spidy 0.3.12 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +53 -8
- data/.ruby-version +1 -1
- data/CLAUDE.md +28 -0
- data/Gemfile +20 -2
- data/Gemfile.lock +178 -69
- data/README.md +96 -17
- data/Rakefile +0 -2
- data/bin/console +2 -3
- data/example/check_ferrum.rb +114 -0
- data/example/check_lightpanda.rb +59 -0
- data/example/connect_test.rb +48 -0
- data/example/lightpanda_links.rb +80 -0
- data/example/master_detail.rb +1 -3
- data/example/proxy.rb +0 -2
- data/example/retry.rb +0 -2
- data/example/run_with_lightpanda.rb +25 -0
- data/example/simple_test.rb +53 -0
- data/example/test_lightpanda.rb +86 -0
- data/example/wikip.rb +2 -4
- data/exe/spidy +0 -3
- data/lib/spidy/binder/error.rb +0 -2
- data/lib/spidy/binder/html.rb +0 -2
- data/lib/spidy/binder/json.rb +0 -2
- data/lib/spidy/binder/xml.rb +0 -2
- data/lib/spidy/binder.rb +0 -2
- data/lib/spidy/command_line.rb +4 -6
- data/lib/spidy/connector/direct.rb +0 -2
- data/lib/spidy/connector/html.rb +0 -2
- data/lib/spidy/connector/json.rb +2 -4
- data/lib/spidy/connector/lightpanda.rb +161 -0
- data/lib/spidy/connector/xml.rb +4 -6
- data/lib/spidy/connector.rb +7 -8
- data/lib/spidy/console.rb +0 -2
- data/lib/spidy/definition.rb +2 -4
- data/lib/spidy/definition_file.rb +0 -2
- data/lib/spidy/definition_object.rb +0 -2
- data/lib/spidy/shell.rb +6 -3
- data/lib/spidy/spider.rb +2 -4
- data/lib/spidy/version.rb +1 -3
- data/lib/spidy.rb +3 -5
- data/spidy.gemspec +4 -17
- metadata +16 -138
- data/.rubocop_todo.yml +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 743eadfde1aa8f5e9dbfde067b1c92e38014f274bd59502ca64d845d622c3e53
|
4
|
+
data.tar.gz: 3b89159ea679762e361214ecf9ece14642ff1aeb48b978084bfd35c71e3ad8ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bad3fd94c682d94a2d92130759178ccea388701cbcbf2ef0e125db8d558af349e81e6c2f623d78277dfffcb89f82d60d7c06d32fef158990cdd3ec19118dc63f
|
7
|
+
data.tar.gz: 24217cbb1c12ebbc4fee9f18e3a3583f2194e33c7dfac99a270cde46f93367a6a397375b08905bb04de39a12f12226f13795c794355ca87ba6f155f8da45b5c6
|
data/.rubocop.yml
CHANGED
@@ -1,34 +1,79 @@
|
|
1
|
-
|
1
|
+
plugins:
|
2
|
+
- rubocop-performance
|
3
|
+
- rubocop-rspec
|
4
|
+
|
2
5
|
AllCops:
|
3
|
-
TargetRubyVersion: 3.
|
6
|
+
TargetRubyVersion: 3.4.2
|
4
7
|
NewCops: enable
|
5
8
|
DisplayCopNames: true
|
9
|
+
Exclude:
|
10
|
+
- 'vendor/**/*'
|
11
|
+
- 'bin/**/*'
|
12
|
+
- 'tmp/**/*'
|
13
|
+
|
14
|
+
Gemspec/RequiredRubyVersion:
|
15
|
+
Enabled: false
|
16
|
+
|
17
|
+
Style/FrozenStringLiteralComment:
|
18
|
+
EnforcedStyle: never
|
6
19
|
|
20
|
+
# Style
|
7
21
|
Style/ClassAndModuleChildren:
|
8
22
|
Enabled: false
|
9
23
|
|
10
24
|
Style/SignalException:
|
11
25
|
EnforcedStyle: semantic
|
12
26
|
|
27
|
+
Style/Documentation:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/StringLiterals:
|
31
|
+
EnforcedStyle: single_quotes
|
32
|
+
ConsistentQuotesInMultiline: true
|
33
|
+
|
34
|
+
# Naming
|
13
35
|
Naming/MethodParameterName:
|
14
36
|
AllowedNames:
|
15
37
|
- as
|
38
|
+
- id
|
39
|
+
- io
|
40
|
+
- ip
|
41
|
+
- of
|
42
|
+
- on
|
43
|
+
- to
|
44
|
+
- up
|
16
45
|
|
46
|
+
# Metrics
|
17
47
|
Metrics/AbcSize:
|
18
|
-
Max:
|
48
|
+
Max: 25
|
19
49
|
Exclude:
|
50
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
20
51
|
|
21
52
|
Metrics/MethodLength:
|
22
|
-
Max:
|
53
|
+
Max: 20
|
54
|
+
Exclude:
|
55
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
56
|
+
|
57
|
+
Metrics/ClassLength:
|
58
|
+
Max: 150
|
59
|
+
Exclude:
|
60
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
23
61
|
|
24
|
-
|
62
|
+
Layout/LineLength:
|
25
63
|
Max: 130
|
26
64
|
|
27
65
|
Metrics/BlockLength:
|
28
66
|
Max: 120
|
29
|
-
|
30
|
-
|
31
|
-
|
67
|
+
Exclude:
|
68
|
+
- 'spec/**/*'
|
69
|
+
- 'example/**/*'
|
32
70
|
|
33
71
|
Layout/EmptyLineAfterGuardClause:
|
34
72
|
Enabled: false
|
73
|
+
|
74
|
+
# RSpec
|
75
|
+
RSpec/ExampleLength:
|
76
|
+
Max: 15
|
77
|
+
|
78
|
+
RSpec/MultipleExpectations:
|
79
|
+
Max: 5
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.
|
1
|
+
3.4.2
|
data/CLAUDE.md
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# Claude Helper for Spidy
|
2
|
+
|
3
|
+
## Build/Test/Lint Commands
|
4
|
+
- Install dependencies: `bundle install`
|
5
|
+
- Run all tests: `bundle exec rake spec`
|
6
|
+
- Run single test: `bundle exec rspec spec/path/to_spec.rb:LINE_NUMBER`
|
7
|
+
- Install gem locally: `bundle exec rake install`
|
8
|
+
- Release gem: `bundle exec rake release`
|
9
|
+
|
10
|
+
## Code Style Guidelines
|
11
|
+
- **Naming Conventions**:
|
12
|
+
- snake_case for methods/variables/files
|
13
|
+
- CamelCase for classes/modules
|
14
|
+
- SCREAMING_SNAKE_CASE for constants
|
15
|
+
- **File Organization**: Match file paths to module/class hierarchy
|
16
|
+
- **Imports**:
|
17
|
+
- Add `# frozen_string_literal: true` at file start
|
18
|
+
- Use `extend ActiveSupport::Autoload` for modules with sub-modules
|
19
|
+
- **Error Handling**: Create custom error classes inheriting from StandardError
|
20
|
+
- **Documentation**: Add brief comments before classes and methods
|
21
|
+
- **Testing**:
|
22
|
+
- Use RSpec with `expect` syntax
|
23
|
+
- Organize with `describe` and `specify` blocks
|
24
|
+
- Name test files with `_spec.rb` suffix
|
25
|
+
|
26
|
+
## Dependencies
|
27
|
+
- Runtime: activesupport, mechanize, socksify, tor
|
28
|
+
- Development: bundler, capybara_discoball, ffaker, rake, rspec, sinatra
|
data/Gemfile
CHANGED
@@ -1,6 +1,24 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
source 'https://rubygems.org'
|
4
2
|
|
5
3
|
# Specify your gem's dependencies in crawler.gemspec
|
6
4
|
gemspec
|
5
|
+
|
6
|
+
gem 'irb'
|
7
|
+
gem 'rackup'
|
8
|
+
gem 'webrick'
|
9
|
+
|
10
|
+
gem 'capybara_discoball'
|
11
|
+
gem 'ffaker'
|
12
|
+
gem 'rake', '~> 13.0'
|
13
|
+
gem 'rspec', '~> 3.0'
|
14
|
+
gem 'rspec-command'
|
15
|
+
gem 'sinatra'
|
16
|
+
|
17
|
+
group :development do
|
18
|
+
gem 'ferrum'
|
19
|
+
|
20
|
+
gem 'rubocop', require: false
|
21
|
+
gem 'rubocop-performance', require: false
|
22
|
+
gem 'rubocop-rake', require: false
|
23
|
+
gem 'rubocop-rspec', require: false
|
24
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,136 +1,245 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.
|
5
|
-
activesupport
|
4
|
+
spidy (1.0.0)
|
5
|
+
activesupport (~> 7.1)
|
6
6
|
mechanize
|
7
|
-
pry
|
8
7
|
socksify
|
9
8
|
tor
|
10
9
|
|
11
10
|
GEM
|
12
11
|
remote: https://rubygems.org/
|
13
12
|
specs:
|
14
|
-
activesupport (7.
|
15
|
-
|
13
|
+
activesupport (7.2.2.1)
|
14
|
+
base64
|
15
|
+
benchmark (>= 0.3)
|
16
|
+
bigdecimal
|
17
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
18
|
+
connection_pool (>= 2.2.5)
|
19
|
+
drb
|
16
20
|
i18n (>= 1.6, < 2)
|
21
|
+
logger (>= 1.4.2)
|
17
22
|
minitest (>= 5.1)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
23
|
+
securerandom (>= 0.3)
|
24
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
25
|
+
addressable (2.8.7)
|
26
|
+
public_suffix (>= 2.0.2, < 7.0)
|
27
|
+
ast (2.4.3)
|
28
|
+
base64 (0.2.0)
|
29
|
+
benchmark (0.4.0)
|
30
|
+
bigdecimal (3.1.9)
|
31
|
+
capybara (3.40.0)
|
22
32
|
addressable
|
23
33
|
matrix
|
24
34
|
mini_mime (>= 0.1.3)
|
25
|
-
nokogiri (~> 1.
|
35
|
+
nokogiri (~> 1.11)
|
26
36
|
rack (>= 1.6.0)
|
27
37
|
rack-test (>= 0.6.3)
|
28
38
|
regexp_parser (>= 1.5, < 3.0)
|
29
39
|
xpath (~> 3.2)
|
30
40
|
capybara_discoball (0.1.0)
|
31
41
|
capybara (>= 2.7, < 4)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
diff-lcs (1.
|
36
|
-
domain_name (0.
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
concurrent-ruby (1.3.5)
|
43
|
+
connection_pool (2.5.0)
|
44
|
+
date (3.4.1)
|
45
|
+
diff-lcs (1.6.1)
|
46
|
+
domain_name (0.6.20240107)
|
47
|
+
drb (2.2.1)
|
48
|
+
ferrum (0.16)
|
49
|
+
addressable (~> 2.5)
|
50
|
+
base64 (~> 0.2)
|
51
|
+
concurrent-ruby (~> 1.1)
|
52
|
+
webrick (~> 1.7)
|
53
|
+
websocket-driver (~> 0.7)
|
54
|
+
ffaker (2.24.0)
|
55
|
+
http-cookie (1.0.8)
|
40
56
|
domain_name (~> 0.5)
|
41
|
-
i18n (1.
|
57
|
+
i18n (1.14.7)
|
42
58
|
concurrent-ruby (~> 1.0)
|
59
|
+
io-console (0.8.0)
|
60
|
+
irb (1.15.2)
|
61
|
+
pp (>= 0.6.0)
|
62
|
+
rdoc (>= 4.0.0)
|
63
|
+
reline (>= 0.4.2)
|
64
|
+
json (2.10.2)
|
65
|
+
language_server-protocol (3.17.0.4)
|
66
|
+
lint_roller (1.1.0)
|
67
|
+
logger (1.7.0)
|
43
68
|
matrix (0.4.2)
|
44
|
-
mechanize (2.
|
69
|
+
mechanize (2.14.0)
|
45
70
|
addressable (~> 2.8)
|
71
|
+
base64
|
46
72
|
domain_name (~> 0.5, >= 0.5.20190701)
|
47
73
|
http-cookie (~> 1.0, >= 1.0.3)
|
48
|
-
mime-types (~> 3.
|
74
|
+
mime-types (~> 3.3)
|
49
75
|
net-http-digest_auth (~> 1.4, >= 1.4.1)
|
50
76
|
net-http-persistent (>= 2.5.2, < 5.0.dev)
|
77
|
+
nkf
|
51
78
|
nokogiri (~> 1.11, >= 1.11.2)
|
52
79
|
rubyntlm (~> 0.6, >= 0.6.3)
|
53
80
|
webrick (~> 1.7)
|
54
81
|
webrobots (~> 0.1.2)
|
55
|
-
|
56
|
-
|
82
|
+
mime-types (3.6.2)
|
83
|
+
logger
|
57
84
|
mime-types-data (~> 3.2015)
|
58
|
-
mime-types-data (3.
|
59
|
-
mini_mime (1.1.
|
60
|
-
minitest (5.
|
85
|
+
mime-types-data (3.2025.0408)
|
86
|
+
mini_mime (1.1.5)
|
87
|
+
minitest (5.25.5)
|
61
88
|
mixlib-shellout (2.4.4)
|
62
|
-
mustermann (
|
89
|
+
mustermann (3.0.3)
|
63
90
|
ruby2_keywords (~> 0.0.1)
|
64
91
|
net-http-digest_auth (1.4.1)
|
65
|
-
net-http-persistent (4.0.
|
92
|
+
net-http-persistent (4.0.5)
|
66
93
|
connection_pool (~> 2.2)
|
67
|
-
|
94
|
+
nkf (0.2.0)
|
95
|
+
nokogiri (1.18.7-aarch64-linux-gnu)
|
96
|
+
racc (~> 1.4)
|
97
|
+
nokogiri (1.18.7-aarch64-linux-musl)
|
98
|
+
racc (~> 1.4)
|
99
|
+
nokogiri (1.18.7-arm-linux-gnu)
|
100
|
+
racc (~> 1.4)
|
101
|
+
nokogiri (1.18.7-arm-linux-musl)
|
102
|
+
racc (~> 1.4)
|
103
|
+
nokogiri (1.18.7-arm64-darwin)
|
104
|
+
racc (~> 1.4)
|
105
|
+
nokogiri (1.18.7-x86_64-darwin)
|
106
|
+
racc (~> 1.4)
|
107
|
+
nokogiri (1.18.7-x86_64-linux-gnu)
|
108
|
+
racc (~> 1.4)
|
109
|
+
nokogiri (1.18.7-x86_64-linux-musl)
|
68
110
|
racc (~> 1.4)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
111
|
+
parallel (1.27.0)
|
112
|
+
parser (3.3.8.0)
|
113
|
+
ast (~> 2.4.1)
|
114
|
+
racc
|
115
|
+
pp (0.6.2)
|
116
|
+
prettyprint
|
117
|
+
prettyprint (0.2.0)
|
118
|
+
prism (1.4.0)
|
119
|
+
psych (5.2.3)
|
120
|
+
date
|
121
|
+
stringio
|
122
|
+
public_suffix (6.0.1)
|
123
|
+
racc (1.8.1)
|
124
|
+
rack (3.1.13)
|
125
|
+
rack-protection (4.1.1)
|
126
|
+
base64 (>= 0.1.0)
|
127
|
+
logger (>= 1.6.0)
|
128
|
+
rack (>= 3.0.0, < 4)
|
129
|
+
rack-session (2.1.0)
|
130
|
+
base64 (>= 0.1.0)
|
131
|
+
rack (>= 3.0.0)
|
132
|
+
rack-test (2.2.0)
|
133
|
+
rack (>= 1.3)
|
134
|
+
rackup (2.2.1)
|
135
|
+
rack (>= 3)
|
136
|
+
rainbow (3.1.1)
|
137
|
+
rake (13.2.1)
|
138
|
+
rdoc (6.13.1)
|
139
|
+
psych (>= 4.0.0)
|
140
|
+
regexp_parser (2.10.0)
|
141
|
+
reline (0.6.1)
|
142
|
+
io-console (~> 0.5)
|
143
|
+
rspec (3.13.0)
|
144
|
+
rspec-core (~> 3.13.0)
|
145
|
+
rspec-expectations (~> 3.13.0)
|
146
|
+
rspec-mocks (~> 3.13.0)
|
85
147
|
rspec-command (1.0.3)
|
86
148
|
mixlib-shellout (~> 2.0)
|
87
149
|
rspec (~> 3.2)
|
88
150
|
rspec-its (~> 1.2)
|
89
|
-
rspec-core (3.
|
90
|
-
rspec-support (~> 3.
|
91
|
-
rspec-expectations (3.
|
151
|
+
rspec-core (3.13.3)
|
152
|
+
rspec-support (~> 3.13.0)
|
153
|
+
rspec-expectations (3.13.3)
|
92
154
|
diff-lcs (>= 1.2.0, < 2.0)
|
93
|
-
rspec-support (~> 3.
|
94
|
-
rspec-its (1.3.
|
155
|
+
rspec-support (~> 3.13.0)
|
156
|
+
rspec-its (1.3.1)
|
95
157
|
rspec-core (>= 3.0.0)
|
96
158
|
rspec-expectations (>= 3.0.0)
|
97
|
-
rspec-mocks (3.
|
159
|
+
rspec-mocks (3.13.2)
|
98
160
|
diff-lcs (>= 1.2.0, < 2.0)
|
99
|
-
rspec-support (~> 3.
|
100
|
-
rspec-support (3.
|
161
|
+
rspec-support (~> 3.13.0)
|
162
|
+
rspec-support (3.13.2)
|
163
|
+
rubocop (1.75.2)
|
164
|
+
json (~> 2.3)
|
165
|
+
language_server-protocol (~> 3.17.0.2)
|
166
|
+
lint_roller (~> 1.1.0)
|
167
|
+
parallel (~> 1.10)
|
168
|
+
parser (>= 3.3.0.2)
|
169
|
+
rainbow (>= 2.2.2, < 4.0)
|
170
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
171
|
+
rubocop-ast (>= 1.44.0, < 2.0)
|
172
|
+
ruby-progressbar (~> 1.7)
|
173
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
174
|
+
rubocop-ast (1.44.1)
|
175
|
+
parser (>= 3.3.7.2)
|
176
|
+
prism (~> 1.4)
|
177
|
+
rubocop-performance (1.25.0)
|
178
|
+
lint_roller (~> 1.1)
|
179
|
+
rubocop (>= 1.75.0, < 2.0)
|
180
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
181
|
+
rubocop-rake (0.7.1)
|
182
|
+
lint_roller (~> 1.1)
|
183
|
+
rubocop (>= 1.72.1)
|
184
|
+
rubocop-rspec (3.5.0)
|
185
|
+
lint_roller (~> 1.1)
|
186
|
+
rubocop (~> 1.72, >= 1.72.1)
|
187
|
+
ruby-progressbar (1.13.0)
|
101
188
|
ruby2_keywords (0.0.5)
|
102
|
-
rubyntlm (0.6.
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
189
|
+
rubyntlm (0.6.5)
|
190
|
+
base64
|
191
|
+
securerandom (0.4.1)
|
192
|
+
sinatra (4.1.1)
|
193
|
+
logger (>= 1.6.0)
|
194
|
+
mustermann (~> 3.0)
|
195
|
+
rack (>= 3.0.0, < 4)
|
196
|
+
rack-protection (= 4.1.1)
|
197
|
+
rack-session (>= 2.0.0, < 3)
|
107
198
|
tilt (~> 2.0)
|
108
199
|
socksify (1.7.1)
|
109
|
-
|
110
|
-
|
111
|
-
|
200
|
+
stringio (3.1.6)
|
201
|
+
tilt (2.6.0)
|
202
|
+
tor (0.1.7)
|
203
|
+
tzinfo (2.0.6)
|
112
204
|
concurrent-ruby (~> 1.0)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
webrick (1.
|
205
|
+
unicode-display_width (3.1.4)
|
206
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
207
|
+
unicode-emoji (4.0.4)
|
208
|
+
webrick (1.9.1)
|
117
209
|
webrobots (0.1.2)
|
210
|
+
websocket-driver (0.7.7)
|
211
|
+
base64
|
212
|
+
websocket-extensions (>= 0.1.0)
|
213
|
+
websocket-extensions (0.1.5)
|
118
214
|
xpath (3.2.0)
|
119
215
|
nokogiri (~> 1.8)
|
120
216
|
|
121
217
|
PLATFORMS
|
122
|
-
|
218
|
+
aarch64-linux-gnu
|
219
|
+
aarch64-linux-musl
|
220
|
+
arm-linux-gnu
|
221
|
+
arm-linux-musl
|
222
|
+
arm64-darwin
|
223
|
+
x86_64-darwin
|
224
|
+
x86_64-linux-gnu
|
225
|
+
x86_64-linux-musl
|
123
226
|
|
124
227
|
DEPENDENCIES
|
125
|
-
bundler (~> 2.0)
|
126
228
|
capybara_discoball
|
229
|
+
ferrum
|
127
230
|
ffaker
|
128
|
-
|
231
|
+
irb
|
232
|
+
rackup
|
129
233
|
rake (~> 13.0)
|
130
234
|
rspec (~> 3.0)
|
131
235
|
rspec-command
|
236
|
+
rubocop
|
237
|
+
rubocop-performance
|
238
|
+
rubocop-rake
|
239
|
+
rubocop-rspec
|
132
240
|
sinatra
|
133
241
|
spidy!
|
242
|
+
webrick
|
134
243
|
|
135
244
|
BUNDLED WITH
|
136
|
-
2.
|
245
|
+
2.6.5
|
data/README.md
CHANGED
@@ -20,14 +20,73 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
###
|
23
|
+
### Connectors
|
24
24
|
|
25
|
-
|
25
|
+
Spidy supports different connectors for fetching web pages:
|
26
|
+
|
27
|
+
1. **HTML Connector (Mechanize)**: Default connector for regular HTTP requests and HTML parsing
|
28
|
+
2. **JSON Connector**: For parsing JSON APIs
|
29
|
+
3. **XML Connector**: For parsing XML responses
|
30
|
+
4. **Lightpanda Connector**: For JavaScript-rendered websites (uses Playwright)
|
31
|
+
|
32
|
+
#### Lightpanda Connector for JavaScript-Rendered Websites
|
33
|
+
|
34
|
+
The Lightpanda connector allows you to process JavaScript-rendered websites by connecting to a running lightpanda CDP server.
|
35
|
+
|
36
|
+
##### Prerequisites
|
37
|
+
|
38
|
+
1. Install the Playwright Ruby client:
|
39
|
+
|
40
|
+
```bash
|
41
|
+
$ gem install playwright-ruby-client
|
42
|
+
```
|
43
|
+
|
44
|
+
2. Start a lightpanda CDP server in a separate terminal:
|
45
|
+
|
46
|
+
```bash
|
47
|
+
$ lightpanda serve --host 127.0.0.1 --port 9222
|
48
|
+
```
|
49
|
+
|
50
|
+
##### Usage
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
# Define a scraper with lightpanda support
|
54
|
+
scraper = Spidy.define do
|
55
|
+
# Use the :lightpanda connector for JavaScript-rendered sites
|
56
|
+
spider(as: :lightpanda) do |yielder, connector, url|
|
57
|
+
connector.call(url) do |page|
|
58
|
+
# Process the JavaScript-rendered page
|
59
|
+
# page is a Nokogiri-like object
|
60
|
+
yielder.call(page)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
define(as: :html) do
|
65
|
+
let(:title, 'title')
|
66
|
+
# Extract content from JS-rendered page...
|
67
|
+
end
|
68
|
+
end
|
69
|
+
```
|
70
|
+
|
71
|
+
##### Configuration
|
72
|
+
|
73
|
+
You can customize the lightpanda CDP server connection using environment variables:
|
74
|
+
|
75
|
+
```bash
|
76
|
+
# Set custom host and port
|
77
|
+
$ LIGHTPANDA_HOST=192.168.1.100 LIGHTPANDA_PORT=9333 ruby your_script.rb
|
78
|
+
```
|
79
|
+
|
80
|
+
Check `example/playwright_example.rb` for a complete example.
|
81
|
+
|
82
|
+
### Command Line Usage
|
83
|
+
|
84
|
+
Create a definition file (e.g., website.rb):
|
26
85
|
```rb
|
27
|
-
Spidy.
|
86
|
+
Spidy.define do
|
28
87
|
spider(as: :html) do |yielder, connector, url|
|
29
88
|
connector.call(url) do |html|
|
30
|
-
# html
|
89
|
+
# html is a Nokogiri object (from Mechanize)
|
31
90
|
yielder.call(url)
|
32
91
|
end
|
33
92
|
end
|
@@ -37,41 +96,61 @@ Spidy.defin do
|
|
37
96
|
end
|
38
97
|
end
|
39
98
|
```
|
99
|
+
|
100
|
+
Use it from the command line:
|
40
101
|
```bash
|
41
102
|
echo 'http://example.com' | spidy each website.rb > urls
|
42
103
|
cat urls | spidy call website.rb > website.json
|
43
|
-
#
|
104
|
+
# shorthand
|
44
105
|
echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
|
45
106
|
```
|
46
107
|
|
47
|
-
###
|
108
|
+
### Development Console
|
109
|
+
|
110
|
+
Start an interactive console with your definition:
|
48
111
|
```bash
|
49
112
|
spidy console website.rb
|
50
113
|
```
|
51
114
|
|
52
|
-
|
115
|
+
Reload your source code during development:
|
53
116
|
```
|
54
|
-
|
117
|
+
irb(#<Spidy::Console>)> reload!
|
55
118
|
```
|
56
119
|
|
120
|
+
Example console usage:
|
57
121
|
```rb
|
58
122
|
each('http://example.com') { |url| break url }
|
59
|
-
call('http://example.com') { |html| break html } # html
|
123
|
+
call('http://example.com') { |html| break html } # html is a Nokogiri object (from Mechanize)
|
60
124
|
```
|
61
125
|
|
62
|
-
###
|
126
|
+
### Ruby Code Usage
|
127
|
+
|
128
|
+
Create and use a scraper in your Ruby code:
|
63
129
|
```rb
|
64
|
-
|
65
|
-
#
|
130
|
+
scraper = Spidy.define do
|
131
|
+
# Implement spiders and scrapers
|
132
|
+
spider(as: :html) do |yielder, connector, url|
|
133
|
+
connector.call(url) do |page|
|
134
|
+
yielder.call(page)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
define(as: :html) do
|
139
|
+
let(:title, 'title')
|
140
|
+
let(:links) { |doc| doc.css('a').map { |a| a['href'] } }
|
141
|
+
end
|
66
142
|
end
|
67
143
|
|
68
|
-
a
|
69
|
-
|
144
|
+
# Extract URLs from a site
|
145
|
+
scraper.each(url) do |page_url|
|
146
|
+
# Process each URL found
|
147
|
+
puts page_url
|
70
148
|
end
|
71
149
|
|
72
|
-
a
|
73
|
-
|
74
|
-
|
150
|
+
# Extract structured data from a site
|
151
|
+
result = scraper.call(url)
|
152
|
+
puts "Title: #{result[:title]}"
|
153
|
+
puts "Found #{result[:links].size} links"
|
75
154
|
```
|
76
155
|
|
77
156
|
## Development
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -3,12 +3,11 @@
|
|
3
3
|
|
4
4
|
require 'bundler/setup'
|
5
5
|
require 'spidy'
|
6
|
+
require 'irb'
|
6
7
|
|
7
8
|
# You can add fixtures and/or initialization code here to make experimenting
|
8
9
|
# with your gem easier. You can also use a different console, if you like.
|
9
10
|
|
10
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
-
require 'pry'
|
12
11
|
def reload!
|
13
12
|
ActiveSupport::Dependencies.clear
|
14
13
|
ActiveSupport::DescendantsTracker.clear
|
@@ -18,5 +17,5 @@ end
|
|
18
17
|
if ARGV[0]
|
19
18
|
Spidy.open(ARGV[0]).console
|
20
19
|
else
|
21
|
-
|
20
|
+
IRB.start
|
22
21
|
end
|