spidy 0.4.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +53 -8
- data/Gemfile +18 -4
- data/Gemfile.lock +71 -19
- data/README.md +95 -16
- data/Rakefile +0 -2
- data/example/check_ferrum.rb +114 -0
- data/example/check_lightpanda.rb +59 -0
- data/example/connect_test.rb +48 -0
- data/example/lightpanda_links.rb +80 -0
- data/example/master_detail.rb +1 -3
- data/example/proxy.rb +0 -2
- data/example/retry.rb +0 -2
- data/example/run_with_lightpanda.rb +25 -0
- data/example/simple_test.rb +53 -0
- data/example/test_lightpanda.rb +86 -0
- data/example/wikip.rb +2 -4
- data/exe/spidy +0 -2
- data/lib/spidy/binder/error.rb +0 -2
- data/lib/spidy/binder/html.rb +0 -2
- data/lib/spidy/binder/json.rb +0 -2
- data/lib/spidy/binder/xml.rb +0 -2
- data/lib/spidy/binder.rb +0 -2
- data/lib/spidy/command_line.rb +4 -6
- data/lib/spidy/connector/direct.rb +0 -2
- data/lib/spidy/connector/html.rb +0 -2
- data/lib/spidy/connector/json.rb +2 -4
- data/lib/spidy/connector/lightpanda.rb +161 -0
- data/lib/spidy/connector/xml.rb +4 -6
- data/lib/spidy/connector.rb +7 -8
- data/lib/spidy/console.rb +0 -2
- data/lib/spidy/definition.rb +2 -4
- data/lib/spidy/definition_file.rb +0 -2
- data/lib/spidy/definition_object.rb +0 -2
- data/lib/spidy/shell.rb +0 -2
- data/lib/spidy/spider.rb +2 -4
- data/lib/spidy/version.rb +1 -3
- data/lib/spidy.rb +3 -5
- data/spidy.gemspec +4 -15
- metadata +11 -102
- data/.rubocop_todo.yml +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 743eadfde1aa8f5e9dbfde067b1c92e38014f274bd59502ca64d845d622c3e53
|
4
|
+
data.tar.gz: 3b89159ea679762e361214ecf9ece14642ff1aeb48b978084bfd35c71e3ad8ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bad3fd94c682d94a2d92130759178ccea388701cbcbf2ef0e125db8d558af349e81e6c2f623d78277dfffcb89f82d60d7c06d32fef158990cdd3ec19118dc63f
|
7
|
+
data.tar.gz: 24217cbb1c12ebbc4fee9f18e3a3583f2194e33c7dfac99a270cde46f93367a6a397375b08905bb04de39a12f12226f13795c794355ca87ba6f155f8da45b5c6
|
data/.rubocop.yml
CHANGED
@@ -1,34 +1,79 @@
|
|
1
|
-
|
1
|
+
plugins:
|
2
|
+
- rubocop-performance
|
3
|
+
- rubocop-rspec
|
4
|
+
|
2
5
|
AllCops:
|
3
|
-
TargetRubyVersion: 3.
|
6
|
+
TargetRubyVersion: 3.4.2
|
4
7
|
NewCops: enable
|
5
8
|
DisplayCopNames: true
|
9
|
+
Exclude:
|
10
|
+
- 'vendor/**/*'
|
11
|
+
- 'bin/**/*'
|
12
|
+
- 'tmp/**/*'
|
13
|
+
|
14
|
+
Gemspec/RequiredRubyVersion:
|
15
|
+
Enabled: false
|
16
|
+
|
17
|
+
Style/FrozenStringLiteralComment:
|
18
|
+
EnforcedStyle: never
|
6
19
|
|
20
|
+
# Style
|
7
21
|
Style/ClassAndModuleChildren:
|
8
22
|
Enabled: false
|
9
23
|
|
10
24
|
Style/SignalException:
|
11
25
|
EnforcedStyle: semantic
|
12
26
|
|
27
|
+
Style/Documentation:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/StringLiterals:
|
31
|
+
EnforcedStyle: single_quotes
|
32
|
+
ConsistentQuotesInMultiline: true
|
33
|
+
|
34
|
+
# Naming
|
13
35
|
Naming/MethodParameterName:
|
14
36
|
AllowedNames:
|
15
37
|
- as
|
38
|
+
- id
|
39
|
+
- io
|
40
|
+
- ip
|
41
|
+
- of
|
42
|
+
- on
|
43
|
+
- to
|
44
|
+
- up
|
16
45
|
|
46
|
+
# Metrics
|
17
47
|
Metrics/AbcSize:
|
18
|
-
Max:
|
48
|
+
Max: 25
|
19
49
|
Exclude:
|
50
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
20
51
|
|
21
52
|
Metrics/MethodLength:
|
22
|
-
Max:
|
53
|
+
Max: 20
|
54
|
+
Exclude:
|
55
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
56
|
+
|
57
|
+
Metrics/ClassLength:
|
58
|
+
Max: 150
|
59
|
+
Exclude:
|
60
|
+
- 'lib/spidy/connector/lightpanda.rb'
|
23
61
|
|
24
|
-
|
62
|
+
Layout/LineLength:
|
25
63
|
Max: 130
|
26
64
|
|
27
65
|
Metrics/BlockLength:
|
28
66
|
Max: 120
|
29
|
-
|
30
|
-
|
31
|
-
|
67
|
+
Exclude:
|
68
|
+
- 'spec/**/*'
|
69
|
+
- 'example/**/*'
|
32
70
|
|
33
71
|
Layout/EmptyLineAfterGuardClause:
|
34
72
|
Enabled: false
|
73
|
+
|
74
|
+
# RSpec
|
75
|
+
RSpec/ExampleLength:
|
76
|
+
Max: 15
|
77
|
+
|
78
|
+
RSpec/MultipleExpectations:
|
79
|
+
Max: 5
|
data/Gemfile
CHANGED
@@ -1,10 +1,24 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
source 'https://rubygems.org'
|
4
2
|
|
5
3
|
# Specify your gem's dependencies in crawler.gemspec
|
6
4
|
gemspec
|
7
5
|
|
8
|
-
gem 'webrick'
|
9
|
-
gem 'rackup'
|
10
6
|
gem 'irb'
|
7
|
+
gem 'rackup'
|
8
|
+
gem 'webrick'
|
9
|
+
|
10
|
+
gem 'capybara_discoball'
|
11
|
+
gem 'ffaker'
|
12
|
+
gem 'rake', '~> 13.0'
|
13
|
+
gem 'rspec', '~> 3.0'
|
14
|
+
gem 'rspec-command'
|
15
|
+
gem 'sinatra'
|
16
|
+
|
17
|
+
group :development do
|
18
|
+
gem 'ferrum'
|
19
|
+
|
20
|
+
gem 'rubocop', require: false
|
21
|
+
gem 'rubocop-performance', require: false
|
22
|
+
gem 'rubocop-rake', require: false
|
23
|
+
gem 'rubocop-rspec', require: false
|
24
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.
|
4
|
+
spidy (1.0.0)
|
5
5
|
activesupport (~> 7.1)
|
6
6
|
mechanize
|
7
7
|
socksify
|
@@ -24,6 +24,7 @@ GEM
|
|
24
24
|
tzinfo (~> 2.0, >= 2.0.5)
|
25
25
|
addressable (2.8.7)
|
26
26
|
public_suffix (>= 2.0.2, < 7.0)
|
27
|
+
ast (2.4.3)
|
27
28
|
base64 (0.2.0)
|
28
29
|
benchmark (0.4.0)
|
29
30
|
bigdecimal (3.1.9)
|
@@ -41,20 +42,29 @@ GEM
|
|
41
42
|
concurrent-ruby (1.3.5)
|
42
43
|
connection_pool (2.5.0)
|
43
44
|
date (3.4.1)
|
44
|
-
diff-lcs (1.6.
|
45
|
+
diff-lcs (1.6.1)
|
45
46
|
domain_name (0.6.20240107)
|
46
47
|
drb (2.2.1)
|
48
|
+
ferrum (0.16)
|
49
|
+
addressable (~> 2.5)
|
50
|
+
base64 (~> 0.2)
|
51
|
+
concurrent-ruby (~> 1.1)
|
52
|
+
webrick (~> 1.7)
|
53
|
+
websocket-driver (~> 0.7)
|
47
54
|
ffaker (2.24.0)
|
48
55
|
http-cookie (1.0.8)
|
49
56
|
domain_name (~> 0.5)
|
50
57
|
i18n (1.14.7)
|
51
58
|
concurrent-ruby (~> 1.0)
|
52
59
|
io-console (0.8.0)
|
53
|
-
irb (1.15.
|
60
|
+
irb (1.15.2)
|
54
61
|
pp (>= 0.6.0)
|
55
62
|
rdoc (>= 4.0.0)
|
56
63
|
reline (>= 0.4.2)
|
57
|
-
|
64
|
+
json (2.10.2)
|
65
|
+
language_server-protocol (3.17.0.4)
|
66
|
+
lint_roller (1.1.0)
|
67
|
+
logger (1.7.0)
|
58
68
|
matrix (0.4.2)
|
59
69
|
mechanize (2.14.0)
|
60
70
|
addressable (~> 2.8)
|
@@ -69,10 +79,10 @@ GEM
|
|
69
79
|
rubyntlm (~> 0.6, >= 0.6.3)
|
70
80
|
webrick (~> 1.7)
|
71
81
|
webrobots (~> 0.1.2)
|
72
|
-
mime-types (3.6.
|
82
|
+
mime-types (3.6.2)
|
73
83
|
logger
|
74
84
|
mime-types-data (~> 3.2015)
|
75
|
-
mime-types-data (3.2025.
|
85
|
+
mime-types-data (3.2025.0408)
|
76
86
|
mini_mime (1.1.5)
|
77
87
|
minitest (5.25.5)
|
78
88
|
mixlib-shellout (2.4.4)
|
@@ -82,31 +92,36 @@ GEM
|
|
82
92
|
net-http-persistent (4.0.5)
|
83
93
|
connection_pool (~> 2.2)
|
84
94
|
nkf (0.2.0)
|
85
|
-
nokogiri (1.18.
|
95
|
+
nokogiri (1.18.7-aarch64-linux-gnu)
|
86
96
|
racc (~> 1.4)
|
87
|
-
nokogiri (1.18.
|
97
|
+
nokogiri (1.18.7-aarch64-linux-musl)
|
88
98
|
racc (~> 1.4)
|
89
|
-
nokogiri (1.18.
|
99
|
+
nokogiri (1.18.7-arm-linux-gnu)
|
90
100
|
racc (~> 1.4)
|
91
|
-
nokogiri (1.18.
|
101
|
+
nokogiri (1.18.7-arm-linux-musl)
|
92
102
|
racc (~> 1.4)
|
93
|
-
nokogiri (1.18.
|
103
|
+
nokogiri (1.18.7-arm64-darwin)
|
94
104
|
racc (~> 1.4)
|
95
|
-
nokogiri (1.18.
|
105
|
+
nokogiri (1.18.7-x86_64-darwin)
|
96
106
|
racc (~> 1.4)
|
97
|
-
nokogiri (1.18.
|
107
|
+
nokogiri (1.18.7-x86_64-linux-gnu)
|
98
108
|
racc (~> 1.4)
|
99
|
-
nokogiri (1.18.
|
109
|
+
nokogiri (1.18.7-x86_64-linux-musl)
|
100
110
|
racc (~> 1.4)
|
111
|
+
parallel (1.27.0)
|
112
|
+
parser (3.3.8.0)
|
113
|
+
ast (~> 2.4.1)
|
114
|
+
racc
|
101
115
|
pp (0.6.2)
|
102
116
|
prettyprint
|
103
117
|
prettyprint (0.2.0)
|
118
|
+
prism (1.4.0)
|
104
119
|
psych (5.2.3)
|
105
120
|
date
|
106
121
|
stringio
|
107
122
|
public_suffix (6.0.1)
|
108
123
|
racc (1.8.1)
|
109
|
-
rack (3.1.
|
124
|
+
rack (3.1.13)
|
110
125
|
rack-protection (4.1.1)
|
111
126
|
base64 (>= 0.1.0)
|
112
127
|
logger (>= 1.6.0)
|
@@ -118,11 +133,12 @@ GEM
|
|
118
133
|
rack (>= 1.3)
|
119
134
|
rackup (2.2.1)
|
120
135
|
rack (>= 3)
|
136
|
+
rainbow (3.1.1)
|
121
137
|
rake (13.2.1)
|
122
|
-
rdoc (6.
|
138
|
+
rdoc (6.13.1)
|
123
139
|
psych (>= 4.0.0)
|
124
140
|
regexp_parser (2.10.0)
|
125
|
-
reline (0.6.
|
141
|
+
reline (0.6.1)
|
126
142
|
io-console (~> 0.5)
|
127
143
|
rspec (3.13.0)
|
128
144
|
rspec-core (~> 3.13.0)
|
@@ -144,6 +160,31 @@ GEM
|
|
144
160
|
diff-lcs (>= 1.2.0, < 2.0)
|
145
161
|
rspec-support (~> 3.13.0)
|
146
162
|
rspec-support (3.13.2)
|
163
|
+
rubocop (1.75.2)
|
164
|
+
json (~> 2.3)
|
165
|
+
language_server-protocol (~> 3.17.0.2)
|
166
|
+
lint_roller (~> 1.1.0)
|
167
|
+
parallel (~> 1.10)
|
168
|
+
parser (>= 3.3.0.2)
|
169
|
+
rainbow (>= 2.2.2, < 4.0)
|
170
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
171
|
+
rubocop-ast (>= 1.44.0, < 2.0)
|
172
|
+
ruby-progressbar (~> 1.7)
|
173
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
174
|
+
rubocop-ast (1.44.1)
|
175
|
+
parser (>= 3.3.7.2)
|
176
|
+
prism (~> 1.4)
|
177
|
+
rubocop-performance (1.25.0)
|
178
|
+
lint_roller (~> 1.1)
|
179
|
+
rubocop (>= 1.75.0, < 2.0)
|
180
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
181
|
+
rubocop-rake (0.7.1)
|
182
|
+
lint_roller (~> 1.1)
|
183
|
+
rubocop (>= 1.72.1)
|
184
|
+
rubocop-rspec (3.5.0)
|
185
|
+
lint_roller (~> 1.1)
|
186
|
+
rubocop (~> 1.72, >= 1.72.1)
|
187
|
+
ruby-progressbar (1.13.0)
|
147
188
|
ruby2_keywords (0.0.5)
|
148
189
|
rubyntlm (0.6.5)
|
149
190
|
base64
|
@@ -156,13 +197,20 @@ GEM
|
|
156
197
|
rack-session (>= 2.0.0, < 3)
|
157
198
|
tilt (~> 2.0)
|
158
199
|
socksify (1.7.1)
|
159
|
-
stringio (3.1.
|
200
|
+
stringio (3.1.6)
|
160
201
|
tilt (2.6.0)
|
161
202
|
tor (0.1.7)
|
162
203
|
tzinfo (2.0.6)
|
163
204
|
concurrent-ruby (~> 1.0)
|
205
|
+
unicode-display_width (3.1.4)
|
206
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
207
|
+
unicode-emoji (4.0.4)
|
164
208
|
webrick (1.9.1)
|
165
209
|
webrobots (0.1.2)
|
210
|
+
websocket-driver (0.7.7)
|
211
|
+
base64
|
212
|
+
websocket-extensions (>= 0.1.0)
|
213
|
+
websocket-extensions (0.1.5)
|
166
214
|
xpath (3.2.0)
|
167
215
|
nokogiri (~> 1.8)
|
168
216
|
|
@@ -177,14 +225,18 @@ PLATFORMS
|
|
177
225
|
x86_64-linux-musl
|
178
226
|
|
179
227
|
DEPENDENCIES
|
180
|
-
bundler (~> 2.0)
|
181
228
|
capybara_discoball
|
229
|
+
ferrum
|
182
230
|
ffaker
|
183
231
|
irb
|
184
232
|
rackup
|
185
233
|
rake (~> 13.0)
|
186
234
|
rspec (~> 3.0)
|
187
235
|
rspec-command
|
236
|
+
rubocop
|
237
|
+
rubocop-performance
|
238
|
+
rubocop-rake
|
239
|
+
rubocop-rspec
|
188
240
|
sinatra
|
189
241
|
spidy!
|
190
242
|
webrick
|
data/README.md
CHANGED
@@ -20,14 +20,73 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
###
|
23
|
+
### Connectors
|
24
24
|
|
25
|
-
|
25
|
+
Spidy supports different connectors for fetching web pages:
|
26
|
+
|
27
|
+
1. **HTML Connector (Mechanize)**: Default connector for regular HTTP requests and HTML parsing
|
28
|
+
2. **JSON Connector**: For parsing JSON APIs
|
29
|
+
3. **XML Connector**: For parsing XML responses
|
30
|
+
4. **Lightpanda Connector**: For JavaScript-rendered websites (uses Playwright)
|
31
|
+
|
32
|
+
#### Lightpanda Connector for JavaScript-Rendered Websites
|
33
|
+
|
34
|
+
The Lightpanda connector allows you to process JavaScript-rendered websites by connecting to a running lightpanda CDP server.
|
35
|
+
|
36
|
+
##### Prerequisites
|
37
|
+
|
38
|
+
1. Install the Playwright Ruby client:
|
39
|
+
|
40
|
+
```bash
|
41
|
+
$ gem install playwright-ruby-client
|
42
|
+
```
|
43
|
+
|
44
|
+
2. Start a lightpanda CDP server in a separate terminal:
|
45
|
+
|
46
|
+
```bash
|
47
|
+
$ lightpanda serve --host 127.0.0.1 --port 9222
|
48
|
+
```
|
49
|
+
|
50
|
+
##### Usage
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
# Define a scraper with lightpanda support
|
54
|
+
scraper = Spidy.define do
|
55
|
+
# Use the :lightpanda connector for JavaScript-rendered sites
|
56
|
+
spider(as: :lightpanda) do |yielder, connector, url|
|
57
|
+
connector.call(url) do |page|
|
58
|
+
# Process the JavaScript-rendered page
|
59
|
+
# page is a Nokogiri-like object
|
60
|
+
yielder.call(page)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
define(as: :html) do
|
65
|
+
let(:title, 'title')
|
66
|
+
# Extract content from JS-rendered page...
|
67
|
+
end
|
68
|
+
end
|
69
|
+
```
|
70
|
+
|
71
|
+
##### Configuration
|
72
|
+
|
73
|
+
You can customize the lightpanda CDP server connection using environment variables:
|
74
|
+
|
75
|
+
```bash
|
76
|
+
# Set custom host and port
|
77
|
+
$ LIGHTPANDA_HOST=192.168.1.100 LIGHTPANDA_PORT=9333 ruby your_script.rb
|
78
|
+
```
|
79
|
+
|
80
|
+
Check `example/playwright_example.rb` for a complete example.
|
81
|
+
|
82
|
+
### Command Line Usage
|
83
|
+
|
84
|
+
Create a definition file (e.g., website.rb):
|
26
85
|
```rb
|
27
|
-
Spidy.
|
86
|
+
Spidy.define do
|
28
87
|
spider(as: :html) do |yielder, connector, url|
|
29
88
|
connector.call(url) do |html|
|
30
|
-
# html
|
89
|
+
# html is a Nokogiri object (from Mechanize)
|
31
90
|
yielder.call(url)
|
32
91
|
end
|
33
92
|
end
|
@@ -37,41 +96,61 @@ Spidy.defin do
|
|
37
96
|
end
|
38
97
|
end
|
39
98
|
```
|
99
|
+
|
100
|
+
Use it from the command line:
|
40
101
|
```bash
|
41
102
|
echo 'http://example.com' | spidy each website.rb > urls
|
42
103
|
cat urls | spidy call website.rb > website.json
|
43
|
-
#
|
104
|
+
# shorthand
|
44
105
|
echo 'http://example.com' | spidy each website.rb | spidy call website.rb | jq .
|
45
106
|
```
|
46
107
|
|
47
|
-
###
|
108
|
+
### Development Console
|
109
|
+
|
110
|
+
Start an interactive console with your definition:
|
48
111
|
```bash
|
49
112
|
spidy console website.rb
|
50
113
|
```
|
51
114
|
|
52
|
-
|
115
|
+
Reload your source code during development:
|
53
116
|
```
|
54
117
|
irb(#<Spidy::Console>)> reload!
|
55
118
|
```
|
56
119
|
|
120
|
+
Example console usage:
|
57
121
|
```rb
|
58
122
|
each('http://example.com') { |url| break url }
|
59
|
-
call('http://example.com') { |html| break html } # html
|
123
|
+
call('http://example.com') { |html| break html } # html is a Nokogiri object (from Mechanize)
|
60
124
|
```
|
61
125
|
|
62
|
-
###
|
126
|
+
### Ruby Code Usage
|
127
|
+
|
128
|
+
Create and use a scraper in your Ruby code:
|
63
129
|
```rb
|
64
|
-
|
65
|
-
#
|
130
|
+
scraper = Spidy.define do
|
131
|
+
# Implement spiders and scrapers
|
132
|
+
spider(as: :html) do |yielder, connector, url|
|
133
|
+
connector.call(url) do |page|
|
134
|
+
yielder.call(page)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
define(as: :html) do
|
139
|
+
let(:title, 'title')
|
140
|
+
let(:links) { |doc| doc.css('a').map { |a| a['href'] } }
|
141
|
+
end
|
66
142
|
end
|
67
143
|
|
68
|
-
a
|
69
|
-
|
144
|
+
# Extract URLs from a site
|
145
|
+
scraper.each(url) do |page_url|
|
146
|
+
# Process each URL found
|
147
|
+
puts page_url
|
70
148
|
end
|
71
149
|
|
72
|
-
a
|
73
|
-
|
74
|
-
|
150
|
+
# Extract structured data from a site
|
151
|
+
result = scraper.call(url)
|
152
|
+
puts "Title: #{result[:title]}"
|
153
|
+
puts "Found #{result[:links].size} links"
|
75
154
|
```
|
76
155
|
|
77
156
|
## Development
|
data/Rakefile
CHANGED
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script checks what Ferrum API methods are available
|
4
|
+
begin
|
5
|
+
require 'ferrum'
|
6
|
+
puts 'Ferrum gem is loaded!'
|
7
|
+
|
8
|
+
# Check Ferrum version
|
9
|
+
puts "Ferrum version: #{begin
|
10
|
+
Ferrum::VERSION
|
11
|
+
rescue StandardError
|
12
|
+
'unknown'
|
13
|
+
end}"
|
14
|
+
|
15
|
+
# Try to create a browser instance
|
16
|
+
puts "\nTrying to create a browser instance..."
|
17
|
+
|
18
|
+
# Find Chrome executable path
|
19
|
+
def find_chrome_path
|
20
|
+
# Common locations on macOS
|
21
|
+
macos_paths = [
|
22
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
23
|
+
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
24
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
25
|
+
'/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'
|
26
|
+
]
|
27
|
+
|
28
|
+
# Check macOS paths
|
29
|
+
macos_paths.each do |path|
|
30
|
+
return path if File.exist?(path)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Try to locate Chrome using 'which' command
|
34
|
+
%w[google-chrome chromium chromium-browser].each do |browser|
|
35
|
+
path = `which #{browser} 2>/dev/null`.strip
|
36
|
+
return path if path != '' && File.exist?(path)
|
37
|
+
end
|
38
|
+
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get Chrome path
|
43
|
+
chrome_path = ENV['CHROME_PATH'] || find_chrome_path
|
44
|
+
if chrome_path
|
45
|
+
puts "Using Chrome executable: #{chrome_path}"
|
46
|
+
else
|
47
|
+
puts 'No Chrome executable found. Using default.'
|
48
|
+
end
|
49
|
+
|
50
|
+
# Create browser with options
|
51
|
+
options = {
|
52
|
+
headless: true,
|
53
|
+
window_size: [1280, 800]
|
54
|
+
}
|
55
|
+
|
56
|
+
# Add Chrome path if available
|
57
|
+
options[:browser_path] = chrome_path if chrome_path
|
58
|
+
|
59
|
+
browser = Ferrum::Browser.new(options)
|
60
|
+
puts 'Browser instance created successfully!'
|
61
|
+
|
62
|
+
# Check available methods on browser
|
63
|
+
puts "\nAvailable methods on browser object:"
|
64
|
+
browser_methods = (browser.methods - Object.methods).sort
|
65
|
+
puts browser_methods.join(', ')
|
66
|
+
|
67
|
+
# Check if headers method exists
|
68
|
+
puts "\nDoes browser respond to 'headers='? #{browser.respond_to?(:headers=)}"
|
69
|
+
|
70
|
+
# Check available methods on browser.network
|
71
|
+
if browser.respond_to?(:network)
|
72
|
+
puts "\nAvailable methods on browser.network object:"
|
73
|
+
network_methods = (browser.network.methods - Object.methods).sort
|
74
|
+
puts network_methods.join(', ')
|
75
|
+
|
76
|
+
# Check if wait_for_idle method exists and what parameters it accepts
|
77
|
+
if browser.network.respond_to?(:wait_for_idle)
|
78
|
+
puts "\nExamine wait_for_idle method:"
|
79
|
+
begin
|
80
|
+
# Try with timeout parameter
|
81
|
+
browser.network.wait_for_idle(timeout: 1)
|
82
|
+
puts 'wait_for_idle accepts timeout parameter'
|
83
|
+
rescue ArgumentError => e
|
84
|
+
puts "wait_for_idle does not accept timeout parameter: #{e.message}"
|
85
|
+
rescue StandardError => e
|
86
|
+
puts "Error calling wait_for_idle with timeout: #{e.message}"
|
87
|
+
end
|
88
|
+
else
|
89
|
+
puts "\nwait_for_idle method not available on network object"
|
90
|
+
end
|
91
|
+
else
|
92
|
+
puts "\nnetwork method not available on browser object"
|
93
|
+
end
|
94
|
+
|
95
|
+
# Test goto method
|
96
|
+
puts "\nTesting navigation with goto method:"
|
97
|
+
begin
|
98
|
+
browser.goto('https://example.com')
|
99
|
+
puts 'Navigation successful!'
|
100
|
+
puts "Page title: #{browser.title}"
|
101
|
+
rescue StandardError => e
|
102
|
+
puts "Error during navigation: #{e.message}"
|
103
|
+
end
|
104
|
+
|
105
|
+
# Clean up
|
106
|
+
browser.quit
|
107
|
+
puts "\nBrowser closed successfully"
|
108
|
+
rescue LoadError => e
|
109
|
+
puts "Error: Ferrum gem is not installed: #{e.message}"
|
110
|
+
puts 'Install it with: gem install ferrum'
|
111
|
+
rescue StandardError => e
|
112
|
+
puts "Error: #{e.message}"
|
113
|
+
puts e.backtrace.join("\n")
|
114
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Script to check if Lightpanda is running and start it if needed
|
4
|
+
require 'net/http'
|
5
|
+
|
6
|
+
def lightpanda_running?(host = '127.0.0.1', port = 9222)
|
7
|
+
uri = URI("http://#{host}:#{port}/json/version")
|
8
|
+
response = Net::HTTP.get_response(uri)
|
9
|
+
response.is_a?(Net::HTTPSuccess)
|
10
|
+
rescue StandardError
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
def start_lightpanda(host = '127.0.0.1', port = 9222)
|
15
|
+
puts 'Starting Lightpanda...'
|
16
|
+
|
17
|
+
# Build the command to start Lightpanda in the background
|
18
|
+
cmd = "/Users/aileron/bin/lightpanda serve --host #{host} --port #{port} > /tmp/lightpanda.log 2>&1 &"
|
19
|
+
|
20
|
+
# Execute the command
|
21
|
+
result = system(cmd)
|
22
|
+
|
23
|
+
if result
|
24
|
+
puts "Lightpanda started! Service should be available at http://#{host}:#{port}"
|
25
|
+
|
26
|
+
# Wait for it to be ready
|
27
|
+
10.times do
|
28
|
+
if lightpanda_running?(host, port)
|
29
|
+
puts 'Lightpanda is now running and accepting connections!'
|
30
|
+
return true
|
31
|
+
end
|
32
|
+
puts 'Waiting for Lightpanda to start...'
|
33
|
+
sleep 1
|
34
|
+
end
|
35
|
+
|
36
|
+
puts "Lightpanda might have started but isn't responding yet."
|
37
|
+
puts 'Check /tmp/lightpanda.log for details.'
|
38
|
+
else
|
39
|
+
puts 'Failed to start Lightpanda. Make sure the path is correct: /Users/aileron/bin/lightpanda'
|
40
|
+
end
|
41
|
+
false
|
42
|
+
end
|
43
|
+
|
44
|
+
# Main script
|
45
|
+
host = '127.0.0.1'
|
46
|
+
port = 9222
|
47
|
+
|
48
|
+
if lightpanda_running?(host, port)
|
49
|
+
puts "✅ Lightpanda is already running at http://#{host}:#{port}"
|
50
|
+
else
|
51
|
+
puts "❌ Lightpanda is not running at http://#{host}:#{port}"
|
52
|
+
|
53
|
+
if ARGV.include?('--start') || ARGV.include?('-s')
|
54
|
+
start_lightpanda(host, port)
|
55
|
+
else
|
56
|
+
puts 'Run this script with --start or -s option to start Lightpanda automatically:'
|
57
|
+
puts " #{$PROGRAM_NAME} --start"
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Test connecting to existing Chrome instance at 127.0.0.1:9222
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'ferrum'
|
7
|
+
puts 'Successfully loaded Ferrum'
|
8
|
+
rescue LoadError => e
|
9
|
+
puts "Ferrum is not installed: #{e.message}"
|
10
|
+
puts "Run 'gem install ferrum' to install it"
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
puts 'Testing connection to Chrome at 127.0.0.1:9222'
|
15
|
+
puts '=============================================='
|
16
|
+
|
17
|
+
begin
|
18
|
+
# Connect to the remote Chrome instance
|
19
|
+
# Note: We're setting process: false to prevent launching a new browser
|
20
|
+
browser = Ferrum::Browser.new(
|
21
|
+
url: 'http://127.0.0.1:9222',
|
22
|
+
process: false
|
23
|
+
)
|
24
|
+
|
25
|
+
# Access a test URL
|
26
|
+
url = 'https://example.com'
|
27
|
+
puts "Accessing: #{url}"
|
28
|
+
browser.goto(url)
|
29
|
+
|
30
|
+
# Get page title
|
31
|
+
title = browser.title
|
32
|
+
puts "Page title: #{title}"
|
33
|
+
|
34
|
+
# Clean up browser connection (but don't close Chrome)
|
35
|
+
browser.quit
|
36
|
+
|
37
|
+
puts "\nSuccess! Connected to Chrome at 127.0.0.1:9222"
|
38
|
+
rescue StandardError => e
|
39
|
+
puts "Error: #{e.message}"
|
40
|
+
puts e.backtrace.join("\n")
|
41
|
+
|
42
|
+
puts "\nTroubleshooting tips:"
|
43
|
+
puts '1. Make sure Chrome is running with remote debugging enabled'
|
44
|
+
puts '2. Verify the command: /Users/aileron/bin/lightpanda serve --host 127.0.0.1 --port 9222'
|
45
|
+
puts '3. Check if you can access http://127.0.0.1:9222/json/version in your browser'
|
46
|
+
end
|
47
|
+
|
48
|
+
puts "\nTest completed"
|