spidy 0.3.12 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,161 @@
1
+ #
2
+ # Lightpanda connector for JavaScript-rendered pages via CDP
3
+ # Using Ferrum for direct CDP connection
4
+ #
5
+ class Spidy::Connector::Lightpanda
6
+ include Spidy::Connector::StaticAccessor
7
+
8
+ attr_reader :user_agent, :host, :port
9
+
10
+ DEFAULT_HOST = '127.0.0.1'.freeze
11
+ DEFAULT_PORT = 9222
12
+
13
+ def initialize(user_agent:, host: nil, port: nil)
14
+ begin
15
+ require 'ferrum'
16
+ rescue LoadError
17
+ raise 'Ferrum gem is required. Please install with: gem install ferrum'
18
+ end
19
+
20
+ @user_agent = user_agent
21
+ @host = host || ENV['LIGHTPANDA_HOST'] || DEFAULT_HOST
22
+ @port = port || ENV['LIGHTPANDA_PORT'] || DEFAULT_PORT
23
+ end
24
+
25
+ def call(url)
26
+ fail 'url is not specified' if url.blank?
27
+
28
+ # Clean the URL by removing any whitespace or newlines
29
+ clean_url = url.to_s.strip
30
+
31
+ puts "Processing URL: #{clean_url}" if ENV['DEBUG']
32
+
33
+ # Create a page-like object similar to Mechanize
34
+ page = fetch_with_ferrum(clean_url)
35
+
36
+ # Apply yielder to the page
37
+ yield(page)
38
+ end
39
+
40
+ def refresh!
41
+ # No special refresh actions needed for Lightpanda
42
+ end
43
+
44
+ private
45
+
46
+ # Try to wait for network to be idle
47
+ def wait_for_network_idle(browser)
48
+ puts 'Waiting for network idle...' if ENV['DEBUG']
49
+
50
+ begin
51
+ # Try with timeout parameter
52
+ browser.network.wait_for_idle(timeout: 15)
53
+ rescue ArgumentError
54
+ begin
55
+ # Try without timeout parameter
56
+ browser.network.wait_for_idle
57
+ rescue StandardError => e
58
+ # If wait_for_idle fails, fall back to a simple sleep
59
+ puts "Warning: Could not wait for network idle: #{e.message}" if ENV['DEBUG']
60
+ sleep 5 # Simple fallback
61
+ end
62
+ end
63
+ end
64
+
65
+ # Navigate to URL and get page content
66
+ def navigate_and_get_content(browser, url)
67
+ # Navigate to the URL
68
+ puts "Navigating to: #{url}" if ENV['DEBUG']
69
+ browser.goto(url)
70
+
71
+ # Wait for network idle
72
+ wait_for_network_idle(browser)
73
+
74
+ # Get the content
75
+ puts 'Getting page content...' if ENV['DEBUG']
76
+ browser.body
77
+ end
78
+
79
+ # Create browser options
80
+ def create_browser_options
81
+ options = {
82
+ headless: true, # Run in headless mode
83
+ timeout: 20, # Increase timeout
84
+ process_timeout: 20, # Process timeout
85
+ window_size: [1280, 800] # Set window size
86
+ }
87
+
88
+ # Add Chrome path if available
89
+ options[:browser_path] = ENV['CHROME_PATH'] if ENV['CHROME_PATH'] && File.exist?(ENV['CHROME_PATH'])
90
+
91
+ options
92
+ end
93
+
94
+ # Connect to CDP server with Ferrum and fetch the page
95
+ def fetch_with_ferrum(url) # rubocop:todo Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
96
+ puts 'Using direct Chrome/Chromium instead of Lightpanda' if ENV['DEBUG']
97
+ browser = nil
98
+ html_content = nil
99
+
100
+ begin
101
+ # Create Ferrum browser
102
+ browser = Ferrum::Browser.new(create_browser_options)
103
+
104
+ # Skip user agent setting - not supported in this Ferrum version
105
+ if @user_agent.present? && ENV.fetch('DEBUG', nil)
106
+ puts 'User agent setting will be skipped - not supported in your Ferrum version'
107
+ end
108
+
109
+ # Navigate and get content
110
+ html_content = navigate_and_get_content(browser, url)
111
+ rescue StandardError => e
112
+ puts "Error during page navigation: #{e.class} - #{e.message}" if ENV['DEBUG']
113
+ raise e
114
+ ensure
115
+ # Clean up - ensure browser is always closed, even if an error occurred
116
+ browser&.quit if defined?(browser) && browser
117
+ end
118
+
119
+ # Create a Mechanize-like page object
120
+ LightpandaPage.new(url, html_content)
121
+ end
122
+
123
+ # Page-like object that mimics the Mechanize::Page interface
124
+ class LightpandaPage
125
+ attr_reader :uri, :body, :title, :code, :response_code
126
+
127
+ def initialize(url, html_content)
128
+ @uri = url
129
+ @body = html_content
130
+ @doc = Nokogiri::HTML(html_content)
131
+ @title = @doc.title
132
+ @code = '200'
133
+ @response_code = '200'
134
+ end
135
+
136
+ # Common methods from Mechanize::Page that might be used in the application
137
+ def search(*)
138
+ @doc.search(*)
139
+ end
140
+
141
+ def at(*)
142
+ @doc.at(*)
143
+ end
144
+
145
+ def css(*)
146
+ @doc.css(*)
147
+ end
148
+
149
+ def xpath(*)
150
+ @doc.xpath(*)
151
+ end
152
+
153
+ def encoding
154
+ @doc.encoding
155
+ end
156
+
157
+ def try(*args)
158
+ send(*args) if respond_to?(args.first)
159
+ end
160
+ end
161
+ end
@@ -1,20 +1,18 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # xml
5
3
  #
6
4
  class Spidy::Connector::Xml
7
5
  include Spidy::Connector::StaticAccessor
8
6
 
9
- def call(url, &block)
7
+ def call(url, &)
10
8
  fail 'URL is undefined' if url.blank?
11
9
 
12
- connect(url, &block)
10
+ connect(url, &)
13
11
  end
14
12
 
15
- def connect(url, &block)
13
+ def connect(url)
16
14
  OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
17
- block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
15
+ yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
18
16
  end
19
17
  rescue OpenURI::HTTPError => e
20
18
  raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # This class is responsible for actually making a network connection and downloading hypertext
5
3
  #
@@ -9,6 +7,7 @@ module Spidy::Connector
9
7
  autoload :Html
10
8
  autoload :Json
11
9
  autoload :Xml
10
+ autoload :Lightpanda
12
11
 
13
12
  DEFAULT_WAIT_TIME = 5
14
13
 
@@ -35,9 +34,9 @@ module Spidy::Connector
35
34
  module StaticAccessor
36
35
  extend ActiveSupport::Concern
37
36
  class_methods do
38
- def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
37
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &)
39
38
  ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
40
- url, &block
39
+ url, &
41
40
  )
42
41
  end
43
42
  end
@@ -75,9 +74,9 @@ module Spidy::Connector
75
74
  connect(url, &block)
76
75
  end
77
76
 
78
- def connect(url, retry_attempt_count: @retry_attempt_count, &block)
77
+ def connect(url, retry_attempt_count: @retry_attempt_count, &)
79
78
  logger.call('connnector.get': url, 'connnector.accessed': Time.current)
80
- origin_connector.call(url, &block)
79
+ origin_connector.call(url, &)
81
80
  rescue Spidy::Connector::Retry => e
82
81
  logger.call('retry.accessed': Time.current,
83
82
  'retry.uri': url,
@@ -105,9 +104,9 @@ module Spidy::Connector
105
104
  @socks_proxy = socks_proxy
106
105
  end
107
106
 
108
- def call(url, &block)
107
+ def call(url, &)
109
108
  Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
110
- connector.call(url, &block)
109
+ connector.call(url, &)
111
110
  end
112
111
  end
113
112
 
data/lib/spidy/console.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy console
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Class representing a website defined by DSL
5
3
  #
@@ -42,12 +40,12 @@ module Spidy::Definition
42
40
  end
43
41
  end
44
42
 
45
- def spider(name = :default, connector: nil, as: nil, &define_block)
43
+ def spider(name = :default, connector: nil, as: nil)
46
44
  @namespace ||= {}
47
45
  connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
48
46
  socks_proxy: @socks_proxy)
49
47
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
50
- define_block.call(yielder, connector, source)
48
+ yield(yielder, connector, source)
51
49
  end
52
50
  end
53
51
 
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy interface binding
5
3
  #
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # An object that represents the scraper defined by define block.
5
3
  #
data/lib/spidy/shell.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # spidy Shell
5
3
  #
@@ -9,7 +7,12 @@ class Spidy::Shell
9
7
  end
10
8
 
11
9
  def interactive
12
- Pry.start(Spidy::Console.new(@definition_file))
10
+ console = Spidy::Console.new(@definition_file)
11
+ require 'irb'
12
+ IRB.setup(nil)
13
+ irb = IRB::Irb.new(IRB::WorkSpace.new(console))
14
+ IRB.conf[:MAIN_CONTEXT] = irb.context
15
+ irb.eval_input
13
16
  end
14
17
 
15
18
  def command_line
data/lib/spidy/spider.rb CHANGED
@@ -1,11 +1,9 @@
1
- # frozen_string_literal: true
2
-
3
1
  #
4
2
  # Spider
5
3
  #
6
4
  class Spidy::Spider
7
- def initialize(&block)
8
- define_singleton_method(:bind, &block)
5
+ def initialize(&)
6
+ define_singleton_method(:bind, &)
9
7
  end
10
8
 
11
9
  def call(resource)
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  module Spidy
4
- VERSION = '0.3.12'
2
+ VERSION = '1.0.0'.freeze
5
3
  end
data/lib/spidy.rb CHANGED
@@ -1,6 +1,4 @@
1
- # frozen_string_literal: true
2
-
3
- require 'spidy/version'
1
+ require_relative 'spidy/version'
4
2
  require 'active_support/all'
5
3
  require 'mechanize'
6
4
  require 'open-uri'
@@ -29,11 +27,11 @@ module Spidy
29
27
  Spidy::DefinitionFile.open(filepath).spidy
30
28
  end
31
29
 
32
- def self.define(&block)
30
+ def self.define(&)
33
31
  spidy = Module.new do
34
32
  class_eval do
35
33
  extend ::Spidy::Definition
36
- module_eval(&block)
34
+ module_eval(&)
37
35
  end
38
36
  end
39
37
  spidy.instance_eval do
data/spidy.gemspec CHANGED
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  lib = File.expand_path('lib', __dir__)
4
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
3
  require 'spidy/version'
@@ -23,21 +21,10 @@ Gem::Specification.new do |spec|
23
21
  spec.bindir = 'exe'
24
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
23
  spec.require_paths = ['lib']
26
-
27
- spec.add_development_dependency 'bundler', '~> 2.0'
28
- spec.add_development_dependency 'capybara_discoball'
29
- spec.add_development_dependency 'ffaker'
30
- spec.add_development_dependency 'pry'
31
- spec.add_development_dependency 'rake', '~> 13.0'
32
- spec.add_development_dependency 'rspec', '~> 3.0'
33
- spec.add_development_dependency 'rspec-command'
34
- spec.add_development_dependency 'sinatra'
35
-
36
- spec.add_runtime_dependency 'activesupport'
37
- spec.add_runtime_dependency 'mechanize'
38
- spec.add_runtime_dependency 'pry'
39
- spec.add_runtime_dependency 'socksify'
40
- spec.add_runtime_dependency 'tor'
24
+ spec.add_dependency 'activesupport', '~> 7.1'
25
+ spec.add_dependency 'mechanize'
26
+ spec.add_dependency 'socksify'
27
+ spec.add_dependency 'tor'
41
28
  spec.metadata = {
42
29
  'rubygems_mfa_required' => 'true'
43
30
  }
metadata CHANGED
@@ -1,141 +1,28 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.12
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2022-02-09 00:00:00.000000000 Z
10
+ date: 2025-04-15 00:00:00.000000000 Z
12
11
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '2.0'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '2.0'
27
- - !ruby/object:Gem::Dependency
28
- name: capybara_discoball
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: ffaker
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: pry
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rake
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '13.0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '13.0'
83
- - !ruby/object:Gem::Dependency
84
- name: rspec
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '3.0'
90
- type: :development
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - "~>"
95
- - !ruby/object:Gem::Version
96
- version: '3.0'
97
- - !ruby/object:Gem::Dependency
98
- name: rspec-command
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - ">="
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- type: :development
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - ">="
109
- - !ruby/object:Gem::Version
110
- version: '0'
111
- - !ruby/object:Gem::Dependency
112
- name: sinatra
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - ">="
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
12
  - !ruby/object:Gem::Dependency
126
13
  name: activesupport
127
14
  requirement: !ruby/object:Gem::Requirement
128
15
  requirements:
129
- - - ">="
16
+ - - "~>"
130
17
  - !ruby/object:Gem::Version
131
- version: '0'
18
+ version: '7.1'
132
19
  type: :runtime
133
20
  prerelease: false
134
21
  version_requirements: !ruby/object:Gem::Requirement
135
22
  requirements:
136
- - - ">="
23
+ - - "~>"
137
24
  - !ruby/object:Gem::Version
138
- version: '0'
25
+ version: '7.1'
139
26
  - !ruby/object:Gem::Dependency
140
27
  name: mechanize
141
28
  requirement: !ruby/object:Gem::Requirement
@@ -150,20 +37,6 @@ dependencies:
150
37
  - - ">="
151
38
  - !ruby/object:Gem::Version
152
39
  version: '0'
153
- - !ruby/object:Gem::Dependency
154
- name: pry
155
- requirement: !ruby/object:Gem::Requirement
156
- requirements:
157
- - - ">="
158
- - !ruby/object:Gem::Version
159
- version: '0'
160
- type: :runtime
161
- prerelease: false
162
- version_requirements: !ruby/object:Gem::Requirement
163
- requirements:
164
- - - ">="
165
- - !ruby/object:Gem::Version
166
- version: '0'
167
40
  - !ruby/object:Gem::Dependency
168
41
  name: socksify
169
42
  requirement: !ruby/object:Gem::Requirement
@@ -192,7 +65,6 @@ dependencies:
192
65
  - - ">="
193
66
  - !ruby/object:Gem::Version
194
67
  version: '0'
195
- description:
196
68
  email:
197
69
  - aileron.cc@gmail.com
198
70
  executables:
@@ -203,10 +75,10 @@ files:
203
75
  - ".gitignore"
204
76
  - ".rspec"
205
77
  - ".rubocop.yml"
206
- - ".rubocop_todo.yml"
207
78
  - ".ruby-version"
208
79
  - ".travis.yml"
209
80
  - CHANGELOG.md
81
+ - CLAUDE.md
210
82
  - CODE_OF_CONDUCT.md
211
83
  - Gemfile
212
84
  - Gemfile.lock
@@ -215,9 +87,16 @@ files:
215
87
  - Rakefile
216
88
  - bin/console
217
89
  - bin/setup
90
+ - example/check_ferrum.rb
91
+ - example/check_lightpanda.rb
92
+ - example/connect_test.rb
93
+ - example/lightpanda_links.rb
218
94
  - example/master_detail.rb
219
95
  - example/proxy.rb
220
96
  - example/retry.rb
97
+ - example/run_with_lightpanda.rb
98
+ - example/simple_test.rb
99
+ - example/test_lightpanda.rb
221
100
  - example/wikip.rb
222
101
  - exe/spidy
223
102
  - lib/spidy.rb
@@ -231,6 +110,7 @@ files:
231
110
  - lib/spidy/connector/direct.rb
232
111
  - lib/spidy/connector/html.rb
233
112
  - lib/spidy/connector/json.rb
113
+ - lib/spidy/connector/lightpanda.rb
234
114
  - lib/spidy/connector/xml.rb
235
115
  - lib/spidy/console.rb
236
116
  - lib/spidy/definition.rb
@@ -246,7 +126,6 @@ licenses:
246
126
  - MIT
247
127
  metadata:
248
128
  rubygems_mfa_required: 'true'
249
- post_install_message:
250
129
  rdoc_options: []
251
130
  require_paths:
252
131
  - lib
@@ -261,8 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
261
140
  - !ruby/object:Gem::Version
262
141
  version: '0'
263
142
  requirements: []
264
- rubygems_version: 3.2.22
265
- signing_key:
143
+ rubygems_version: 3.6.6
266
144
  specification_version: 4
267
145
  summary: web spider dsl
268
146
  test_files: []
data/.rubocop_todo.yml DELETED
@@ -1,13 +0,0 @@
1
- # This configuration was generated by
2
- # `rubocop --auto-gen-config`
3
- # on 2019-03-29 18:00:03 +0900 using RuboCop version 0.66.0.
4
- # The point is for the user to remove these configuration records
5
- # one by one as the offenses are removed from the code base.
6
- # Note that changes in the inspected code, or installation of new
7
- # versions of RuboCop, may require this file to be generated again.
8
-
9
- # Offense count: 7
10
- # Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
11
- # URISchemes: http, https
12
- Metrics/LineLength:
13
- Max: 96