rubycrawl 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rubycrawl.rb CHANGED
@@ -3,25 +3,26 @@
3
3
  require_relative 'rubycrawl/version'
4
4
  require_relative 'rubycrawl/errors'
5
5
  require_relative 'rubycrawl/helpers'
6
- require_relative 'rubycrawl/service_client'
6
+ require_relative 'rubycrawl/browser'
7
7
  require_relative 'rubycrawl/url_normalizer'
8
8
  require_relative 'rubycrawl/markdown_converter'
9
9
  require_relative 'rubycrawl/result'
10
10
  require_relative 'rubycrawl/site_crawler'
11
11
  require_relative 'rubycrawl/railtie' if defined?(Rails)
12
12
 
13
- # RubyCrawl provides a simple interface for crawling pages via a local Playwright service.
13
+ # RubyCrawl pure Ruby web crawler with full JavaScript rendering via Ferrum.
14
14
  class RubyCrawl
15
15
  include Helpers
16
16
 
17
- DEFAULT_HOST = '127.0.0.1'
18
- DEFAULT_PORT = 3344
19
-
20
17
  class << self
21
18
  def client
22
19
  @client ||= new
23
20
  end
24
21
 
22
+ # Crawl a single URL and return a Result.
23
+ # @param url [String]
24
+ # @param options [Hash] wait_until:, block_resources:, max_attempts:
25
+ # @return [RubyCrawl::Result]
25
26
  def crawl(url, **options)
26
27
  client.crawl(url, **options)
27
28
  end
@@ -34,12 +35,12 @@ class RubyCrawl
34
35
  # @param max_depth [Integer] Maximum link depth from start URL (default: 3)
35
36
  # @param same_host_only [Boolean] Only follow links on the same host (default: true)
36
37
  # @yield [page] Yields each page result as it is crawled
37
- # @yieldparam page [SiteCrawler::PageResult] The crawled page result
38
+ # @yieldparam page [SiteCrawler::PageResult]
38
39
  # @return [Integer] Number of pages crawled
39
40
  #
40
- # @example Save pages to database
41
+ # @example
41
42
  # RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
42
- # Page.create!(url: page.url, html: page.html, depth: page.depth)
43
+ # Page.create!(url: page.url, content: page.clean_text, depth: page.depth)
43
44
  # end
44
45
  def crawl_site(url, ...)
45
46
  client.crawl_site(url, ...)
@@ -52,90 +53,60 @@ class RubyCrawl
52
53
 
53
54
  def initialize(**options)
54
55
  load_options(options)
55
- build_service_client
56
+ @browser = Browser.new(
57
+ timeout: @timeout,
58
+ headless: @headless,
59
+ browser_options: @browser_options
60
+ )
56
61
  end
57
62
 
58
- def crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries)
63
+ def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts)
59
64
  validate_url!(url)
60
- @service_client.ensure_running
61
- with_retries(retries) do
62
- payload = build_payload(url, wait_until, block_resources)
63
- response = @service_client.post_json('/crawl', payload)
64
- raise_node_error!(response)
65
- build_result(response)
65
+ validate_wait_until!(wait_until)
66
+ with_retries(max_attempts) do
67
+ @browser.crawl(url, wait_until: wait_until, block_resources: block_resources)
66
68
  end
67
69
  end
68
70
 
69
- # Crawl multiple pages starting from a URL, following links.
70
- # @see RubyCrawl.crawl_site
71
71
  def crawl_site(url, **options, &block)
72
- @service_client.ensure_running
73
72
  crawler_options = build_crawler_options(options)
74
- crawler = SiteCrawler.new(self, crawler_options)
75
- crawler.crawl(url, &block)
73
+ SiteCrawler.new(self, crawler_options).crawl(url, &block)
76
74
  end
77
75
 
78
76
  private
79
77
 
80
- def raise_node_error!(response)
81
- return unless response.is_a?(Hash) && response['error']
82
-
83
- error_code = response['error']
84
- error_message = response['message'] || error_code
85
- raise error_class_for(error_code), error_message_for(error_code, error_message)
78
+ def load_options(options)
79
+ @wait_until = options.fetch(:wait_until, nil)
80
+ @block_resources = options.fetch(:block_resources, nil)
81
+ @max_attempts = options.fetch(:max_attempts, 3)
82
+ @timeout = options.fetch(:timeout, 30)
83
+ @headless = options.fetch(:headless, true)
84
+ @browser_options = options.fetch(:browser_options, {})
86
85
  end
87
86
 
88
- def with_retries(retries)
87
+ def with_retries(max_attempts)
89
88
  attempt = 0
90
89
  begin
91
90
  yield
92
91
  rescue ServiceError, TimeoutError => e
93
92
  attempt += 1
94
- raise unless attempt < retries
93
+ raise unless attempt < max_attempts
95
94
 
96
- retry_with_backoff(attempt, retries, e)
95
+ backoff = 2**attempt
96
+ warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff}s: #{e.message}"
97
+ sleep(backoff)
97
98
  retry
98
99
  end
99
100
  end
100
101
 
101
- def load_options(options)
102
- @host = options.fetch(:host, DEFAULT_HOST)
103
- @port = Integer(options.fetch(:port, DEFAULT_PORT))
104
- @node_dir = options.fetch(:node_dir, default_node_dir)
105
- @node_bin = options.fetch(:node_bin, ENV.fetch('RUBYCRAWL_NODE_BIN', nil)) || 'node'
106
- @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
107
- @wait_until = options.fetch(:wait_until, nil)
108
- @block_resources = options.fetch(:block_resources, nil)
109
- @max_retries = options.fetch(:max_retries, 3)
110
- end
111
-
112
- def build_service_client
113
- @service_client = ServiceClient.new(
114
- host: @host,
115
- port: @port,
116
- node_dir: @node_dir,
117
- node_bin: @node_bin,
118
- node_log: @node_log
119
- )
120
- end
121
-
122
- def retry_with_backoff(attempt, retries, error)
123
- backoff_seconds = 2**attempt
124
- warn "[rubycrawl] Retry #{attempt}/#{retries - 1} after #{backoff_seconds}s: #{error.message}"
125
- sleep(backoff_seconds)
126
- end
127
-
128
102
  def build_crawler_options(options)
129
103
  {
130
- max_pages: options.fetch(:max_pages, 50),
131
- max_depth: options.fetch(:max_depth, 3),
132
- same_host_only: options.fetch(:same_host_only, true),
133
- wait_until: options.fetch(:wait_until, @wait_until),
134
- block_resources: options.fetch(:block_resources, @block_resources)
104
+ max_pages: options.fetch(:max_pages, 50),
105
+ max_depth: options.fetch(:max_depth, 3),
106
+ same_host_only: options.fetch(:same_host_only, true),
107
+ wait_until: options.fetch(:wait_until, @wait_until),
108
+ block_resources: options.fetch(:block_resources, @block_resources),
109
+ max_attempts: options.fetch(:max_attempts, @max_attempts)
135
110
  }
136
111
  end
137
-
138
- def default_node_dir
139
- File.expand_path('../node', __dir__)
140
- end
141
112
  end
data/rubycrawl.gemspec CHANGED
@@ -8,21 +8,21 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ['RubyCrawl contributors']
9
9
  spec.email = ['ganesh.navale@zohomail.in']
10
10
 
11
- spec.summary = 'Playwright-based web crawler for Ruby'
12
- spec.description = 'A Ruby-first web crawler that orchestrates a local Playwright service.'
11
+ spec.summary = 'Pure Ruby web crawler with full JavaScript rendering'
12
+ spec.description = 'rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.'
13
13
  spec.homepage = 'https://github.com/craft-wise/rubycrawl'
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.required_ruby_version = '>= 3.0'
17
17
 
18
- spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
19
- spec.files += %w[README.md LICENSE Gemfile Rakefile rubycrawl.gemspec .rspec]
18
+ spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
19
+ spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
20
20
 
21
21
  spec.bindir = 'bin'
22
22
  spec.executables = []
23
23
  spec.require_paths = ['lib']
24
24
 
25
- # For HTML to Markdown conversion
25
+ spec.add_dependency 'ferrum', '~> 0.15'
26
26
  spec.add_dependency 'reverse_markdown', '~> 2.1'
27
27
 
28
28
  spec.metadata['rubygems_mfa_required'] = 'true'
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - RubyCrawl contributors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-01-26 00:00:00.000000000 Z
11
+ date: 2026-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ferrum
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.15'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.15'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: reverse_markdown
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -24,7 +38,7 @@ dependencies:
24
38
  - - "~>"
25
39
  - !ruby/object:Gem::Version
26
40
  version: '2.1'
27
- description: A Ruby-first web crawler that orchestrates a local Playwright service.
41
+ description: rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.
28
42
  email:
29
43
  - ganesh.navale@zohomail.in
30
44
  executables: []
@@ -32,17 +46,17 @@ extensions: []
32
46
  extra_rdoc_files: []
33
47
  files:
34
48
  - ".rspec"
35
- - Gemfile
36
49
  - LICENSE
37
50
  - README.md
38
51
  - Rakefile
39
52
  - lib/rubycrawl.rb
53
+ - lib/rubycrawl/browser.rb
54
+ - lib/rubycrawl/browser/extraction.rb
40
55
  - lib/rubycrawl/errors.rb
41
56
  - lib/rubycrawl/helpers.rb
42
57
  - lib/rubycrawl/markdown_converter.rb
43
58
  - lib/rubycrawl/railtie.rb
44
59
  - lib/rubycrawl/result.rb
45
- - lib/rubycrawl/service_client.rb
46
60
  - lib/rubycrawl/site_crawler.rb
47
61
  - lib/rubycrawl/tasks/install.rake
48
62
  - lib/rubycrawl/url_normalizer.rb
@@ -71,5 +85,5 @@ requirements: []
71
85
  rubygems_version: 3.5.22
72
86
  signing_key:
73
87
  specification_version: 4
74
- summary: Playwright-based web crawler for Ruby
88
+ summary: Pure Ruby web crawler with full JavaScript rendering
75
89
  test_files: []
data/Gemfile DELETED
@@ -1,11 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- source 'https://rubygems.org'
4
-
5
- gemspec
6
-
7
- group :development do
8
- gem 'rake', '>= 13.0'
9
- gem 'rspec', '>= 3.12'
10
- gem 'rubocop', '>= 1.50'
11
- end
@@ -1,86 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
- require 'net/http'
5
- require 'uri'
6
-
7
- class RubyCrawl
8
- # Handles node service lifecycle and HTTP requests.
9
- class ServiceClient
10
- def initialize(host:, port:, node_dir:, node_bin:, node_log:)
11
- @host = host
12
- @port = Integer(port)
13
- @node_dir = node_dir
14
- @node_bin = node_bin
15
- @node_log = node_log
16
- @node_pid = nil
17
- end
18
-
19
- def ensure_running
20
- return if healthy?
21
-
22
- start_service
23
- wait_until_healthy
24
- end
25
-
26
- def post_json(path, body)
27
- uri = URI("http://#{@host}:#{@port}#{path}")
28
- request = build_request(uri, body)
29
- response = perform_request(uri, request)
30
- JSON.parse(response.body)
31
- rescue JSON::ParserError => e
32
- raise ServiceError, "Node service returned invalid JSON: #{e.message}"
33
- rescue Errno::ECONNREFUSED, Errno::ECONNRESET => e
34
- raise ServiceError, "Cannot connect to node service at #{uri}: #{e.message}"
35
- rescue Net::OpenTimeout, Net::ReadTimeout => e
36
- raise TimeoutError, "Request to node service timed out: #{e.message}"
37
- end
38
-
39
- private
40
-
41
- def build_request(uri, body)
42
- request = Net::HTTP::Post.new(uri)
43
- request['Content-Type'] = 'application/json'
44
- request.body = JSON.generate(body)
45
- request
46
- end
47
-
48
- def perform_request(uri, request)
49
- Net::HTTP.start(uri.host, uri.port, open_timeout: 5, read_timeout: 30) do |http|
50
- http.request(request)
51
- end
52
- end
53
-
54
- def start_service
55
- raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
56
-
57
- env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
58
- out = @node_log ? File.open(@node_log, 'a') : File::NULL
59
- err = @node_log ? out : File::NULL
60
- @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: err)
61
- Process.detach(@node_pid)
62
- end
63
-
64
- def wait_until_healthy(timeout: 5)
65
- deadline = Time.now + timeout
66
- until Time.now > deadline
67
- return true if healthy?
68
-
69
- sleep 0.2
70
- end
71
-
72
- raise ServiceError, "rubycrawl node service failed to start within #{timeout}s. " \
73
- "Check logs at #{@node_log || 'RUBYCRAWL_NODE_LOG'}"
74
- end
75
-
76
- def healthy?
77
- uri = URI("http://#{@host}:#{@port}/health")
78
- response = Net::HTTP.start(uri.host, uri.port, open_timeout: 1, read_timeout: 1) do |http|
79
- http.get(uri.request_uri)
80
- end
81
- response.is_a?(Net::HTTPSuccess)
82
- rescue StandardError
83
- false
84
- end
85
- end
86
- end