rubycrawl 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rubycrawl.rb CHANGED
@@ -3,25 +3,26 @@
3
3
  require_relative 'rubycrawl/version'
4
4
  require_relative 'rubycrawl/errors'
5
5
  require_relative 'rubycrawl/helpers'
6
- require_relative 'rubycrawl/service_client'
6
+ require_relative 'rubycrawl/browser'
7
7
  require_relative 'rubycrawl/url_normalizer'
8
8
  require_relative 'rubycrawl/markdown_converter'
9
9
  require_relative 'rubycrawl/result'
10
10
  require_relative 'rubycrawl/site_crawler'
11
11
  require_relative 'rubycrawl/railtie' if defined?(Rails)
12
12
 
13
- # RubyCrawl provides a simple interface for crawling pages via a local Playwright service.
13
+ # RubyCrawl pure Ruby web crawler with full JavaScript rendering via Ferrum.
14
14
  class RubyCrawl
15
15
  include Helpers
16
16
 
17
- DEFAULT_HOST = '127.0.0.1'
18
- DEFAULT_PORT = 3344
19
-
20
17
  class << self
21
18
  def client
22
19
  @client ||= new
23
20
  end
24
21
 
22
+ # Crawl a single URL and return a Result.
23
+ # @param url [String]
24
+ # @param options [Hash] wait_until:, block_resources:, max_attempts:
25
+ # @return [RubyCrawl::Result]
25
26
  def crawl(url, **options)
26
27
  client.crawl(url, **options)
27
28
  end
@@ -34,29 +35,17 @@ class RubyCrawl
34
35
  # @param max_depth [Integer] Maximum link depth from start URL (default: 3)
35
36
  # @param same_host_only [Boolean] Only follow links on the same host (default: true)
36
37
  # @yield [page] Yields each page result as it is crawled
37
- # @yieldparam page [SiteCrawler::PageResult] The crawled page result
38
+ # @yieldparam page [SiteCrawler::PageResult]
38
39
  # @return [Integer] Number of pages crawled
39
40
  #
40
- # @example Save pages to database
41
+ # @example
41
42
  # RubyCrawl.crawl_site("https://example.com", max_pages: 100) do |page|
42
- # Page.create!(url: page.url, html: page.html, depth: page.depth)
43
+ # Page.create!(url: page.url, content: page.clean_text, depth: page.depth)
43
44
  # end
44
45
  def crawl_site(url, ...)
45
46
  client.crawl_site(url, ...)
46
47
  end
47
48
 
48
- # Create a session for reusing browser context across multiple crawls.
49
- # @return [String] session_id
50
- def create_session
51
- client.create_session
52
- end
53
-
54
- # Destroy a session and close its browser context.
55
- # @param session_id [String]
56
- def destroy_session(session_id)
57
- client.destroy_session(session_id)
58
- end
59
-
60
49
  def configure(**options)
61
50
  @client = new(**options)
62
51
  end
@@ -64,104 +53,60 @@ class RubyCrawl
64
53
 
65
54
  def initialize(**options)
66
55
  load_options(options)
67
- build_service_client
56
+ @browser = Browser.new(
57
+ timeout: @timeout,
58
+ headless: @headless,
59
+ browser_options: @browser_options
60
+ )
68
61
  end
69
62
 
70
- def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts, session_id: nil)
63
+ def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts)
71
64
  validate_url!(url)
72
- @service_client.ensure_running
65
+ validate_wait_until!(wait_until)
73
66
  with_retries(max_attempts) do
74
- payload = build_payload(url, wait_until, block_resources, session_id)
75
- response = @service_client.post_json('/crawl', payload)
76
- raise_node_error!(response)
77
- build_result(response)
67
+ @browser.crawl(url, wait_until: wait_until, block_resources: block_resources)
78
68
  end
79
69
  end
80
70
 
81
- # Create a session for reusing browser context.
82
- # @return [String] session_id
83
- def create_session
84
- @service_client.ensure_running
85
- @service_client.create_session
86
- end
87
-
88
- # Destroy a session.
89
- # @param session_id [String]
90
- def destroy_session(session_id)
91
- @service_client.destroy_session(session_id)
92
- end
93
-
94
- # Crawl multiple pages starting from a URL, following links.
95
- # @see RubyCrawl.crawl_site
96
71
  def crawl_site(url, **options, &block)
97
- @service_client.ensure_running
98
72
  crawler_options = build_crawler_options(options)
99
- crawler = SiteCrawler.new(self, crawler_options)
100
- crawler.crawl(url, &block)
73
+ SiteCrawler.new(self, crawler_options).crawl(url, &block)
101
74
  end
102
75
 
103
76
  private
104
77
 
105
- def raise_node_error!(response)
106
- return unless response.is_a?(Hash) && response['error']
107
-
108
- error_code = response['error']
109
- error_message = response['message'] || error_code
110
- raise error_class_for(error_code), error_message_for(error_code, error_message)
78
+ def load_options(options)
79
+ @wait_until = options.fetch(:wait_until, nil)
80
+ @block_resources = options.fetch(:block_resources, nil)
81
+ @max_attempts = options.fetch(:max_attempts, 3)
82
+ @timeout = options.fetch(:timeout, 30)
83
+ @headless = options.fetch(:headless, true)
84
+ @browser_options = options.fetch(:browser_options, {})
111
85
  end
112
86
 
113
- def with_retries(retries)
87
+ def with_retries(max_attempts)
114
88
  attempt = 0
115
89
  begin
116
90
  yield
117
91
  rescue ServiceError, TimeoutError => e
118
92
  attempt += 1
119
- raise unless attempt < retries
93
+ raise unless attempt < max_attempts
120
94
 
121
- retry_with_backoff(attempt, retries, e)
95
+ backoff = 2**attempt
96
+ warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff}s: #{e.message}"
97
+ sleep(backoff)
122
98
  retry
123
99
  end
124
100
  end
125
101
 
126
- def load_options(options)
127
- @host = options.fetch(:host, DEFAULT_HOST)
128
- @port = Integer(options.fetch(:port, DEFAULT_PORT))
129
- @node_dir = options.fetch(:node_dir, default_node_dir)
130
- @node_bin = options.fetch(:node_bin, ENV.fetch('RUBYCRAWL_NODE_BIN', nil)) || 'node'
131
- @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
132
- @wait_until = options.fetch(:wait_until, nil)
133
- @block_resources = options.fetch(:block_resources, nil)
134
- @max_attempts = options.fetch(:max_attempts, 3)
135
- end
136
-
137
- def build_service_client
138
- @service_client = ServiceClient.new(
139
- host: @host,
140
- port: @port,
141
- node_dir: @node_dir,
142
- node_bin: @node_bin,
143
- node_log: @node_log
144
- )
145
- end
146
-
147
- def retry_with_backoff(attempt, max_attempts, error)
148
- backoff_seconds = 2**attempt
149
- warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff_seconds}s: #{error.message}"
150
- sleep(backoff_seconds)
151
- end
152
-
153
102
  def build_crawler_options(options)
154
103
  {
155
- max_pages: options.fetch(:max_pages, 50),
156
- max_depth: options.fetch(:max_depth, 3),
157
- same_host_only: options.fetch(:same_host_only, true),
158
- wait_until: options.fetch(:wait_until, @wait_until),
104
+ max_pages: options.fetch(:max_pages, 50),
105
+ max_depth: options.fetch(:max_depth, 3),
106
+ same_host_only: options.fetch(:same_host_only, true),
107
+ wait_until: options.fetch(:wait_until, @wait_until),
159
108
  block_resources: options.fetch(:block_resources, @block_resources),
160
- max_attempts: options.fetch(:max_attempts, @max_attempts)
109
+ max_attempts: options.fetch(:max_attempts, @max_attempts)
161
110
  }
162
111
  end
163
-
164
- def default_node_dir
165
- File.expand_path('../node', __dir__)
166
- end
167
112
  end
data/rubycrawl.gemspec CHANGED
@@ -8,22 +8,21 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ['RubyCrawl contributors']
9
9
  spec.email = ['ganesh.navale@zohomail.in']
10
10
 
11
- spec.summary = 'Playwright-based web crawler for Ruby'
12
- spec.description = 'A Ruby-first web crawler that orchestrates a local Playwright service.'
11
+ spec.summary = 'Pure Ruby web crawler with full JavaScript rendering'
12
+ spec.description = 'rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.'
13
13
  spec.homepage = 'https://github.com/craft-wise/rubycrawl'
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.required_ruby_version = '>= 3.0'
17
17
 
18
18
  spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
19
- spec.files += Dir.glob('node/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) || f.include?('node_modules') }
20
19
  spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
21
20
 
22
21
  spec.bindir = 'bin'
23
22
  spec.executables = []
24
23
  spec.require_paths = ['lib']
25
24
 
26
- # For HTML to Markdown conversion
25
+ spec.add_dependency 'ferrum', '~> 0.15'
27
26
  spec.add_dependency 'reverse_markdown', '~> 2.1'
28
27
 
29
28
  spec.metadata['rubygems_mfa_required'] = 'true'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - RubyCrawl contributors
@@ -10,6 +10,20 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2026-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ferrum
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.15'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.15'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: reverse_markdown
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -24,7 +38,7 @@ dependencies:
24
38
  - - "~>"
25
39
  - !ruby/object:Gem::Version
26
40
  version: '2.1'
27
- description: A Ruby-first web crawler that orchestrates a local Playwright service.
41
+ description: rubycrawl uses Ferrum (Chrome DevTools Protocol) for JS rendering.
28
42
  email:
29
43
  - ganesh.navale@zohomail.in
30
44
  executables: []
@@ -36,22 +50,17 @@ files:
36
50
  - README.md
37
51
  - Rakefile
38
52
  - lib/rubycrawl.rb
53
+ - lib/rubycrawl/browser.rb
54
+ - lib/rubycrawl/browser/extraction.rb
39
55
  - lib/rubycrawl/errors.rb
40
56
  - lib/rubycrawl/helpers.rb
41
57
  - lib/rubycrawl/markdown_converter.rb
42
58
  - lib/rubycrawl/railtie.rb
43
59
  - lib/rubycrawl/result.rb
44
- - lib/rubycrawl/service_client.rb
45
60
  - lib/rubycrawl/site_crawler.rb
46
61
  - lib/rubycrawl/tasks/install.rake
47
62
  - lib/rubycrawl/url_normalizer.rb
48
63
  - lib/rubycrawl/version.rb
49
- - node/.gitignore
50
- - node/.npmrc
51
- - node/README.md
52
- - node/package-lock.json
53
- - node/package.json
54
- - node/src/index.js
55
64
  - rubycrawl.gemspec
56
65
  homepage: https://github.com/craft-wise/rubycrawl
57
66
  licenses:
@@ -76,5 +85,5 @@ requirements: []
76
85
  rubygems_version: 3.5.22
77
86
  signing_key:
78
87
  specification_version: 4
79
- summary: Playwright-based web crawler for Ruby
88
+ summary: Pure Ruby web crawler with full JavaScript rendering
80
89
  test_files: []
@@ -1,108 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
- require 'net/http'
5
- require 'uri'
6
-
7
- class RubyCrawl
8
- # Handles node service lifecycle and HTTP requests.
9
- class ServiceClient
10
- def initialize(host:, port:, node_dir:, node_bin:, node_log:)
11
- @host = host
12
- @port = Integer(port)
13
- @node_dir = node_dir
14
- @node_bin = node_bin
15
- @node_log = node_log
16
- @node_pid = nil
17
- end
18
-
19
- def ensure_running
20
- return if healthy?
21
-
22
- start_service
23
- wait_until_healthy
24
- end
25
-
26
- def post_json(path, body)
27
- uri = URI("http://#{@host}:#{@port}#{path}")
28
- request = build_request(uri, body)
29
- response = perform_request(uri, request)
30
- JSON.parse(response.body)
31
- rescue JSON::ParserError => e
32
- raise ServiceError, "Node service returned invalid JSON: #{e.message}"
33
- rescue Errno::ECONNREFUSED, Errno::ECONNRESET => e
34
- raise ServiceError, "Cannot connect to node service at #{uri}: #{e.message}"
35
- rescue Net::OpenTimeout, Net::ReadTimeout => e
36
- raise TimeoutError, "Request to node service timed out: #{e.message}"
37
- end
38
-
39
- # Create a session for reusing browser context across multiple crawls.
40
- # @return [String] session_id
41
- def create_session
42
- response = post_json('/session/create', {})
43
- raise ServiceError, "Failed to create session: #{response['error']}" if response['error']
44
-
45
- response['session_id']
46
- end
47
-
48
- # Destroy a session and close its browser context.
49
- # @param session_id [String]
50
- def destroy_session(session_id)
51
- post_json('/session/destroy', { session_id: session_id })
52
- rescue StandardError
53
- # Ignore errors on destroy - context may already be closed
54
- nil
55
- end
56
-
57
- private
58
-
59
- def build_request(uri, body)
60
- request = Net::HTTP::Post.new(uri)
61
- request['Content-Type'] = 'application/json'
62
- request.body = JSON.generate(body)
63
- request
64
- end
65
-
66
- def perform_request(uri, request)
67
- Net::HTTP.start(uri.host, uri.port, open_timeout: 5, read_timeout: 30) do |http|
68
- http.request(request)
69
- end
70
- end
71
-
72
- def start_service
73
- raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
74
-
75
- env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
76
- if @node_log
77
- out = File.open(@node_log, 'a')
78
- @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: out)
79
- out.close
80
- else
81
- @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: File::NULL, err: File::NULL)
82
- end
83
- Process.detach(@node_pid)
84
- end
85
-
86
- def wait_until_healthy(timeout: 5)
87
- deadline = Time.now + timeout
88
- until Time.now > deadline
89
- return true if healthy?
90
-
91
- sleep 0.2
92
- end
93
-
94
- raise ServiceError, "rubycrawl node service failed to start within #{timeout}s. " \
95
- "Check logs at #{@node_log || 'RUBYCRAWL_NODE_LOG'}"
96
- end
97
-
98
- def healthy?
99
- uri = URI("http://#{@host}:#{@port}/health")
100
- response = Net::HTTP.start(uri.host, uri.port, open_timeout: 1, read_timeout: 1) do |http|
101
- http.get(uri.request_uri)
102
- end
103
- response.is_a?(Net::HTTPSuccess)
104
- rescue StandardError
105
- false
106
- end
107
- end
108
- end
data/node/.gitignore DELETED
@@ -1,2 +0,0 @@
1
- /node_modules
2
- /.env
data/node/.npmrc DELETED
@@ -1 +0,0 @@
1
- fund=false
data/node/README.md DELETED
@@ -1,19 +0,0 @@
1
- # rubycrawl Node Service
2
-
3
- Local Playwright-backed HTTP service used by the Ruby gem.
4
-
5
- ## Run
6
-
7
- ```
8
- npm install
9
- npm start
10
- ```
11
-
12
- ## Environment
13
-
14
- Create a `.env` file (or copy from `.env.example`) if you need custom settings.
15
-
16
- ## Endpoints
17
-
18
- - `POST /crawl` JSON body: `{ "url": "https://example.com" }`
19
- - `GET /health`
@@ -1,72 +0,0 @@
1
- {
2
- "name": "rubycrawl-node",
3
- "version": "0.1.0",
4
- "lockfileVersion": 3,
5
- "requires": true,
6
- "packages": {
7
- "": {
8
- "name": "rubycrawl-node",
9
- "version": "0.1.0",
10
- "dependencies": {
11
- "dotenv": "^16.4.5",
12
- "playwright": "^1.41.0"
13
- }
14
- },
15
- "node_modules/dotenv": {
16
- "version": "16.6.1",
17
- "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
18
- "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
19
- "license": "BSD-2-Clause",
20
- "engines": {
21
- "node": ">=12"
22
- },
23
- "funding": {
24
- "url": "https://dotenvx.com"
25
- }
26
- },
27
- "node_modules/fsevents": {
28
- "version": "2.3.2",
29
- "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
30
- "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
31
- "hasInstallScript": true,
32
- "license": "MIT",
33
- "optional": true,
34
- "os": [
35
- "darwin"
36
- ],
37
- "engines": {
38
- "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
39
- }
40
- },
41
- "node_modules/playwright": {
42
- "version": "1.58.0",
43
- "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.0.tgz",
44
- "integrity": "sha512-2SVA0sbPktiIY/MCOPX8e86ehA/e+tDNq+e5Y8qjKYti2Z/JG7xnronT/TXTIkKbYGWlCbuucZ6dziEgkoEjQQ==",
45
- "license": "Apache-2.0",
46
- "dependencies": {
47
- "playwright-core": "1.58.0"
48
- },
49
- "bin": {
50
- "playwright": "cli.js"
51
- },
52
- "engines": {
53
- "node": ">=18"
54
- },
55
- "optionalDependencies": {
56
- "fsevents": "2.3.2"
57
- }
58
- },
59
- "node_modules/playwright-core": {
60
- "version": "1.58.0",
61
- "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.0.tgz",
62
- "integrity": "sha512-aaoB1RWrdNi3//rOeKuMiS65UCcgOVljU46At6eFcOFPFHWtd2weHRRow6z/n+Lec0Lvu0k9ZPKJSjPugikirw==",
63
- "license": "Apache-2.0",
64
- "bin": {
65
- "playwright-core": "cli.js"
66
- },
67
- "engines": {
68
- "node": ">=18"
69
- }
70
- }
71
- }
72
- }
data/node/package.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "name": "rubycrawl-node",
3
- "version": "0.1.0",
4
- "private": true,
5
- "type": "module",
6
- "main": "src/index.js",
7
- "scripts": {
8
- "start": "node src/index.js"
9
- },
10
- "dependencies": {
11
- "dotenv": "^16.4.5",
12
- "playwright": "^1.41.0"
13
- }
14
- }