rubycrawl 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,14 +17,22 @@ class RubyCrawl
17
17
  if uri.host&.match?(/^(localhost|127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[01]))/)
18
18
  warn '[rubycrawl] Warning: Crawling internal/private IP addresses'
19
19
  end
20
- rescue URI::InvalidURIError => e
20
+ rescue URI::InvalidURIError, TypeError => e
21
21
  raise ConfigurationError, "Invalid URL: #{e.message}"
22
22
  end
23
23
 
24
- def build_payload(url, wait_until, block_resources)
24
+ VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
25
+
26
+ def build_payload(url, wait_until, block_resources, session_id = nil)
27
+ if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
28
+ raise ConfigurationError,
29
+ "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
30
+ end
31
+
25
32
  payload = { url: url }
26
33
  payload[:wait_until] = wait_until if wait_until
27
34
  payload[:block_resources] = block_resources unless block_resources.nil?
35
+ payload[:session_id] = session_id if session_id
28
36
  payload
29
37
  end
30
38
 
@@ -39,11 +47,9 @@ class RubyCrawl
39
47
 
40
48
  def error_class_for(error_code)
41
49
  case error_code
42
- when 'navigation_timeout', 'crawl_timeout'
43
- TimeoutError
44
- when 'navigation_failed', 'crawl_failed'
50
+ when 'crawl_failed'
45
51
  NavigationError
46
- when 'invalid_json', 'invalid_json_response'
52
+ when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
47
53
  ServiceError
48
54
  else
49
55
  Error
@@ -52,12 +58,10 @@ class RubyCrawl
52
58
 
53
59
  def error_message_for(error_code, error_message)
54
60
  case error_code
55
- when 'navigation_timeout', 'crawl_timeout'
56
- "Crawl timeout: #{error_message}"
57
- when 'navigation_failed', 'crawl_failed'
61
+ when 'crawl_failed'
58
62
  "Navigation failed: #{error_message}"
59
- when 'invalid_json', 'invalid_json_response'
60
- "Node service returned invalid JSON: #{error_message}"
63
+ when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
64
+ "Service error [#{error_code}]: #{error_message}"
61
65
  else
62
66
  "Crawl error [#{error_code}]: #{error_message}"
63
67
  end
@@ -15,10 +15,10 @@ class RubyCrawl
15
15
 
16
16
  # Convert HTML to Markdown with resolved URLs.
17
17
  #
18
- # @param html [String] The HTML content to convert
18
+ # @param html [String] The page HTML to convert
19
19
  # @param base_url [String, nil] Base URL to resolve relative URLs
20
- # @param options [Hash] Options for conversion
21
- # @return [String] The Markdown content with absolute URLs
20
+ # @param options [Hash] Options passed to ReverseMarkdown
21
+ # @return [String] Markdown content with absolute URLs
22
22
  def convert(html, base_url: nil, **options)
23
23
  return '' if html.nil? || html.empty?
24
24
 
@@ -1,38 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RubyCrawl
4
- # Result object with lazy markdown conversion.
4
+ # Result object with lazy clean_markdown conversion.
5
5
  class Result
6
6
  attr_reader :text, :html, :links, :metadata
7
7
 
8
- def initialize(text:, html:, links:, metadata:, markdown: nil)
8
+ def initialize(text:, html:, links:, metadata:)
9
9
  @text = text
10
10
  @html = html
11
11
  @links = links
12
12
  @metadata = metadata
13
- @markdown = markdown unless markdown.to_s.empty?
14
13
  end
15
14
 
16
- # Returns markdown, converting from HTML lazily if needed.
15
+ # Returns clean markdown converted from the page HTML.
17
16
  # Relative URLs are resolved using the page's final_url.
18
17
  #
19
18
  # @return [String] Markdown content with absolute URLs
20
- def markdown
21
- @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
19
+ def clean_markdown
20
+ @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
22
21
  end
23
22
 
24
23
  # The final URL after redirects.
25
24
  #
26
25
  # @return [String, nil]
27
26
  def final_url
28
- metadata['final_url'] || metadata[:final_url]
27
+ metadata['final_url']
29
28
  end
30
29
 
31
- # Check if markdown has been computed.
30
+ # Check if clean_markdown has been computed.
32
31
  #
33
32
  # @return [Boolean]
34
- def markdown?
35
- !@markdown.nil?
33
+ def clean_markdown?
34
+ !@clean_markdown.nil?
36
35
  end
37
36
 
38
37
  def to_h
@@ -41,7 +40,7 @@ class RubyCrawl
41
40
  html: html,
42
41
  links: links,
43
42
  metadata: metadata,
44
- markdown: markdown
43
+ clean_markdown: @clean_markdown
45
44
  }
46
45
  end
47
46
  end
@@ -36,6 +36,24 @@ class RubyCrawl
36
36
  raise TimeoutError, "Request to node service timed out: #{e.message}"
37
37
  end
38
38
 
39
+ # Create a session for reusing browser context across multiple crawls.
40
+ # @return [String] session_id
41
+ def create_session
42
+ response = post_json('/session/create', {})
43
+ raise ServiceError, "Failed to create session: #{response['error']}" if response['error']
44
+
45
+ response['session_id']
46
+ end
47
+
48
+ # Destroy a session and close its browser context.
49
+ # @param session_id [String]
50
+ def destroy_session(session_id)
51
+ post_json('/session/destroy', { session_id: session_id })
52
+ rescue StandardError
53
+ # Ignore errors on destroy - context may already be closed
54
+ nil
55
+ end
56
+
39
57
  private
40
58
 
41
59
  def build_request(uri, body)
@@ -55,9 +73,13 @@ class RubyCrawl
55
73
  raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
56
74
 
57
75
  env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
58
- out = @node_log ? File.open(@node_log, 'a') : File::NULL
59
- err = @node_log ? out : File::NULL
60
- @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: err)
76
+ if @node_log
77
+ out = File.open(@node_log, 'a')
78
+ @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: out)
79
+ out.close
80
+ else
81
+ @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: File::NULL, err: File::NULL)
82
+ end
61
83
  Process.detach(@node_pid)
62
84
  end
63
85
 
@@ -5,7 +5,7 @@ require 'set'
5
5
  class RubyCrawl
6
6
  # BFS crawler that follows links with deduplication.
7
7
  class SiteCrawler
8
- # Page result yielded to the block with lazy markdown.
8
+ # Page result yielded to the block with lazy clean_markdown.
9
9
  class PageResult
10
10
  attr_reader :url, :html, :links, :metadata, :depth
11
11
 
@@ -17,14 +17,15 @@ class RubyCrawl
17
17
  @depth = depth
18
18
  end
19
19
 
20
- # Lazy markdown conversion with resolved URLs.
21
- def markdown
22
- @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
20
+ # Returns clean markdown converted from the page HTML.
21
+ # Relative URLs are resolved using the page's final_url.
22
+ def clean_markdown
23
+ @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
23
24
  end
24
25
 
25
26
  # The final URL after redirects.
26
27
  def final_url
27
- metadata['final_url'] || metadata[:final_url] || url
28
+ metadata['final_url'] || url
28
29
  end
29
30
  end
30
31
 
@@ -35,8 +36,10 @@ class RubyCrawl
35
36
  @same_host_only = options.fetch(:same_host_only, true)
36
37
  @wait_until = options.fetch(:wait_until, nil)
37
38
  @block_resources = options.fetch(:block_resources, nil)
39
+ @max_attempts = options.fetch(:max_attempts, nil)
38
40
  @visited = Set.new
39
41
  @queue = []
42
+ @session_id = nil
40
43
  end
41
44
 
42
45
  def crawl(start_url, &block)
@@ -46,8 +49,11 @@ class RubyCrawl
46
49
  raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
47
50
 
48
51
  @base_url = normalized
52
+ @session_id = @client.create_session
49
53
  enqueue(normalized, 0)
50
54
  process_queue(&block)
55
+ ensure
56
+ @client.destroy_session(@session_id) if @session_id
51
57
  end
52
58
 
53
59
  private
@@ -77,7 +83,9 @@ class RubyCrawl
77
83
  end
78
84
 
79
85
  def crawl_page(url, depth)
80
- result = @client.crawl(url, wait_until: @wait_until, block_resources: @block_resources)
86
+ opts = { wait_until: @wait_until, block_resources: @block_resources, session_id: @session_id }
87
+ opts[:max_attempts] = @max_attempts if @max_attempts
88
+ result = @client.crawl(url, **opts)
81
89
  build_page_result(url, depth, result)
82
90
  rescue Error => e
83
91
  warn "[rubycrawl] Failed to crawl #{url}: #{e.message}"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RubyCrawl
4
- VERSION = '0.1.3'
4
+ VERSION = '0.1.4'
5
5
  end
data/lib/rubycrawl.rb CHANGED
@@ -45,6 +45,18 @@ class RubyCrawl
45
45
  client.crawl_site(url, ...)
46
46
  end
47
47
 
48
+ # Create a session for reusing browser context across multiple crawls.
49
+ # @return [String] session_id
50
+ def create_session
51
+ client.create_session
52
+ end
53
+
54
+ # Destroy a session and close its browser context.
55
+ # @param session_id [String]
56
+ def destroy_session(session_id)
57
+ client.destroy_session(session_id)
58
+ end
59
+
48
60
  def configure(**options)
49
61
  @client = new(**options)
50
62
  end
@@ -55,17 +67,30 @@ class RubyCrawl
55
67
  build_service_client
56
68
  end
57
69
 
58
- def crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries)
70
+ def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts, session_id: nil)
59
71
  validate_url!(url)
60
72
  @service_client.ensure_running
61
- with_retries(retries) do
62
- payload = build_payload(url, wait_until, block_resources)
73
+ with_retries(max_attempts) do
74
+ payload = build_payload(url, wait_until, block_resources, session_id)
63
75
  response = @service_client.post_json('/crawl', payload)
64
76
  raise_node_error!(response)
65
77
  build_result(response)
66
78
  end
67
79
  end
68
80
 
81
+ # Create a session for reusing browser context.
82
+ # @return [String] session_id
83
+ def create_session
84
+ @service_client.ensure_running
85
+ @service_client.create_session
86
+ end
87
+
88
+ # Destroy a session.
89
+ # @param session_id [String]
90
+ def destroy_session(session_id)
91
+ @service_client.destroy_session(session_id)
92
+ end
93
+
69
94
  # Crawl multiple pages starting from a URL, following links.
70
95
  # @see RubyCrawl.crawl_site
71
96
  def crawl_site(url, **options, &block)
@@ -106,7 +131,7 @@ class RubyCrawl
106
131
  @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
107
132
  @wait_until = options.fetch(:wait_until, nil)
108
133
  @block_resources = options.fetch(:block_resources, nil)
109
- @max_retries = options.fetch(:max_retries, 3)
134
+ @max_attempts = options.fetch(:max_attempts, 3)
110
135
  end
111
136
 
112
137
  def build_service_client
@@ -119,9 +144,9 @@ class RubyCrawl
119
144
  )
120
145
  end
121
146
 
122
- def retry_with_backoff(attempt, retries, error)
147
+ def retry_with_backoff(attempt, max_attempts, error)
123
148
  backoff_seconds = 2**attempt
124
- warn "[rubycrawl] Retry #{attempt}/#{retries - 1} after #{backoff_seconds}s: #{error.message}"
149
+ warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff_seconds}s: #{error.message}"
125
150
  sleep(backoff_seconds)
126
151
  end
127
152
 
@@ -131,7 +156,8 @@ class RubyCrawl
131
156
  max_depth: options.fetch(:max_depth, 3),
132
157
  same_host_only: options.fetch(:same_host_only, true),
133
158
  wait_until: options.fetch(:wait_until, @wait_until),
134
- block_resources: options.fetch(:block_resources, @block_resources)
159
+ block_resources: options.fetch(:block_resources, @block_resources),
160
+ max_attempts: options.fetch(:max_attempts, @max_attempts)
135
161
  }
136
162
  end
137
163
 
data/node/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ /node_modules
2
+ /.env
data/node/.npmrc ADDED
@@ -0,0 +1 @@
1
+ fund=false
data/node/README.md ADDED
@@ -0,0 +1,19 @@
1
+ # rubycrawl Node Service
2
+
3
+ Local Playwright-backed HTTP service used by the Ruby gem.
4
+
5
+ ## Run
6
+
7
+ ```
8
+ npm install
9
+ npm start
10
+ ```
11
+
12
+ ## Environment
13
+
14
+ Create a `.env` file (or copy from `.env.example`) if you need custom settings.
15
+
16
+ ## Endpoints
17
+
18
+ - `POST /crawl` JSON body: `{ "url": "https://example.com" }`
19
+ - `GET /health`
@@ -0,0 +1,72 @@
1
+ {
2
+ "name": "rubycrawl-node",
3
+ "version": "0.1.0",
4
+ "lockfileVersion": 3,
5
+ "requires": true,
6
+ "packages": {
7
+ "": {
8
+ "name": "rubycrawl-node",
9
+ "version": "0.1.0",
10
+ "dependencies": {
11
+ "dotenv": "^16.4.5",
12
+ "playwright": "^1.41.0"
13
+ }
14
+ },
15
+ "node_modules/dotenv": {
16
+ "version": "16.6.1",
17
+ "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
18
+ "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
19
+ "license": "BSD-2-Clause",
20
+ "engines": {
21
+ "node": ">=12"
22
+ },
23
+ "funding": {
24
+ "url": "https://dotenvx.com"
25
+ }
26
+ },
27
+ "node_modules/fsevents": {
28
+ "version": "2.3.2",
29
+ "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
30
+ "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
31
+ "hasInstallScript": true,
32
+ "license": "MIT",
33
+ "optional": true,
34
+ "os": [
35
+ "darwin"
36
+ ],
37
+ "engines": {
38
+ "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
39
+ }
40
+ },
41
+ "node_modules/playwright": {
42
+ "version": "1.58.0",
43
+ "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.0.tgz",
44
+ "integrity": "sha512-2SVA0sbPktiIY/MCOPX8e86ehA/e+tDNq+e5Y8qjKYti2Z/JG7xnronT/TXTIkKbYGWlCbuucZ6dziEgkoEjQQ==",
45
+ "license": "Apache-2.0",
46
+ "dependencies": {
47
+ "playwright-core": "1.58.0"
48
+ },
49
+ "bin": {
50
+ "playwright": "cli.js"
51
+ },
52
+ "engines": {
53
+ "node": ">=18"
54
+ },
55
+ "optionalDependencies": {
56
+ "fsevents": "2.3.2"
57
+ }
58
+ },
59
+ "node_modules/playwright-core": {
60
+ "version": "1.58.0",
61
+ "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.0.tgz",
62
+ "integrity": "sha512-aaoB1RWrdNi3//rOeKuMiS65UCcgOVljU46At6eFcOFPFHWtd2weHRRow6z/n+Lec0Lvu0k9ZPKJSjPugikirw==",
63
+ "license": "Apache-2.0",
64
+ "bin": {
65
+ "playwright-core": "cli.js"
66
+ },
67
+ "engines": {
68
+ "node": ">=18"
69
+ }
70
+ }
71
+ }
72
+ }
data/node/package.json ADDED
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "rubycrawl-node",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "main": "src/index.js",
7
+ "scripts": {
8
+ "start": "node src/index.js"
9
+ },
10
+ "dependencies": {
11
+ "dotenv": "^16.4.5",
12
+ "playwright": "^1.41.0"
13
+ }
14
+ }