RubyGems - rubycrawl - Versions diffs - 0.1.3 → 0.1.4 - Mend

rubycrawl 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/README.md +427 -210
data/lib/rubycrawl/helpers.rb +15 -11
data/lib/rubycrawl/markdown_converter.rb +3 -3
data/lib/rubycrawl/result.rb +10 -11
data/lib/rubycrawl/service_client.rb +25 -3
data/lib/rubycrawl/site_crawler.rb +14 -6
data/lib/rubycrawl/version.rb +1 -1
data/lib/rubycrawl.rb +33 -7
data/node/.gitignore +2 -0
data/node/.npmrc +1 -0
data/node/README.md +19 -0
data/node/package-lock.json +72 -0
data/node/package.json +14 -0
data/node/src/index.js +389 -0
data/rubycrawl.gemspec +3 -2
metadata +8 -3
data/Gemfile +0 -11

data/lib/rubycrawl/helpers.rb CHANGED Viewed

@@ -17,14 +17,22 @@ class RubyCrawl
       if uri.host&.match?(/^(localhost|127\.|192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[01]))/)
         warn '[rubycrawl] Warning: Crawling internal/private IP addresses'
       end
-    rescue URI::InvalidURIError => e
+    rescue URI::InvalidURIError, TypeError => e
       raise ConfigurationError, "Invalid URL: #{e.message}"
     end
-    def build_payload(url, wait_until, block_resources)
+    VALID_WAIT_UNTIL = %w[load domcontentloaded networkidle commit].freeze
+    def build_payload(url, wait_until, block_resources, session_id = nil)
+      if wait_until && !VALID_WAIT_UNTIL.include?(wait_until.to_s)
+        raise ConfigurationError,
+              "Invalid wait_until: #{wait_until.inspect}. Must be one of: #{VALID_WAIT_UNTIL.join(', ')}"
+      end
       payload = { url: url }
       payload[:wait_until] = wait_until if wait_until
       payload[:block_resources] = block_resources unless block_resources.nil?
+      payload[:session_id] = session_id if session_id
       payload
     end
@@ -39,11 +47,9 @@ class RubyCrawl
     def error_class_for(error_code)
       case error_code
-      when 'navigation_timeout', 'crawl_timeout'
-        TimeoutError
-      when 'navigation_failed', 'crawl_failed'
+      when 'crawl_failed'
         NavigationError
-      when 'invalid_json', 'invalid_json_response'
+      when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
         ServiceError
       else
         Error
@@ -52,12 +58,10 @@ class RubyCrawl
     def error_message_for(error_code, error_message)
       case error_code
-      when 'navigation_timeout', 'crawl_timeout'
-        "Crawl timeout: #{error_message}"
-      when 'navigation_failed', 'crawl_failed'
+      when 'crawl_failed'
         "Navigation failed: #{error_message}"
-      when 'invalid_json', 'invalid_json_response'
-        "Node service returned invalid JSON: #{error_message}"
+      when 'invalid_json', 'session_create_failed', 'session_destroy_failed'
+        "Service error [#{error_code}]: #{error_message}"
       else
         "Crawl error [#{error_code}]: #{error_message}"
       end

data/lib/rubycrawl/markdown_converter.rb CHANGED Viewed

@@ -15,10 +15,10 @@ class RubyCrawl
     # Convert HTML to Markdown with resolved URLs.
     #
-    # @param html [String] The HTML content to convert
+    # @param html [String] The page HTML to convert
     # @param base_url [String, nil] Base URL to resolve relative URLs
-    # @param options [Hash] Options for conversion
-    # @return [String] The Markdown content with absolute URLs
+    # @param options [Hash] Options passed to ReverseMarkdown
+    # @return [String] Markdown content with absolute URLs
     def convert(html, base_url: nil, **options)
       return '' if html.nil? || html.empty?

data/lib/rubycrawl/result.rb CHANGED Viewed

@@ -1,38 +1,37 @@
 # frozen_string_literal: true
 class RubyCrawl
-  # Result object with lazy markdown conversion.
+  # Result object with lazy clean_markdown conversion.
   class Result
     attr_reader :text, :html, :links, :metadata
-    def initialize(text:, html:, links:, metadata:, markdown: nil)
+    def initialize(text:, html:, links:, metadata:)
       @text = text
       @html = html
       @links = links
       @metadata = metadata
-      @markdown = markdown unless markdown.to_s.empty?
     end
-    # Returns markdown, converting from HTML lazily if needed.
+    # Returns clean markdown converted from the page HTML.
     # Relative URLs are resolved using the page's final_url.
     #
     # @return [String] Markdown content with absolute URLs
-    def markdown
-      @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
+    def clean_markdown
+      @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
     end
     # The final URL after redirects.
     #
     # @return [String, nil]
     def final_url
-      metadata['final_url'] || metadata[:final_url]
+      metadata['final_url']
     end
-    # Check if markdown has been computed.
+    # Check if clean_markdown has been computed.
     #
     # @return [Boolean]
-    def markdown?
-      !@markdown.nil?
+    def clean_markdown?
+      !@clean_markdown.nil?
     end
     def to_h
@@ -41,7 +40,7 @@ class RubyCrawl
         html: html,
         links: links,
         metadata: metadata,
-        markdown: markdown
+        clean_markdown: @clean_markdown
       }
     end
   end

data/lib/rubycrawl/service_client.rb CHANGED Viewed

@@ -36,6 +36,24 @@ class RubyCrawl
       raise TimeoutError, "Request to node service timed out: #{e.message}"
     end
+    # Create a session for reusing browser context across multiple crawls.
+    # @return [String] session_id
+    def create_session
+      response = post_json('/session/create', {})
+      raise ServiceError, "Failed to create session: #{response['error']}" if response['error']
+      response['session_id']
+    end
+    # Destroy a session and close its browser context.
+    # @param session_id [String]
+    def destroy_session(session_id)
+      post_json('/session/destroy', { session_id: session_id })
+    rescue StandardError
+      # Ignore errors on destroy - context may already be closed
+      nil
+    end
     private
     def build_request(uri, body)
@@ -55,9 +73,13 @@ class RubyCrawl
       raise ServiceError, "rubycrawl node service directory not found: #{@node_dir}" unless Dir.exist?(@node_dir)
       env = { 'RUBYCRAWL_NODE_PORT' => @port.to_s }
-      out = @node_log ? File.open(@node_log, 'a') : File::NULL
-      err = @node_log ? out : File::NULL
-      @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: err)
+      if @node_log
+        out = File.open(@node_log, 'a')
+        @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: out, err: out)
+        out.close
+      else
+        @node_pid = Process.spawn(env, @node_bin, 'src/index.js', chdir: @node_dir, out: File::NULL, err: File::NULL)
+      end
       Process.detach(@node_pid)
     end

data/lib/rubycrawl/site_crawler.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'set'
 class RubyCrawl
   # BFS crawler that follows links with deduplication.
   class SiteCrawler
-    # Page result yielded to the block with lazy markdown.
+    # Page result yielded to the block with lazy clean_markdown.
     class PageResult
       attr_reader :url, :html, :links, :metadata, :depth
@@ -17,14 +17,15 @@ class RubyCrawl
         @depth = depth
       end
-      # Lazy markdown conversion with resolved URLs.
-      def markdown
-        @markdown ||= MarkdownConverter.convert(html, base_url: final_url)
+      # Returns clean markdown converted from the page HTML.
+      # Relative URLs are resolved using the page's final_url.
+      def clean_markdown
+        @clean_markdown ||= MarkdownConverter.convert(html, base_url: final_url)
       end
       # The final URL after redirects.
       def final_url
-        metadata['final_url'] || metadata[:final_url] || url
+        metadata['final_url'] || url
       end
     end
@@ -35,8 +36,10 @@ class RubyCrawl
       @same_host_only = options.fetch(:same_host_only, true)
       @wait_until = options.fetch(:wait_until, nil)
       @block_resources = options.fetch(:block_resources, nil)
+      @max_attempts = options.fetch(:max_attempts, nil)
       @visited = Set.new
       @queue = []
+      @session_id = nil
     end
     def crawl(start_url, &block)
@@ -46,8 +49,11 @@ class RubyCrawl
       raise ConfigurationError, "Invalid start URL: #{start_url}" unless normalized
       @base_url = normalized
+      @session_id = @client.create_session
       enqueue(normalized, 0)
       process_queue(&block)
+    ensure
+      @client.destroy_session(@session_id) if @session_id
     end
     private
@@ -77,7 +83,9 @@ class RubyCrawl
     end
     def crawl_page(url, depth)
-      result = @client.crawl(url, wait_until: @wait_until, block_resources: @block_resources)
+      opts = { wait_until: @wait_until, block_resources: @block_resources, session_id: @session_id }
+      opts[:max_attempts] = @max_attempts if @max_attempts
+      result = @client.crawl(url, **opts)
       build_page_result(url, depth, result)
     rescue Error => e
       warn "[rubycrawl] Failed to crawl #{url}: #{e.message}"

data/lib/rubycrawl/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class RubyCrawl
-  VERSION = '0.1.3'
+  VERSION = '0.1.4'
 end

data/lib/rubycrawl.rb CHANGED Viewed

@@ -45,6 +45,18 @@ class RubyCrawl
       client.crawl_site(url, ...)
     end
+    # Create a session for reusing browser context across multiple crawls.
+    # @return [String] session_id
+    def create_session
+      client.create_session
+    end
+    # Destroy a session and close its browser context.
+    # @param session_id [String]
+    def destroy_session(session_id)
+      client.destroy_session(session_id)
+    end
     def configure(**options)
       @client = new(**options)
     end
@@ -55,17 +67,30 @@ class RubyCrawl
     build_service_client
   end
-  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, retries: @max_retries)
+  def crawl(url, wait_until: @wait_until, block_resources: @block_resources, max_attempts: @max_attempts, session_id: nil)
     validate_url!(url)
     @service_client.ensure_running
-    with_retries(retries) do
-      payload = build_payload(url, wait_until, block_resources)
+    with_retries(max_attempts) do
+      payload = build_payload(url, wait_until, block_resources, session_id)
       response = @service_client.post_json('/crawl', payload)
       raise_node_error!(response)
       build_result(response)
     end
   end
+  # Create a session for reusing browser context.
+  # @return [String] session_id
+  def create_session
+    @service_client.ensure_running
+    @service_client.create_session
+  end
+  # Destroy a session.
+  # @param session_id [String]
+  def destroy_session(session_id)
+    @service_client.destroy_session(session_id)
+  end
   # Crawl multiple pages starting from a URL, following links.
   # @see RubyCrawl.crawl_site
   def crawl_site(url, **options, &block)
@@ -106,7 +131,7 @@ class RubyCrawl
     @node_log = options.fetch(:node_log, ENV.fetch('RUBYCRAWL_NODE_LOG', nil))
     @wait_until = options.fetch(:wait_until, nil)
     @block_resources = options.fetch(:block_resources, nil)
-    @max_retries = options.fetch(:max_retries, 3)
+    @max_attempts = options.fetch(:max_attempts, 3)
   end
   def build_service_client
@@ -119,9 +144,9 @@ class RubyCrawl
     )
   end
-  def retry_with_backoff(attempt, retries, error)
+  def retry_with_backoff(attempt, max_attempts, error)
     backoff_seconds = 2**attempt
-    warn "[rubycrawl] Retry #{attempt}/#{retries - 1} after #{backoff_seconds}s: #{error.message}"
+    warn "[rubycrawl] Attempt #{attempt + 1}/#{max_attempts} failed, retrying in #{backoff_seconds}s: #{error.message}"
     sleep(backoff_seconds)
   end
@@ -131,7 +156,8 @@ class RubyCrawl
       max_depth: options.fetch(:max_depth, 3),
       same_host_only: options.fetch(:same_host_only, true),
       wait_until: options.fetch(:wait_until, @wait_until),
-      block_resources: options.fetch(:block_resources, @block_resources)
+      block_resources: options.fetch(:block_resources, @block_resources),
+      max_attempts: options.fetch(:max_attempts, @max_attempts)
     }
   end

data/node/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ /node_modules
2	+ /.env

data/node/.npmrc ADDED Viewed

	@@ -0,0 +1 @@
1	+ fund=false

data/node/README.md ADDED Viewed

@@ -0,0 +1,19 @@
+# rubycrawl Node Service
+Local Playwright-backed HTTP service used by the Ruby gem.
+## Run
+```
+npm install
+npm start
+```
+## Environment
+Create a `.env` file (or copy from `.env.example`) if you need custom settings.
+## Endpoints
+- `POST /crawl` JSON body: `{ "url": "https://example.com" }`
+- `GET /health`

data/node/package-lock.json ADDED Viewed

@@ -0,0 +1,72 @@
+{
+  "name": "rubycrawl-node",
+  "version": "0.1.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "rubycrawl-node",
+      "version": "0.1.0",
+      "dependencies": {
+        "dotenv": "^16.4.5",
+        "playwright": "^1.41.0"
+      }
+    },
+    "node_modules/dotenv": {
+      "version": "16.6.1",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
+      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://dotenvx.com"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/playwright": {
+      "version": "1.58.0",
+      "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.0.tgz",
+      "integrity": "sha512-2SVA0sbPktiIY/MCOPX8e86ehA/e+tDNq+e5Y8qjKYti2Z/JG7xnronT/TXTIkKbYGWlCbuucZ6dziEgkoEjQQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "playwright-core": "1.58.0"
+      },
+      "bin": {
+        "playwright": "cli.js"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "fsevents": "2.3.2"
+      }
+    },
+    "node_modules/playwright-core": {
+      "version": "1.58.0",
+      "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.0.tgz",
+      "integrity": "sha512-aaoB1RWrdNi3//rOeKuMiS65UCcgOVljU46At6eFcOFPFHWtd2weHRRow6z/n+Lec0Lvu0k9ZPKJSjPugikirw==",
+      "license": "Apache-2.0",
+      "bin": {
+        "playwright-core": "cli.js"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    }
+  }
+}

data/node/package.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+  "name": "rubycrawl-node",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "main": "src/index.js",
+  "scripts": {
+    "start": "node src/index.js"
+  },
+  "dependencies": {
+    "dotenv": "^16.4.5",
+    "playwright": "^1.41.0"
+  }
+}