RubyGems - crawlr - Versions diffs - 0.2.1 → 0.2.2 - Mend

crawlr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
-  data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
+  metadata.gz: b74c6111f3df0866bf50f28c5510b637c74264fe8809d57092d2694d1694c974
+  data.tar.gz: 93799e89ba870575b86d9f70f4938c145cefa8aea8effc60d0bbacc1e9919a87
 SHA512:
-  metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
-  data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
+  metadata.gz: 492e82dddbc07a130135137c94f561307f0692a405f630eec51500cd8b4046ad7a3147f010758bdfa91f1b2aa6dd23135fdcbea5a9ba62c76c2715525a117fbb
+  data.tar.gz: ad76ec821b6b4929779c1823107c1b21ca8180ec3fea3b71402d66a8853eddeb986bd8895f0146ce8e14d88d9090d3df696b8015c8b51b0c1f5ca224b36b6986

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,9 @@
 ## [Unreleased]
+## [0.2.2] - 2025-10-01
+- Refactor robots.rb and parser.rb to address a few rubocop complaints
 ## [0.2.1] - 2025-09-30
 - Fix paginated_visit to properly handle provided url queries (if present)

data/README.md CHANGED Viewed

@@ -3,7 +3,7 @@
 A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
 [![Gem Version](https://badge.fury.io/rb/crawlr.svg)](https://badge.fury.io/rb/crawlr)
-[![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
+[![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
 ## ✨ Features

data/lib/crawlr/parser.rb CHANGED Viewed

@@ -165,15 +165,16 @@ module Crawlr
       callbacks_by_format.each do |format, format_callbacks|
         doc = parse_content(format, content)
-        format_callbacks.each do |callback|
-          Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
-          nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
-          nodes.each { |node| callback[:block].call(node, context) }
-        end
+        format_callbacks.each { |callback| apply_callback(doc, callback, context) }
       end
     end
+    private_class_method def self.apply_callback(doc, callback, context)
+      Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
+      nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
+      nodes.each { |node| callback[:block].call(node, context) }
+    end
     # Parses content using the appropriate Nokogiri parser
     #
     # Creates a Nokogiri document object using either the HTML or XML parser

data/lib/crawlr/robots.rb CHANGED Viewed

@@ -130,25 +130,13 @@ module Crawlr
     #   robots.allowed?('https://site.com/temporary/', 'Bot')    #=> true
     def allowed?(url, user_agent)
       rule = get_rule(url, user_agent)
-      return true unless rule # if no robots.txt or no rule, allow
+      return true unless rule
       path = URI.parse(url).path
-      matched = []
-      # Match allow/disallow using fnmatch (robots.txt style)
-      rule.allow.each do |pattern|
-        matched << [:allow, pattern] if robots_match?(pattern, path)
-      end
-      rule.disallow.each do |pattern|
-        matched << [:disallow, pattern] if robots_match?(pattern, path)
-      end
+      matched = matched_rules(rule, path)
       return true if matched.empty?
-      # Longest match wins
-      action, = matched.max_by { |_, p| p.length }
-      action == :allow
+      longest_match_allows?(matched)
     end
     # Parses robots.txt content and stores rules for the given URL's domain
@@ -204,6 +192,25 @@ module Crawlr
     private
+    def matched_rules(rule, path)
+      matched = []
+      rule.allow.each do |pattern|
+        matched << [:allow, pattern] if robots_match?(pattern, path)
+      end
+      rule.disallow.each do |pattern|
+        matched << [:disallow, pattern] if robots_match?(pattern, path)
+      end
+      matched
+    end
+    def longest_match_allows?(matched)
+      action, = matched.max_by { |_, pattern| pattern.length }
+      action == :allow
+    end
     # Finds the most applicable rule for a URL and user-agent combination
     #
     # Implements the robots.txt user-agent matching algorithm:
@@ -222,11 +229,7 @@ module Crawlr
       return nil unless rules
       # Case-insensitive prefix match
-      applicable_rules = rules.select do |rule|
-        next if rule.user_agent.nil?
-        user_agent.downcase.start_with?(rule.user_agent.downcase)
-      end
+      applicable_rules = rules_by_prefix_match(user_agent, rules)
       # Fallback to wildcard
       applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
@@ -235,6 +238,14 @@ module Crawlr
       applicable_rules.max_by { |r| r.user_agent.length }
     end
+    def rules_by_prefix_match(user_agent, rules)
+      rules.select do |rule|
+        next if rule.user_agent.nil?
+        user_agent.downcase.start_with?(rule.user_agent.downcase)
+      end
+    end
     # Tests if a robots.txt pattern matches a given path
     #
     # Implements robots.txt pattern matching including:
@@ -291,38 +302,45 @@ module Crawlr
     #     }
     #   }
     def parse_to_hash(content)
-      robots_hash = {
-        sitemap: [],
-        rules: {}
-      }
+      robots_hash = { sitemap: [], rules: {} }
       curr_user_agents = []
       content.each_line do |line|
-        clean_line = line.strip
-        next if clean_line.empty? || clean_line.start_with?("#")
-        key, value = clean_line.split(":", 2).map(&:strip)
+        key, value = parse_line(line)
         next unless key && value
-        key = key.downcase
-        case key
-        when "sitemap"
-          robots_hash[:sitemap] << value
-        when "user-agent"
-          curr_user_agents = [value]
-          robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
-        when "allow"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
-        when "disallow"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
-        when "crawl-delay"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
-        end
+        curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
       end
       robots_hash
     end
+    def parse_line(line)
+      clean_line = line.strip
+      return if clean_line.empty? || clean_line.start_with?("#")
+      key, value = clean_line.split(":", 2).map(&:strip)
+      return unless key && value
+      [key.downcase, value]
+    end
+    def apply_rule(robots_hash, key, value, curr_user_agents)
+      case key
+      when "sitemap"
+        robots_hash[:sitemap] << value
+      when "user-agent"
+        curr_user_agents = [value]
+        robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
+      when "allow"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
+      when "disallow"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
+      when "crawl-delay"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
+      end
+      curr_user_agents
+    end
   end
 end

data/lib/crawlr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlr
-  VERSION = "0.2.1"
+  VERSION = "0.2.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: crawlr
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Aristotelis Rapai