crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,329 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Crawlr
6
+ # Robots.txt parser and compliance checker for respectful web scraping.
7
+ #
8
+ # The Robots class implements full robots.txt specification compliance,
9
+ # including user-agent matching, path pattern matching with wildcards,
10
+ # allow/disallow precedence rules, and crawl-delay directives. It helps
11
+ # ensure that scrapers respect website crawling policies and avoid
12
+ # making unwanted requests.
13
+ #
14
+ # @example Basic robots.txt compliance
15
+ # robots = Crawlr::Robots.new
16
+ #
17
+ # # Parse robots.txt content
18
+ # robots_content = <<~ROBOTS
19
+ # User-agent: *
20
+ # Disallow: /private/
21
+ # Allow: /public/
22
+ # Crawl-delay: 1
23
+ # ROBOTS
24
+ #
25
+ # robots.parse('https://example.com', robots_content)
26
+ #
27
+ # # Check URL permissions
28
+ # robots.allowed?('https://example.com/public/page', 'MyBot/1.0') #=> true
29
+ # robots.allowed?('https://example.com/private/data', 'MyBot/1.0') #=> false
30
+ #
31
+ # @example Complex user-agent matching
32
+ # robots_content = <<~ROBOTS
33
+ # User-agent: Googlebot
34
+ # Disallow: /admin/
35
+ #
36
+ # User-agent: *
37
+ # Disallow: /
38
+ # Allow: /public/
39
+ # ROBOTS
40
+ #
41
+ # robots.parse('https://site.com', robots_content)
42
+ #
43
+ # robots.allowed?('https://site.com/admin/', 'Googlebot/2.1') #=> false
44
+ # robots.allowed?('https://site.com/public/', 'Googlebot/2.1') #=> true
45
+ # robots.allowed?('https://site.com/anything/', 'OtherBot/1.0') #=> false
46
+ #
47
+ # @example Wildcard pattern matching
48
+ # robots_content = <<~ROBOTS
49
+ # User-agent: *
50
+ # Disallow: /*.pdf$
51
+ # Disallow: /temp/*
52
+ # Allow: /temp/public/*
53
+ # ROBOTS
54
+ #
55
+ # robots.parse('https://example.com', robots_content)
56
+ #
57
+ # robots.allowed?('https://example.com/document.pdf', 'Bot') #=> false
58
+ # robots.allowed?('https://example.com/temp/secret.txt', 'Bot') #=> false
59
+ # robots.allowed?('https://example.com/temp/public/file.txt', 'Bot') #=> true
60
+ #
61
+ # @author [Your Name]
62
+ # @since 0.1.0
63
+ class Robots
64
+ # Represents a robots.txt rule for a specific user-agent
65
+ #
66
+ # @!attribute [r] user_agent
67
+ # @return [String] User-agent pattern this rule applies to
68
+ # @!attribute [r] allow
69
+ # @return [Array<String>] Array of allowed path patterns
70
+ # @!attribute [r] disallow
71
+ # @return [Array<String>] Array of disallowed path patterns
72
+ # @!attribute [r] crawl_delay
73
+ # @return [String, nil] Crawl delay in seconds for this user-agent
74
+ Rule = Struct.new(:user_agent, :allow, :disallow, :crawl_delay)
75
+
76
+ # @return [Hash<String, Array<Rule>>] Internal store of parsed robots.txt rules by domain
77
+ attr_reader :store
78
+
79
+ # Initializes a new Robots instance
80
+ #
81
+ # Creates an empty store for caching parsed robots.txt files by domain.
82
+ # Each domain's robots.txt is parsed once and cached for subsequent
83
+ # permission checks.
84
+ #
85
+ # @example
86
+ # robots = Crawlr::Robots.new
87
+ def initialize
88
+ @store = {}
89
+ end
90
+
91
+ # Checks if robots.txt has been parsed and cached for a given origin
92
+ #
93
+ # @param origin [String] The origin URL (scheme + host + port)
94
+ # @return [Boolean] true if robots.txt data exists for this origin
95
+ #
96
+ # @example
97
+ # robots.exists?('https://example.com') #=> false
98
+ # robots.parse('https://example.com', robots_content)
99
+ # robots.exists?('https://example.com') #=> true
100
+ def exists?(origin)
101
+ @store.key?(origin)
102
+ end
103
+
104
+ # Determines if a URL is allowed to be crawled according to robots.txt rules
105
+ #
106
+ # This method implements the full robots.txt specification including:
107
+ # - User-agent matching with prefix matching and wildcards
108
+ # - Path pattern matching with wildcards and end anchors
109
+ # - Allow/disallow precedence with longest match wins
110
+ # - Graceful fallback when no robots.txt exists
111
+ #
112
+ # @param url [String] The full URL to check for crawling permission
113
+ # @param user_agent [String] The user-agent string to match against rules
114
+ # @return [Boolean] true if the URL is allowed to be crawled
115
+ #
116
+ # @example Basic permission checking
117
+ # robots.allowed?('https://example.com/page.html', 'MyBot/1.0')
118
+ #
119
+ # @example With specific user-agent rules
120
+ # # robots.txt contains specific rules for "MyBot"
121
+ # robots.allowed?('https://site.com/admin/', 'MyBot/2.0') #=> depends on rules
122
+ # robots.allowed?('https://site.com/admin/', 'OtherBot') #=> uses wildcard rules
123
+ #
124
+ # @example Pattern matching examples
125
+ # # robots.txt: Disallow: /*.pdf$
126
+ # robots.allowed?('https://site.com/doc.pdf', 'Bot') #=> false
127
+ # robots.allowed?('https://site.com/doc.pdf.html', 'Bot') #=> true
128
+ #
129
+ # # robots.txt: Disallow: /temp/*
130
+ # robots.allowed?('https://site.com/temp/file.txt', 'Bot') #=> false
131
+ # robots.allowed?('https://site.com/temporary/', 'Bot') #=> true
132
+ def allowed?(url, user_agent)
133
+ rule = get_rule(url, user_agent)
134
+ return true unless rule # if no robots.txt or no rule, allow
135
+
136
+ path = URI.parse(url).path
137
+ matched = []
138
+
139
+ # Match allow/disallow using fnmatch (robots.txt style)
140
+ rule.allow.each do |pattern|
141
+ matched << [:allow, pattern] if robots_match?(pattern, path)
142
+ end
143
+
144
+ rule.disallow.each do |pattern|
145
+ matched << [:disallow, pattern] if robots_match?(pattern, path)
146
+ end
147
+
148
+ return true if matched.empty?
149
+
150
+ # Longest match wins
151
+ action, = matched.max_by { |_, p| p.length }
152
+ action == :allow
153
+ end
154
+
155
+ # Parses robots.txt content and stores rules for the given URL's domain
156
+ #
157
+ # Extracts and processes all robots.txt directives including:
158
+ # - User-agent declarations
159
+ # - Allow and Disallow rules
160
+ # - Crawl-delay directives
161
+ # - Sitemap declarations
162
+ # - Comment and empty line handling
163
+ #
164
+ # @param url [String] The URL where this robots.txt was fetched from
165
+ # @param content [String] Raw robots.txt file content
166
+ # @return [void]
167
+ #
168
+ # @example Parse standard robots.txt
169
+ # robots_content = <<~ROBOTS
170
+ # # This is a comment
171
+ # User-agent: *
172
+ # Disallow: /private/
173
+ # Allow: /public/
174
+ # Crawl-delay: 2
175
+ #
176
+ # User-agent: Googlebot
177
+ # Allow: /
178
+ #
179
+ # Sitemap: https://example.com/sitemap.xml
180
+ # ROBOTS
181
+ #
182
+ # robots.parse('https://example.com/robots.txt', robots_content)
183
+ #
184
+ # @example Parse with wildcards and patterns
185
+ # robots_content = <<~ROBOTS
186
+ # User-agent: *
187
+ # Disallow: /*.json$
188
+ # Disallow: /api/v*/private/
189
+ # Allow: /api/v*/public/
190
+ # ROBOTS
191
+ #
192
+ # robots.parse('https://api.example.com', robots_content)
193
+ def parse(url, content)
194
+ uri = URI.parse(url)
195
+ domain = uri.host.downcase
196
+ hash = parse_to_hash(content)
197
+
198
+ rules = []
199
+ hash[:rules].each do |user_agent, rule|
200
+ rules << Rule.new(user_agent, rule[:allow], rule[:disallow], rule[:crawl_delay])
201
+ end
202
+
203
+ @store[domain] ||= rules
204
+ end
205
+
206
+ private
207
+
208
+ # Finds the most applicable rule for a URL and user-agent combination
209
+ #
210
+ # Implements the robots.txt user-agent matching algorithm:
211
+ # 1. Find rules with user-agent prefix matching (case-insensitive)
212
+ # 2. If no matches, fall back to wildcard (*) rules
213
+ # 3. Return the most specific match (longest user-agent string)
214
+ #
215
+ # @param url [String] URL to find rules for
216
+ # @param user_agent [String] User-agent to match
217
+ # @return [Rule, nil] Most applicable rule or nil if none found
218
+ # @api private
219
+ def get_rule(url, user_agent)
220
+ uri = URI.parse(url)
221
+ domain = uri.host.downcase
222
+ rules = @store[domain]
223
+ return nil unless rules
224
+
225
+ # Case-insensitive prefix match
226
+ applicable_rules = rules.select do |rule|
227
+ next if rule.user_agent.nil?
228
+
229
+ user_agent.downcase.start_with?(rule.user_agent.downcase)
230
+ end
231
+
232
+ # Fallback to wildcard
233
+ applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
234
+
235
+ # Most specific (longest UA name) wins
236
+ applicable_rules.max_by { |r| r.user_agent.length }
237
+ end
238
+
239
+ # Tests if a robots.txt pattern matches a given path
240
+ #
241
+ # Implements robots.txt pattern matching including:
242
+ # - Wildcard matching using File.fnmatch
243
+ # - End anchor ($) support for exact suffix matching
244
+ # - Extended glob patterns support
245
+ #
246
+ # @param pattern [String] robots.txt path pattern (may include wildcards and anchors)
247
+ # @param path [String] URL path to test against pattern
248
+ # @return [Boolean] true if pattern matches the path
249
+ # @api private
250
+ #
251
+ # @example Wildcard patterns
252
+ # robots_match?('/temp/*', '/temp/file.txt') #=> true
253
+ # robots_match?('/temp/*', '/temporary/') #=> false
254
+ #
255
+ # @example End anchor patterns
256
+ # robots_match?('*.pdf$', '/document.pdf') #=> true
257
+ # robots_match?('*.pdf$', '/document.pdf.html') #=> false
258
+ #
259
+ # @example Exact path matching
260
+ # robots_match?('/admin/', '/admin/') #=> true
261
+ # robots_match?('/admin/', '/admin/page.html') #=> false
262
+ def robots_match?(pattern, path)
263
+ # Handle `$` end anchor (remove and check exact end)
264
+ anchored = pattern.end_with?("$")
265
+ pattern = pattern.chomp("$") if anchored
266
+
267
+ matched = File.fnmatch?(pattern, path, File::FNM_EXTGLOB)
268
+ return matched unless anchored
269
+
270
+ matched && path.end_with?(pattern.delete_prefix("*"))
271
+ end
272
+
273
+ # Parses robots.txt content into a structured hash format
274
+ #
275
+ # Processes the raw robots.txt file line by line, handling:
276
+ # - User-agent declarations and grouping
277
+ # - Allow/Disallow rule accumulation
278
+ # - Crawl-delay value extraction
279
+ # - Sitemap URL collection
280
+ # - Comment and whitespace filtering
281
+ #
282
+ # @param content [String] Raw robots.txt file content
283
+ # @return [Hash] Structured hash with :sitemap and :rules keys
284
+ # @api private
285
+ #
286
+ # @example Return structure
287
+ # {
288
+ # sitemap: ['https://example.com/sitemap.xml'],
289
+ # rules: {
290
+ # '*' => { allow: ['/public/'], disallow: ['/private/'], crawl_delay: '1' },
291
+ # 'Googlebot' => { allow: ['/'], disallow: [], crawl_delay: nil }
292
+ # }
293
+ # }
294
+ def parse_to_hash(content)
295
+ robots_hash = {
296
+ sitemap: [],
297
+ rules: {}
298
+ }
299
+
300
+ curr_user_agents = []
301
+
302
+ content.each_line do |line|
303
+ clean_line = line.strip
304
+ next if clean_line.empty? || clean_line.start_with?("#")
305
+
306
+ key, value = clean_line.split(":", 2).map(&:strip)
307
+ next unless key && value
308
+
309
+ key = key.downcase
310
+
311
+ case key
312
+ when "sitemap"
313
+ robots_hash[:sitemap] << value
314
+ when "user-agent"
315
+ curr_user_agents = [value]
316
+ robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
317
+ when "allow"
318
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
319
+ when "disallow"
320
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
321
+ when "crawl-delay"
322
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
323
+ end
324
+ end
325
+
326
+ robots_hash
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlr
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,190 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "concurrent"
4
+
5
+ module Crawlr
6
+ # Thread-safe visit tracking system for URL deduplication and history management.
7
+ #
8
+ # The Visits class maintains a record of visited URLs to prevent duplicate
9
+ # requests during scraping sessions. It uses concurrent data structures to
10
+ # ensure thread safety in parallel scraping environments and implements
11
+ # memory management through configurable visit limits with automatic cache
12
+ # reset when limits are reached.
13
+ #
14
+ # @example Basic visit tracking
15
+ # config = Crawlr::Config.new(allow_url_revisit: false, max_visited: 1000)
16
+ # visits = Crawlr::Visits.new(config)
17
+ #
18
+ # visits.new?('https://example.com/page1') #=> true (first time)
19
+ # visits.register('https://example.com/page1')
20
+ # visits.new?('https://example.com/page1') #=> false (already visited)
21
+ #
22
+ # @example With URL revisiting allowed
23
+ # config = Crawlr::Config.new(allow_url_revisit: true)
24
+ # visits = Crawlr::Visits.new(config)
25
+ #
26
+ # visits.new?('https://example.com/page') #=> true (always allowed)
27
+ # visits.register('https://example.com/page')
28
+ # visits.new?('https://example.com/page') #=> true (revisiting allowed)
29
+ #
30
+ # @example Memory management with limits
31
+ # config = Crawlr::Config.new(max_visited: 5)
32
+ # visits = Crawlr::Visits.new(config)
33
+ #
34
+ # # Add URLs up to limit
35
+ # (1..5).each do |i|
36
+ # visits.register("https://example.com/page#{i}")
37
+ # end
38
+ #
39
+ # # Next check triggers cache reset
40
+ # visits.new?('https://example.com/page6') #=> true (cache was reset)
41
+ # visits.stats[:visited_count] #=> 0 (cache cleared)
42
+ #
43
+ # @example Thread-safe parallel scraping
44
+ # visits = Crawlr::Visits.new(config)
45
+ #
46
+ # # Safe to use across multiple threads
47
+ # threads = 10.times.map do |i|
48
+ # Thread.new do
49
+ # url = "https://example.com/thread#{i}/page"
50
+ # if visits.new?(url)
51
+ # visits.register(url)
52
+ # scrape_page(url)
53
+ # end
54
+ # end
55
+ # end
56
+ #
57
+ # threads.each(&:join)
58
+ #
59
+ # @author [Your Name]
60
+ # @since 0.1.0
61
+ class Visits
62
+ # Initializes a new Visits tracker with the given configuration
63
+ #
64
+ # Creates a thread-safe concurrent map for storing visited URLs and
65
+ # configures behavior based on the provided settings for revisiting
66
+ # and memory management.
67
+ #
68
+ # @param config [Crawlr::Config] Configuration object with visit tracking settings
69
+ # @option config [Boolean] :allow_url_revisit Whether to allow revisiting URLs
70
+ # @option config [Integer] :max_visited Maximum URLs to track before cache reset
71
+ #
72
+ # @example
73
+ # config = Crawlr::Config.new(
74
+ # allow_url_revisit: false,
75
+ # max_visited: 10_000
76
+ # )
77
+ # visits = Crawlr::Visits.new(config)
78
+ def initialize(config)
79
+ @config = config
80
+ @visited = Concurrent::Map.new
81
+ end
82
+
83
+ # Registers a URL as visited in the tracking system
84
+ #
85
+ # Marks the given URL as visited by storing it in the concurrent map.
86
+ # This method is thread-safe and can be called from multiple threads
87
+ # simultaneously without risk of data corruption.
88
+ #
89
+ # @param url [String] The URL to mark as visited
90
+ # @return [Boolean] Always returns true (the stored value)
91
+ #
92
+ # @example
93
+ # visits.register('https://example.com/page')
94
+ # visits.register('https://api.example.com/data?id=123')
95
+ def register(url)
96
+ @visited[url] = true
97
+ end
98
+
99
+ # Checks if the visit tracking system is empty
100
+ #
101
+ # Useful for determining if this is the first URL being processed
102
+ # or if the cache has been recently cleared. Can be used to apply
103
+ # different behavior for initial requests (like skipping delays).
104
+ #
105
+ # @return [Boolean] true if no URLs have been visited or cache is empty
106
+ #
107
+ # @example
108
+ # visits.blank? #=> true (no visits yet)
109
+ # visits.register('https://example.com')
110
+ # visits.blank? #=> false (has visits)
111
+ def blank?
112
+ @visited.keys.empty?
113
+ end
114
+
115
+ # Returns statistics about the visit tracking system
116
+ #
117
+ # Provides metrics about the current state of visit tracking including
118
+ # the number of URLs currently stored and the configured maximum limit.
119
+ # Useful for monitoring memory usage and debugging scraping behavior.
120
+ #
121
+ # @return [Hash<Symbol, Integer>] Statistics hash containing visit metrics
122
+ # @option return [Integer] :visited_count Number of URLs currently tracked
123
+ # @option return [Integer] :max_visited Maximum URLs before cache reset
124
+ #
125
+ # @example
126
+ # stats = visits.stats
127
+ # puts "Visited #{stats[:visited_count]} / #{stats[:max_visited]} URLs"
128
+ #
129
+ # if stats[:visited_count] > stats[:max_visited] * 0.8
130
+ # puts "Approaching visit limit, cache will reset soon"
131
+ # end
132
+ def stats
133
+ {
134
+ visited_count: @visited.size,
135
+ max_visited: @config.max_visited
136
+ }
137
+ end
138
+
139
+ # Determines if a URL is new (not previously visited)
140
+ #
141
+ # This method implements the core visit deduplication logic including:
142
+ # - Automatic cache reset when maximum visit limit is reached
143
+ # - Configurable URL revisiting behavior
144
+ # - Thread-safe duplicate detection
145
+ # - Logging for debugging and monitoring
146
+ #
147
+ # The method performs memory management by clearing the visited cache
148
+ # when the configured maximum is reached, preventing unbounded memory
149
+ # growth during long-running scraping sessions.
150
+ #
151
+ # @param url [String] URL to check for previous visits
152
+ # @return [Boolean] true if URL is new or revisiting is allowed, false if already visited
153
+ #
154
+ # @example Basic deduplication
155
+ # visits.new?('https://example.com/page1') #=> true
156
+ # visits.register('https://example.com/page1')
157
+ # visits.new?('https://example.com/page1') #=> false
158
+ #
159
+ # @example With revisiting enabled
160
+ # # config.allow_url_revisit = true
161
+ # visits.new?('https://example.com/page') #=> true (always)
162
+ #
163
+ # @example Memory limit handling
164
+ # # When max_visited limit is reached
165
+ # visits.new?('https://example.com/new') #=> true (cache reset)
166
+ # # Previous visits are forgotten after reset
167
+ #
168
+ # @example In parallel scraping context
169
+ # # Thread-safe checking across multiple workers
170
+ # if visits.new?(discovered_url)
171
+ # visits.register(discovered_url)
172
+ # process_url(discovered_url)
173
+ # else
174
+ # skip_duplicate(discovered_url)
175
+ # end
176
+ def new?(url)
177
+ # Reset if max visited reached
178
+ if @visited.size >= @config.max_visited
179
+ Crawlr.logger.warn "Reached max visited URLs limit (#{@config.max_visited}). Resetting visited cache."
180
+ @visited.clear
181
+ end
182
+
183
+ return true if @config.allow_url_revisit
184
+ return true unless @visited.key?(url)
185
+
186
+ Crawlr.logger.debug "Already visited #{url}; Skipping"
187
+ false
188
+ end
189
+ end
190
+ end
data/lib/crawlr.rb ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "crawlr/version"
4
+
5
+ # A Ruby scraping framework for parsing HTML and XML documents
6
+ # @author [Your Name]
7
+ # @since 0.1.0
8
+ module Crawlr
9
+ class Error < StandardError; end
10
+
11
+ class << self
12
+ attr_accessor :logger
13
+ end
14
+
15
+ self.logger = Logger.new($stdout, level: Logger::INFO)
16
+ end
data/sig/crawlr.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Crawlr
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end