crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "async"
4
+ require "async/timeout"
5
+ require "async/http/internet"
6
+ require "http/cookie_jar"
7
+
8
+ module Crawlr
9
+ # Handles fetching documents via async HTTP with proxy and cookie support.
10
+ #
11
+ # The HTTPInterface class provides a high-level async HTTP client specifically
12
+ # designed for web scraping. It supports proxy rotation, cookie management,
13
+ # configurable timeouts, and transforms raw HTTP responses into a simplified
14
+ # response structure suitable for content processing.
15
+ #
16
+ # @example Basic HTTP fetching
17
+ # config = Crawlr::Config.new(timeout: 10)
18
+ # http = Crawlr::HTTPInterface.new(config)
19
+ #
20
+ # response = http.get('https://example.com')
21
+ # puts response.status #=> 200
22
+ # puts response.body #=> HTML content
23
+ #
24
+ # @example With cookie support
25
+ # config = Crawlr::Config.new(allow_cookies: true)
26
+ # http = Crawlr::HTTPInterface.new(config)
27
+ #
28
+ # # Cookies are automatically managed across requests
29
+ # login_response = http.get('https://site.com/login')
30
+ # profile_response = http.get('https://site.com/profile') # Uses login cookies
31
+ #
32
+ # @example With proxy rotation
33
+ # config = Crawlr::Config.new(
34
+ # proxies: ['http://proxy1:8080', 'socks5://proxy2:1080'],
35
+ # proxy_strategy: :round_robin
36
+ # )
37
+ # http = Crawlr::HTTPInterface.new(config)
38
+ #
39
+ # response = http.get('https://example.com') # Uses proxy1
40
+ # response = http.get('https://example.com') # Uses proxy2
41
+ #
42
+ # @example With request hooks
43
+ # response = http.get('https://api.example.com') do |url, headers|
44
+ # headers['Authorization'] = "Bearer #{get_token()}"
45
+ # headers['X-Request-ID'] = SecureRandom.uuid
46
+ # end
47
+ #
48
+ # @author [Your Name]
49
+ # @since 0.1.0
50
+ class HTTPInterface
51
+ # Simplified HTTP response structure for internal use
52
+ #
53
+ # @!attribute [r] url
54
+ # @return [String] The requested URL
55
+ # @!attribute [r] status
56
+ # @return [Integer] HTTP status code
57
+ # @!attribute [r] headers
58
+ # @return [Hash] HTTP response headers
59
+ # @!attribute [r] version
60
+ # @return [String] HTTP protocol version
61
+ # @!attribute [r] body
62
+ # @return [String, nil] Response body content
63
+ Response = Struct.new(:url, :status, :headers, :version, :body)
64
+
65
+ # @return [Crawlr::Config] Configuration object containing HTTP settings
66
+ attr_reader :config
67
+
68
+ # Initializes a new HTTPInterface with the given configuration
69
+ #
70
+ # Sets up cookie management (if enabled) and proxy rotation state.
71
+ # The cookie jar persists across all requests made by this interface instance.
72
+ #
73
+ # @param config [Crawlr::Config] Configuration object with HTTP settings
74
+ # @option config [Boolean] :allow_cookies Enable cookie jar management
75
+ # @option config [Array<String>] :proxies List of proxy URLs
76
+ # @option config [Symbol] :proxy_strategy Proxy selection strategy (:round_robin, :random)
77
+ # @option config [Integer] :timeout Request timeout in seconds
78
+ # @option config [Hash] :headers Default headers for all requests
79
+ #
80
+ # @example
81
+ # config = Crawlr::Config.new(
82
+ # allow_cookies: true,
83
+ # timeout: 15,
84
+ # proxies: ['http://proxy.example.com:8080']
85
+ # )
86
+ # http = Crawlr::HTTPInterface.new(config)
87
+ def initialize(config)
88
+ @config = config
89
+ @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
90
+ @proxy_index = 0
91
+ end
92
+
93
+ # Performs an HTTP GET request with full async support and cookie management
94
+ #
95
+ # This method handles the complete HTTP request lifecycle including:
96
+ # - Proxy selection and connection setup
97
+ # - Cookie retrieval and attachment
98
+ # - Request header customization via block
99
+ # - Async execution with timeout handling
100
+ # - Response cookie parsing and storage
101
+ # - Resource cleanup and connection closing
102
+ #
103
+ # @param url [String] The URL to fetch
104
+ # @param block [Proc] Optional block for request customization
105
+ # @yieldparam url [String] The URL being requested
106
+ # @yieldparam headers [Hash] Mutable headers hash for customization
107
+ # @return [HTTPInterface::Response] Simplified response object
108
+ # @raise [Async::TimeoutError] When request exceeds configured timeout
109
+ # @raise [URI::InvalidURIError] When URL is malformed
110
+ # @raise [StandardError] For other HTTP-related errors
111
+ #
112
+ # @example Basic GET request
113
+ # response = http.get('https://example.com/api/data')
114
+ # if response.status == 200
115
+ # data = JSON.parse(response.body)
116
+ # end
117
+ #
118
+ # @example With custom headers
119
+ # response = http.get('https://api.service.com/endpoint') do |url, headers|
120
+ # headers['Accept'] = 'application/json'
121
+ # headers['X-API-Key'] = ENV['API_KEY']
122
+ # headers['User-Agent'] = 'MyBot/1.0'
123
+ # end
124
+ #
125
+ # @example With authentication
126
+ # response = http.get('https://secure.site.com/data') do |url, headers|
127
+ # token = authenticate_user(url)
128
+ # headers['Authorization'] = "Bearer #{token}"
129
+ # end
130
+ #
131
+ # @example Error handling
132
+ # begin
133
+ # response = http.get('https://unreliable.com/data')
134
+ # rescue Async::TimeoutError
135
+ # puts "Request timed out"
136
+ # rescue StandardError => e
137
+ # puts "Request failed: #{e.message}"
138
+ # end
139
+ def get(url)
140
+ Crawlr.logger.debug "Fetching #{url}"
141
+
142
+ uri = URI.parse(url)
143
+ proxy_url = next_proxy
144
+ internet = build_internet_connection(proxy_url)
145
+
146
+ request_headers = @config.headers.dup
147
+
148
+ if @config.allow_cookies
149
+ cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
150
+ request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
151
+ end
152
+
153
+ yield(url, request_headers) if block_given?
154
+
155
+ raw_response = nil
156
+ begin
157
+ Sync do |task|
158
+ raw_response = task.with_timeout(@config.timeout) do
159
+ internet.get(url, request_headers)
160
+ end
161
+ end
162
+
163
+ parse_and_set_cookies(uri, raw_response) if @config.allow_cookies && raw_response
164
+ make_response_struct(url, raw_response)
165
+ rescue Async::TimeoutError
166
+ Crawlr.logger.warn "Timeout fetching #{url} after #{@config.timeout}sec"
167
+ raise
168
+ ensure
169
+ raw_response&.close
170
+ internet&.close
171
+ Crawlr.logger.debug "Done fetching #{url}"
172
+ end
173
+ end
174
+
175
+ private
176
+
177
+ # Builds an async HTTP connection with optional proxy support
178
+ #
179
+ # Creates either a direct internet connection or a proxied connection
180
+ # based on the provided proxy URL. Supports HTTP and SOCKS5 proxies.
181
+ #
182
+ # @param proxy [String, nil] Proxy URL or nil for direct connection
183
+ # @return [Async::HTTP::Internet, Async::HTTP::Client] HTTP connection object
184
+ # @raise [URI::InvalidURIError] When proxy URL is malformed
185
+ # @api private
186
+ #
187
+ # @example Direct connection
188
+ # connection = build_internet_connection(nil)
189
+ #
190
+ # @example HTTP proxy
191
+ # connection = build_internet_connection('http://proxy.example.com:8080')
192
+ #
193
+ # @example SOCKS proxy with authentication
194
+ # connection = build_internet_connection('socks5://user:pass@proxy.example.com:1080')
195
+ def build_internet_connection(proxy = nil)
196
+ if proxy
197
+ # Expected format: "http://user:pass@host:port" or "socks5://host:port"
198
+ uri = URI.parse(proxy)
199
+ Crawlr.logger.debug "Using proxy: #{uri}"
200
+ # Async::HTTP::Proxy requires target endpoint
201
+ endpoint = Async::HTTP::Endpoint.parse(uri.to_s)
202
+ Async::HTTP::Client.new(endpoint)
203
+ else
204
+ Async::HTTP::Internet.new
205
+ end
206
+ end
207
+
208
+ # Selects the next proxy according to the configured strategy
209
+ #
210
+ # Implements proxy rotation strategies to distribute requests across
211
+ # multiple proxy servers. Maintains state for round-robin selection.
212
+ #
213
+ # @return [String, nil] Next proxy URL or nil if no proxies configured
214
+ # @raise [StandardError] When proxy_strategy is unknown
215
+ # @api private
216
+ #
217
+ # @example Round-robin selection
218
+ # proxy = next_proxy # Returns first proxy
219
+ # proxy = next_proxy # Returns second proxy
220
+ # proxy = next_proxy # Wraps back to first proxy
221
+ #
222
+ # @example Random selection
223
+ # # config.proxy_strategy = :random
224
+ # proxy = next_proxy # Returns random proxy from list
225
+ def next_proxy
226
+ return nil if @config.proxies.empty?
227
+
228
+ case @config.proxy_strategy
229
+ when :round_robin
230
+ proxy = @config.proxies[@proxy_index % @config.proxies.size]
231
+ @proxy_index += 1
232
+ proxy
233
+ when :random
234
+ @config.proxies.sample
235
+ else
236
+ raise "Unknown proxy strategy: #{@config.proxy_strategy}"
237
+ end
238
+ end
239
+
240
+ # Creates a simplified response struct from the raw HTTP response
241
+ #
242
+ # Transforms the async-http response object into a simplified structure
243
+ # that's easier to work with in the scraping framework. Safely handles
244
+ # body reading with error recovery.
245
+ #
246
+ # @param url [String] The original request URL
247
+ # @param response [Async::HTTP::Response] Raw async-http response object
248
+ # @return [HTTPInterface::Response] Simplified response struct
249
+ # @api private
250
+ def make_response_struct(url, response)
251
+ body = begin
252
+ response.read
253
+ rescue StandardError
254
+ nil
255
+ end
256
+
257
+ Response.new(url, response.status, response.headers, response.version, body)
258
+ end
259
+
260
+ # Parses and stores cookies from HTTP response headers
261
+ #
262
+ # Extracts Set-Cookie headers from the response and adds them to the
263
+ # internal cookie jar for use in subsequent requests. Handles multiple
264
+ # cookies and logs cookie information for debugging.
265
+ #
266
+ # @param uri [URI] The request URI for cookie domain/path context
267
+ # @param response [Async::HTTP::Response] HTTP response containing cookies
268
+ # @return [void]
269
+ # @api private
270
+ #
271
+ # @example Cookie processing
272
+ # # Response contains: Set-Cookie: session_id=abc123; Domain=.example.com; Path=/
273
+ # parse_and_set_cookies(uri, response)
274
+ # # Cookie is stored and will be sent with future requests to example.com
275
+ def parse_and_set_cookies(uri, response)
276
+ set_cookies = response.headers["set-cookie"]
277
+ Array(set_cookies).each do |set_cookie|
278
+ HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
279
+ @cookie_jar.add(cookie)
280
+ Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
281
+ " domain=#{cookie.domain}, path=#{cookie.path}"
282
+ end
283
+ end
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Crawlr
6
+ # Document parsing and callback execution engine.
7
+ #
8
+ # The Parser module provides the core document processing functionality for
9
+ # the Crawlr framework. It efficiently parses HTML and XML content using
10
+ # Nokogiri and executes registered callbacks on matching elements. The module
11
+ # optimizes performance by grouping callbacks by document format to minimize
12
+ # parsing overhead.
13
+ #
14
+ # @example Basic callback execution
15
+ # content = '<html><body><h1>Title</h1><p>Content</p></body></html>'
16
+ #
17
+ # callbacks = [
18
+ # {
19
+ # format: :html,
20
+ # selector_type: :css,
21
+ # selector: 'h1',
22
+ # block: ->(node, ctx) { ctx.titles << node.text }
23
+ # }
24
+ # ]
25
+ #
26
+ # context = OpenStruct.new(titles: [])
27
+ # Crawlr::Parser.apply_callbacks(
28
+ # content: content,
29
+ # callbacks: callbacks,
30
+ # context: context
31
+ # )
32
+ # puts context.titles #=> ["Title"]
33
+ #
34
+ # @example Mixed HTML and XML parsing
35
+ # callbacks = [
36
+ # {
37
+ # format: :html,
38
+ # selector_type: :css,
39
+ # selector: '.product',
40
+ # block: ->(node, ctx) { process_html_product(node, ctx) }
41
+ # },
42
+ # {
43
+ # format: :xml,
44
+ # selector_type: :xpath,
45
+ # selector: '//item[@type="product"]',
46
+ # block: ->(node, ctx) { process_xml_product(node, ctx) }
47
+ # }
48
+ # ]
49
+ #
50
+ # Crawlr::Parser.apply_callbacks(
51
+ # content: xml_content,
52
+ # callbacks: callbacks,
53
+ # context: scraping_context
54
+ # )
55
+ #
56
+ # @example Performance optimization with format grouping
57
+ # # Multiple callbacks for same format - document parsed only once
58
+ # callbacks = [
59
+ # { format: :html, selector_type: :css, selector: 'title', block: title_proc },
60
+ # { format: :html, selector_type: :css, selector: 'meta', block: meta_proc },
61
+ # { format: :html, selector_type: :xpath, selector: '//a[@href]', block: link_proc }
62
+ # ]
63
+ #
64
+ # # HTML content parsed once, all callbacks executed on same document
65
+ # Crawlr::Parser.apply_callbacks(content: html, callbacks: callbacks, context: ctx)
66
+ #
67
+ # @author [Your Name]
68
+ # @since 0.1.0
69
+ module Parser
70
+ # Applies registered callbacks to parsed document content
71
+ #
72
+ # This method is the main entry point for document processing. It efficiently
73
+ # handles multiple callbacks by grouping them by document format, ensuring
74
+ # that each piece of content is parsed only once per format regardless of
75
+ # how many callbacks are registered for that format.
76
+ #
77
+ # The method performs the following operations:
78
+ # 1. Groups callbacks by document format (:html or :xml)
79
+ # 2. Parses content once per format using appropriate Nokogiri parser
80
+ # 3. Executes all callbacks for each format on the parsed document
81
+ # 4. Extracts matching nodes using CSS or XPath selectors
82
+ # 5. Calls callback blocks with matched nodes and context
83
+ #
84
+ # @param content [String] Raw HTML or XML content to parse
85
+ # @param callbacks [Array<Hash>] Array of callback configuration hashes
86
+ # @param context [Object] Context object passed to callback blocks
87
+ # @option callbacks [Symbol] :format Document format (:html or :xml, defaults to :html)
88
+ # @option callbacks [Symbol] :selector_type Selector type (:css or :xpath)
89
+ # @option callbacks [String] :selector CSS or XPath selector string
90
+ # @option callbacks [Proc] :block Callback block to execute on matching nodes
91
+ # @return [void]
92
+ #
93
+ # @example Single callback execution
94
+ # callbacks = [{
95
+ # format: :html,
96
+ # selector_type: :css,
97
+ # selector: '.article-title',
98
+ # block: ->(node, ctx) { ctx.titles << node.text.strip }
99
+ # }]
100
+ #
101
+ # Crawlr::Parser.apply_callbacks(
102
+ # content: html_content,
103
+ # callbacks: callbacks,
104
+ # context: context_object
105
+ # )
106
+ #
107
+ # @example Multiple callbacks with different selectors
108
+ # callbacks = [
109
+ # {
110
+ # format: :html,
111
+ # selector_type: :css,
112
+ # selector: 'h1, h2, h3',
113
+ # block: ->(node, ctx) { ctx.headings << { text: node.text, level: node.name } }
114
+ # },
115
+ # {
116
+ # format: :html,
117
+ # selector_type: :xpath,
118
+ # selector: '//a[@href and text()]',
119
+ # block: ->(node, ctx) { ctx.links << { url: node['href'], text: node.text } }
120
+ # }
121
+ # ]
122
+ #
123
+ # Crawlr::Parser.apply_callbacks(
124
+ # content: page_html,
125
+ # callbacks: callbacks,
126
+ # context: scraping_context
127
+ # )
128
+ #
129
+ # @example XML feed processing
130
+ # callbacks = [{
131
+ # format: :xml,
132
+ # selector_type: :xpath,
133
+ # selector: '//item/title',
134
+ # block: ->(node, ctx) { ctx.feed_titles << node.text }
135
+ # }]
136
+ #
137
+ # Crawlr::Parser.apply_callbacks(
138
+ # content: rss_xml,
139
+ # callbacks: callbacks,
140
+ # context: feed_context
141
+ # )
142
+ #
143
+ # @example Complex data extraction
144
+ # callbacks = [{
145
+ # format: :html,
146
+ # selector_type: :css,
147
+ # selector: '.product-card',
148
+ # block: ->(node, ctx) {
149
+ # product = {
150
+ # name: node.css('.product-name').text,
151
+ # price: node.css('.price').text,
152
+ # image: node.css('img')&.first&.[]('src')
153
+ # }
154
+ # ctx.products << product
155
+ # }
156
+ # }]
157
+ #
158
+ # Crawlr::Parser.apply_callbacks(
159
+ # content: product_page_html,
160
+ # callbacks: callbacks,
161
+ # context: product_context
162
+ # )
163
+ def self.apply_callbacks(content:, callbacks:, context:)
164
+ # Group callbacks by format to minimize parsing
165
+ callbacks_by_format = callbacks.group_by { |cb| cb[:format] || :html }
166
+
167
+ callbacks_by_format.each do |format, format_callbacks|
168
+ doc = parse_content(format, content)
169
+
170
+ format_callbacks.each do |callback|
171
+ Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
172
+ nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
173
+ nodes.each { |node| callback[:block].call(node, context) }
174
+ end
175
+ end
176
+ end
177
+
178
+ # Parses content using the appropriate Nokogiri parser
179
+ #
180
+ # Creates a Nokogiri document object using either the HTML or XML parser
181
+ # based on the specified format. The HTML parser is more lenient and
182
+ # handles malformed markup better, while the XML parser is stricter and
183
+ # preserves XML-specific features.
184
+ #
185
+ # @param format [Symbol] Document format (:html or :xml)
186
+ # @param content [String] Raw document content to parse
187
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document] Parsed document
188
+ # @raise [ArgumentError] When format is not :html or :xml
189
+ # @api private
190
+ #
191
+ # @example HTML parsing
192
+ # doc = parse_content(:html, '<html><body>Hello</body></html>')
193
+ # doc.class #=> Nokogiri::HTML::Document
194
+ #
195
+ # @example XML parsing
196
+ # doc = parse_content(:xml, '<?xml version="1.0"?><root><item>data</item></root>')
197
+ # doc.class #=> Nokogiri::XML::Document
198
+ private_class_method def self.parse_content(format, content)
199
+ case format
200
+ when :html then Nokogiri::HTML(content)
201
+ when :xml then Nokogiri::XML(content)
202
+ else raise ArgumentError, "Unsupported format #{format}"
203
+ end
204
+ end
205
+
206
+ # Extracts nodes from parsed document using specified selector
207
+ #
208
+ # Executes CSS or XPath selectors against the parsed document to find
209
+ # matching elements. Returns a NodeSet that can be iterated over to
210
+ # process each matching element.
211
+ #
212
+ # @param doc [Nokogiri::HTML::Document, Nokogiri::XML::Document] Parsed document
213
+ # @param selector_type [Symbol] Type of selector (:css or :xpath)
214
+ # @param selector [String] Selector expression to find matching nodes
215
+ # @return [Nokogiri::XML::NodeSet] Collection of matching nodes
216
+ # @raise [ArgumentError] When selector_type is not :css or :xpath
217
+ # @api private
218
+ #
219
+ # @example CSS selector extraction
220
+ # nodes = extract_nodes(doc, :css, '.product-title')
221
+ # nodes.each { |node| puts node.text }
222
+ #
223
+ # @example XPath selector extraction
224
+ # nodes = extract_nodes(doc, :xpath, '//div[@class="content"]//p')
225
+ # nodes.each { |node| process_paragraph(node) }
226
+ #
227
+ # @example Complex CSS selector
228
+ # nodes = extract_nodes(doc, :css, 'article > header h1, article > header h2')
229
+ # # Returns all h1 and h2 elements that are direct children of article headers
230
+ #
231
+ # @example XPath with attributes
232
+ # nodes = extract_nodes(doc, :xpath, '//a[@href and contains(@class, "external")]')
233
+ # # Returns all links with href attribute containing "external" class
234
+ private_class_method def self.extract_nodes(doc, selector_type, selector)
235
+ case selector_type
236
+ when :css then doc.css(selector)
237
+ when :xpath then doc.xpath(selector)
238
+ else raise ArgumentError, "Unsupported selector type #{selector_type}"
239
+ end
240
+ end
241
+ end
242
+ end