nous 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +68 -0
  3. data/README.md +82 -10
  4. data/lib/nous/cli.rb +13 -10
  5. data/lib/nous/command.rb +2 -2
  6. data/lib/nous/configuration_builder.rb +56 -0
  7. data/lib/nous/converter.rb +1 -1
  8. data/lib/nous/crawler/async_page_fetcher.rb +83 -0
  9. data/lib/nous/crawler/link_extractor.rb +11 -11
  10. data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
  11. data/lib/nous/crawler/redirect_follower.rb +60 -0
  12. data/lib/nous/crawler/single_page_fetcher.rb +112 -0
  13. data/lib/nous/crawler/url_filter.rb +6 -6
  14. data/lib/nous/crawler.rb +15 -70
  15. data/lib/nous/extractor/default/client.rb +68 -0
  16. data/lib/nous/extractor/default.rb +10 -6
  17. data/lib/nous/extractor/jina/client.rb +4 -4
  18. data/lib/nous/extractor/jina.rb +10 -9
  19. data/lib/nous/fetcher/extraction_runner.rb +31 -0
  20. data/lib/nous/fetcher/page_extractor.rb +40 -0
  21. data/lib/nous/fetcher.rb +38 -11
  22. data/lib/nous/primitives/configuration.rb +17 -0
  23. data/lib/nous/primitives/extracted_content.rb +5 -0
  24. data/lib/nous/primitives/fetch_record.rb +26 -0
  25. data/lib/nous/primitives/fetch_result.rb +21 -0
  26. data/lib/nous/primitives/page.rb +5 -0
  27. data/lib/nous/primitives/url.rb +45 -0
  28. data/lib/nous/serializer.rb +14 -3
  29. data/lib/nous/url_resolver.rb +25 -0
  30. data/lib/nous/version.rb +1 -1
  31. data/lib/nous.rb +6 -5
  32. metadata +44 -8
  33. data/lib/nous/configuration.rb +0 -39
  34. data/lib/nous/crawler/page_fetcher.rb +0 -47
  35. data/lib/nous/error.rb +0 -5
  36. data/lib/nous/extraction_runner.rb +0 -29
  37. data/lib/nous/extraction_thread.rb +0 -28
  38. data/lib/nous/extractor.rb +0 -46
  39. data/lib/nous/page.rb +0 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c44bdc52070c6430739f9b0258ea53e3dafc1cff42d87814fd940c2e9e26ee94
4
- data.tar.gz: e4b42ca9917d7e4656f8e8bc2d9b8b328781021c2ed02e9e1912bdb9ce8ac744
3
+ metadata.gz: c73c21d427c9bb99cc148e089ed5899e7aa9e3ca86a4825540380d41771354d2
4
+ data.tar.gz: 4b361a7aed3c0dfb28a6a650b0813d371622c82b910b8880502281047152a739
5
5
  SHA512:
6
- metadata.gz: f55c5122dd9a53611c7045e648c34870f9e423afae6777d0004f0bc909c0b916fd5a8a0350168d286e2e63339be6ae393f9ee02cbe4703d1a392fceaee317fd0
7
- data.tar.gz: fb6bdb6b9c283bc8350a4e697412869e9c1062af0659ad5b56aee6a0cdcad33983f8a15da9a94f3ae37b45b469a5d67a1c5e977d3d2c8b27a59e8f66eeedd59c
6
+ metadata.gz: aacbc4777dc1e5bd66513ddc3bc5a1f667276ac2e89ff781f7b43762133ae640d68bc1191c11b61ee00ac7911a128fcf6ad80653f86d369d284223e830f09120
7
+ data.tar.gz: 90fc8f0cf3c30c6e06bf6aeebd2790539ff80d0d63469a367c4a09b008c9ceed3dedc0d18528e5f6fab3c4a02103b4d2c7c9b3788b1708c6a1a46790d9f5cbab
data/CHANGELOG.md CHANGED
@@ -1,5 +1,67 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.0] - 2026-04-11
4
+
5
+ ### Added
6
+
7
+ - **New `details: true` option for `Nous.fetch`** - Returns a `FetchResult` object containing both successful pages and failed fetch/extraction attempts. This enables explicit failure handling without exceptions.
8
+ ```ruby
9
+ result = Nous.fetch("https://example.com", details: true)
10
+ result.pages # Array<Page> - successfully extracted
11
+ result.failures # [{requested_url:, error:}, ...]
12
+ ```
13
+
14
+ - **Page metadata** - Every extracted page now includes provenance information:
15
+ - `extractor`: Which extractor backend was used (e.g., "Nous::Extractor::Default")
16
+ - `requested_url`: The original URL before any redirects
17
+ - `content_type`: HTTP Content-Type header from the response
18
+ - `redirected`: Boolean indicating if redirects occurred
19
+
20
+ - **FetchRecord internal primitive** - Unified fetch result representation that captures both success and failure cases with full provenance tracking. Replaces the previous `RawPage` which only handled successful fetches.
21
+
22
+ - **Configuration#single_page? helper** - Convenience predicate method for checking if the current configuration is in single-page (non-recursive) mode.
23
+
24
+ ### Changed
25
+
26
+ - **Improved title extraction** - Title extraction now uses a fallback chain: readability extracted title → HTML `<title>` tag → first `<h1>` element. This significantly improves title reliability on pages where readability fails to identify the title.
27
+
28
+ - **Reduced aggressive DOM stripping** - The default extractor now preserves more content before readability processing. Previously removed elements (`header`, `img`, `video`, `svg`, `link`) are now retained, providing better context for readability scoring and preserving useful content like captions and bylines.
29
+
30
+ - **Unified fetch contract** - Both single-page and recursive crawling now use the same internal `FetchRecord` structure, ensuring consistent provenance tracking and failure handling across all fetch modes.
31
+
32
+ - **Serializer schema updated** - Both text and JSON output formats now include:
33
+ - `pathname`: URL path component
34
+ - `extractor`: Which extractor processed the page
35
+ - Full metadata object (JSON only)
36
+
37
+ ### Fixed
38
+
39
+ - **JSON serialization** - The JSON output now correctly includes the `pathname` field that was documented but missing in previous versions.
40
+
41
+ - **Extraction failure visibility** - Previously, extraction failures were only visible with debug logging enabled. The new `FetchResult` structure makes failures programmatically accessible.
42
+
43
+ ### Internal Changes
44
+
45
+ - **Duck-typed extractor interface** - Extractors now receive the full `FetchRecord` object and can access the fields they need (`Default` uses `record.html`, `Jina` uses `record.final_url`).
46
+
47
+ - **Removed `RawPage` primitive** - Superseded by the richer `FetchRecord` which handles both success and failure uniformly.
48
+
49
+ ## [0.3.0] - 2026-02-23
50
+
51
+ - Remove `Nous::Error` base hierarchy; colocated errors inherit directly from `StandardError` with descriptive names
52
+ - Move extraction pipeline under `Nous::Fetcher::*` namespace (`ExtractionRunner`, `ExtractionThread`)
53
+ - Move readability command into `Nous::Extractor::Default::Client`, mirroring Jina structure
54
+ - `Nous::Extractor` is now a module namespace (implicit via Zeitwerk), no longer a Command
55
+ - Shared `Extractor::ExtractionError` contract: all extractor backends raise this on failure
56
+ - Pull `seed_url` off `Configuration`; `Crawler` owns URL parsing and validation directly
57
+ - Explicit rescue lists in CLI and extraction thread instead of broad `Nous::Error` rescue
58
+ - Rename `--verbose`/`-v` to `--debug`/`-d`; `-v` is now `--version`
59
+ - Add `Nous::Url`, `Nous::UrlResolver`, and `Crawler::RedirectFollower` to correctly handle redirects and path encoding (including spaces)
60
+ - Add `-r`/`--recursive`; default mode now fetches only the seed page unless recursion is explicitly enabled
61
+ - Split crawler fetchers by mode: `Crawler::AsyncPageFetcher`, `Crawler::RecursivePageFetcher`, and `Crawler::SinglePageFetcher`
62
+ - Move configuration construction to `ConfigurationBuilder` and `Data.define`-based `Configuration` primitive
63
+ - Add `faraday-follow_redirects` for single-page redirect handling and update integration/spec coverage for recursive and single-page flows
64
+
3
65
  ## [0.2.0] - 2026-02-21
4
66
 
5
67
  - Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)
@@ -13,3 +75,9 @@
13
75
  ## [0.1.0] - 2026-02-21
14
76
 
15
77
  - Initial release
78
+
79
+ [Unreleased]: https://github.com/danfrenette/nous/compare/v0.4.0...HEAD
80
+ [0.4.0]: https://github.com/danfrenette/nous/compare/v0.3.0...v0.4.0
81
+ [0.3.0]: https://github.com/danfrenette/nous/compare/v0.2.0...v0.3.0
82
+ [0.2.0]: https://github.com/danfrenette/nous/compare/v0.1.0...v0.2.0
83
+ [0.1.0]: https://github.com/danfrenette/nous/releases/tag/v0.1.0
data/README.md CHANGED
@@ -42,8 +42,8 @@ nous https://example.com -s "article.post"
42
42
  # Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
43
43
  nous https://example.com --jina
44
44
 
45
- # Verbose logging
46
- nous https://example.com -v
45
+ # Debug logging
46
+ nous https://example.com -d
47
47
  ```
48
48
 
49
49
  ### Options
@@ -58,17 +58,21 @@ nous https://example.com -v
58
58
  | `-l`, `--limit N` | Maximum pages to fetch | `100` |
59
59
  | `--timeout N` | Per-request timeout in seconds | `15` |
60
60
  | `--jina` | Use Jina Reader API for extraction | off |
61
- | `-v`, `--verbose` | Verbose logging to stderr | off |
61
+ | `-v`, `--version` | Print version and exit | off |
62
+ | `-h`, `--help` | Print usage and exit | off |
63
+ | `-d`, `--debug` | Debug logging to stderr | off |
62
64
 
63
65
  ## Ruby API
64
66
 
67
+ ### Basic Usage
68
+
65
69
  ```ruby
66
70
  require "nous"
67
71
 
68
72
  # Fetch pages with the default extractor
69
73
  pages = Nous.fetch("https://example.com", limit: 10, concurrency: 3)
70
74
 
71
- # Each page is a Nous::Page with title, url, pathname, content
75
+ # Each page is a Nous::Page with title, url, pathname, content, metadata
72
76
  pages.each do |page|
73
77
  puts "#{page.title} (#{page.url})"
74
78
  puts page.content
@@ -87,11 +91,70 @@ pages = Nous.fetch("https://spa-site.com",
87
91
  )
88
92
  ```
89
93
 
94
+ ### Detailed Results
95
+
96
+ Use the `details: true` option to receive full fetch results including failures:
97
+
98
+ ```ruby
99
+ result = Nous.fetch("https://example.com", details: true)
100
+
101
+ result.pages # Array<Nous::Page> - successfully extracted pages
102
+ result.failures # Array<{requested_url:, error:}> - failed fetches
103
+ result.total_requested # Integer - total URLs attempted
104
+ result.all_succeeded? # Boolean - true if no failures
105
+ result.any_succeeded? # Boolean - true if at least one page extracted
106
+ ```
107
+
108
+ This is useful when you need to handle failures explicitly:
109
+
110
+ ```ruby
111
+ result = Nous.fetch("https://example.com/api-docs", details: true)
112
+
113
+ if result.failures.any?
114
+ puts "Failed to fetch:"
115
+ result.failures.each do |failure|
116
+ puts " #{failure[:requested_url]}: #{failure[:error]}"
117
+ end
118
+ end
119
+
120
+ result.pages.each do |page|
121
+ puts "Successfully extracted: #{page.title}"
122
+ end
123
+ ```
124
+
125
+ ### Page Structure
126
+
127
+ Each extracted page contains:
128
+
129
+ | Field | Type | Description |
130
+ |-------|------|-------------|
131
+ | `title` | String | Page title (fallback chain: readability → `<title>` tag → `<h1>`) |
132
+ | `url` | String | Final URL after redirects |
133
+ | `pathname` | String | URL path component |
134
+ | `content` | String | Extracted content as Markdown |
135
+ | `metadata` | Hash | Provenance information (see below) |
136
+
137
+ ### Page Metadata
138
+
139
+ ```ruby
140
+ page.metadata # => {
141
+ # extractor: "Nous::Extractor::Default", # Which extractor was used
142
+ # requested_url: "https://example.com/blog", # Original URL before redirects
143
+ # content_type: "text/html; charset=utf-8", # HTTP Content-Type header
144
+ # redirected: true # Whether redirects occurred
145
+ # }
146
+ ```
147
+
90
148
  ## Extraction Backends
91
149
 
92
150
  ### Default (ruby-readability)
93
151
 
94
- Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (nav, footer, script, header), and converts to Markdown via [reverse_markdown](https://github.com/xijo/reverse_markdown). Fast and requires no external services, but cannot extract content from JS-rendered pages.
152
+ Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (script, style, nav, footer), and converts to Markdown via [reverse_markdown](https://github.com/xijo/reverse_markdown). Fast and requires no external services, but cannot extract content from JS-rendered pages.
153
+
154
+ Title extraction uses a fallback chain:
155
+ 1. Readability's extracted title
156
+ 2. Original `<title>` tag from HTML
157
+ 3. First `<h1>` from extracted content
95
158
 
96
159
  ### Jina Reader API
97
160
 
@@ -105,13 +168,15 @@ XML-tagged output designed for LLM context windows:
105
168
 
106
169
  ```xml
107
170
  <page>
108
- <title>Page Title</title>
109
- <url>https://example.com/page</url>
110
- <content>
171
+ <title>Page Title</title>
172
+ <url>https://example.com/page</url>
173
+ <pathname>/page</pathname>
174
+ <extractor>Nous::Extractor::Default</extractor>
175
+ <content>
111
176
  # Heading
112
177
 
113
178
  Extracted markdown content...
114
- </content>
179
+ </content>
115
180
  </page>
116
181
  ```
117
182
 
@@ -123,7 +188,13 @@ Extracted markdown content...
123
188
  "title": "Page Title",
124
189
  "url": "https://example.com/page",
125
190
  "pathname": "/page",
126
- "content": "# Heading\n\nExtracted markdown content..."
191
+ "content": "# Heading\n\nExtracted markdown content...",
192
+ "metadata": {
193
+ "extractor": "Nous::Extractor::Default",
194
+ "requested_url": "https://example.com/page",
195
+ "content_type": "text/html; charset=utf-8",
196
+ "redirected": false
197
+ }
127
198
  }
128
199
  ]
129
200
  ```
@@ -134,6 +205,7 @@ Extracted markdown content...
134
205
  bin/setup # Install dependencies
135
206
  bundle exec rspec # Run tests
136
207
  bundle exec standardrb # Lint
208
+ bundle exec exe/nous # Run the command line in-development
137
209
  ```
138
210
 
139
211
  ## License
data/lib/nous/cli.rb CHANGED
@@ -4,7 +4,7 @@ require "optparse"
4
4
 
5
5
  module Nous
6
6
  class Cli
7
- class Error < Nous::Error; end
7
+ class CliError < StandardError; end
8
8
 
9
9
  def initialize(argv)
10
10
  @argv = argv
@@ -18,7 +18,9 @@ module Nous
18
18
  pages = Nous.fetch(seed_url, **fetch_options)
19
19
  output = Nous.serialize(pages, format: options[:format])
20
20
  write_output(output)
21
- rescue Nous::Error => e
21
+ rescue CliError,
22
+ Fetcher::FetchError,
23
+ Serializer::SerializationError => e
22
24
  warn("nous: #{e.message}")
23
25
  exit 1
24
26
  end
@@ -32,7 +34,7 @@ module Nous
32
34
  end
33
35
 
34
36
  def fetch_options
35
- opts = options.slice(:concurrency, :match, :limit, :timeout, :verbose)
37
+ opts = options.slice(*Configuration.members)
36
38
  opts[:extractor] = extractor
37
39
  opts
38
40
  end
@@ -44,7 +46,7 @@ module Nous
44
46
  end
45
47
 
46
48
  def validate!
47
- raise Error, "no URL provided. Usage: nous <url> [options]" unless seed_url
49
+ raise CliError, "no URL provided. Usage: nous <url> [options]" unless seed_url
48
50
  end
49
51
 
50
52
  def write_output(output)
@@ -58,7 +60,7 @@ module Nous
58
60
  def parse_options!
59
61
  parser.parse!(argv)
60
62
  rescue OptionParser::InvalidOption => e
61
- raise Error, e.message
63
+ raise CliError, e.message
62
64
  end
63
65
 
64
66
  def parser
@@ -77,13 +79,14 @@ module Nous
77
79
  opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
78
80
  opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
79
81
  opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
80
- opts.on("-v", "--verbose", "Verbose logging to stderr") { options[:verbose] = true }
81
- opts.on("-h", "--help", "Show help") do
82
- $stdout.puts(opts)
82
+ opts.on("-r", "--recursive", "Follow same-host links recursively") { options[:recursive] = true }
83
+ opts.on("-d", "--debug", "Debug logging to stderr") { options[:debug] = true }
84
+ opts.on("-v", "--version", "Show version") do
85
+ $stdout.puts("nous #{Nous::VERSION}")
83
86
  exit
84
87
  end
85
- opts.on("--version", "Show version") do
86
- $stdout.puts("nous #{Nous::VERSION}")
88
+ opts.on("-h", "--help", "Show help") do
89
+ $stdout.puts(opts)
87
90
  exit
88
91
  end
89
92
  end
data/lib/nous/command.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Nous
4
4
  class Command
5
- class Error < Nous::Error; end
5
+ class CommandError < StandardError; end
6
6
 
7
7
  class Result
8
8
  attr_reader :payload, :error, :metadata
@@ -27,7 +27,7 @@ module Nous
27
27
  command = new(...)
28
28
  command.call
29
29
  rescue => e
30
- return command.failure(Error.new("unexpected: #{e.message}")) if command
30
+ return command.failure(CommandError.new("unexpected: #{e.message}")) if command
31
31
 
32
32
  Result.new(success: false, error: e)
33
33
  end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class ConfigurationBuilder
5
+ class UnknownOptionError < StandardError; end
6
+
7
+ DEFAULTS = {
8
+ concurrency: 3,
9
+ match: [],
10
+ limit: 100,
11
+ timeout: 15,
12
+ debug: false,
13
+ keep_query: false,
14
+ recursive: false
15
+ }.freeze
16
+
17
+ def self.call(**options)
18
+ new(options).call
19
+ end
20
+
21
+ def initialize(options)
22
+ @options = options
23
+ end
24
+
25
+ def call
26
+ validate_keys!
27
+
28
+ Configuration.new(**coerced_options)
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :options
34
+
35
+ def validate_keys!
36
+ unknown = options.keys - Configuration.members
37
+ return if unknown.empty?
38
+
39
+ raise UnknownOptionError, "unknown option(s): #{unknown.join(", ")}"
40
+ end
41
+
42
+ def coerced_options
43
+ merged = DEFAULTS.merge(options)
44
+
45
+ {
46
+ concurrency: Integer(merged[:concurrency]).clamp(1, 20),
47
+ match: Array(merged[:match]),
48
+ limit: Integer(merged[:limit]).clamp(1, 10_000),
49
+ timeout: Integer(merged[:timeout]),
50
+ debug: !!merged[:debug],
51
+ keep_query: !!merged[:keep_query],
52
+ recursive: !!merged[:recursive]
53
+ }
54
+ end
55
+ end
56
+ end
@@ -4,7 +4,7 @@ require "reverse_markdown"
4
4
 
5
5
  module Nous
6
6
  class Converter < Command
7
- class Error < Command::Error; end
7
+ class ConversionError < StandardError; end
8
8
 
9
9
  def initialize(html:)
10
10
  @html = html
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Crawler < Command
5
+ class AsyncPageFetcher
6
+ HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
7
+
8
+ def initialize(client:, seed_host:)
9
+ @client = client
10
+ @seed_host = seed_host
11
+ end
12
+
13
+ def fetch(url)
14
+ Async::Task.current.with_timeout(config.timeout) do
15
+ result = RedirectFollower.call(client:, seed_host:, url:)
16
+ return build_failed_record(url, result.error.message) if result.failure?
17
+
18
+ response, final_url = result.payload
19
+ content_type = response.headers["content-type"].to_s
20
+ redirected = final_url.to_s != url
21
+
22
+ return build_failed_record(url, "status #{response.status}") unless response.status == 200
23
+ return build_failed_record(url, "non-html content") unless html?(content_type)
24
+
25
+ build_success_record(
26
+ url: url,
27
+ final_url: final_url.to_s,
28
+ pathname: final_url.path,
29
+ html: response.read,
30
+ content_type: content_type,
31
+ redirected: redirected
32
+ )
33
+ ensure
34
+ response&.close
35
+ end
36
+ rescue Async::TimeoutError
37
+ build_failed_record(url, "timeout after #{config.timeout}s")
38
+ rescue IOError, SocketError, Errno::ECONNREFUSED => e
39
+ build_failed_record(url, e.message)
40
+ end
41
+
42
+ private
43
+
44
+ attr_reader :client, :seed_host
45
+
46
+ def config
47
+ Nous.configuration
48
+ end
49
+
50
+ def html?(content_type)
51
+ HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
52
+ end
53
+
54
+ def build_success_record(url:, final_url:, pathname:, html:, content_type:, redirected:)
55
+ FetchRecord.new(
56
+ requested_url: url,
57
+ final_url: final_url,
58
+ pathname: pathname,
59
+ html: html,
60
+ content_type: content_type,
61
+ ok: true,
62
+ error: nil,
63
+ redirected: redirected
64
+ )
65
+ end
66
+
67
+ def build_failed_record(url, error)
68
+ FetchRecord.new(
69
+ requested_url: url,
70
+ final_url: nil,
71
+ pathname: Url.new(url).path,
72
+ html: nil,
73
+ content_type: nil,
74
+ ok: false,
75
+ error: error,
76
+ redirected: false
77
+ ).tap do |record|
78
+ warn("[nous] skip #{url}: #{error}") if config.debug?
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
4
+
3
5
  module Nous
4
6
  class Crawler < Command
5
7
  class LinkExtractor
@@ -8,9 +10,7 @@ module Nous
8
10
  end
9
11
 
10
12
  def extract(current_url, html)
11
- base_uri = URI.parse(current_url)
12
-
13
- anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
13
+ anchors(html).filter_map { |href| resolve(current_url, href) }.uniq
14
14
  end
15
15
 
16
16
  private
@@ -21,19 +21,19 @@ module Nous
21
21
  Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
22
22
  end
23
23
 
24
- def resolve(base_uri, href)
24
+ def resolve(current_url, href)
25
25
  return unless url_filter.allowed?(href)
26
26
 
27
- uri = URI.join(base_uri, href)
28
- return unless url_filter.same_host?(uri)
27
+ result = UrlResolver.call(base_url: current_url, href:)
28
+ return unless result.success?
29
+
30
+ url = result.payload
31
+ return unless url_filter.same_host?(url)
29
32
 
30
- canonical = url_filter.canonicalize(uri)
31
- return unless url_filter.matches_path?(URI.parse(canonical).path)
33
+ canonical = url_filter.canonicalize(url)
34
+ return unless url_filter.matches_path?(Url.new(canonical).path)
32
35
 
33
36
  canonical
34
- rescue URI::InvalidURIError => e
35
- warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
36
- nil
37
37
  end
38
38
  end
39
39
  end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "async"
4
+ require "async/http/internet"
5
+
6
+ module Nous
7
+ class Crawler < Command
8
+ class RecursivePageFetcher < Command
9
+ def initialize(seed_url:, http_client: nil)
10
+ @seed_uri = Url.new(seed_url)
11
+ @http_client = http_client
12
+ @records = []
13
+ @queue = [url_filter.canonicalize(seed_uri)]
14
+ @seen = Set.new(queue)
15
+ end
16
+
17
+ def call
18
+ suppress_async_warnings unless config.debug?
19
+
20
+ open_connection do |client|
21
+ crawl(client)
22
+ end
23
+
24
+ success(payload: records)
25
+ end
26
+
27
+ private
28
+
29
+ attr_reader :seed_uri, :http_client, :records, :queue, :seen
30
+
31
+ def config
32
+ Nous.configuration
33
+ end
34
+
35
+ def crawl(client)
36
+ fetch_and_enqueue(queue.shift(config.concurrency), client) while queue.any? && within_limit?
37
+ end
38
+
39
+ def fetch_and_enqueue(batch, client)
40
+ fetch_batch(batch, client).each do |record|
41
+ next unless record.ok
42
+ break unless within_limit?
43
+
44
+ records << record
45
+ seen << record.final_url
46
+ enqueue_links(record)
47
+ end
48
+ end
49
+
50
+ def fetch_batch(urls, client)
51
+ tasks = []
52
+
53
+ Async do |task|
54
+ urls.each do |url|
55
+ tasks << task.async { page_fetcher(client).fetch(url) }
56
+ end
57
+ end.wait
58
+
59
+ tasks.map(&:wait)
60
+ end
61
+
62
+ def enqueue_links(record)
63
+ link_extractor.extract(record.final_url, record.html).each do |url|
64
+ next if seen.include?(url)
65
+
66
+ seen << url
67
+ queue << url
68
+ end
69
+ end
70
+
71
+ def within_limit?
72
+ records.count(&:ok) < config.limit
73
+ end
74
+
75
+ def open_connection
76
+ client = http_client || Async::HTTP::Internet.new
77
+
78
+ Async do
79
+ yield client
80
+ ensure
81
+ client.close
82
+ end.wait
83
+ end
84
+
85
+ def page_fetcher(client)
86
+ AsyncPageFetcher.new(client:, seed_host: seed_uri.host)
87
+ end
88
+
89
+ def url_filter
90
+ @url_filter ||= UrlFilter.new(seed_uri:)
91
+ end
92
+
93
+ def link_extractor
94
+ @link_filter ||= LinkExtractor.new(url_filter:)
95
+ end
96
+
97
+ def suppress_async_warnings
98
+ require "console"
99
+ Console.logger.level = :error
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Crawler < Command
5
+ class RedirectFollower < Command
6
+ class RedirectError < StandardError; end
7
+
8
+ MAX_HOPS = 5
9
+
10
+ def initialize(client:, seed_host:, url:, hops_remaining: MAX_HOPS)
11
+ @client = client
12
+ @seed_host = seed_host
13
+ @url = url
14
+ @hops_remaining = hops_remaining
15
+ end
16
+
17
+ def call
18
+ response = client.get(url, {})
19
+
20
+ return success(payload: [response, Url.new(url)]) unless redirect?(response.status)
21
+
22
+ response.close
23
+ follow(response.headers["location"])
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :client, :seed_host, :url, :hops_remaining
29
+
30
+ def redirect?(status)
31
+ (300..399).cover?(status)
32
+ end
33
+
34
+ def follow(location)
35
+ target = resolve_target(location)
36
+ return target if target.failure?
37
+
38
+ self.class.call(client:, seed_host:, url: target.payload.to_s, hops_remaining: hops_remaining - 1)
39
+ end
40
+
41
+ def resolve_target(location)
42
+ return failure(RedirectError.new("redirect without location from #{url}")) unless location
43
+ return failure(RedirectError.new("too many redirects from #{url}")) if hops_remaining <= 0
44
+
45
+ result = UrlResolver.call(base_url: url, href: location)
46
+ return failure(RedirectError.new(result.error.message)) if result.failure?
47
+
48
+ unless safe?(result.payload)
49
+ return failure(RedirectError.new("redirect to #{result.payload} outside #{seed_host}"))
50
+ end
51
+
52
+ result
53
+ end
54
+
55
+ def safe?(target)
56
+ target.http? && target.host == seed_host
57
+ end
58
+ end
59
+ end
60
+ end