nous 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -0
- data/README.md +82 -10
- data/lib/nous/cli.rb +13 -10
- data/lib/nous/command.rb +2 -2
- data/lib/nous/configuration_builder.rb +56 -0
- data/lib/nous/converter.rb +1 -1
- data/lib/nous/crawler/async_page_fetcher.rb +83 -0
- data/lib/nous/crawler/link_extractor.rb +11 -11
- data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
- data/lib/nous/crawler/redirect_follower.rb +60 -0
- data/lib/nous/crawler/single_page_fetcher.rb +112 -0
- data/lib/nous/crawler/url_filter.rb +6 -6
- data/lib/nous/crawler.rb +15 -70
- data/lib/nous/extractor/default/client.rb +68 -0
- data/lib/nous/extractor/default.rb +10 -6
- data/lib/nous/extractor/jina/client.rb +4 -4
- data/lib/nous/extractor/jina.rb +10 -9
- data/lib/nous/fetcher/extraction_runner.rb +31 -0
- data/lib/nous/fetcher/page_extractor.rb +40 -0
- data/lib/nous/fetcher.rb +38 -11
- data/lib/nous/primitives/configuration.rb +17 -0
- data/lib/nous/primitives/extracted_content.rb +5 -0
- data/lib/nous/primitives/fetch_record.rb +26 -0
- data/lib/nous/primitives/fetch_result.rb +21 -0
- data/lib/nous/primitives/page.rb +5 -0
- data/lib/nous/primitives/url.rb +45 -0
- data/lib/nous/serializer.rb +14 -3
- data/lib/nous/url_resolver.rb +25 -0
- data/lib/nous/version.rb +1 -1
- data/lib/nous.rb +6 -5
- metadata +44 -8
- data/lib/nous/configuration.rb +0 -39
- data/lib/nous/crawler/page_fetcher.rb +0 -47
- data/lib/nous/error.rb +0 -5
- data/lib/nous/extraction_runner.rb +0 -29
- data/lib/nous/extraction_thread.rb +0 -28
- data/lib/nous/extractor.rb +0 -46
- data/lib/nous/page.rb +0 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c73c21d427c9bb99cc148e089ed5899e7aa9e3ca86a4825540380d41771354d2
|
|
4
|
+
data.tar.gz: 4b361a7aed3c0dfb28a6a650b0813d371622c82b910b8880502281047152a739
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: aacbc4777dc1e5bd66513ddc3bc5a1f667276ac2e89ff781f7b43762133ae640d68bc1191c11b61ee00ac7911a128fcf6ad80653f86d369d284223e830f09120
|
|
7
|
+
data.tar.gz: 90fc8f0cf3c30c6e06bf6aeebd2790539ff80d0d63469a367c4a09b008c9ceed3dedc0d18528e5f6fab3c4a02103b4d2c7c9b3788b1708c6a1a46790d9f5cbab
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,67 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.4.0] - 2026-04-11
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **New `details: true` option for `Nous.fetch`** - Returns a `FetchResult` object containing both successful pages and failed fetch/extraction attempts. This enables explicit failure handling without exceptions.
|
|
8
|
+
```ruby
|
|
9
|
+
result = Nous.fetch("https://example.com", details: true)
|
|
10
|
+
result.pages # Array<Page> - successfully extracted
|
|
11
|
+
result.failures # [{requested_url:, error:}, ...]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
- **Page metadata** - Every extracted page now includes provenance information:
|
|
15
|
+
- `extractor`: Which extractor backend was used (e.g., "Nous::Extractor::Default")
|
|
16
|
+
- `requested_url`: The original URL before any redirects
|
|
17
|
+
- `content_type`: HTTP Content-Type header from the response
|
|
18
|
+
- `redirected`: Boolean indicating if redirects occurred
|
|
19
|
+
|
|
20
|
+
- **FetchRecord internal primitive** - Unified fetch result representation that captures both success and failure cases with full provenance tracking. Replaces the previous `RawPage` which only handled successful fetches.
|
|
21
|
+
|
|
22
|
+
- **Configuration#single_page? helper** - Convenience predicate method for checking if the current configuration is in single-page (non-recursive) mode.
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
|
|
26
|
+
- **Improved title extraction** - Title extraction now uses a fallback chain: readability extracted title → HTML `<title>` tag → first `<h1>` element. This significantly improves title reliability on pages where readability fails to identify the title.
|
|
27
|
+
|
|
28
|
+
- **Reduced aggressive DOM stripping** - The default extractor now preserves more content before readability processing. Previously removed elements (`header`, `img`, `video`, `svg`, `link`) are now retained, providing better context for readability scoring and preserving useful content like captions and bylines.
|
|
29
|
+
|
|
30
|
+
- **Unified fetch contract** - Both single-page and recursive crawling now use the same internal `FetchRecord` structure, ensuring consistent provenance tracking and failure handling across all fetch modes.
|
|
31
|
+
|
|
32
|
+
- **Serializer schema updated** - Both text and JSON output formats now include:
|
|
33
|
+
- `pathname`: URL path component
|
|
34
|
+
- `extractor`: Which extractor processed the page
|
|
35
|
+
- Full metadata object (JSON only)
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
|
|
39
|
+
- **JSON serialization** - The JSON output now correctly includes the `pathname` field that was documented but missing in previous versions.
|
|
40
|
+
|
|
41
|
+
- **Extraction failure visibility** - Previously, extraction failures were only visible with debug logging enabled. The new `FetchResult` structure makes failures programmatically accessible.
|
|
42
|
+
|
|
43
|
+
### Internal Changes
|
|
44
|
+
|
|
45
|
+
- **Duck-typed extractor interface** - Extractors now receive the full `FetchRecord` object and can access the fields they need (`Default` uses `record.html`, `Jina` uses `record.final_url`).
|
|
46
|
+
|
|
47
|
+
- **Removed `RawPage` primitive** - Superseded by the richer `FetchRecord` which handles both success and failure uniformly.
|
|
48
|
+
|
|
49
|
+
## [0.3.0] - 2026-02-23
|
|
50
|
+
|
|
51
|
+
- Remove `Nous::Error` base hierarchy; colocated errors inherit directly from `StandardError` with descriptive names
|
|
52
|
+
- Move extraction pipeline under `Nous::Fetcher::*` namespace (`ExtractionRunner`, `ExtractionThread`)
|
|
53
|
+
- Move readability command into `Nous::Extractor::Default::Client`, mirroring Jina structure
|
|
54
|
+
- `Nous::Extractor` is now a module namespace (implicit via Zeitwerk), no longer a Command
|
|
55
|
+
- Shared `Extractor::ExtractionError` contract: all extractor backends raise this on failure
|
|
56
|
+
- Pull `seed_url` off `Configuration`; `Crawler` owns URL parsing and validation directly
|
|
57
|
+
- Explicit rescue lists in CLI and extraction thread instead of broad `Nous::Error` rescue
|
|
58
|
+
- Rename `--verbose`/`-v` to `--debug`/`-d`; `-v` is now `--version`
|
|
59
|
+
- Add `Nous::Url`, `Nous::UrlResolver`, and `Crawler::RedirectFollower` to correctly handle redirects and path encoding (including spaces)
|
|
60
|
+
- Add `-r`/`--recursive`; default mode now fetches only the seed page unless recursion is explicitly enabled
|
|
61
|
+
- Split crawler fetchers by mode: `Crawler::AsyncPageFetcher`, `Crawler::RecursivePageFetcher`, and `Crawler::SinglePageFetcher`
|
|
62
|
+
- Move configuration construction to `ConfigurationBuilder` and `Data.define`-based `Configuration` primitive
|
|
63
|
+
- Add `faraday-follow_redirects` for single-page redirect handling and update integration/spec coverage for recursive and single-page flows
|
|
64
|
+
|
|
3
65
|
## [0.2.0] - 2026-02-21
|
|
4
66
|
|
|
5
67
|
- Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)
|
|
@@ -13,3 +75,9 @@
|
|
|
13
75
|
## [0.1.0] - 2026-02-21
|
|
14
76
|
|
|
15
77
|
- Initial release
|
|
78
|
+
|
|
79
|
+
[Unreleased]: https://github.com/danfrenette/nous/compare/v0.4.0...HEAD
|
|
80
|
+
[0.4.0]: https://github.com/danfrenette/nous/compare/v0.3.0...v0.4.0
|
|
81
|
+
[0.3.0]: https://github.com/danfrenette/nous/compare/v0.2.0...v0.3.0
|
|
82
|
+
[0.2.0]: https://github.com/danfrenette/nous/compare/v0.1.0...v0.2.0
|
|
83
|
+
[0.1.0]: https://github.com/danfrenette/nous/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -42,8 +42,8 @@ nous https://example.com -s "article.post"
|
|
|
42
42
|
# Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
|
|
43
43
|
nous https://example.com --jina
|
|
44
44
|
|
|
45
|
-
#
|
|
46
|
-
nous https://example.com -
|
|
45
|
+
# Debug logging
|
|
46
|
+
nous https://example.com -d
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
### Options
|
|
@@ -58,17 +58,21 @@ nous https://example.com -v
|
|
|
58
58
|
| `-l`, `--limit N` | Maximum pages to fetch | `100` |
|
|
59
59
|
| `--timeout N` | Per-request timeout in seconds | `15` |
|
|
60
60
|
| `--jina` | Use Jina Reader API for extraction | off |
|
|
61
|
-
| `-v`, `--
|
|
61
|
+
| `-v`, `--version` | Print version and exit | off |
|
|
62
|
+
| `-h`, `--help` | Print usage and exit | off |
|
|
63
|
+
| `-d`, `--debug` | Debug logging to stderr | off |
|
|
62
64
|
|
|
63
65
|
## Ruby API
|
|
64
66
|
|
|
67
|
+
### Basic Usage
|
|
68
|
+
|
|
65
69
|
```ruby
|
|
66
70
|
require "nous"
|
|
67
71
|
|
|
68
72
|
# Fetch pages with the default extractor
|
|
69
73
|
pages = Nous.fetch("https://example.com", limit: 10, concurrency: 3)
|
|
70
74
|
|
|
71
|
-
# Each page is a Nous::Page with title, url, pathname, content
|
|
75
|
+
# Each page is a Nous::Page with title, url, pathname, content, metadata
|
|
72
76
|
pages.each do |page|
|
|
73
77
|
puts "#{page.title} (#{page.url})"
|
|
74
78
|
puts page.content
|
|
@@ -87,11 +91,70 @@ pages = Nous.fetch("https://spa-site.com",
|
|
|
87
91
|
)
|
|
88
92
|
```
|
|
89
93
|
|
|
94
|
+
### Detailed Results
|
|
95
|
+
|
|
96
|
+
Use the `details: true` option to receive full fetch results including failures:
|
|
97
|
+
|
|
98
|
+
```ruby
|
|
99
|
+
result = Nous.fetch("https://example.com", details: true)
|
|
100
|
+
|
|
101
|
+
result.pages # Array<Nous::Page> - successfully extracted pages
|
|
102
|
+
result.failures # Array<{requested_url:, error:}> - failed fetches
|
|
103
|
+
result.total_requested # Integer - total URLs attempted
|
|
104
|
+
result.all_succeeded? # Boolean - true if no failures
|
|
105
|
+
result.any_succeeded? # Boolean - true if at least one page extracted
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
This is useful when you need to handle failures explicitly:
|
|
109
|
+
|
|
110
|
+
```ruby
|
|
111
|
+
result = Nous.fetch("https://example.com/api-docs", details: true)
|
|
112
|
+
|
|
113
|
+
if result.failures.any?
|
|
114
|
+
puts "Failed to fetch:"
|
|
115
|
+
result.failures.each do |failure|
|
|
116
|
+
puts " #{failure[:requested_url]}: #{failure[:error]}"
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
result.pages.each do |page|
|
|
121
|
+
puts "Successfully extracted: #{page.title}"
|
|
122
|
+
end
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Page Structure
|
|
126
|
+
|
|
127
|
+
Each extracted page contains:
|
|
128
|
+
|
|
129
|
+
| Field | Type | Description |
|
|
130
|
+
|-------|------|-------------|
|
|
131
|
+
| `title` | String | Page title (fallback chain: readability → `<title>` tag → `<h1>`) |
|
|
132
|
+
| `url` | String | Final URL after redirects |
|
|
133
|
+
| `pathname` | String | URL path component |
|
|
134
|
+
| `content` | String | Extracted content as Markdown |
|
|
135
|
+
| `metadata` | Hash | Provenance information (see below) |
|
|
136
|
+
|
|
137
|
+
### Page Metadata
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
page.metadata # => {
|
|
141
|
+
# extractor: "Nous::Extractor::Default", # Which extractor was used
|
|
142
|
+
# requested_url: "https://example.com/blog", # Original URL before redirects
|
|
143
|
+
# content_type: "text/html; charset=utf-8", # HTTP Content-Type header
|
|
144
|
+
# redirected: true # Whether redirects occurred
|
|
145
|
+
# }
|
|
146
|
+
```
|
|
147
|
+
|
|
90
148
|
## Extraction Backends
|
|
91
149
|
|
|
92
150
|
### Default (ruby-readability)
|
|
93
151
|
|
|
94
|
-
Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (
|
|
152
|
+
Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (script, style, nav, footer), and converts to Markdown via [reverse_markdown](https://github.com/xijo/reverse_markdown). Fast and requires no external services, but cannot extract content from JS-rendered pages.
|
|
153
|
+
|
|
154
|
+
Title extraction uses a fallback chain:
|
|
155
|
+
1. Readability's extracted title
|
|
156
|
+
2. Original `<title>` tag from HTML
|
|
157
|
+
3. First `<h1>` from extracted content
|
|
95
158
|
|
|
96
159
|
### Jina Reader API
|
|
97
160
|
|
|
@@ -105,13 +168,15 @@ XML-tagged output designed for LLM context windows:
|
|
|
105
168
|
|
|
106
169
|
```xml
|
|
107
170
|
<page>
|
|
108
|
-
<title>Page Title</title>
|
|
109
|
-
<url>https://example.com/page</url>
|
|
110
|
-
<
|
|
171
|
+
<title>Page Title</title>
|
|
172
|
+
<url>https://example.com/page</url>
|
|
173
|
+
<pathname>/page</pathname>
|
|
174
|
+
<extractor>Nous::Extractor::Default</extractor>
|
|
175
|
+
<content>
|
|
111
176
|
# Heading
|
|
112
177
|
|
|
113
178
|
Extracted markdown content...
|
|
114
|
-
</content>
|
|
179
|
+
</content>
|
|
115
180
|
</page>
|
|
116
181
|
```
|
|
117
182
|
|
|
@@ -123,7 +188,13 @@ Extracted markdown content...
|
|
|
123
188
|
"title": "Page Title",
|
|
124
189
|
"url": "https://example.com/page",
|
|
125
190
|
"pathname": "/page",
|
|
126
|
-
"content": "# Heading\n\nExtracted markdown content..."
|
|
191
|
+
"content": "# Heading\n\nExtracted markdown content...",
|
|
192
|
+
"metadata": {
|
|
193
|
+
"extractor": "Nous::Extractor::Default",
|
|
194
|
+
"requested_url": "https://example.com/page",
|
|
195
|
+
"content_type": "text/html; charset=utf-8",
|
|
196
|
+
"redirected": false
|
|
197
|
+
}
|
|
127
198
|
}
|
|
128
199
|
]
|
|
129
200
|
```
|
|
@@ -134,6 +205,7 @@ Extracted markdown content...
|
|
|
134
205
|
bin/setup # Install dependencies
|
|
135
206
|
bundle exec rspec # Run tests
|
|
136
207
|
bundle exec standardrb # Lint
|
|
208
|
+
bundle exec exe/nous # Run the command line in-development
|
|
137
209
|
```
|
|
138
210
|
|
|
139
211
|
## License
|
data/lib/nous/cli.rb
CHANGED
|
@@ -4,7 +4,7 @@ require "optparse"
|
|
|
4
4
|
|
|
5
5
|
module Nous
|
|
6
6
|
class Cli
|
|
7
|
-
class
|
|
7
|
+
class CliError < StandardError; end
|
|
8
8
|
|
|
9
9
|
def initialize(argv)
|
|
10
10
|
@argv = argv
|
|
@@ -18,7 +18,9 @@ module Nous
|
|
|
18
18
|
pages = Nous.fetch(seed_url, **fetch_options)
|
|
19
19
|
output = Nous.serialize(pages, format: options[:format])
|
|
20
20
|
write_output(output)
|
|
21
|
-
rescue
|
|
21
|
+
rescue CliError,
|
|
22
|
+
Fetcher::FetchError,
|
|
23
|
+
Serializer::SerializationError => e
|
|
22
24
|
warn("nous: #{e.message}")
|
|
23
25
|
exit 1
|
|
24
26
|
end
|
|
@@ -32,7 +34,7 @@ module Nous
|
|
|
32
34
|
end
|
|
33
35
|
|
|
34
36
|
def fetch_options
|
|
35
|
-
opts = options.slice(
|
|
37
|
+
opts = options.slice(*Configuration.members)
|
|
36
38
|
opts[:extractor] = extractor
|
|
37
39
|
opts
|
|
38
40
|
end
|
|
@@ -44,7 +46,7 @@ module Nous
|
|
|
44
46
|
end
|
|
45
47
|
|
|
46
48
|
def validate!
|
|
47
|
-
raise
|
|
49
|
+
raise CliError, "no URL provided. Usage: nous <url> [options]" unless seed_url
|
|
48
50
|
end
|
|
49
51
|
|
|
50
52
|
def write_output(output)
|
|
@@ -58,7 +60,7 @@ module Nous
|
|
|
58
60
|
def parse_options!
|
|
59
61
|
parser.parse!(argv)
|
|
60
62
|
rescue OptionParser::InvalidOption => e
|
|
61
|
-
raise
|
|
63
|
+
raise CliError, e.message
|
|
62
64
|
end
|
|
63
65
|
|
|
64
66
|
def parser
|
|
@@ -77,13 +79,14 @@ module Nous
|
|
|
77
79
|
opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
|
|
78
80
|
opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
|
|
79
81
|
opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
|
|
80
|
-
opts.on("-
|
|
81
|
-
opts.on("-
|
|
82
|
-
|
|
82
|
+
opts.on("-r", "--recursive", "Follow same-host links recursively") { options[:recursive] = true }
|
|
83
|
+
opts.on("-d", "--debug", "Debug logging to stderr") { options[:debug] = true }
|
|
84
|
+
opts.on("-v", "--version", "Show version") do
|
|
85
|
+
$stdout.puts("nous #{Nous::VERSION}")
|
|
83
86
|
exit
|
|
84
87
|
end
|
|
85
|
-
opts.on("--
|
|
86
|
-
$stdout.puts(
|
|
88
|
+
opts.on("-h", "--help", "Show help") do
|
|
89
|
+
$stdout.puts(opts)
|
|
87
90
|
exit
|
|
88
91
|
end
|
|
89
92
|
end
|
data/lib/nous/command.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
4
|
class Command
|
|
5
|
-
class
|
|
5
|
+
class CommandError < StandardError; end
|
|
6
6
|
|
|
7
7
|
class Result
|
|
8
8
|
attr_reader :payload, :error, :metadata
|
|
@@ -27,7 +27,7 @@ module Nous
|
|
|
27
27
|
command = new(...)
|
|
28
28
|
command.call
|
|
29
29
|
rescue => e
|
|
30
|
-
return command.failure(
|
|
30
|
+
return command.failure(CommandError.new("unexpected: #{e.message}")) if command
|
|
31
31
|
|
|
32
32
|
Result.new(success: false, error: e)
|
|
33
33
|
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class ConfigurationBuilder
|
|
5
|
+
class UnknownOptionError < StandardError; end
|
|
6
|
+
|
|
7
|
+
DEFAULTS = {
|
|
8
|
+
concurrency: 3,
|
|
9
|
+
match: [],
|
|
10
|
+
limit: 100,
|
|
11
|
+
timeout: 15,
|
|
12
|
+
debug: false,
|
|
13
|
+
keep_query: false,
|
|
14
|
+
recursive: false
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
def self.call(**options)
|
|
18
|
+
new(options).call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(options)
|
|
22
|
+
@options = options
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call
|
|
26
|
+
validate_keys!
|
|
27
|
+
|
|
28
|
+
Configuration.new(**coerced_options)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :options
|
|
34
|
+
|
|
35
|
+
def validate_keys!
|
|
36
|
+
unknown = options.keys - Configuration.members
|
|
37
|
+
return if unknown.empty?
|
|
38
|
+
|
|
39
|
+
raise UnknownOptionError, "unknown option(s): #{unknown.join(", ")}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def coerced_options
|
|
43
|
+
merged = DEFAULTS.merge(options)
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
concurrency: Integer(merged[:concurrency]).clamp(1, 20),
|
|
47
|
+
match: Array(merged[:match]),
|
|
48
|
+
limit: Integer(merged[:limit]).clamp(1, 10_000),
|
|
49
|
+
timeout: Integer(merged[:timeout]),
|
|
50
|
+
debug: !!merged[:debug],
|
|
51
|
+
keep_query: !!merged[:keep_query],
|
|
52
|
+
recursive: !!merged[:recursive]
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
data/lib/nous/converter.rb
CHANGED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class AsyncPageFetcher
|
|
6
|
+
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
7
|
+
|
|
8
|
+
def initialize(client:, seed_host:)
|
|
9
|
+
@client = client
|
|
10
|
+
@seed_host = seed_host
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def fetch(url)
|
|
14
|
+
Async::Task.current.with_timeout(config.timeout) do
|
|
15
|
+
result = RedirectFollower.call(client:, seed_host:, url:)
|
|
16
|
+
return build_failed_record(url, result.error.message) if result.failure?
|
|
17
|
+
|
|
18
|
+
response, final_url = result.payload
|
|
19
|
+
content_type = response.headers["content-type"].to_s
|
|
20
|
+
redirected = final_url.to_s != url
|
|
21
|
+
|
|
22
|
+
return build_failed_record(url, "status #{response.status}") unless response.status == 200
|
|
23
|
+
return build_failed_record(url, "non-html content") unless html?(content_type)
|
|
24
|
+
|
|
25
|
+
build_success_record(
|
|
26
|
+
url: url,
|
|
27
|
+
final_url: final_url.to_s,
|
|
28
|
+
pathname: final_url.path,
|
|
29
|
+
html: response.read,
|
|
30
|
+
content_type: content_type,
|
|
31
|
+
redirected: redirected
|
|
32
|
+
)
|
|
33
|
+
ensure
|
|
34
|
+
response&.close
|
|
35
|
+
end
|
|
36
|
+
rescue Async::TimeoutError
|
|
37
|
+
build_failed_record(url, "timeout after #{config.timeout}s")
|
|
38
|
+
rescue IOError, SocketError, Errno::ECONNREFUSED => e
|
|
39
|
+
build_failed_record(url, e.message)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
attr_reader :client, :seed_host
|
|
45
|
+
|
|
46
|
+
def config
|
|
47
|
+
Nous.configuration
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def html?(content_type)
|
|
51
|
+
HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def build_success_record(url:, final_url:, pathname:, html:, content_type:, redirected:)
|
|
55
|
+
FetchRecord.new(
|
|
56
|
+
requested_url: url,
|
|
57
|
+
final_url: final_url,
|
|
58
|
+
pathname: pathname,
|
|
59
|
+
html: html,
|
|
60
|
+
content_type: content_type,
|
|
61
|
+
ok: true,
|
|
62
|
+
error: nil,
|
|
63
|
+
redirected: redirected
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def build_failed_record(url, error)
|
|
68
|
+
FetchRecord.new(
|
|
69
|
+
requested_url: url,
|
|
70
|
+
final_url: nil,
|
|
71
|
+
pathname: Url.new(url).path,
|
|
72
|
+
html: nil,
|
|
73
|
+
content_type: nil,
|
|
74
|
+
ok: false,
|
|
75
|
+
error: error,
|
|
76
|
+
redirected: false
|
|
77
|
+
).tap do |record|
|
|
78
|
+
warn("[nous] skip #{url}: #{error}") if config.debug?
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Nous
|
|
4
6
|
class Crawler < Command
|
|
5
7
|
class LinkExtractor
|
|
@@ -8,9 +10,7 @@ module Nous
|
|
|
8
10
|
end
|
|
9
11
|
|
|
10
12
|
def extract(current_url, html)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
|
|
13
|
+
anchors(html).filter_map { |href| resolve(current_url, href) }.uniq
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
private
|
|
@@ -21,19 +21,19 @@ module Nous
|
|
|
21
21
|
Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
|
|
22
22
|
end
|
|
23
23
|
|
|
24
|
-
def resolve(
|
|
24
|
+
def resolve(current_url, href)
|
|
25
25
|
return unless url_filter.allowed?(href)
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
return unless
|
|
27
|
+
result = UrlResolver.call(base_url: current_url, href:)
|
|
28
|
+
return unless result.success?
|
|
29
|
+
|
|
30
|
+
url = result.payload
|
|
31
|
+
return unless url_filter.same_host?(url)
|
|
29
32
|
|
|
30
|
-
canonical = url_filter.canonicalize(
|
|
31
|
-
return unless url_filter.matches_path?(
|
|
33
|
+
canonical = url_filter.canonicalize(url)
|
|
34
|
+
return unless url_filter.matches_path?(Url.new(canonical).path)
|
|
32
35
|
|
|
33
36
|
canonical
|
|
34
|
-
rescue URI::InvalidURIError => e
|
|
35
|
-
warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
|
|
36
|
-
nil
|
|
37
37
|
end
|
|
38
38
|
end
|
|
39
39
|
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "async"
|
|
4
|
+
require "async/http/internet"
|
|
5
|
+
|
|
6
|
+
module Nous
|
|
7
|
+
class Crawler < Command
|
|
8
|
+
class RecursivePageFetcher < Command
|
|
9
|
+
def initialize(seed_url:, http_client: nil)
|
|
10
|
+
@seed_uri = Url.new(seed_url)
|
|
11
|
+
@http_client = http_client
|
|
12
|
+
@records = []
|
|
13
|
+
@queue = [url_filter.canonicalize(seed_uri)]
|
|
14
|
+
@seen = Set.new(queue)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call
|
|
18
|
+
suppress_async_warnings unless config.debug?
|
|
19
|
+
|
|
20
|
+
open_connection do |client|
|
|
21
|
+
crawl(client)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
success(payload: records)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
attr_reader :seed_uri, :http_client, :records, :queue, :seen
|
|
30
|
+
|
|
31
|
+
def config
|
|
32
|
+
Nous.configuration
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def crawl(client)
|
|
36
|
+
fetch_and_enqueue(queue.shift(config.concurrency), client) while queue.any? && within_limit?
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def fetch_and_enqueue(batch, client)
|
|
40
|
+
fetch_batch(batch, client).each do |record|
|
|
41
|
+
next unless record.ok
|
|
42
|
+
break unless within_limit?
|
|
43
|
+
|
|
44
|
+
records << record
|
|
45
|
+
seen << record.final_url
|
|
46
|
+
enqueue_links(record)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def fetch_batch(urls, client)
|
|
51
|
+
tasks = []
|
|
52
|
+
|
|
53
|
+
Async do |task|
|
|
54
|
+
urls.each do |url|
|
|
55
|
+
tasks << task.async { page_fetcher(client).fetch(url) }
|
|
56
|
+
end
|
|
57
|
+
end.wait
|
|
58
|
+
|
|
59
|
+
tasks.map(&:wait)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def enqueue_links(record)
|
|
63
|
+
link_extractor.extract(record.final_url, record.html).each do |url|
|
|
64
|
+
next if seen.include?(url)
|
|
65
|
+
|
|
66
|
+
seen << url
|
|
67
|
+
queue << url
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def within_limit?
|
|
72
|
+
records.count(&:ok) < config.limit
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def open_connection
|
|
76
|
+
client = http_client || Async::HTTP::Internet.new
|
|
77
|
+
|
|
78
|
+
Async do
|
|
79
|
+
yield client
|
|
80
|
+
ensure
|
|
81
|
+
client.close
|
|
82
|
+
end.wait
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def page_fetcher(client)
|
|
86
|
+
AsyncPageFetcher.new(client:, seed_host: seed_uri.host)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def url_filter
|
|
90
|
+
@url_filter ||= UrlFilter.new(seed_uri:)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def link_extractor
|
|
94
|
+
@link_filter ||= LinkExtractor.new(url_filter:)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def suppress_async_warnings
|
|
98
|
+
require "console"
|
|
99
|
+
Console.logger.level = :error
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class RedirectFollower < Command
|
|
6
|
+
class RedirectError < StandardError; end
|
|
7
|
+
|
|
8
|
+
MAX_HOPS = 5
|
|
9
|
+
|
|
10
|
+
def initialize(client:, seed_host:, url:, hops_remaining: MAX_HOPS)
|
|
11
|
+
@client = client
|
|
12
|
+
@seed_host = seed_host
|
|
13
|
+
@url = url
|
|
14
|
+
@hops_remaining = hops_remaining
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call
|
|
18
|
+
response = client.get(url, {})
|
|
19
|
+
|
|
20
|
+
return success(payload: [response, Url.new(url)]) unless redirect?(response.status)
|
|
21
|
+
|
|
22
|
+
response.close
|
|
23
|
+
follow(response.headers["location"])
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :client, :seed_host, :url, :hops_remaining
|
|
29
|
+
|
|
30
|
+
def redirect?(status)
|
|
31
|
+
(300..399).cover?(status)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def follow(location)
|
|
35
|
+
target = resolve_target(location)
|
|
36
|
+
return target if target.failure?
|
|
37
|
+
|
|
38
|
+
self.class.call(client:, seed_host:, url: target.payload.to_s, hops_remaining: hops_remaining - 1)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def resolve_target(location)
|
|
42
|
+
return failure(RedirectError.new("redirect without location from #{url}")) unless location
|
|
43
|
+
return failure(RedirectError.new("too many redirects from #{url}")) if hops_remaining <= 0
|
|
44
|
+
|
|
45
|
+
result = UrlResolver.call(base_url: url, href: location)
|
|
46
|
+
return failure(RedirectError.new(result.error.message)) if result.failure?
|
|
47
|
+
|
|
48
|
+
unless safe?(result.payload)
|
|
49
|
+
return failure(RedirectError.new("redirect to #{result.payload} outside #{seed_host}"))
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
result
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def safe?(target)
|
|
56
|
+
target.http? && target.host == seed_host
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|