nous 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/README.md +6 -3
- data/lib/nous/cli.rb +13 -10
- data/lib/nous/command.rb +2 -2
- data/lib/nous/configuration_builder.rb +56 -0
- data/lib/nous/converter.rb +1 -1
- data/lib/nous/crawler/{page_fetcher.rb → async_page_fetcher.rb} +10 -6
- data/lib/nous/crawler/link_extractor.rb +11 -11
- data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
- data/lib/nous/crawler/redirect_follower.rb +60 -0
- data/lib/nous/crawler/single_page_fetcher.rb +72 -0
- data/lib/nous/crawler/url_filter.rb +6 -6
- data/lib/nous/crawler.rb +15 -70
- data/lib/nous/extractor/default/client.rb +50 -0
- data/lib/nous/extractor/default.rb +10 -6
- data/lib/nous/extractor/jina/client.rb +4 -4
- data/lib/nous/extractor/jina.rb +10 -9
- data/lib/nous/fetcher/extraction_runner.rb +31 -0
- data/lib/nous/fetcher/page_extractor.rb +34 -0
- data/lib/nous/fetcher.rb +7 -6
- data/lib/nous/primitives/configuration.rb +16 -0
- data/lib/nous/primitives/extracted_content.rb +5 -0
- data/lib/nous/primitives/raw_page.rb +5 -0
- data/lib/nous/primitives/url.rb +45 -0
- data/lib/nous/serializer.rb +5 -2
- data/lib/nous/url_resolver.rb +25 -0
- data/lib/nous/version.rb +1 -1
- data/lib/nous.rb +6 -5
- metadata +43 -8
- data/lib/nous/configuration.rb +0 -39
- data/lib/nous/error.rb +0 -5
- data/lib/nous/extraction_runner.rb +0 -29
- data/lib/nous/extraction_thread.rb +0 -28
- data/lib/nous/extractor.rb +0 -46
- /data/lib/nous/{page.rb → primitives/page.rb} +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7636d207654dbf38a64aeec480164c0e57b3c8bf98ac8373e576f692896fb3a3
|
|
4
|
+
data.tar.gz: 62ae3b01ec837d71caf104710c42bde82df6d50e6c7acc50252f2902ef9b2046
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: af52f527a8720d46cd00f3a42814d432730d105aed05ebb2435f1546afb2140bd88fd1e2c6f4e75c0226afd5ef6c9072c6919518bae366047eb022f24b30ffcd
|
|
7
|
+
data.tar.gz: '049b133f406f694771617c34d3adfef2aa64ef3aa5608d89d98230b2f596e4cdef831e53f9ee039aa247a6c588900fc51cfd07a7b456dec3ee524e83abe08b93'
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.3.0] - 2026-02-23
|
|
4
|
+
|
|
5
|
+
- Remove `Nous::Error` base hierarchy; colocated errors inherit directly from `StandardError` with descriptive names
|
|
6
|
+
- Move extraction pipeline under `Nous::Fetcher::*` namespace (`ExtractionRunner`, `ExtractionThread`)
|
|
7
|
+
- Move readability command into `Nous::Extractor::Default::Client`, mirroring Jina structure
|
|
8
|
+
- `Nous::Extractor` is now a module namespace (implicit via Zeitwerk), no longer a Command
|
|
9
|
+
- Shared `Extractor::ExtractionError` contract: all extractor backends raise this on failure
|
|
10
|
+
- Pull `seed_url` off `Configuration`; `Crawler` owns URL parsing and validation directly
|
|
11
|
+
- Explicit rescue lists in CLI and extraction thread instead of broad `Nous::Error` rescue
|
|
12
|
+
- Rename `--verbose`/`-v` to `--debug`/`-d`; `-v` is now `--version`
|
|
13
|
+
- Add `Nous::Url`, `Nous::UrlResolver`, and `Crawler::RedirectFollower` to correctly handle redirects and path encoding (including spaces)
|
|
14
|
+
- Add `-r`/`--recursive`; default mode now fetches only the seed page unless recursion is explicitly enabled
|
|
15
|
+
- Split crawler fetchers by mode: `Crawler::AsyncPageFetcher`, `Crawler::RecursivePageFetcher`, and `Crawler::SinglePageFetcher`
|
|
16
|
+
- Move configuration construction to `ConfigurationBuilder` and `Data.define`-based `Configuration` primitive
|
|
17
|
+
- Add `faraday-follow_redirects` for single-page redirect handling and update integration/spec coverage for recursive and single-page flows
|
|
18
|
+
|
|
3
19
|
## [0.2.0] - 2026-02-21
|
|
4
20
|
|
|
5
21
|
- Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)
|
data/README.md
CHANGED
|
@@ -42,8 +42,8 @@ nous https://example.com -s "article.post"
|
|
|
42
42
|
# Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
|
|
43
43
|
nous https://example.com --jina
|
|
44
44
|
|
|
45
|
-
#
|
|
46
|
-
nous https://example.com -
|
|
45
|
+
# Debug logging
|
|
46
|
+
nous https://example.com -d
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
### Options
|
|
@@ -58,7 +58,9 @@ nous https://example.com -v
|
|
|
58
58
|
| `-l`, `--limit N` | Maximum pages to fetch | `100` |
|
|
59
59
|
| `--timeout N` | Per-request timeout in seconds | `15` |
|
|
60
60
|
| `--jina` | Use Jina Reader API for extraction | off |
|
|
61
|
-
| `-v`, `--
|
|
61
|
+
| `-v`, `--version` | Print version and exit | off |
|
|
62
|
+
| `-h`, `--help` | Print usage and exit | off |
|
|
63
|
+
| `-d`, `--debug` | Debug logging to stderr | off |
|
|
62
64
|
|
|
63
65
|
## Ruby API
|
|
64
66
|
|
|
@@ -134,6 +136,7 @@ Extracted markdown content...
|
|
|
134
136
|
bin/setup # Install dependencies
|
|
135
137
|
bundle exec rspec # Run tests
|
|
136
138
|
bundle exec standardrb # Lint
|
|
139
|
+
bundle exec exe/nous # Run the command line in-development
|
|
137
140
|
```
|
|
138
141
|
|
|
139
142
|
## License
|
data/lib/nous/cli.rb
CHANGED
|
@@ -4,7 +4,7 @@ require "optparse"
|
|
|
4
4
|
|
|
5
5
|
module Nous
|
|
6
6
|
class Cli
|
|
7
|
-
class
|
|
7
|
+
class CliError < StandardError; end
|
|
8
8
|
|
|
9
9
|
def initialize(argv)
|
|
10
10
|
@argv = argv
|
|
@@ -18,7 +18,9 @@ module Nous
|
|
|
18
18
|
pages = Nous.fetch(seed_url, **fetch_options)
|
|
19
19
|
output = Nous.serialize(pages, format: options[:format])
|
|
20
20
|
write_output(output)
|
|
21
|
-
rescue
|
|
21
|
+
rescue CliError,
|
|
22
|
+
Fetcher::FetchError,
|
|
23
|
+
Serializer::SerializationError => e
|
|
22
24
|
warn("nous: #{e.message}")
|
|
23
25
|
exit 1
|
|
24
26
|
end
|
|
@@ -32,7 +34,7 @@ module Nous
|
|
|
32
34
|
end
|
|
33
35
|
|
|
34
36
|
def fetch_options
|
|
35
|
-
opts = options.slice(
|
|
37
|
+
opts = options.slice(*Configuration.members)
|
|
36
38
|
opts[:extractor] = extractor
|
|
37
39
|
opts
|
|
38
40
|
end
|
|
@@ -44,7 +46,7 @@ module Nous
|
|
|
44
46
|
end
|
|
45
47
|
|
|
46
48
|
def validate!
|
|
47
|
-
raise
|
|
49
|
+
raise CliError, "no URL provided. Usage: nous <url> [options]" unless seed_url
|
|
48
50
|
end
|
|
49
51
|
|
|
50
52
|
def write_output(output)
|
|
@@ -58,7 +60,7 @@ module Nous
|
|
|
58
60
|
def parse_options!
|
|
59
61
|
parser.parse!(argv)
|
|
60
62
|
rescue OptionParser::InvalidOption => e
|
|
61
|
-
raise
|
|
63
|
+
raise CliError, e.message
|
|
62
64
|
end
|
|
63
65
|
|
|
64
66
|
def parser
|
|
@@ -77,13 +79,14 @@ module Nous
|
|
|
77
79
|
opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
|
|
78
80
|
opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
|
|
79
81
|
opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
|
|
80
|
-
opts.on("-
|
|
81
|
-
opts.on("-
|
|
82
|
-
|
|
82
|
+
opts.on("-r", "--recursive", "Follow same-host links recursively") { options[:recursive] = true }
|
|
83
|
+
opts.on("-d", "--debug", "Debug logging to stderr") { options[:debug] = true }
|
|
84
|
+
opts.on("-v", "--version", "Show version") do
|
|
85
|
+
$stdout.puts("nous #{Nous::VERSION}")
|
|
83
86
|
exit
|
|
84
87
|
end
|
|
85
|
-
opts.on("--
|
|
86
|
-
$stdout.puts(
|
|
88
|
+
opts.on("-h", "--help", "Show help") do
|
|
89
|
+
$stdout.puts(opts)
|
|
87
90
|
exit
|
|
88
91
|
end
|
|
89
92
|
end
|
data/lib/nous/command.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
4
|
class Command
|
|
5
|
-
class
|
|
5
|
+
class CommandError < StandardError; end
|
|
6
6
|
|
|
7
7
|
class Result
|
|
8
8
|
attr_reader :payload, :error, :metadata
|
|
@@ -27,7 +27,7 @@ module Nous
|
|
|
27
27
|
command = new(...)
|
|
28
28
|
command.call
|
|
29
29
|
rescue => e
|
|
30
|
-
return command.failure(
|
|
30
|
+
return command.failure(CommandError.new("unexpected: #{e.message}")) if command
|
|
31
31
|
|
|
32
32
|
Result.new(success: false, error: e)
|
|
33
33
|
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class ConfigurationBuilder
|
|
5
|
+
class UnknownOptionError < StandardError; end
|
|
6
|
+
|
|
7
|
+
DEFAULTS = {
|
|
8
|
+
concurrency: 3,
|
|
9
|
+
match: [],
|
|
10
|
+
limit: 100,
|
|
11
|
+
timeout: 15,
|
|
12
|
+
debug: false,
|
|
13
|
+
keep_query: false,
|
|
14
|
+
recursive: false
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
def self.call(**options)
|
|
18
|
+
new(options).call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def initialize(options)
|
|
22
|
+
@options = options
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call
|
|
26
|
+
validate_keys!
|
|
27
|
+
|
|
28
|
+
Configuration.new(**coerced_options)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :options
|
|
34
|
+
|
|
35
|
+
def validate_keys!
|
|
36
|
+
unknown = options.keys - Configuration.members
|
|
37
|
+
return if unknown.empty?
|
|
38
|
+
|
|
39
|
+
raise UnknownOptionError, "unknown option(s): #{unknown.join(", ")}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def coerced_options
|
|
43
|
+
merged = DEFAULTS.merge(options)
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
concurrency: Integer(merged[:concurrency]).clamp(1, 20),
|
|
47
|
+
match: Array(merged[:match]),
|
|
48
|
+
limit: Integer(merged[:limit]).clamp(1, 10_000),
|
|
49
|
+
timeout: Integer(merged[:timeout]),
|
|
50
|
+
debug: !!merged[:debug],
|
|
51
|
+
keep_query: !!merged[:keep_query],
|
|
52
|
+
recursive: !!merged[:recursive]
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
data/lib/nous/converter.rb
CHANGED
|
@@ -2,20 +2,24 @@
|
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
4
|
class Crawler < Command
|
|
5
|
-
class
|
|
5
|
+
class AsyncPageFetcher
|
|
6
6
|
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
7
7
|
|
|
8
|
-
def initialize(client:)
|
|
8
|
+
def initialize(client:, seed_host:)
|
|
9
9
|
@client = client
|
|
10
|
+
@seed_host = seed_host
|
|
10
11
|
end
|
|
11
12
|
|
|
12
13
|
def fetch(url)
|
|
13
14
|
Async::Task.current.with_timeout(config.timeout) do
|
|
14
|
-
|
|
15
|
+
result = RedirectFollower.call(client:, seed_host:, url:)
|
|
16
|
+
return skip(url, result.error.message) if result.failure?
|
|
17
|
+
|
|
18
|
+
response, final_url = result.payload
|
|
15
19
|
return skip(url, "status #{response.status}") unless response.status == 200
|
|
16
20
|
return skip(url, "non-html content") unless html?(response)
|
|
17
21
|
|
|
18
|
-
|
|
22
|
+
RawPage.new(url: final_url.to_s, pathname: final_url.path, html: response.read)
|
|
19
23
|
ensure
|
|
20
24
|
response&.close
|
|
21
25
|
end
|
|
@@ -27,7 +31,7 @@ module Nous
|
|
|
27
31
|
|
|
28
32
|
private
|
|
29
33
|
|
|
30
|
-
attr_reader :client
|
|
34
|
+
attr_reader :client, :seed_host
|
|
31
35
|
|
|
32
36
|
def config
|
|
33
37
|
Nous.configuration
|
|
@@ -39,7 +43,7 @@ module Nous
|
|
|
39
43
|
end
|
|
40
44
|
|
|
41
45
|
def skip(url, reason)
|
|
42
|
-
warn("[nous] skip #{url}: #{reason}") if config.
|
|
46
|
+
warn("[nous] skip #{url}: #{reason}") if config.debug?
|
|
43
47
|
nil
|
|
44
48
|
end
|
|
45
49
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Nous
|
|
4
6
|
class Crawler < Command
|
|
5
7
|
class LinkExtractor
|
|
@@ -8,9 +10,7 @@ module Nous
|
|
|
8
10
|
end
|
|
9
11
|
|
|
10
12
|
def extract(current_url, html)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
|
|
13
|
+
anchors(html).filter_map { |href| resolve(current_url, href) }.uniq
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
private
|
|
@@ -21,19 +21,19 @@ module Nous
|
|
|
21
21
|
Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
|
|
22
22
|
end
|
|
23
23
|
|
|
24
|
-
def resolve(
|
|
24
|
+
def resolve(current_url, href)
|
|
25
25
|
return unless url_filter.allowed?(href)
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
return unless
|
|
27
|
+
result = UrlResolver.call(base_url: current_url, href:)
|
|
28
|
+
return unless result.success?
|
|
29
|
+
|
|
30
|
+
url = result.payload
|
|
31
|
+
return unless url_filter.same_host?(url)
|
|
29
32
|
|
|
30
|
-
canonical = url_filter.canonicalize(
|
|
31
|
-
return unless url_filter.matches_path?(
|
|
33
|
+
canonical = url_filter.canonicalize(url)
|
|
34
|
+
return unless url_filter.matches_path?(Url.new(canonical).path)
|
|
32
35
|
|
|
33
36
|
canonical
|
|
34
|
-
rescue URI::InvalidURIError => e
|
|
35
|
-
warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
|
|
36
|
-
nil
|
|
37
37
|
end
|
|
38
38
|
end
|
|
39
39
|
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "async"
|
|
4
|
+
require "async/http/internet"
|
|
5
|
+
|
|
6
|
+
module Nous
|
|
7
|
+
class Crawler < Command
|
|
8
|
+
class RecursivePageFetcher < Command
|
|
9
|
+
def initialize(seed_url:, http_client: nil)
|
|
10
|
+
@seed_uri = Url.new(seed_url)
|
|
11
|
+
@http_client = http_client
|
|
12
|
+
@pages = []
|
|
13
|
+
@queue = [url_filter.canonicalize(seed_uri)]
|
|
14
|
+
@seen = Set.new(queue)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call
|
|
18
|
+
suppress_async_warnings unless config.debug?
|
|
19
|
+
|
|
20
|
+
open_connection do |client|
|
|
21
|
+
crawl(client)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
success(payload: pages)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
attr_reader :seed_uri, :http_client, :pages, :queue, :seen
|
|
30
|
+
|
|
31
|
+
def config
|
|
32
|
+
Nous.configuration
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def crawl(client)
|
|
36
|
+
fetch_and_enqueue(queue.shift(config.concurrency), client) while queue.any? && within_limit?
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def fetch_and_enqueue(batch, client)
|
|
40
|
+
fetch_batch(batch, client).each do |page|
|
|
41
|
+
next unless page
|
|
42
|
+
break unless within_limit?
|
|
43
|
+
|
|
44
|
+
pages << page
|
|
45
|
+
seen << page.url
|
|
46
|
+
enqueue_links(page)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def fetch_batch(urls, client)
|
|
51
|
+
tasks = []
|
|
52
|
+
|
|
53
|
+
Async do |task|
|
|
54
|
+
urls.each do |url|
|
|
55
|
+
tasks << task.async { page_fetcher(client).fetch(url) }
|
|
56
|
+
end
|
|
57
|
+
end.wait
|
|
58
|
+
|
|
59
|
+
tasks.map(&:wait)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def enqueue_links(page)
|
|
63
|
+
link_extractor.extract(page.url, page.html).each do |url|
|
|
64
|
+
next if seen.include?(url)
|
|
65
|
+
|
|
66
|
+
seen << url
|
|
67
|
+
queue << url
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def within_limit?
|
|
72
|
+
pages.length < config.limit
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def open_connection
|
|
76
|
+
client = http_client || Async::HTTP::Internet.new
|
|
77
|
+
|
|
78
|
+
Async do
|
|
79
|
+
yield client
|
|
80
|
+
ensure
|
|
81
|
+
client.close
|
|
82
|
+
end.wait
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def page_fetcher(client)
|
|
86
|
+
AsyncPageFetcher.new(client:, seed_host: seed_uri.host)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def url_filter
|
|
90
|
+
@url_filter ||= UrlFilter.new(seed_uri:)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def link_extractor
|
|
94
|
+
@link_extractor ||= LinkExtractor.new(url_filter:)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def suppress_async_warnings
|
|
98
|
+
require "console"
|
|
99
|
+
Console.logger.level = :error
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class RedirectFollower < Command
|
|
6
|
+
class RedirectError < StandardError; end
|
|
7
|
+
|
|
8
|
+
MAX_HOPS = 5
|
|
9
|
+
|
|
10
|
+
def initialize(client:, seed_host:, url:, hops_remaining: MAX_HOPS)
|
|
11
|
+
@client = client
|
|
12
|
+
@seed_host = seed_host
|
|
13
|
+
@url = url
|
|
14
|
+
@hops_remaining = hops_remaining
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call
|
|
18
|
+
response = client.get(url, {})
|
|
19
|
+
|
|
20
|
+
return success(payload: [response, Url.new(url)]) unless redirect?(response.status)
|
|
21
|
+
|
|
22
|
+
response.close
|
|
23
|
+
follow(response.headers["location"])
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :client, :seed_host, :url, :hops_remaining
|
|
29
|
+
|
|
30
|
+
def redirect?(status)
|
|
31
|
+
(300..399).cover?(status)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def follow(location)
|
|
35
|
+
target = resolve_target(location)
|
|
36
|
+
return target if target.failure?
|
|
37
|
+
|
|
38
|
+
self.class.call(client:, seed_host:, url: target.payload.to_s, hops_remaining: hops_remaining - 1)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def resolve_target(location)
|
|
42
|
+
return failure(RedirectError.new("redirect without location from #{url}")) unless location
|
|
43
|
+
return failure(RedirectError.new("too many redirects from #{url}")) if hops_remaining <= 0
|
|
44
|
+
|
|
45
|
+
result = UrlResolver.call(base_url: url, href: location)
|
|
46
|
+
return failure(RedirectError.new(result.error.message)) if result.failure?
|
|
47
|
+
|
|
48
|
+
unless safe?(result.payload)
|
|
49
|
+
return failure(RedirectError.new("redirect to #{result.payload} outside #{seed_host}"))
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
result
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def safe?(target)
|
|
56
|
+
target.http? && target.host == seed_host
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "faraday/follow_redirects"
|
|
5
|
+
|
|
6
|
+
module Nous
|
|
7
|
+
class Crawler < Command
|
|
8
|
+
class SinglePageFetcher < Command
|
|
9
|
+
class FetchError < StandardError; end
|
|
10
|
+
|
|
11
|
+
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
12
|
+
MAX_REDIRECTS = 5
|
|
13
|
+
|
|
14
|
+
def initialize(url:, http_client: nil)
|
|
15
|
+
@url = url
|
|
16
|
+
@seed_host = Url.new(url).host
|
|
17
|
+
@connection = http_client || build_connection
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def call
|
|
21
|
+
response = connection.get(url)
|
|
22
|
+
final_url = resolve_final_url(response)
|
|
23
|
+
|
|
24
|
+
validate_host!(final_url)
|
|
25
|
+
validate_html!(response)
|
|
26
|
+
|
|
27
|
+
raw_page = RawPage.new(url: final_url.to_s, pathname: final_url.path, html: response.body)
|
|
28
|
+
success(payload: [raw_page])
|
|
29
|
+
rescue FetchError => e
|
|
30
|
+
failure(e)
|
|
31
|
+
rescue Faraday::Error => e
|
|
32
|
+
failure(FetchError.new(e.message))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
attr_reader :url, :seed_host, :connection
|
|
38
|
+
|
|
39
|
+
def config
|
|
40
|
+
Nous.configuration
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def resolve_final_url(response)
|
|
44
|
+
location = response.env.url.to_s
|
|
45
|
+
Url.new(location)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def validate_host!(final_url)
|
|
49
|
+
return if final_url.host == seed_host
|
|
50
|
+
|
|
51
|
+
raise FetchError, "redirected to #{final_url} outside #{seed_host}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def validate_html!(response)
|
|
55
|
+
content_type = response.headers["content-type"].to_s
|
|
56
|
+
return if HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
|
|
57
|
+
|
|
58
|
+
raise FetchError, "non-html content: #{content_type}"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def build_connection
|
|
62
|
+
Faraday.new do |f|
|
|
63
|
+
f.response :follow_redirects, limit: MAX_REDIRECTS
|
|
64
|
+
f.response :raise_error
|
|
65
|
+
|
|
66
|
+
f.options.timeout = config.timeout
|
|
67
|
+
f.options.open_timeout = config.timeout
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -5,10 +5,10 @@ module Nous
|
|
|
5
5
|
class UrlFilter
|
|
6
6
|
IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
|
|
7
7
|
|
|
8
|
-
def initialize(
|
|
9
|
-
@host =
|
|
10
|
-
@match =
|
|
11
|
-
@keep_query =
|
|
8
|
+
def initialize(seed_uri:)
|
|
9
|
+
@host = seed_uri.host
|
|
10
|
+
@match = Nous.configuration.match
|
|
11
|
+
@keep_query = Nous.configuration.keep_query
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def canonicalize(uri)
|
|
@@ -25,8 +25,8 @@ module Nous
|
|
|
25
25
|
IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
def same_host?(
|
|
29
|
-
|
|
28
|
+
def same_host?(url)
|
|
29
|
+
url.http? && url.host == host
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
def matches_path?(path)
|
data/lib/nous/crawler.rb
CHANGED
|
@@ -1,91 +1,36 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "async"
|
|
4
|
-
require "async/http/internet"
|
|
5
|
-
require "nokogiri"
|
|
6
|
-
require "uri"
|
|
7
|
-
|
|
8
3
|
module Nous
|
|
9
4
|
class Crawler < Command
|
|
10
|
-
class
|
|
5
|
+
class CrawlError < StandardError; end
|
|
11
6
|
|
|
12
|
-
def initialize(seed_url:)
|
|
7
|
+
def initialize(seed_url:, http_client: nil)
|
|
13
8
|
@seed_url = seed_url
|
|
9
|
+
@http_client = http_client
|
|
10
|
+
parse_seed!
|
|
14
11
|
end
|
|
15
12
|
|
|
16
13
|
def call
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Async do
|
|
24
|
-
client = Async::HTTP::Internet.new
|
|
25
|
-
begin
|
|
26
|
-
crawl(queue:, seen:, pages:, client:)
|
|
27
|
-
ensure
|
|
28
|
-
client.close
|
|
29
|
-
end
|
|
30
|
-
end.wait
|
|
31
|
-
|
|
32
|
-
success(payload: pages)
|
|
14
|
+
if config.recursive?
|
|
15
|
+
RecursivePageFetcher.call(seed_url:, http_client:)
|
|
16
|
+
else
|
|
17
|
+
SinglePageFetcher.call(url: seed_url, http_client:)
|
|
18
|
+
end
|
|
33
19
|
end
|
|
34
20
|
|
|
35
21
|
private
|
|
36
22
|
|
|
37
|
-
attr_reader :seed_url
|
|
23
|
+
attr_reader :seed_url, :http_client
|
|
38
24
|
|
|
39
25
|
def config
|
|
40
26
|
Nous.configuration
|
|
41
27
|
end
|
|
42
28
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
pages << page
|
|
50
|
-
break if pages.length >= config.limit
|
|
51
|
-
|
|
52
|
-
link_extractor.extract(page[:url], page[:html]).each do |url|
|
|
53
|
-
next if seen.include?(url)
|
|
54
|
-
|
|
55
|
-
seen << url
|
|
56
|
-
queue << url
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
def fetch_batch(urls, client)
|
|
63
|
-
tasks = []
|
|
64
|
-
|
|
65
|
-
Async do |task|
|
|
66
|
-
urls.each do |url|
|
|
67
|
-
tasks << task.async { page_fetcher(client).fetch(url) }
|
|
68
|
-
end
|
|
69
|
-
end.wait
|
|
70
|
-
|
|
71
|
-
tasks.map(&:wait)
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def url_filter
|
|
75
|
-
@url_filter ||= UrlFilter.new(config)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def link_extractor
|
|
79
|
-
@link_extractor ||= LinkExtractor.new(url_filter:)
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
def page_fetcher(client)
|
|
83
|
-
PageFetcher.new(client:)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def suppress_async_warnings
|
|
87
|
-
require "console"
|
|
88
|
-
Console.logger.level = :error
|
|
29
|
+
def parse_seed!
|
|
30
|
+
parsed = Url.new(seed_url)
|
|
31
|
+
raise CrawlError, "seed URL must be http or https" unless parsed.http?
|
|
32
|
+
rescue ArgumentError => e
|
|
33
|
+
raise CrawlError, "invalid seed URL: #{e.message}"
|
|
89
34
|
end
|
|
90
35
|
end
|
|
91
36
|
end
|