nous 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -0
- data/README.md +82 -10
- data/lib/nous/cli.rb +13 -10
- data/lib/nous/command.rb +2 -2
- data/lib/nous/configuration_builder.rb +56 -0
- data/lib/nous/converter.rb +1 -1
- data/lib/nous/crawler/async_page_fetcher.rb +83 -0
- data/lib/nous/crawler/link_extractor.rb +11 -11
- data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
- data/lib/nous/crawler/redirect_follower.rb +60 -0
- data/lib/nous/crawler/single_page_fetcher.rb +112 -0
- data/lib/nous/crawler/url_filter.rb +6 -6
- data/lib/nous/crawler.rb +15 -70
- data/lib/nous/extractor/default/client.rb +68 -0
- data/lib/nous/extractor/default.rb +10 -6
- data/lib/nous/extractor/jina/client.rb +4 -4
- data/lib/nous/extractor/jina.rb +10 -9
- data/lib/nous/fetcher/extraction_runner.rb +31 -0
- data/lib/nous/fetcher/page_extractor.rb +40 -0
- data/lib/nous/fetcher.rb +38 -11
- data/lib/nous/primitives/configuration.rb +17 -0
- data/lib/nous/primitives/extracted_content.rb +5 -0
- data/lib/nous/primitives/fetch_record.rb +26 -0
- data/lib/nous/primitives/fetch_result.rb +21 -0
- data/lib/nous/primitives/page.rb +5 -0
- data/lib/nous/primitives/url.rb +45 -0
- data/lib/nous/serializer.rb +14 -3
- data/lib/nous/url_resolver.rb +25 -0
- data/lib/nous/version.rb +1 -1
- data/lib/nous.rb +6 -5
- metadata +44 -8
- data/lib/nous/configuration.rb +0 -39
- data/lib/nous/crawler/page_fetcher.rb +0 -47
- data/lib/nous/error.rb +0 -5
- data/lib/nous/extraction_runner.rb +0 -29
- data/lib/nous/extraction_thread.rb +0 -28
- data/lib/nous/extractor.rb +0 -46
- data/lib/nous/page.rb +0 -5
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "faraday/follow_redirects"
|
|
5
|
+
|
|
6
|
+
module Nous
|
|
7
|
+
class Crawler < Command
|
|
8
|
+
class SinglePageFetcher < Command
|
|
9
|
+
class FetchError < StandardError; end
|
|
10
|
+
|
|
11
|
+
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
12
|
+
MAX_REDIRECTS = 5
|
|
13
|
+
|
|
14
|
+
def initialize(url:, http_client: nil)
|
|
15
|
+
@url = url
|
|
16
|
+
@seed_host = Url.new(url).host
|
|
17
|
+
@connection = http_client || build_connection
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def call
|
|
21
|
+
response = connection.get(url)
|
|
22
|
+
final_url = resolve_final_url(response)
|
|
23
|
+
content_type = response.headers["content-type"].to_s
|
|
24
|
+
redirected = final_url.to_s != url
|
|
25
|
+
|
|
26
|
+
record = build_record(
|
|
27
|
+
final_url: final_url.to_s,
|
|
28
|
+
pathname: final_url.path,
|
|
29
|
+
html: response.body,
|
|
30
|
+
content_type: content_type,
|
|
31
|
+
redirected: redirected
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
validate!(record)
|
|
35
|
+
|
|
36
|
+
success(payload: [record])
|
|
37
|
+
rescue FetchError => e
|
|
38
|
+
record = build_failed_record(error: e.message)
|
|
39
|
+
success(payload: [record])
|
|
40
|
+
rescue Faraday::Error => e
|
|
41
|
+
record = build_failed_record(error: e.message)
|
|
42
|
+
success(payload: [record])
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
attr_reader :url, :seed_host, :connection
|
|
48
|
+
|
|
49
|
+
def config
|
|
50
|
+
Nous.configuration
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def build_record(final_url:, pathname:, html:, content_type:, redirected:)
|
|
54
|
+
FetchRecord.new(
|
|
55
|
+
requested_url: url,
|
|
56
|
+
final_url: final_url,
|
|
57
|
+
pathname: pathname,
|
|
58
|
+
html: html,
|
|
59
|
+
content_type: content_type,
|
|
60
|
+
ok: true,
|
|
61
|
+
error: nil,
|
|
62
|
+
redirected: redirected
|
|
63
|
+
)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def build_failed_record(error:)
|
|
67
|
+
FetchRecord.new(
|
|
68
|
+
requested_url: url,
|
|
69
|
+
final_url: nil,
|
|
70
|
+
pathname: Url.new(url).path,
|
|
71
|
+
html: nil,
|
|
72
|
+
content_type: nil,
|
|
73
|
+
ok: false,
|
|
74
|
+
error: error,
|
|
75
|
+
redirected: false
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def validate!(record)
|
|
80
|
+
validate_host!(record.final_url)
|
|
81
|
+
validate_html!(record.content_type)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def resolve_final_url(response)
|
|
85
|
+
location = response.env.url.to_s
|
|
86
|
+
Url.new(location)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def validate_host!(final_url)
|
|
90
|
+
return if Url.new(final_url).host == seed_host
|
|
91
|
+
|
|
92
|
+
raise FetchError, "redirected to #{final_url} outside #{seed_host}"
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def validate_html!(content_type)
|
|
96
|
+
return if HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
|
|
97
|
+
|
|
98
|
+
raise FetchError, "non-html content: #{content_type}"
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def build_connection
|
|
102
|
+
Faraday.new do |f|
|
|
103
|
+
f.response :follow_redirects, limit: MAX_REDIRECTS
|
|
104
|
+
f.response :raise_error
|
|
105
|
+
|
|
106
|
+
f.options.timeout = config.timeout
|
|
107
|
+
f.options.open_timeout = config.timeout
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
@@ -5,10 +5,10 @@ module Nous
|
|
|
5
5
|
class UrlFilter
|
|
6
6
|
IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
|
|
7
7
|
|
|
8
|
-
def initialize(
|
|
9
|
-
@host =
|
|
10
|
-
@match =
|
|
11
|
-
@keep_query =
|
|
8
|
+
def initialize(seed_uri:)
|
|
9
|
+
@host = seed_uri.host
|
|
10
|
+
@match = Nous.configuration.match
|
|
11
|
+
@keep_query = Nous.configuration.keep_query
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def canonicalize(uri)
|
|
@@ -25,8 +25,8 @@ module Nous
|
|
|
25
25
|
IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
def same_host?(
|
|
29
|
-
|
|
28
|
+
def same_host?(url)
|
|
29
|
+
url.http? && url.host == host
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
def matches_path?(path)
|
data/lib/nous/crawler.rb
CHANGED
|
@@ -1,91 +1,36 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "async"
|
|
4
|
-
require "async/http/internet"
|
|
5
|
-
require "nokogiri"
|
|
6
|
-
require "uri"
|
|
7
|
-
|
|
8
3
|
module Nous
|
|
9
4
|
class Crawler < Command
|
|
10
|
-
class
|
|
5
|
+
class CrawlError < StandardError; end
|
|
11
6
|
|
|
12
|
-
def initialize(seed_url:)
|
|
7
|
+
def initialize(seed_url:, http_client: nil)
|
|
13
8
|
@seed_url = seed_url
|
|
9
|
+
@http_client = http_client
|
|
10
|
+
parse_seed!
|
|
14
11
|
end
|
|
15
12
|
|
|
16
13
|
def call
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Async do
|
|
24
|
-
client = Async::HTTP::Internet.new
|
|
25
|
-
begin
|
|
26
|
-
crawl(queue:, seen:, pages:, client:)
|
|
27
|
-
ensure
|
|
28
|
-
client.close
|
|
29
|
-
end
|
|
30
|
-
end.wait
|
|
31
|
-
|
|
32
|
-
success(payload: pages)
|
|
14
|
+
if config.recursive?
|
|
15
|
+
RecursivePageFetcher.call(seed_url:, http_client:)
|
|
16
|
+
else
|
|
17
|
+
SinglePageFetcher.call(url: seed_url, http_client:)
|
|
18
|
+
end
|
|
33
19
|
end
|
|
34
20
|
|
|
35
21
|
private
|
|
36
22
|
|
|
37
|
-
attr_reader :seed_url
|
|
23
|
+
attr_reader :seed_url, :http_client
|
|
38
24
|
|
|
39
25
|
def config
|
|
40
26
|
Nous.configuration
|
|
41
27
|
end
|
|
42
28
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
pages << page
|
|
50
|
-
break if pages.length >= config.limit
|
|
51
|
-
|
|
52
|
-
link_extractor.extract(page[:url], page[:html]).each do |url|
|
|
53
|
-
next if seen.include?(url)
|
|
54
|
-
|
|
55
|
-
seen << url
|
|
56
|
-
queue << url
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
def fetch_batch(urls, client)
|
|
63
|
-
tasks = []
|
|
64
|
-
|
|
65
|
-
Async do |task|
|
|
66
|
-
urls.each do |url|
|
|
67
|
-
tasks << task.async { page_fetcher(client).fetch(url) }
|
|
68
|
-
end
|
|
69
|
-
end.wait
|
|
70
|
-
|
|
71
|
-
tasks.map(&:wait)
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def url_filter
|
|
75
|
-
@url_filter ||= UrlFilter.new(config)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def link_extractor
|
|
79
|
-
@link_extractor ||= LinkExtractor.new(url_filter:)
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
def page_fetcher(client)
|
|
83
|
-
PageFetcher.new(client:)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def suppress_async_warnings
|
|
87
|
-
require "console"
|
|
88
|
-
Console.logger.level = :error
|
|
29
|
+
def parse_seed!
|
|
30
|
+
parsed = Url.new(seed_url)
|
|
31
|
+
raise CrawlError, "seed URL must be http or https" unless parsed.http?
|
|
32
|
+
rescue ArgumentError => e
|
|
33
|
+
raise CrawlError, "invalid seed URL: #{e.message}"
|
|
89
34
|
end
|
|
90
35
|
end
|
|
91
36
|
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "readability"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
module Extractor
|
|
7
|
+
class Default
|
|
8
|
+
class Client < Command
|
|
9
|
+
class ExtractionError < StandardError; end
|
|
10
|
+
|
|
11
|
+
NOISY_TAGS = %w[script style nav footer].freeze
|
|
12
|
+
|
|
13
|
+
def initialize(html:, selector: nil)
|
|
14
|
+
@html = html
|
|
15
|
+
@selector = selector
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def call
|
|
19
|
+
readable = ::Readability::Document.new(prepared_html)
|
|
20
|
+
|
|
21
|
+
text = Nokogiri::HTML(readable.content).text.strip
|
|
22
|
+
return failure(ExtractionError.new("readability returned no content")) if text.empty?
|
|
23
|
+
|
|
24
|
+
title = resolve_title(readable)
|
|
25
|
+
success(payload: {title: title, content: readable.content})
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
attr_reader :html, :selector
|
|
31
|
+
|
|
32
|
+
def prepared_html
|
|
33
|
+
doc = Nokogiri::HTML(html)
|
|
34
|
+
original_title(doc)
|
|
35
|
+
doc = scope(doc, selector) if selector
|
|
36
|
+
strip_tags(doc)
|
|
37
|
+
doc.to_html
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def original_title(doc)
|
|
41
|
+
@original_title ||= doc.at_css("title")&.text.to_s.strip
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def scope(doc, selector)
|
|
45
|
+
scoped = doc.at_css(selector)
|
|
46
|
+
return doc unless scoped
|
|
47
|
+
|
|
48
|
+
Nokogiri::HTML.fragment(scoped.to_html)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def strip_tags(doc)
|
|
52
|
+
NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def resolve_title(readable)
|
|
56
|
+
title = readable.title.to_s.strip
|
|
57
|
+
title = @original_title if title.empty?
|
|
58
|
+
title = title_from_content(readable.content) if title.empty?
|
|
59
|
+
title
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def title_from_content(content)
|
|
63
|
+
Nokogiri::HTML(content).at_css("h1")&.text.to_s.strip
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
|
-
|
|
5
|
-
class
|
|
4
|
+
module Extractor
|
|
5
|
+
class ExtractionError < StandardError; end
|
|
6
|
+
|
|
7
|
+
class Default < Command
|
|
6
8
|
def initialize(selector: nil)
|
|
7
9
|
@selector = selector
|
|
8
10
|
end
|
|
9
11
|
|
|
10
|
-
def extract(
|
|
11
|
-
extracted = extract_content(
|
|
12
|
+
def extract(record)
|
|
13
|
+
extracted = extract_content(record.html)
|
|
12
14
|
markdown = convert_to_markdown(extracted[:content])
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
success(payload: ExtractedContent.new(title: extracted[:title], content: markdown))
|
|
17
|
+
rescue Client::ExtractionError, Converter::ConversionError => e
|
|
18
|
+
failure(ExtractionError.new(e.message))
|
|
15
19
|
end
|
|
16
20
|
|
|
17
21
|
private
|
|
@@ -19,7 +23,7 @@ module Nous
|
|
|
19
23
|
attr_reader :selector
|
|
20
24
|
|
|
21
25
|
def extract_content(html)
|
|
22
|
-
result =
|
|
26
|
+
result = Client.call(html:, selector:)
|
|
23
27
|
raise result.error if result.failure?
|
|
24
28
|
|
|
25
29
|
result.payload
|
|
@@ -5,10 +5,10 @@ require "faraday/retry"
|
|
|
5
5
|
require "json"
|
|
6
6
|
|
|
7
7
|
module Nous
|
|
8
|
-
|
|
8
|
+
module Extractor
|
|
9
9
|
class Jina
|
|
10
10
|
class Client
|
|
11
|
-
class
|
|
11
|
+
class RequestError < StandardError; end
|
|
12
12
|
|
|
13
13
|
BASE_URL = "https://r.jina.ai"
|
|
14
14
|
RETRYABLE_STATUSES = [429, 500, 502, 503, 504].freeze
|
|
@@ -22,7 +22,7 @@ module Nous
|
|
|
22
22
|
response = connection.get("/#{url}")
|
|
23
23
|
parse(response.body)
|
|
24
24
|
rescue Faraday::Error => e
|
|
25
|
-
raise
|
|
25
|
+
raise RequestError, e.message
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
private
|
|
@@ -51,7 +51,7 @@ module Nous
|
|
|
51
51
|
def parse(body)
|
|
52
52
|
JSON.parse(body)
|
|
53
53
|
rescue JSON::ParserError => e
|
|
54
|
-
raise
|
|
54
|
+
raise RequestError, "invalid JSON: #{e.message}"
|
|
55
55
|
end
|
|
56
56
|
end
|
|
57
57
|
end
|
data/lib/nous/extractor/jina.rb
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
|
-
|
|
5
|
-
class Jina
|
|
6
|
-
class Error < Nous::Error; end
|
|
7
|
-
|
|
4
|
+
module Extractor
|
|
5
|
+
class Jina < Command
|
|
8
6
|
def initialize(api_key: nil, timeout: 30, **client_options)
|
|
9
7
|
@client = Client.new(api_key: api_key || ENV["JINA_API_KEY"], timeout:, **client_options)
|
|
10
8
|
end
|
|
11
9
|
|
|
12
|
-
def extract(
|
|
13
|
-
body = client.get(
|
|
10
|
+
def extract(record)
|
|
11
|
+
body = client.get(record.final_url)
|
|
14
12
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
success(payload: ExtractedContent.new(
|
|
14
|
+
title: body.dig("data", "title") || "",
|
|
15
|
+
content: body.dig("data", "content") || ""
|
|
16
|
+
))
|
|
17
|
+
rescue Client::RequestError => e
|
|
18
|
+
failure(ExtractionError.new(e.message))
|
|
18
19
|
end
|
|
19
20
|
|
|
20
21
|
private
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Fetcher < Command
|
|
5
|
+
class ExtractionRunner < Command
|
|
6
|
+
class ExtractionError < StandardError; end
|
|
7
|
+
|
|
8
|
+
def initialize(records:, extractor:)
|
|
9
|
+
@records = records
|
|
10
|
+
@extractor = extractor
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call
|
|
14
|
+
pages = records.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
|
|
15
|
+
threads = batch.map { |record| Thread.new { PageExtractor.call(extractor:, record:) } }
|
|
16
|
+
|
|
17
|
+
threads.each do |thread|
|
|
18
|
+
result = thread.value
|
|
19
|
+
results << result.payload if result.success?
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
success(payload: pages)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :records, :extractor
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Fetcher < Command
|
|
5
|
+
class PageExtractor < Command
|
|
6
|
+
def initialize(extractor:, record:)
|
|
7
|
+
@extractor = extractor
|
|
8
|
+
@record = record
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call
|
|
12
|
+
result = extractor.extract(record)
|
|
13
|
+
|
|
14
|
+
unless result.success?
|
|
15
|
+
warn("[nous] extract skip #{record.final_url}: #{result.error.message}") if Nous.configuration.debug?
|
|
16
|
+
return failure(result.error)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
page = Page.new(
|
|
20
|
+
title: result.payload.title,
|
|
21
|
+
url: record.final_url,
|
|
22
|
+
pathname: record.pathname,
|
|
23
|
+
content: result.payload.content,
|
|
24
|
+
metadata: {
|
|
25
|
+
extractor: extractor.class.name,
|
|
26
|
+
requested_url: record.requested_url,
|
|
27
|
+
content_type: record.content_type,
|
|
28
|
+
redirected: record.redirected
|
|
29
|
+
}
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
success(payload: page)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
attr_reader :extractor, :record
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
data/lib/nous/fetcher.rb
CHANGED
|
@@ -2,35 +2,62 @@
|
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
4
|
class Fetcher < Command
|
|
5
|
-
class
|
|
5
|
+
class FetchError < StandardError; end
|
|
6
6
|
|
|
7
|
-
def initialize(seed_url:, extractor: Extractor::Default.new)
|
|
7
|
+
def initialize(seed_url:, extractor: Extractor::Default.new, http_client: nil, details: false)
|
|
8
8
|
@seed_url = seed_url
|
|
9
9
|
@extractor = extractor
|
|
10
|
+
@http_client = http_client
|
|
11
|
+
@single_page = Nous.configuration.single_page?
|
|
12
|
+
@details = details
|
|
10
13
|
end
|
|
11
14
|
|
|
12
15
|
def call
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
+
records = crawl
|
|
17
|
+
successful_records, failed_records = records.partition(&:ok)
|
|
18
|
+
|
|
19
|
+
if single_page && !details && successful_records.empty?
|
|
20
|
+
raise FetchError, failed_records.first&.error || "fetch failed"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
pages = extract(successful_records)
|
|
24
|
+
|
|
25
|
+
if details
|
|
26
|
+
success(payload: FetchResult.new(
|
|
27
|
+
pages: pages,
|
|
28
|
+
failures: build_failures(failed_records),
|
|
29
|
+
total_requested: records.length
|
|
30
|
+
))
|
|
31
|
+
else
|
|
32
|
+
success(payload: pages)
|
|
33
|
+
end
|
|
16
34
|
end
|
|
17
35
|
|
|
18
36
|
private
|
|
19
37
|
|
|
20
|
-
attr_reader :seed_url, :extractor
|
|
38
|
+
attr_reader :seed_url, :extractor, :http_client, :single_page, :details
|
|
21
39
|
|
|
22
40
|
def crawl
|
|
23
|
-
result = Crawler.call(seed_url:)
|
|
24
|
-
raise
|
|
41
|
+
result = Crawler.call(seed_url:, http_client:)
|
|
42
|
+
raise FetchError, result.error.message if result.failure?
|
|
25
43
|
|
|
26
44
|
result.payload
|
|
27
45
|
end
|
|
28
46
|
|
|
29
|
-
def extract(
|
|
30
|
-
result = ExtractionRunner.call(
|
|
31
|
-
raise
|
|
47
|
+
def extract(records)
|
|
48
|
+
result = ExtractionRunner.call(records:, extractor:)
|
|
49
|
+
raise FetchError, result.error.message if result.failure?
|
|
32
50
|
|
|
33
51
|
result.payload
|
|
34
52
|
end
|
|
53
|
+
|
|
54
|
+
def build_failures(records)
|
|
55
|
+
records.map do |record|
|
|
56
|
+
{
|
|
57
|
+
requested_url: record.requested_url,
|
|
58
|
+
error: record.error
|
|
59
|
+
}
|
|
60
|
+
end
|
|
61
|
+
end
|
|
35
62
|
end
|
|
36
63
|
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
Configuration = Data.define(
|
|
5
|
+
:concurrency,
|
|
6
|
+
:match,
|
|
7
|
+
:limit,
|
|
8
|
+
:timeout,
|
|
9
|
+
:debug,
|
|
10
|
+
:keep_query,
|
|
11
|
+
:recursive
|
|
12
|
+
) do
|
|
13
|
+
def debug? = debug
|
|
14
|
+
def recursive? = recursive
|
|
15
|
+
def single_page? = !recursive
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
FetchRecord = Data.define(
|
|
5
|
+
:requested_url,
|
|
6
|
+
:final_url,
|
|
7
|
+
:pathname,
|
|
8
|
+
:html,
|
|
9
|
+
:content_type,
|
|
10
|
+
:ok,
|
|
11
|
+
:error,
|
|
12
|
+
:redirected
|
|
13
|
+
) do
|
|
14
|
+
def initialize(
|
|
15
|
+
requested_url:,
|
|
16
|
+
pathname:, final_url: nil,
|
|
17
|
+
html: nil,
|
|
18
|
+
content_type: nil,
|
|
19
|
+
ok: true,
|
|
20
|
+
error: nil,
|
|
21
|
+
redirected: false
|
|
22
|
+
)
|
|
23
|
+
super
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
FetchResult = Data.define(:pages, :failures, :total_requested) do
|
|
5
|
+
def succeeded
|
|
6
|
+
pages.length
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def failed
|
|
10
|
+
failures.length
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def all_succeeded?
|
|
14
|
+
failures.empty?
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def any_succeeded?
|
|
18
|
+
pages.any?
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "addressable/uri"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Url
|
|
7
|
+
def initialize(raw)
|
|
8
|
+
@uri = Addressable::URI.parse(raw.to_s.strip).normalize
|
|
9
|
+
raise ArgumentError, "invalid URL: #{raw}" if uri.to_s.empty?
|
|
10
|
+
rescue Addressable::URI::InvalidURIError => e
|
|
11
|
+
raise ArgumentError, "invalid URL: #{e.message}"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def host
|
|
15
|
+
uri.host
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def path
|
|
19
|
+
return "/" if uri.path.empty?
|
|
20
|
+
|
|
21
|
+
uri.path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def http?
|
|
25
|
+
%w[http https].include?(uri.scheme)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_s
|
|
29
|
+
uri.to_s
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def ==(other)
|
|
33
|
+
other.is_a?(Url) && to_s == other.to_s
|
|
34
|
+
end
|
|
35
|
+
alias_method :eql?, :==
|
|
36
|
+
|
|
37
|
+
def hash
|
|
38
|
+
to_s.hash
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
attr_reader :uri
|
|
44
|
+
end
|
|
45
|
+
end
|