nous 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -0
- data/README.md +82 -10
- data/lib/nous/cli.rb +13 -10
- data/lib/nous/command.rb +2 -2
- data/lib/nous/configuration_builder.rb +56 -0
- data/lib/nous/converter.rb +1 -1
- data/lib/nous/crawler/async_page_fetcher.rb +83 -0
- data/lib/nous/crawler/link_extractor.rb +11 -11
- data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
- data/lib/nous/crawler/redirect_follower.rb +60 -0
- data/lib/nous/crawler/single_page_fetcher.rb +112 -0
- data/lib/nous/crawler/url_filter.rb +6 -6
- data/lib/nous/crawler.rb +15 -70
- data/lib/nous/extractor/default/client.rb +68 -0
- data/lib/nous/extractor/default.rb +10 -6
- data/lib/nous/extractor/jina/client.rb +4 -4
- data/lib/nous/extractor/jina.rb +10 -9
- data/lib/nous/fetcher/extraction_runner.rb +31 -0
- data/lib/nous/fetcher/page_extractor.rb +40 -0
- data/lib/nous/fetcher.rb +38 -11
- data/lib/nous/primitives/configuration.rb +17 -0
- data/lib/nous/primitives/extracted_content.rb +5 -0
- data/lib/nous/primitives/fetch_record.rb +26 -0
- data/lib/nous/primitives/fetch_result.rb +21 -0
- data/lib/nous/primitives/page.rb +5 -0
- data/lib/nous/primitives/url.rb +45 -0
- data/lib/nous/serializer.rb +14 -3
- data/lib/nous/url_resolver.rb +25 -0
- data/lib/nous/version.rb +1 -1
- data/lib/nous.rb +6 -5
- metadata +44 -8
- data/lib/nous/configuration.rb +0 -39
- data/lib/nous/crawler/page_fetcher.rb +0 -47
- data/lib/nous/error.rb +0 -5
- data/lib/nous/extraction_runner.rb +0 -29
- data/lib/nous/extraction_thread.rb +0 -28
- data/lib/nous/extractor.rb +0 -46
- data/lib/nous/page.rb +0 -5
data/lib/nous/serializer.rb
CHANGED
|
@@ -4,7 +4,7 @@ require "json"
|
|
|
4
4
|
|
|
5
5
|
module Nous
|
|
6
6
|
class Serializer < Command
|
|
7
|
-
class
|
|
7
|
+
class SerializationError < StandardError; end
|
|
8
8
|
|
|
9
9
|
FORMATS = %i[text json].freeze
|
|
10
10
|
|
|
@@ -24,7 +24,10 @@ module Nous
|
|
|
24
24
|
attr_reader :pages, :format
|
|
25
25
|
|
|
26
26
|
def validate_format!
|
|
27
|
-
|
|
27
|
+
return if FORMATS.include?(format)
|
|
28
|
+
|
|
29
|
+
raise SerializationError,
|
|
30
|
+
"unknown format: #{format}. Must be one of: #{FORMATS.join(", ")}"
|
|
28
31
|
end
|
|
29
32
|
|
|
30
33
|
def serialize_text
|
|
@@ -40,6 +43,8 @@ module Nous
|
|
|
40
43
|
<page>
|
|
41
44
|
<title>#{page.title}</title>
|
|
42
45
|
<url>#{page.url}</url>
|
|
46
|
+
<pathname>#{page.pathname}</pathname>
|
|
47
|
+
<extractor>#{page.metadata[:extractor]}</extractor>
|
|
43
48
|
<content>
|
|
44
49
|
#{page.content}
|
|
45
50
|
</content>
|
|
@@ -48,7 +53,13 @@ module Nous
|
|
|
48
53
|
end
|
|
49
54
|
|
|
50
55
|
def json_page(page)
|
|
51
|
-
{
|
|
56
|
+
{
|
|
57
|
+
title: page.title,
|
|
58
|
+
url: page.url,
|
|
59
|
+
pathname: page.pathname,
|
|
60
|
+
content: page.content,
|
|
61
|
+
metadata: page.metadata
|
|
62
|
+
}
|
|
52
63
|
end
|
|
53
64
|
end
|
|
54
65
|
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "addressable/uri"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class UrlResolver < Command
|
|
7
|
+
class ResolutionError < StandardError; end
|
|
8
|
+
|
|
9
|
+
def initialize(base_url:, href:)
|
|
10
|
+
@base_uri = Addressable::URI.parse(base_url.to_s)
|
|
11
|
+
@href = href.to_s.strip
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call
|
|
15
|
+
joined = base_uri.join(href)
|
|
16
|
+
success(payload: Url.new(joined))
|
|
17
|
+
rescue Addressable::URI::InvalidURIError => e
|
|
18
|
+
failure(ResolutionError.new("cannot resolve #{href.inspect}: #{e.message}"))
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
attr_reader :base_uri, :href
|
|
24
|
+
end
|
|
25
|
+
end
|
data/lib/nous/version.rb
CHANGED
data/lib/nous.rb
CHANGED
|
@@ -3,24 +3,25 @@
|
|
|
3
3
|
require "zeitwerk"
|
|
4
4
|
|
|
5
5
|
loader = Zeitwerk::Loader.for_gem
|
|
6
|
+
loader.collapse("#{__dir__}/nous/primitives")
|
|
6
7
|
loader.setup
|
|
7
8
|
|
|
8
9
|
module Nous
|
|
9
10
|
class << self
|
|
10
11
|
attr_reader :configuration
|
|
11
12
|
|
|
12
|
-
def configure(
|
|
13
|
-
@configuration =
|
|
13
|
+
def configure(...)
|
|
14
|
+
@configuration = ConfigurationBuilder.call(...)
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
def reset_configuration!
|
|
17
18
|
@configuration = nil
|
|
18
19
|
end
|
|
19
20
|
|
|
20
|
-
def fetch(seed_url, extractor: Extractor::Default.new, **options)
|
|
21
|
-
configure(
|
|
21
|
+
def fetch(seed_url, extractor: Extractor::Default.new, http_client: nil, details: false, **options)
|
|
22
|
+
configure(**options)
|
|
22
23
|
|
|
23
|
-
result = Fetcher.call(seed_url:, extractor:)
|
|
24
|
+
result = Fetcher.call(seed_url:, extractor:, http_client:, details:)
|
|
24
25
|
raise result.error if result.failure?
|
|
25
26
|
|
|
26
27
|
result.payload
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: nous
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dan Frenette
|
|
@@ -9,6 +9,20 @@ bindir: exe
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: addressable
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '2.8'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '2.8'
|
|
12
26
|
- !ruby/object:Gem::Dependency
|
|
13
27
|
name: async
|
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -51,6 +65,20 @@ dependencies:
|
|
|
51
65
|
- - "~>"
|
|
52
66
|
- !ruby/object:Gem::Version
|
|
53
67
|
version: '2.12'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: faraday-follow_redirects
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '0.5'
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '0.5'
|
|
54
82
|
- !ruby/object:Gem::Dependency
|
|
55
83
|
name: faraday-retry
|
|
56
84
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -197,22 +225,30 @@ files:
|
|
|
197
225
|
- lib/nous.rb
|
|
198
226
|
- lib/nous/cli.rb
|
|
199
227
|
- lib/nous/command.rb
|
|
200
|
-
- lib/nous/
|
|
228
|
+
- lib/nous/configuration_builder.rb
|
|
201
229
|
- lib/nous/converter.rb
|
|
202
230
|
- lib/nous/crawler.rb
|
|
231
|
+
- lib/nous/crawler/async_page_fetcher.rb
|
|
203
232
|
- lib/nous/crawler/link_extractor.rb
|
|
204
|
-
- lib/nous/crawler/
|
|
233
|
+
- lib/nous/crawler/recursive_page_fetcher.rb
|
|
234
|
+
- lib/nous/crawler/redirect_follower.rb
|
|
235
|
+
- lib/nous/crawler/single_page_fetcher.rb
|
|
205
236
|
- lib/nous/crawler/url_filter.rb
|
|
206
|
-
- lib/nous/error.rb
|
|
207
|
-
- lib/nous/extraction_runner.rb
|
|
208
|
-
- lib/nous/extraction_thread.rb
|
|
209
|
-
- lib/nous/extractor.rb
|
|
210
237
|
- lib/nous/extractor/default.rb
|
|
238
|
+
- lib/nous/extractor/default/client.rb
|
|
211
239
|
- lib/nous/extractor/jina.rb
|
|
212
240
|
- lib/nous/extractor/jina/client.rb
|
|
213
241
|
- lib/nous/fetcher.rb
|
|
214
|
-
- lib/nous/
|
|
242
|
+
- lib/nous/fetcher/extraction_runner.rb
|
|
243
|
+
- lib/nous/fetcher/page_extractor.rb
|
|
244
|
+
- lib/nous/primitives/configuration.rb
|
|
245
|
+
- lib/nous/primitives/extracted_content.rb
|
|
246
|
+
- lib/nous/primitives/fetch_record.rb
|
|
247
|
+
- lib/nous/primitives/fetch_result.rb
|
|
248
|
+
- lib/nous/primitives/page.rb
|
|
249
|
+
- lib/nous/primitives/url.rb
|
|
215
250
|
- lib/nous/serializer.rb
|
|
251
|
+
- lib/nous/url_resolver.rb
|
|
216
252
|
- lib/nous/version.rb
|
|
217
253
|
- sig/nous.rbs
|
|
218
254
|
homepage: https://github.com/danfrenette/nous
|
data/lib/nous/configuration.rb
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "uri"
|
|
4
|
-
|
|
5
|
-
module Nous
|
|
6
|
-
class Configuration
|
|
7
|
-
class Error < Nous::Error; end
|
|
8
|
-
|
|
9
|
-
attr_reader :seed, :concurrency, :match, :limit, :timeout, :keep_query
|
|
10
|
-
|
|
11
|
-
DEFAULT_CONCURRENCY = 3
|
|
12
|
-
DEFAULT_LIMIT = 100
|
|
13
|
-
DEFAULT_TIMEOUT = 15
|
|
14
|
-
|
|
15
|
-
def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
|
|
16
|
-
timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
|
|
17
|
-
@seed = parse_seed!(seed_url)
|
|
18
|
-
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
19
|
-
@match = Array(match)
|
|
20
|
-
@limit = Integer(limit).clamp(1, 10_000)
|
|
21
|
-
@timeout = Integer(timeout)
|
|
22
|
-
@verbose = verbose
|
|
23
|
-
@keep_query = keep_query
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
def verbose? = @verbose
|
|
27
|
-
|
|
28
|
-
private
|
|
29
|
-
|
|
30
|
-
def parse_seed!(url)
|
|
31
|
-
uri = URI.parse(url)
|
|
32
|
-
raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
|
|
33
|
-
|
|
34
|
-
uri
|
|
35
|
-
rescue URI::InvalidURIError => e
|
|
36
|
-
raise Error, "invalid seed URL: #{e.message}"
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Nous
|
|
4
|
-
class Crawler < Command
|
|
5
|
-
class PageFetcher
|
|
6
|
-
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
7
|
-
|
|
8
|
-
def initialize(client:)
|
|
9
|
-
@client = client
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def fetch(url)
|
|
13
|
-
Async::Task.current.with_timeout(config.timeout) do
|
|
14
|
-
response = client.get(url, {})
|
|
15
|
-
return skip(url, "status #{response.status}") unless response.status == 200
|
|
16
|
-
return skip(url, "non-html content") unless html?(response)
|
|
17
|
-
|
|
18
|
-
{url:, pathname: URI.parse(url).path, html: response.read}
|
|
19
|
-
ensure
|
|
20
|
-
response&.close
|
|
21
|
-
end
|
|
22
|
-
rescue Async::TimeoutError
|
|
23
|
-
skip(url, "timeout after #{config.timeout}s")
|
|
24
|
-
rescue IOError, SocketError, Errno::ECONNREFUSED => e
|
|
25
|
-
skip(url, e.message)
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
private
|
|
29
|
-
|
|
30
|
-
attr_reader :client
|
|
31
|
-
|
|
32
|
-
def config
|
|
33
|
-
Nous.configuration
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
def html?(response)
|
|
37
|
-
content_type = response.headers["content-type"].to_s
|
|
38
|
-
HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def skip(url, reason)
|
|
42
|
-
warn("[nous] skip #{url}: #{reason}") if config.verbose?
|
|
43
|
-
nil
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
data/lib/nous/error.rb
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Nous
|
|
4
|
-
class ExtractionRunner < Command
|
|
5
|
-
class Error < Command::Error; end
|
|
6
|
-
|
|
7
|
-
def initialize(raw_pages:, extractor:)
|
|
8
|
-
@raw_pages = raw_pages
|
|
9
|
-
@extractor = extractor
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def call
|
|
13
|
-
pages = raw_pages.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
|
|
14
|
-
threads = batch.map { |raw| Thread.new { ExtractionThread.new(extractor:, raw_page: raw).call } }
|
|
15
|
-
|
|
16
|
-
threads.each do |thread|
|
|
17
|
-
result = thread.value
|
|
18
|
-
results << result if result
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
success(payload: pages)
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
private
|
|
26
|
-
|
|
27
|
-
attr_reader :raw_pages, :extractor
|
|
28
|
-
end
|
|
29
|
-
end
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Nous
|
|
4
|
-
class ExtractionThread
|
|
5
|
-
def initialize(extractor:, raw_page:)
|
|
6
|
-
@extractor = extractor
|
|
7
|
-
@raw_page = raw_page
|
|
8
|
-
end
|
|
9
|
-
|
|
10
|
-
def call
|
|
11
|
-
extracted = extractor.extract(raw_page)
|
|
12
|
-
|
|
13
|
-
Page.new(
|
|
14
|
-
title: extracted[:title],
|
|
15
|
-
url: raw_page[:url],
|
|
16
|
-
pathname: raw_page[:pathname],
|
|
17
|
-
content: extracted[:content]
|
|
18
|
-
)
|
|
19
|
-
rescue Nous::Error => e
|
|
20
|
-
warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if Nous.configuration.verbose?
|
|
21
|
-
nil
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
private
|
|
25
|
-
|
|
26
|
-
attr_reader :extractor, :raw_page
|
|
27
|
-
end
|
|
28
|
-
end
|
data/lib/nous/extractor.rb
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "readability"
|
|
4
|
-
|
|
5
|
-
module Nous
|
|
6
|
-
class Extractor < Command
|
|
7
|
-
class Error < Command::Error; end
|
|
8
|
-
|
|
9
|
-
NOISY_TAGS = %w[script style link nav header footer img video svg].freeze
|
|
10
|
-
|
|
11
|
-
def initialize(html:, selector: nil)
|
|
12
|
-
@html = html
|
|
13
|
-
@selector = selector
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
def call
|
|
17
|
-
doc = Nokogiri::HTML(html)
|
|
18
|
-
doc = scope_to_selector(doc) if selector
|
|
19
|
-
strip_noisy_tags(doc)
|
|
20
|
-
|
|
21
|
-
readable = Readability::Document.new(doc.to_html)
|
|
22
|
-
text = Nokogiri::HTML(readable.content).text.strip
|
|
23
|
-
|
|
24
|
-
return failure(Error.new("readability returned no content")) if text.empty?
|
|
25
|
-
|
|
26
|
-
success(payload: {title: readable.title, content: readable.content})
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
private
|
|
30
|
-
|
|
31
|
-
attr_reader :html, :selector
|
|
32
|
-
|
|
33
|
-
def scope_to_selector(doc)
|
|
34
|
-
scoped = doc.at_css(selector)
|
|
35
|
-
return doc unless scoped
|
|
36
|
-
|
|
37
|
-
fragment = Nokogiri::HTML::Document.new
|
|
38
|
-
fragment.root = scoped
|
|
39
|
-
fragment
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
def strip_noisy_tags(doc)
|
|
43
|
-
NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|