nous 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +68 -0
  3. data/README.md +82 -10
  4. data/lib/nous/cli.rb +13 -10
  5. data/lib/nous/command.rb +2 -2
  6. data/lib/nous/configuration_builder.rb +56 -0
  7. data/lib/nous/converter.rb +1 -1
  8. data/lib/nous/crawler/async_page_fetcher.rb +83 -0
  9. data/lib/nous/crawler/link_extractor.rb +11 -11
  10. data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
  11. data/lib/nous/crawler/redirect_follower.rb +60 -0
  12. data/lib/nous/crawler/single_page_fetcher.rb +112 -0
  13. data/lib/nous/crawler/url_filter.rb +6 -6
  14. data/lib/nous/crawler.rb +15 -70
  15. data/lib/nous/extractor/default/client.rb +68 -0
  16. data/lib/nous/extractor/default.rb +10 -6
  17. data/lib/nous/extractor/jina/client.rb +4 -4
  18. data/lib/nous/extractor/jina.rb +10 -9
  19. data/lib/nous/fetcher/extraction_runner.rb +31 -0
  20. data/lib/nous/fetcher/page_extractor.rb +40 -0
  21. data/lib/nous/fetcher.rb +38 -11
  22. data/lib/nous/primitives/configuration.rb +17 -0
  23. data/lib/nous/primitives/extracted_content.rb +5 -0
  24. data/lib/nous/primitives/fetch_record.rb +26 -0
  25. data/lib/nous/primitives/fetch_result.rb +21 -0
  26. data/lib/nous/primitives/page.rb +5 -0
  27. data/lib/nous/primitives/url.rb +45 -0
  28. data/lib/nous/serializer.rb +14 -3
  29. data/lib/nous/url_resolver.rb +25 -0
  30. data/lib/nous/version.rb +1 -1
  31. data/lib/nous.rb +6 -5
  32. metadata +44 -8
  33. data/lib/nous/configuration.rb +0 -39
  34. data/lib/nous/crawler/page_fetcher.rb +0 -47
  35. data/lib/nous/error.rb +0 -5
  36. data/lib/nous/extraction_runner.rb +0 -29
  37. data/lib/nous/extraction_thread.rb +0 -28
  38. data/lib/nous/extractor.rb +0 -46
  39. data/lib/nous/page.rb +0 -5
@@ -4,7 +4,7 @@ require "json"
4
4
 
5
5
  module Nous
6
6
  class Serializer < Command
7
- class Error < Command::Error; end
7
+ class SerializationError < StandardError; end
8
8
 
9
9
  FORMATS = %i[text json].freeze
10
10
 
@@ -24,7 +24,10 @@ module Nous
24
24
  attr_reader :pages, :format
25
25
 
26
26
  def validate_format!
27
- raise Error, "unknown format: #{format}. Must be one of: #{FORMATS.join(", ")}" unless FORMATS.include?(format)
27
+ return if FORMATS.include?(format)
28
+
29
+ raise SerializationError,
30
+ "unknown format: #{format}. Must be one of: #{FORMATS.join(", ")}"
28
31
  end
29
32
 
30
33
  def serialize_text
@@ -40,6 +43,8 @@ module Nous
40
43
  <page>
41
44
  <title>#{page.title}</title>
42
45
  <url>#{page.url}</url>
46
+ <pathname>#{page.pathname}</pathname>
47
+ <extractor>#{page.metadata[:extractor]}</extractor>
43
48
  <content>
44
49
  #{page.content}
45
50
  </content>
@@ -48,7 +53,13 @@ module Nous
48
53
  end
49
54
 
50
55
  def json_page(page)
51
- {title: page.title, url: page.url, content: page.content}
56
+ {
57
+ title: page.title,
58
+ url: page.url,
59
+ pathname: page.pathname,
60
+ content: page.content,
61
+ metadata: page.metadata
62
+ }
52
63
  end
53
64
  end
54
65
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "addressable/uri"
4
+
5
+ module Nous
6
+ class UrlResolver < Command
7
+ class ResolutionError < StandardError; end
8
+
9
+ def initialize(base_url:, href:)
10
+ @base_uri = Addressable::URI.parse(base_url.to_s)
11
+ @href = href.to_s.strip
12
+ end
13
+
14
+ def call
15
+ joined = base_uri.join(href)
16
+ success(payload: Url.new(joined))
17
+ rescue Addressable::URI::InvalidURIError => e
18
+ failure(ResolutionError.new("cannot resolve #{href.inspect}: #{e.message}"))
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :base_uri, :href
24
+ end
25
+ end
data/lib/nous/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nous
4
- VERSION = '0.2.0'
4
+ VERSION = "0.4.0"
5
5
  end
data/lib/nous.rb CHANGED
@@ -3,24 +3,25 @@
3
3
  require "zeitwerk"
4
4
 
5
5
  loader = Zeitwerk::Loader.for_gem
6
+ loader.collapse("#{__dir__}/nous/primitives")
6
7
  loader.setup
7
8
 
8
9
  module Nous
9
10
  class << self
10
11
  attr_reader :configuration
11
12
 
12
- def configure(seed_url:, **options)
13
- @configuration = Configuration.new(seed_url:, **options)
13
+ def configure(...)
14
+ @configuration = ConfigurationBuilder.call(...)
14
15
  end
15
16
 
16
17
  def reset_configuration!
17
18
  @configuration = nil
18
19
  end
19
20
 
20
- def fetch(seed_url, extractor: Extractor::Default.new, **options)
21
- configure(seed_url:, **options)
21
+ def fetch(seed_url, extractor: Extractor::Default.new, http_client: nil, details: false, **options)
22
+ configure(**options)
22
23
 
23
- result = Fetcher.call(seed_url:, extractor:)
24
+ result = Fetcher.call(seed_url:, extractor:, http_client:, details:)
24
25
  raise result.error if result.failure?
25
26
 
26
27
  result.payload
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nous
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Frenette
@@ -9,6 +9,20 @@ bindir: exe
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: addressable
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '2.8'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '2.8'
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: async
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -51,6 +65,20 @@ dependencies:
51
65
  - - "~>"
52
66
  - !ruby/object:Gem::Version
53
67
  version: '2.12'
68
+ - !ruby/object:Gem::Dependency
69
+ name: faraday-follow_redirects
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '0.5'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.5'
54
82
  - !ruby/object:Gem::Dependency
55
83
  name: faraday-retry
56
84
  requirement: !ruby/object:Gem::Requirement
@@ -197,22 +225,30 @@ files:
197
225
  - lib/nous.rb
198
226
  - lib/nous/cli.rb
199
227
  - lib/nous/command.rb
200
- - lib/nous/configuration.rb
228
+ - lib/nous/configuration_builder.rb
201
229
  - lib/nous/converter.rb
202
230
  - lib/nous/crawler.rb
231
+ - lib/nous/crawler/async_page_fetcher.rb
203
232
  - lib/nous/crawler/link_extractor.rb
204
- - lib/nous/crawler/page_fetcher.rb
233
+ - lib/nous/crawler/recursive_page_fetcher.rb
234
+ - lib/nous/crawler/redirect_follower.rb
235
+ - lib/nous/crawler/single_page_fetcher.rb
205
236
  - lib/nous/crawler/url_filter.rb
206
- - lib/nous/error.rb
207
- - lib/nous/extraction_runner.rb
208
- - lib/nous/extraction_thread.rb
209
- - lib/nous/extractor.rb
210
237
  - lib/nous/extractor/default.rb
238
+ - lib/nous/extractor/default/client.rb
211
239
  - lib/nous/extractor/jina.rb
212
240
  - lib/nous/extractor/jina/client.rb
213
241
  - lib/nous/fetcher.rb
214
- - lib/nous/page.rb
242
+ - lib/nous/fetcher/extraction_runner.rb
243
+ - lib/nous/fetcher/page_extractor.rb
244
+ - lib/nous/primitives/configuration.rb
245
+ - lib/nous/primitives/extracted_content.rb
246
+ - lib/nous/primitives/fetch_record.rb
247
+ - lib/nous/primitives/fetch_result.rb
248
+ - lib/nous/primitives/page.rb
249
+ - lib/nous/primitives/url.rb
215
250
  - lib/nous/serializer.rb
251
+ - lib/nous/url_resolver.rb
216
252
  - lib/nous/version.rb
217
253
  - sig/nous.rbs
218
254
  homepage: https://github.com/danfrenette/nous
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "uri"
4
-
5
- module Nous
6
- class Configuration
7
- class Error < Nous::Error; end
8
-
9
- attr_reader :seed, :concurrency, :match, :limit, :timeout, :keep_query
10
-
11
- DEFAULT_CONCURRENCY = 3
12
- DEFAULT_LIMIT = 100
13
- DEFAULT_TIMEOUT = 15
14
-
15
- def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
16
- timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
17
- @seed = parse_seed!(seed_url)
18
- @concurrency = Integer(concurrency).clamp(1, 20)
19
- @match = Array(match)
20
- @limit = Integer(limit).clamp(1, 10_000)
21
- @timeout = Integer(timeout)
22
- @verbose = verbose
23
- @keep_query = keep_query
24
- end
25
-
26
- def verbose? = @verbose
27
-
28
- private
29
-
30
- def parse_seed!(url)
31
- uri = URI.parse(url)
32
- raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
33
-
34
- uri
35
- rescue URI::InvalidURIError => e
36
- raise Error, "invalid seed URL: #{e.message}"
37
- end
38
- end
39
- end
@@ -1,47 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Nous
4
- class Crawler < Command
5
- class PageFetcher
6
- HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
7
-
8
- def initialize(client:)
9
- @client = client
10
- end
11
-
12
- def fetch(url)
13
- Async::Task.current.with_timeout(config.timeout) do
14
- response = client.get(url, {})
15
- return skip(url, "status #{response.status}") unless response.status == 200
16
- return skip(url, "non-html content") unless html?(response)
17
-
18
- {url:, pathname: URI.parse(url).path, html: response.read}
19
- ensure
20
- response&.close
21
- end
22
- rescue Async::TimeoutError
23
- skip(url, "timeout after #{config.timeout}s")
24
- rescue IOError, SocketError, Errno::ECONNREFUSED => e
25
- skip(url, e.message)
26
- end
27
-
28
- private
29
-
30
- attr_reader :client
31
-
32
- def config
33
- Nous.configuration
34
- end
35
-
36
- def html?(response)
37
- content_type = response.headers["content-type"].to_s
38
- HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
39
- end
40
-
41
- def skip(url, reason)
42
- warn("[nous] skip #{url}: #{reason}") if config.verbose?
43
- nil
44
- end
45
- end
46
- end
47
- end
data/lib/nous/error.rb DELETED
@@ -1,5 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Nous
4
- class Error < StandardError; end
5
- end
@@ -1,29 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Nous
4
- class ExtractionRunner < Command
5
- class Error < Command::Error; end
6
-
7
- def initialize(raw_pages:, extractor:)
8
- @raw_pages = raw_pages
9
- @extractor = extractor
10
- end
11
-
12
- def call
13
- pages = raw_pages.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
14
- threads = batch.map { |raw| Thread.new { ExtractionThread.new(extractor:, raw_page: raw).call } }
15
-
16
- threads.each do |thread|
17
- result = thread.value
18
- results << result if result
19
- end
20
- end
21
-
22
- success(payload: pages)
23
- end
24
-
25
- private
26
-
27
- attr_reader :raw_pages, :extractor
28
- end
29
- end
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Nous
4
- class ExtractionThread
5
- def initialize(extractor:, raw_page:)
6
- @extractor = extractor
7
- @raw_page = raw_page
8
- end
9
-
10
- def call
11
- extracted = extractor.extract(raw_page)
12
-
13
- Page.new(
14
- title: extracted[:title],
15
- url: raw_page[:url],
16
- pathname: raw_page[:pathname],
17
- content: extracted[:content]
18
- )
19
- rescue Nous::Error => e
20
- warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if Nous.configuration.verbose?
21
- nil
22
- end
23
-
24
- private
25
-
26
- attr_reader :extractor, :raw_page
27
- end
28
- end
@@ -1,46 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "readability"
4
-
5
- module Nous
6
- class Extractor < Command
7
- class Error < Command::Error; end
8
-
9
- NOISY_TAGS = %w[script style link nav header footer img video svg].freeze
10
-
11
- def initialize(html:, selector: nil)
12
- @html = html
13
- @selector = selector
14
- end
15
-
16
- def call
17
- doc = Nokogiri::HTML(html)
18
- doc = scope_to_selector(doc) if selector
19
- strip_noisy_tags(doc)
20
-
21
- readable = Readability::Document.new(doc.to_html)
22
- text = Nokogiri::HTML(readable.content).text.strip
23
-
24
- return failure(Error.new("readability returned no content")) if text.empty?
25
-
26
- success(payload: {title: readable.title, content: readable.content})
27
- end
28
-
29
- private
30
-
31
- attr_reader :html, :selector
32
-
33
- def scope_to_selector(doc)
34
- scoped = doc.at_css(selector)
35
- return doc unless scoped
36
-
37
- fragment = Nokogiri::HTML::Document.new
38
- fragment.root = scoped
39
- fragment
40
- end
41
-
42
- def strip_noisy_tags(doc)
43
- NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
44
- end
45
- end
46
- end
data/lib/nous/page.rb DELETED
@@ -1,5 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Nous
4
- Page = Data.define(:title, :url, :pathname, :content)
5
- end