nous 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +68 -0
  3. data/README.md +82 -10
  4. data/lib/nous/cli.rb +13 -10
  5. data/lib/nous/command.rb +2 -2
  6. data/lib/nous/configuration_builder.rb +56 -0
  7. data/lib/nous/converter.rb +1 -1
  8. data/lib/nous/crawler/async_page_fetcher.rb +83 -0
  9. data/lib/nous/crawler/link_extractor.rb +11 -11
  10. data/lib/nous/crawler/recursive_page_fetcher.rb +103 -0
  11. data/lib/nous/crawler/redirect_follower.rb +60 -0
  12. data/lib/nous/crawler/single_page_fetcher.rb +112 -0
  13. data/lib/nous/crawler/url_filter.rb +6 -6
  14. data/lib/nous/crawler.rb +15 -70
  15. data/lib/nous/extractor/default/client.rb +68 -0
  16. data/lib/nous/extractor/default.rb +10 -6
  17. data/lib/nous/extractor/jina/client.rb +4 -4
  18. data/lib/nous/extractor/jina.rb +10 -9
  19. data/lib/nous/fetcher/extraction_runner.rb +31 -0
  20. data/lib/nous/fetcher/page_extractor.rb +40 -0
  21. data/lib/nous/fetcher.rb +38 -11
  22. data/lib/nous/primitives/configuration.rb +17 -0
  23. data/lib/nous/primitives/extracted_content.rb +5 -0
  24. data/lib/nous/primitives/fetch_record.rb +26 -0
  25. data/lib/nous/primitives/fetch_result.rb +21 -0
  26. data/lib/nous/primitives/page.rb +5 -0
  27. data/lib/nous/primitives/url.rb +45 -0
  28. data/lib/nous/serializer.rb +14 -3
  29. data/lib/nous/url_resolver.rb +25 -0
  30. data/lib/nous/version.rb +1 -1
  31. data/lib/nous.rb +6 -5
  32. metadata +44 -8
  33. data/lib/nous/configuration.rb +0 -39
  34. data/lib/nous/crawler/page_fetcher.rb +0 -47
  35. data/lib/nous/error.rb +0 -5
  36. data/lib/nous/extraction_runner.rb +0 -29
  37. data/lib/nous/extraction_thread.rb +0 -28
  38. data/lib/nous/extractor.rb +0 -46
  39. data/lib/nous/page.rb +0 -5
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "faraday/follow_redirects"
5
+
6
+ module Nous
7
+ class Crawler < Command
8
+ class SinglePageFetcher < Command
9
+ class FetchError < StandardError; end
10
+
11
+ HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
12
+ MAX_REDIRECTS = 5
13
+
14
+ def initialize(url:, http_client: nil)
15
+ @url = url
16
+ @seed_host = Url.new(url).host
17
+ @connection = http_client || build_connection
18
+ end
19
+
20
+ def call
21
+ response = connection.get(url)
22
+ final_url = resolve_final_url(response)
23
+ content_type = response.headers["content-type"].to_s
24
+ redirected = final_url.to_s != url
25
+
26
+ record = build_record(
27
+ final_url: final_url.to_s,
28
+ pathname: final_url.path,
29
+ html: response.body,
30
+ content_type: content_type,
31
+ redirected: redirected
32
+ )
33
+
34
+ validate!(record)
35
+
36
+ success(payload: [record])
37
+ rescue FetchError => e
38
+ record = build_failed_record(error: e.message)
39
+ success(payload: [record])
40
+ rescue Faraday::Error => e
41
+ record = build_failed_record(error: e.message)
42
+ success(payload: [record])
43
+ end
44
+
45
+ private
46
+
47
+ attr_reader :url, :seed_host, :connection
48
+
49
+ def config
50
+ Nous.configuration
51
+ end
52
+
53
+ def build_record(final_url:, pathname:, html:, content_type:, redirected:)
54
+ FetchRecord.new(
55
+ requested_url: url,
56
+ final_url: final_url,
57
+ pathname: pathname,
58
+ html: html,
59
+ content_type: content_type,
60
+ ok: true,
61
+ error: nil,
62
+ redirected: redirected
63
+ )
64
+ end
65
+
66
+ def build_failed_record(error:)
67
+ FetchRecord.new(
68
+ requested_url: url,
69
+ final_url: nil,
70
+ pathname: Url.new(url).path,
71
+ html: nil,
72
+ content_type: nil,
73
+ ok: false,
74
+ error: error,
75
+ redirected: false
76
+ )
77
+ end
78
+
79
+ def validate!(record)
80
+ validate_host!(record.final_url)
81
+ validate_html!(record.content_type)
82
+ end
83
+
84
+ def resolve_final_url(response)
85
+ location = response.env.url.to_s
86
+ Url.new(location)
87
+ end
88
+
89
+ def validate_host!(final_url)
90
+ return if Url.new(final_url).host == seed_host
91
+
92
+ raise FetchError, "redirected to #{final_url} outside #{seed_host}"
93
+ end
94
+
95
+ def validate_html!(content_type)
96
+ return if HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
97
+
98
+ raise FetchError, "non-html content: #{content_type}"
99
+ end
100
+
101
+ def build_connection
102
+ Faraday.new do |f|
103
+ f.response :follow_redirects, limit: MAX_REDIRECTS
104
+ f.response :raise_error
105
+
106
+ f.options.timeout = config.timeout
107
+ f.options.open_timeout = config.timeout
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
@@ -5,10 +5,10 @@ module Nous
5
5
  class UrlFilter
6
6
  IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
7
7
 
8
- def initialize(config)
9
- @host = config.seed.host
10
- @match = config.match
11
- @keep_query = config.keep_query
8
+ def initialize(seed_uri:)
9
+ @host = seed_uri.host
10
+ @match = Nous.configuration.match
11
+ @keep_query = Nous.configuration.keep_query
12
12
  end
13
13
 
14
14
  def canonicalize(uri)
@@ -25,8 +25,8 @@ module Nous
25
25
  IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
26
26
  end
27
27
 
28
- def same_host?(uri)
29
- uri.is_a?(URI::HTTP) && uri.host == host
28
+ def same_host?(url)
29
+ url.http? && url.host == host
30
30
  end
31
31
 
32
32
  def matches_path?(path)
data/lib/nous/crawler.rb CHANGED
@@ -1,91 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "async"
4
- require "async/http/internet"
5
- require "nokogiri"
6
- require "uri"
7
-
8
3
  module Nous
9
4
  class Crawler < Command
10
- class Error < Command::Error; end
5
+ class CrawlError < StandardError; end
11
6
 
12
- def initialize(seed_url:)
7
+ def initialize(seed_url:, http_client: nil)
13
8
  @seed_url = seed_url
9
+ @http_client = http_client
10
+ parse_seed!
14
11
  end
15
12
 
16
13
  def call
17
- suppress_async_warnings unless config.verbose?
18
-
19
- pages = []
20
- queue = [url_filter.canonicalize(config.seed)]
21
- seen = Set.new(queue)
22
-
23
- Async do
24
- client = Async::HTTP::Internet.new
25
- begin
26
- crawl(queue:, seen:, pages:, client:)
27
- ensure
28
- client.close
29
- end
30
- end.wait
31
-
32
- success(payload: pages)
14
+ if config.recursive?
15
+ RecursivePageFetcher.call(seed_url:, http_client:)
16
+ else
17
+ SinglePageFetcher.call(url: seed_url, http_client:)
18
+ end
33
19
  end
34
20
 
35
21
  private
36
22
 
37
- attr_reader :seed_url
23
+ attr_reader :seed_url, :http_client
38
24
 
39
25
  def config
40
26
  Nous.configuration
41
27
  end
42
28
 
43
- def crawl(queue:, seen:, pages:, client:)
44
- while queue.any? && pages.length < config.limit
45
- batch = queue.shift(config.concurrency)
46
- fetch_batch(batch, client).each do |page|
47
- next unless page
48
-
49
- pages << page
50
- break if pages.length >= config.limit
51
-
52
- link_extractor.extract(page[:url], page[:html]).each do |url|
53
- next if seen.include?(url)
54
-
55
- seen << url
56
- queue << url
57
- end
58
- end
59
- end
60
- end
61
-
62
- def fetch_batch(urls, client)
63
- tasks = []
64
-
65
- Async do |task|
66
- urls.each do |url|
67
- tasks << task.async { page_fetcher(client).fetch(url) }
68
- end
69
- end.wait
70
-
71
- tasks.map(&:wait)
72
- end
73
-
74
- def url_filter
75
- @url_filter ||= UrlFilter.new(config)
76
- end
77
-
78
- def link_extractor
79
- @link_extractor ||= LinkExtractor.new(url_filter:)
80
- end
81
-
82
- def page_fetcher(client)
83
- PageFetcher.new(client:)
84
- end
85
-
86
- def suppress_async_warnings
87
- require "console"
88
- Console.logger.level = :error
29
+ def parse_seed!
30
+ parsed = Url.new(seed_url)
31
+ raise CrawlError, "seed URL must be http or https" unless parsed.http?
32
+ rescue ArgumentError => e
33
+ raise CrawlError, "invalid seed URL: #{e.message}"
89
34
  end
90
35
  end
91
36
  end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "readability"
4
+
5
+ module Nous
6
+ module Extractor
7
+ class Default
8
+ class Client < Command
9
+ class ExtractionError < StandardError; end
10
+
11
+ NOISY_TAGS = %w[script style nav footer].freeze
12
+
13
+ def initialize(html:, selector: nil)
14
+ @html = html
15
+ @selector = selector
16
+ end
17
+
18
+ def call
19
+ readable = ::Readability::Document.new(prepared_html)
20
+
21
+ text = Nokogiri::HTML(readable.content).text.strip
22
+ return failure(ExtractionError.new("readability returned no content")) if text.empty?
23
+
24
+ title = resolve_title(readable)
25
+ success(payload: {title: title, content: readable.content})
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :html, :selector
31
+
32
+ def prepared_html
33
+ doc = Nokogiri::HTML(html)
34
+ original_title(doc)
35
+ doc = scope(doc, selector) if selector
36
+ strip_tags(doc)
37
+ doc.to_html
38
+ end
39
+
40
+ def original_title(doc)
41
+ @original_title ||= doc.at_css("title")&.text.to_s.strip
42
+ end
43
+
44
+ def scope(doc, selector)
45
+ scoped = doc.at_css(selector)
46
+ return doc unless scoped
47
+
48
+ Nokogiri::HTML.fragment(scoped.to_html)
49
+ end
50
+
51
+ def strip_tags(doc)
52
+ NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
53
+ end
54
+
55
+ def resolve_title(readable)
56
+ title = readable.title.to_s.strip
57
+ title = @original_title if title.empty?
58
+ title = title_from_content(readable.content) if title.empty?
59
+ title
60
+ end
61
+
62
+ def title_from_content(content)
63
+ Nokogiri::HTML(content).at_css("h1")&.text.to_s.strip
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -1,17 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nous
4
- class Extractor
5
- class Default
4
+ module Extractor
5
+ class ExtractionError < StandardError; end
6
+
7
+ class Default < Command
6
8
  def initialize(selector: nil)
7
9
  @selector = selector
8
10
  end
9
11
 
10
- def extract(page)
11
- extracted = extract_content(page[:html])
12
+ def extract(record)
13
+ extracted = extract_content(record.html)
12
14
  markdown = convert_to_markdown(extracted[:content])
13
15
 
14
- {title: extracted[:title], content: markdown}
16
+ success(payload: ExtractedContent.new(title: extracted[:title], content: markdown))
17
+ rescue Client::ExtractionError, Converter::ConversionError => e
18
+ failure(ExtractionError.new(e.message))
15
19
  end
16
20
 
17
21
  private
@@ -19,7 +23,7 @@ module Nous
19
23
  attr_reader :selector
20
24
 
21
25
  def extract_content(html)
22
- result = Extractor.call(html:, selector:)
26
+ result = Client.call(html:, selector:)
23
27
  raise result.error if result.failure?
24
28
 
25
29
  result.payload
@@ -5,10 +5,10 @@ require "faraday/retry"
5
5
  require "json"
6
6
 
7
7
  module Nous
8
- class Extractor
8
+ module Extractor
9
9
  class Jina
10
10
  class Client
11
- class Error < Nous::Error; end
11
+ class RequestError < StandardError; end
12
12
 
13
13
  BASE_URL = "https://r.jina.ai"
14
14
  RETRYABLE_STATUSES = [429, 500, 502, 503, 504].freeze
@@ -22,7 +22,7 @@ module Nous
22
22
  response = connection.get("/#{url}")
23
23
  parse(response.body)
24
24
  rescue Faraday::Error => e
25
- raise Error, e.message
25
+ raise RequestError, e.message
26
26
  end
27
27
 
28
28
  private
@@ -51,7 +51,7 @@ module Nous
51
51
  def parse(body)
52
52
  JSON.parse(body)
53
53
  rescue JSON::ParserError => e
54
- raise Error, "invalid JSON: #{e.message}"
54
+ raise RequestError, "invalid JSON: #{e.message}"
55
55
  end
56
56
  end
57
57
  end
@@ -1,20 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nous
4
- class Extractor
5
- class Jina
6
- class Error < Nous::Error; end
7
-
4
+ module Extractor
5
+ class Jina < Command
8
6
  def initialize(api_key: nil, timeout: 30, **client_options)
9
7
  @client = Client.new(api_key: api_key || ENV["JINA_API_KEY"], timeout:, **client_options)
10
8
  end
11
9
 
12
- def extract(page)
13
- body = client.get(page[:url])
10
+ def extract(record)
11
+ body = client.get(record.final_url)
14
12
 
15
- {title: body.dig("data", "title") || "", content: body.dig("data", "content") || ""}
16
- rescue Client::Error => e
17
- raise Error, e.message
13
+ success(payload: ExtractedContent.new(
14
+ title: body.dig("data", "title") || "",
15
+ content: body.dig("data", "content") || ""
16
+ ))
17
+ rescue Client::RequestError => e
18
+ failure(ExtractionError.new(e.message))
18
19
  end
19
20
 
20
21
  private
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Fetcher < Command
5
+ class ExtractionRunner < Command
6
+ class ExtractionError < StandardError; end
7
+
8
+ def initialize(records:, extractor:)
9
+ @records = records
10
+ @extractor = extractor
11
+ end
12
+
13
+ def call
14
+ pages = records.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
15
+ threads = batch.map { |record| Thread.new { PageExtractor.call(extractor:, record:) } }
16
+
17
+ threads.each do |thread|
18
+ result = thread.value
19
+ results << result.payload if result.success?
20
+ end
21
+ end
22
+
23
+ success(payload: pages)
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :records, :extractor
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Fetcher < Command
5
+ class PageExtractor < Command
6
+ def initialize(extractor:, record:)
7
+ @extractor = extractor
8
+ @record = record
9
+ end
10
+
11
+ def call
12
+ result = extractor.extract(record)
13
+
14
+ unless result.success?
15
+ warn("[nous] extract skip #{record.final_url}: #{result.error.message}") if Nous.configuration.debug?
16
+ return failure(result.error)
17
+ end
18
+
19
+ page = Page.new(
20
+ title: result.payload.title,
21
+ url: record.final_url,
22
+ pathname: record.pathname,
23
+ content: result.payload.content,
24
+ metadata: {
25
+ extractor: extractor.class.name,
26
+ requested_url: record.requested_url,
27
+ content_type: record.content_type,
28
+ redirected: record.redirected
29
+ }
30
+ )
31
+
32
+ success(payload: page)
33
+ end
34
+
35
+ private
36
+
37
+ attr_reader :extractor, :record
38
+ end
39
+ end
40
+ end
data/lib/nous/fetcher.rb CHANGED
@@ -2,35 +2,62 @@
2
2
 
3
3
  module Nous
4
4
  class Fetcher < Command
5
- class Error < Command::Error; end
5
+ class FetchError < StandardError; end
6
6
 
7
- def initialize(seed_url:, extractor: Extractor::Default.new)
7
+ def initialize(seed_url:, extractor: Extractor::Default.new, http_client: nil, details: false)
8
8
  @seed_url = seed_url
9
9
  @extractor = extractor
10
+ @http_client = http_client
11
+ @single_page = Nous.configuration.single_page?
12
+ @details = details
10
13
  end
11
14
 
12
15
  def call
13
- raw_pages = crawl
14
- pages = extract(raw_pages)
15
- success(payload: pages)
16
+ records = crawl
17
+ successful_records, failed_records = records.partition(&:ok)
18
+
19
+ if single_page && !details && successful_records.empty?
20
+ raise FetchError, failed_records.first&.error || "fetch failed"
21
+ end
22
+
23
+ pages = extract(successful_records)
24
+
25
+ if details
26
+ success(payload: FetchResult.new(
27
+ pages: pages,
28
+ failures: build_failures(failed_records),
29
+ total_requested: records.length
30
+ ))
31
+ else
32
+ success(payload: pages)
33
+ end
16
34
  end
17
35
 
18
36
  private
19
37
 
20
- attr_reader :seed_url, :extractor
38
+ attr_reader :seed_url, :extractor, :http_client, :single_page, :details
21
39
 
22
40
  def crawl
23
- result = Crawler.call(seed_url:)
24
- raise Error, result.error.message if result.failure?
41
+ result = Crawler.call(seed_url:, http_client:)
42
+ raise FetchError, result.error.message if result.failure?
25
43
 
26
44
  result.payload
27
45
  end
28
46
 
29
- def extract(raw_pages)
30
- result = ExtractionRunner.call(raw_pages:, extractor:)
31
- raise Error, result.error.message if result.failure?
47
+ def extract(records)
48
+ result = ExtractionRunner.call(records:, extractor:)
49
+ raise FetchError, result.error.message if result.failure?
32
50
 
33
51
  result.payload
34
52
  end
53
+
54
+ def build_failures(records)
55
+ records.map do |record|
56
+ {
57
+ requested_url: record.requested_url,
58
+ error: record.error
59
+ }
60
+ end
61
+ end
35
62
  end
36
63
  end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ Configuration = Data.define(
5
+ :concurrency,
6
+ :match,
7
+ :limit,
8
+ :timeout,
9
+ :debug,
10
+ :keep_query,
11
+ :recursive
12
+ ) do
13
+ def debug? = debug
14
+ def recursive? = recursive
15
+ def single_page? = !recursive
16
+ end
17
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ ExtractedContent = Data.define(:title, :content)
5
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ FetchRecord = Data.define(
5
+ :requested_url,
6
+ :final_url,
7
+ :pathname,
8
+ :html,
9
+ :content_type,
10
+ :ok,
11
+ :error,
12
+ :redirected
13
+ ) do
14
+ def initialize(
15
+ requested_url:,
16
+ pathname:, final_url: nil,
17
+ html: nil,
18
+ content_type: nil,
19
+ ok: true,
20
+ error: nil,
21
+ redirected: false
22
+ )
23
+ super
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ FetchResult = Data.define(:pages, :failures, :total_requested) do
5
+ def succeeded
6
+ pages.length
7
+ end
8
+
9
+ def failed
10
+ failures.length
11
+ end
12
+
13
+ def all_succeeded?
14
+ failures.empty?
15
+ end
16
+
17
+ def any_succeeded?
18
+ pages.any?
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ Page = Data.define(:title, :url, :pathname, :content, :metadata)
5
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "addressable/uri"
4
+
5
+ module Nous
6
+ class Url
7
+ def initialize(raw)
8
+ @uri = Addressable::URI.parse(raw.to_s.strip).normalize
9
+ raise ArgumentError, "invalid URL: #{raw}" if uri.to_s.empty?
10
+ rescue Addressable::URI::InvalidURIError => e
11
+ raise ArgumentError, "invalid URL: #{e.message}"
12
+ end
13
+
14
+ def host
15
+ uri.host
16
+ end
17
+
18
+ def path
19
+ return "/" if uri.path.empty?
20
+
21
+ uri.path
22
+ end
23
+
24
+ def http?
25
+ %w[http https].include?(uri.scheme)
26
+ end
27
+
28
+ def to_s
29
+ uri.to_s
30
+ end
31
+
32
+ def ==(other)
33
+ other.is_a?(Url) && to_s == other.to_s
34
+ end
35
+ alias_method :eql?, :==
36
+
37
+ def hash
38
+ to_s.hash
39
+ end
40
+
41
+ private
42
+
43
+ attr_reader :uri
44
+ end
45
+ end