nous 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e781bd7a823a8022f4a6e5a4346c183aadde727a59306103be9ffa937b7029dc
4
- data.tar.gz: 80bcb682c861204b589c3a548088d5e5806d23281d509e2556e27f9fa8ef3960
3
+ metadata.gz: c44bdc52070c6430739f9b0258ea53e3dafc1cff42d87814fd940c2e9e26ee94
4
+ data.tar.gz: e4b42ca9917d7e4656f8e8bc2d9b8b328781021c2ed02e9e1912bdb9ce8ac744
5
5
  SHA512:
6
- metadata.gz: 4cd9a3a161b7203689063820d9e5bc5fda4c04be288262ac5bdddeaae46bc283bd2a85a4029579b28329c5f1bcc6d324d3042ce4bde9a53763ab4d19830bfa28
7
- data.tar.gz: 3b0a4e96b51060064f3494b7227aa43ffa0e487a9a01f36ce610be7c852c5a86570665bbd92be07a622fa3b25efd2eef1c772f1db56161d63c062d770a973e26
6
+ metadata.gz: f55c5122dd9a53611c7045e648c34870f9e423afae6777d0004f0bc909c0b916fd5a8a0350168d286e2e63339be6ae393f9ee02cbe4703d1a392fceaee317fd0
7
+ data.tar.gz: fb6bdb6b9c283bc8350a4e697412869e9c1062af0659ad5b56aee6a0cdcad33983f8a15da9a94f3ae37b45b469a5d67a1c5e977d3d2c8b27a59e8f66eeedd59c
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 4.0.1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.0] - 2026-02-21
4
+
5
+ - Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)
6
+ - Eliminate verbose/concurrency/timeout parameter drilling through pipeline classes
7
+ - Promote ExtractionRunner to Command pattern
8
+ - Suppress async-pool gardener ThreadError in non-verbose mode
9
+ - Add CLI specs and full pipeline integration test
10
+ - Replace boilerplate README with real documentation
11
+ - Upgrade to Ruby 4.0.1
12
+
3
13
  ## [0.1.0] - 2026-02-21
4
14
 
5
15
  - Initial release
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Nous
2
2
 
3
- Crawl websites and extract readable Markdown, optimized for LLM consumption.
3
+ Crawl websites and extract readable Markdown, optimized for LLM consumption. Inspired by [sitefetch](https://github.com/egoist/sitefetch).
4
4
 
5
5
  Nous fetches same-host pages starting from a seed URL, extracts readable content, and outputs clean Markdown as XML-tagged text or JSON. It supports concurrent crawling, glob-based URL filtering, and two extraction backends: a local parser (ruby-readability) and the Jina Reader API for JS-rendered sites.
6
6
 
@@ -131,8 +131,8 @@ Extracted markdown content...
131
131
  ## Development
132
132
 
133
133
  ```bash
134
- bin/setup # Install dependencies
135
- bundle exec rspec # Run tests
134
+ bin/setup # Install dependencies
135
+ bundle exec rspec # Run tests
136
136
  bundle exec standardrb # Lint
137
137
  ```
138
138
 
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Nous
6
+ class Configuration
7
+ class Error < Nous::Error; end
8
+
9
+ attr_reader :seed, :concurrency, :match, :limit, :timeout, :keep_query
10
+
11
+ DEFAULT_CONCURRENCY = 3
12
+ DEFAULT_LIMIT = 100
13
+ DEFAULT_TIMEOUT = 15
14
+
15
+ def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
16
+ timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
17
+ @seed = parse_seed!(seed_url)
18
+ @concurrency = Integer(concurrency).clamp(1, 20)
19
+ @match = Array(match)
20
+ @limit = Integer(limit).clamp(1, 10_000)
21
+ @timeout = Integer(timeout)
22
+ @verbose = verbose
23
+ @keep_query = keep_query
24
+ end
25
+
26
+ def verbose? = @verbose
27
+
28
+ private
29
+
30
+ def parse_seed!(url)
31
+ uri = URI.parse(url)
32
+ raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
33
+
34
+ uri
35
+ rescue URI::InvalidURIError => e
36
+ raise Error, "invalid seed URL: #{e.message}"
37
+ end
38
+ end
39
+ end
@@ -3,9 +3,8 @@
3
3
  module Nous
4
4
  class Crawler < Command
5
5
  class LinkExtractor
6
- def initialize(url_filter:, verbose: false)
6
+ def initialize(url_filter:)
7
7
  @url_filter = url_filter
8
- @verbose = verbose
9
8
  end
10
9
 
11
10
  def extract(current_url, html)
@@ -16,7 +15,7 @@ module Nous
16
15
 
17
16
  private
18
17
 
19
- attr_reader :url_filter, :verbose
18
+ attr_reader :url_filter
20
19
 
21
20
  def anchors(html)
22
21
  Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
@@ -33,7 +32,7 @@ module Nous
33
32
 
34
33
  canonical
35
34
  rescue URI::InvalidURIError => e
36
- warn("[nous] malformed href #{href.inspect}: #{e.message}") if verbose
35
+ warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
37
36
  nil
38
37
  end
39
38
  end
@@ -5,14 +5,12 @@ module Nous
5
5
  class PageFetcher
6
6
  HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
7
7
 
8
- def initialize(client:, timeout:, verbose: false)
8
+ def initialize(client:)
9
9
  @client = client
10
- @timeout = timeout
11
- @verbose = verbose
12
10
  end
13
11
 
14
12
  def fetch(url)
15
- Async::Task.current.with_timeout(timeout) do
13
+ Async::Task.current.with_timeout(config.timeout) do
16
14
  response = client.get(url, {})
17
15
  return skip(url, "status #{response.status}") unless response.status == 200
18
16
  return skip(url, "non-html content") unless html?(response)
@@ -22,14 +20,18 @@ module Nous
22
20
  response&.close
23
21
  end
24
22
  rescue Async::TimeoutError
25
- skip(url, "timeout after #{timeout}s")
23
+ skip(url, "timeout after #{config.timeout}s")
26
24
  rescue IOError, SocketError, Errno::ECONNREFUSED => e
27
25
  skip(url, e.message)
28
26
  end
29
27
 
30
28
  private
31
29
 
32
- attr_reader :client, :timeout, :verbose
30
+ attr_reader :client
31
+
32
+ def config
33
+ Nous.configuration
34
+ end
33
35
 
34
36
  def html?(response)
35
37
  content_type = response.headers["content-type"].to_s
@@ -37,7 +39,7 @@ module Nous
37
39
  end
38
40
 
39
41
  def skip(url, reason)
40
- warn("[nous] skip #{url}: #{reason}") if verbose
42
+ warn("[nous] skip #{url}: #{reason}") if config.verbose?
41
43
  nil
42
44
  end
43
45
  end
data/lib/nous/crawler.rb CHANGED
@@ -9,11 +9,13 @@ module Nous
9
9
  class Crawler < Command
10
10
  class Error < Command::Error; end
11
11
 
12
- def initialize(seed_url:, **options)
13
- @config = Configuration.new(seed_url:, **options)
12
+ def initialize(seed_url:)
13
+ @seed_url = seed_url
14
14
  end
15
15
 
16
16
  def call
17
+ suppress_async_warnings unless config.verbose?
18
+
17
19
  pages = []
18
20
  queue = [url_filter.canonicalize(config.seed)]
19
21
  seen = Set.new(queue)
@@ -32,7 +34,11 @@ module Nous
32
34
 
33
35
  private
34
36
 
35
- attr_reader :config
37
+ attr_reader :seed_url
38
+
39
+ def config
40
+ Nous.configuration
41
+ end
36
42
 
37
43
  def crawl(queue:, seen:, pages:, client:)
38
44
  while queue.any? && pages.length < config.limit
@@ -70,11 +76,16 @@ module Nous
70
76
  end
71
77
 
72
78
  def link_extractor
73
- @link_extractor ||= LinkExtractor.new(url_filter:, verbose: config.verbose)
79
+ @link_extractor ||= LinkExtractor.new(url_filter:)
74
80
  end
75
81
 
76
82
  def page_fetcher(client)
77
- PageFetcher.new(client:, timeout: config.timeout, verbose: config.verbose)
83
+ PageFetcher.new(client:)
84
+ end
85
+
86
+ def suppress_async_warnings
87
+ require "console"
88
+ Console.logger.level = :error
78
89
  end
79
90
  end
80
91
  end
@@ -1,31 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nous
4
- class ExtractionRunner
5
- def initialize(raw_pages:, extractor:, concurrency: 3, verbose: false)
4
+ class ExtractionRunner < Command
5
+ class Error < Command::Error; end
6
+
7
+ def initialize(raw_pages:, extractor:)
6
8
  @raw_pages = raw_pages
7
9
  @extractor = extractor
8
- @concurrency = Integer(concurrency).clamp(1, 20)
9
- @verbose = verbose
10
10
  end
11
11
 
12
12
  def call
13
- raw_pages.each_slice(concurrency).each_with_object([]) do |batch, pages|
14
- threads = batch.map { |raw| Thread.new { build_thread(raw).call } }
13
+ pages = raw_pages.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
14
+ threads = batch.map { |raw| Thread.new { ExtractionThread.new(extractor:, raw_page: raw).call } }
15
15
 
16
16
  threads.each do |thread|
17
17
  result = thread.value
18
- pages << result if result
18
+ results << result if result
19
19
  end
20
20
  end
21
+
22
+ success(payload: pages)
21
23
  end
22
24
 
23
25
  private
24
26
 
25
- attr_reader :raw_pages, :extractor, :concurrency, :verbose
26
-
27
- def build_thread(raw_page)
28
- ExtractionThread.new(extractor:, raw_page:, verbose:)
29
- end
27
+ attr_reader :raw_pages, :extractor
30
28
  end
31
29
  end
@@ -2,10 +2,9 @@
2
2
 
3
3
  module Nous
4
4
  class ExtractionThread
5
- def initialize(extractor:, raw_page:, verbose: false)
5
+ def initialize(extractor:, raw_page:)
6
6
  @extractor = extractor
7
7
  @raw_page = raw_page
8
- @verbose = verbose
9
8
  end
10
9
 
11
10
  def call
@@ -18,12 +17,12 @@ module Nous
18
17
  content: extracted[:content]
19
18
  )
20
19
  rescue Nous::Error => e
21
- warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if verbose
20
+ warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if Nous.configuration.verbose?
22
21
  nil
23
22
  end
24
23
 
25
24
  private
26
25
 
27
- attr_reader :extractor, :raw_page, :verbose
26
+ attr_reader :extractor, :raw_page
28
27
  end
29
28
  end
data/lib/nous/fetcher.rb CHANGED
@@ -4,10 +4,9 @@ module Nous
4
4
  class Fetcher < Command
5
5
  class Error < Command::Error; end
6
6
 
7
- def initialize(seed_url:, extractor: Extractor::Default.new, **crawler_options)
7
+ def initialize(seed_url:, extractor: Extractor::Default.new)
8
8
  @seed_url = seed_url
9
9
  @extractor = extractor
10
- @crawler_options = crawler_options
11
10
  end
12
11
 
13
12
  def call
@@ -18,22 +17,20 @@ module Nous
18
17
 
19
18
  private
20
19
 
21
- attr_reader :seed_url, :extractor, :crawler_options
20
+ attr_reader :seed_url, :extractor
22
21
 
23
22
  def crawl
24
- result = Crawler.call(seed_url:, **crawler_options)
23
+ result = Crawler.call(seed_url:)
25
24
  raise Error, result.error.message if result.failure?
26
25
 
27
26
  result.payload
28
27
  end
29
28
 
30
29
  def extract(raw_pages)
31
- ExtractionRunner.new(
32
- raw_pages:,
33
- extractor:,
34
- concurrency: crawler_options.fetch(:concurrency, 3),
35
- verbose: crawler_options.fetch(:verbose, false)
36
- ).call
30
+ result = ExtractionRunner.call(raw_pages:, extractor:)
31
+ raise Error, result.error.message if result.failure?
32
+
33
+ result.payload
37
34
  end
38
35
  end
39
36
  end
data/lib/nous/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Nous
4
- VERSION = "0.1.0"
4
+ VERSION = '0.2.0'
5
5
  end
data/lib/nous.rb CHANGED
@@ -6,19 +6,31 @@ loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
8
  module Nous
9
- module_function
9
+ class << self
10
+ attr_reader :configuration
10
11
 
11
- def fetch(seed_url, **options)
12
- result = Fetcher.call(seed_url:, **options)
13
- raise result.error if result.failure?
12
+ def configure(seed_url:, **options)
13
+ @configuration = Configuration.new(seed_url:, **options)
14
+ end
14
15
 
15
- result.payload
16
- end
16
+ def reset_configuration!
17
+ @configuration = nil
18
+ end
19
+
20
+ def fetch(seed_url, extractor: Extractor::Default.new, **options)
21
+ configure(seed_url:, **options)
22
+
23
+ result = Fetcher.call(seed_url:, extractor:)
24
+ raise result.error if result.failure?
25
+
26
+ result.payload
27
+ end
17
28
 
18
- def serialize(pages, format: :text)
19
- result = Serializer.call(pages:, format:)
20
- raise result.error if result.failure?
29
+ def serialize(pages, format: :text)
30
+ result = Serializer.call(pages:, format:)
31
+ raise result.error if result.failure?
21
32
 
22
- result.payload
33
+ result.payload
34
+ end
23
35
  end
24
36
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nous
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dan Frenette
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2026-02-21 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: async
@@ -188,6 +187,7 @@ extensions: []
188
187
  extra_rdoc_files: []
189
188
  files:
190
189
  - ".rspec"
190
+ - ".ruby-version"
191
191
  - ".standard.yml"
192
192
  - CHANGELOG.md
193
193
  - LICENSE.txt
@@ -197,9 +197,9 @@ files:
197
197
  - lib/nous.rb
198
198
  - lib/nous/cli.rb
199
199
  - lib/nous/command.rb
200
+ - lib/nous/configuration.rb
200
201
  - lib/nous/converter.rb
201
202
  - lib/nous/crawler.rb
202
- - lib/nous/crawler/configuration.rb
203
203
  - lib/nous/crawler/link_extractor.rb
204
204
  - lib/nous/crawler/page_fetcher.rb
205
205
  - lib/nous/crawler/url_filter.rb
@@ -222,7 +222,6 @@ metadata:
222
222
  homepage_uri: https://github.com/danfrenette/nous
223
223
  source_code_uri: https://github.com/danfrenette/nous
224
224
  changelog_uri: https://github.com/danfrenette/nous/blob/main/CHANGELOG.md
225
- post_install_message:
226
225
  rdoc_options: []
227
226
  require_paths:
228
227
  - lib
@@ -237,8 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
237
236
  - !ruby/object:Gem::Version
238
237
  version: '0'
239
238
  requirements: []
240
- rubygems_version: 3.5.9
241
- signing_key:
239
+ rubygems_version: 4.0.3
242
240
  specification_version: 4
243
241
  summary: Crawl websites and extract readable markdown for LLM workflows
244
242
  test_files: []
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "uri"
4
-
5
- module Nous
6
- class Crawler < Command
7
- class Error < Command::Error; end
8
-
9
- class Configuration
10
- attr_reader :seed, :concurrency, :match, :limit, :timeout, :verbose, :keep_query
11
-
12
- DEFAULT_CONCURRENCY = 3
13
- DEFAULT_LIMIT = 100
14
- DEFAULT_TIMEOUT = 15
15
-
16
- def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
17
- timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
18
- @seed = parse_seed!(seed_url)
19
- @concurrency = Integer(concurrency).clamp(1, 20)
20
- @match = Array(match)
21
- @limit = Integer(limit).clamp(1, 10_000)
22
- @timeout = Integer(timeout)
23
- @verbose = verbose
24
- @keep_query = keep_query
25
- end
26
-
27
- private
28
-
29
- def parse_seed!(url)
30
- uri = URI.parse(url)
31
- raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
32
-
33
- uri
34
- rescue URI::InvalidURIError => e
35
- raise Error, "invalid seed URL: #{e.message}"
36
- end
37
- end
38
- end
39
- end