nous 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/CHANGELOG.md +10 -0
- data/README.md +3 -3
- data/lib/nous/configuration.rb +39 -0
- data/lib/nous/crawler/link_extractor.rb +3 -4
- data/lib/nous/crawler/page_fetcher.rb +9 -7
- data/lib/nous/crawler.rb +16 -5
- data/lib/nous/extraction_runner.rb +10 -12
- data/lib/nous/extraction_thread.rb +3 -4
- data/lib/nous/fetcher.rb +7 -10
- data/lib/nous/version.rb +1 -1
- data/lib/nous.rb +22 -10
- metadata +5 -7
- data/lib/nous/crawler/configuration.rb +0 -39
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c44bdc52070c6430739f9b0258ea53e3dafc1cff42d87814fd940c2e9e26ee94
|
|
4
|
+
data.tar.gz: e4b42ca9917d7e4656f8e8bc2d9b8b328781021c2ed02e9e1912bdb9ce8ac744
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f55c5122dd9a53611c7045e648c34870f9e423afae6777d0004f0bc909c0b916fd5a8a0350168d286e2e63339be6ae393f9ee02cbe4703d1a392fceaee317fd0
|
|
7
|
+
data.tar.gz: fb6bdb6b9c283bc8350a4e697412869e9c1062af0659ad5b56aee6a0cdcad33983f8a15da9a94f3ae37b45b469a5d67a1c5e977d3d2c8b27a59e8f66eeedd59c
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
4.0.1
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.2.0] - 2026-02-21
|
|
4
|
+
|
|
5
|
+
- Promote Configuration to module-level singleton (`Nous.configure`, `Nous.configuration`)
|
|
6
|
+
- Eliminate verbose/concurrency/timeout parameter drilling through pipeline classes
|
|
7
|
+
- Promote ExtractionRunner to Command pattern
|
|
8
|
+
- Suppress async-pool gardener ThreadError in non-verbose mode
|
|
9
|
+
- Add CLI specs and full pipeline integration test
|
|
10
|
+
- Replace boilerplate README with real documentation
|
|
11
|
+
- Upgrade to Ruby 4.0.1
|
|
12
|
+
|
|
3
13
|
## [0.1.0] - 2026-02-21
|
|
4
14
|
|
|
5
15
|
- Initial release
|
data/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Nous
|
|
2
2
|
|
|
3
|
-
Crawl websites and extract readable Markdown, optimized for LLM consumption.
|
|
3
|
+
Crawl websites and extract readable Markdown, optimized for LLM consumption. Inspired by [sitefetch](https://github.com/egoist/sitefetch).
|
|
4
4
|
|
|
5
5
|
Nous fetches same-host pages starting from a seed URL, extracts readable content, and outputs clean Markdown as XML-tagged text or JSON. It supports concurrent crawling, glob-based URL filtering, and two extraction backends: a local parser (ruby-readability) and the Jina Reader API for JS-rendered sites.
|
|
6
6
|
|
|
@@ -131,8 +131,8 @@ Extracted markdown content...
|
|
|
131
131
|
## Development
|
|
132
132
|
|
|
133
133
|
```bash
|
|
134
|
-
bin/setup
|
|
135
|
-
bundle exec rspec
|
|
134
|
+
bin/setup # Install dependencies
|
|
135
|
+
bundle exec rspec # Run tests
|
|
136
136
|
bundle exec standardrb # Lint
|
|
137
137
|
```
|
|
138
138
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Configuration
|
|
7
|
+
class Error < Nous::Error; end
|
|
8
|
+
|
|
9
|
+
attr_reader :seed, :concurrency, :match, :limit, :timeout, :keep_query
|
|
10
|
+
|
|
11
|
+
DEFAULT_CONCURRENCY = 3
|
|
12
|
+
DEFAULT_LIMIT = 100
|
|
13
|
+
DEFAULT_TIMEOUT = 15
|
|
14
|
+
|
|
15
|
+
def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
|
|
16
|
+
timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
|
|
17
|
+
@seed = parse_seed!(seed_url)
|
|
18
|
+
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
19
|
+
@match = Array(match)
|
|
20
|
+
@limit = Integer(limit).clamp(1, 10_000)
|
|
21
|
+
@timeout = Integer(timeout)
|
|
22
|
+
@verbose = verbose
|
|
23
|
+
@keep_query = keep_query
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def verbose? = @verbose
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def parse_seed!(url)
|
|
31
|
+
uri = URI.parse(url)
|
|
32
|
+
raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
|
|
33
|
+
|
|
34
|
+
uri
|
|
35
|
+
rescue URI::InvalidURIError => e
|
|
36
|
+
raise Error, "invalid seed URL: #{e.message}"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -3,9 +3,8 @@
|
|
|
3
3
|
module Nous
|
|
4
4
|
class Crawler < Command
|
|
5
5
|
class LinkExtractor
|
|
6
|
-
def initialize(url_filter
|
|
6
|
+
def initialize(url_filter:)
|
|
7
7
|
@url_filter = url_filter
|
|
8
|
-
@verbose = verbose
|
|
9
8
|
end
|
|
10
9
|
|
|
11
10
|
def extract(current_url, html)
|
|
@@ -16,7 +15,7 @@ module Nous
|
|
|
16
15
|
|
|
17
16
|
private
|
|
18
17
|
|
|
19
|
-
attr_reader :url_filter
|
|
18
|
+
attr_reader :url_filter
|
|
20
19
|
|
|
21
20
|
def anchors(html)
|
|
22
21
|
Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
|
|
@@ -33,7 +32,7 @@ module Nous
|
|
|
33
32
|
|
|
34
33
|
canonical
|
|
35
34
|
rescue URI::InvalidURIError => e
|
|
36
|
-
warn("[nous] malformed href #{href.inspect}: #{e.message}") if verbose
|
|
35
|
+
warn("[nous] malformed href #{href.inspect}: #{e.message}") if Nous.configuration.verbose?
|
|
37
36
|
nil
|
|
38
37
|
end
|
|
39
38
|
end
|
|
@@ -5,14 +5,12 @@ module Nous
|
|
|
5
5
|
class PageFetcher
|
|
6
6
|
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
7
7
|
|
|
8
|
-
def initialize(client
|
|
8
|
+
def initialize(client:)
|
|
9
9
|
@client = client
|
|
10
|
-
@timeout = timeout
|
|
11
|
-
@verbose = verbose
|
|
12
10
|
end
|
|
13
11
|
|
|
14
12
|
def fetch(url)
|
|
15
|
-
Async::Task.current.with_timeout(timeout) do
|
|
13
|
+
Async::Task.current.with_timeout(config.timeout) do
|
|
16
14
|
response = client.get(url, {})
|
|
17
15
|
return skip(url, "status #{response.status}") unless response.status == 200
|
|
18
16
|
return skip(url, "non-html content") unless html?(response)
|
|
@@ -22,14 +20,18 @@ module Nous
|
|
|
22
20
|
response&.close
|
|
23
21
|
end
|
|
24
22
|
rescue Async::TimeoutError
|
|
25
|
-
skip(url, "timeout after #{timeout}s")
|
|
23
|
+
skip(url, "timeout after #{config.timeout}s")
|
|
26
24
|
rescue IOError, SocketError, Errno::ECONNREFUSED => e
|
|
27
25
|
skip(url, e.message)
|
|
28
26
|
end
|
|
29
27
|
|
|
30
28
|
private
|
|
31
29
|
|
|
32
|
-
attr_reader :client
|
|
30
|
+
attr_reader :client
|
|
31
|
+
|
|
32
|
+
def config
|
|
33
|
+
Nous.configuration
|
|
34
|
+
end
|
|
33
35
|
|
|
34
36
|
def html?(response)
|
|
35
37
|
content_type = response.headers["content-type"].to_s
|
|
@@ -37,7 +39,7 @@ module Nous
|
|
|
37
39
|
end
|
|
38
40
|
|
|
39
41
|
def skip(url, reason)
|
|
40
|
-
warn("[nous] skip #{url}: #{reason}") if verbose
|
|
42
|
+
warn("[nous] skip #{url}: #{reason}") if config.verbose?
|
|
41
43
|
nil
|
|
42
44
|
end
|
|
43
45
|
end
|
data/lib/nous/crawler.rb
CHANGED
|
@@ -9,11 +9,13 @@ module Nous
|
|
|
9
9
|
class Crawler < Command
|
|
10
10
|
class Error < Command::Error; end
|
|
11
11
|
|
|
12
|
-
def initialize(seed_url
|
|
13
|
-
@
|
|
12
|
+
def initialize(seed_url:)
|
|
13
|
+
@seed_url = seed_url
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def call
|
|
17
|
+
suppress_async_warnings unless config.verbose?
|
|
18
|
+
|
|
17
19
|
pages = []
|
|
18
20
|
queue = [url_filter.canonicalize(config.seed)]
|
|
19
21
|
seen = Set.new(queue)
|
|
@@ -32,7 +34,11 @@ module Nous
|
|
|
32
34
|
|
|
33
35
|
private
|
|
34
36
|
|
|
35
|
-
attr_reader :
|
|
37
|
+
attr_reader :seed_url
|
|
38
|
+
|
|
39
|
+
def config
|
|
40
|
+
Nous.configuration
|
|
41
|
+
end
|
|
36
42
|
|
|
37
43
|
def crawl(queue:, seen:, pages:, client:)
|
|
38
44
|
while queue.any? && pages.length < config.limit
|
|
@@ -70,11 +76,16 @@ module Nous
|
|
|
70
76
|
end
|
|
71
77
|
|
|
72
78
|
def link_extractor
|
|
73
|
-
@link_extractor ||= LinkExtractor.new(url_filter
|
|
79
|
+
@link_extractor ||= LinkExtractor.new(url_filter:)
|
|
74
80
|
end
|
|
75
81
|
|
|
76
82
|
def page_fetcher(client)
|
|
77
|
-
PageFetcher.new(client
|
|
83
|
+
PageFetcher.new(client:)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def suppress_async_warnings
|
|
87
|
+
require "console"
|
|
88
|
+
Console.logger.level = :error
|
|
78
89
|
end
|
|
79
90
|
end
|
|
80
91
|
end
|
|
@@ -1,31 +1,29 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
|
-
class ExtractionRunner
|
|
5
|
-
|
|
4
|
+
class ExtractionRunner < Command
|
|
5
|
+
class Error < Command::Error; end
|
|
6
|
+
|
|
7
|
+
def initialize(raw_pages:, extractor:)
|
|
6
8
|
@raw_pages = raw_pages
|
|
7
9
|
@extractor = extractor
|
|
8
|
-
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
9
|
-
@verbose = verbose
|
|
10
10
|
end
|
|
11
11
|
|
|
12
12
|
def call
|
|
13
|
-
raw_pages.each_slice(concurrency).each_with_object([]) do |batch,
|
|
14
|
-
threads = batch.map { |raw| Thread.new {
|
|
13
|
+
pages = raw_pages.each_slice(Nous.configuration.concurrency).each_with_object([]) do |batch, results|
|
|
14
|
+
threads = batch.map { |raw| Thread.new { ExtractionThread.new(extractor:, raw_page: raw).call } }
|
|
15
15
|
|
|
16
16
|
threads.each do |thread|
|
|
17
17
|
result = thread.value
|
|
18
|
-
|
|
18
|
+
results << result if result
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
|
+
|
|
22
|
+
success(payload: pages)
|
|
21
23
|
end
|
|
22
24
|
|
|
23
25
|
private
|
|
24
26
|
|
|
25
|
-
attr_reader :raw_pages, :extractor
|
|
26
|
-
|
|
27
|
-
def build_thread(raw_page)
|
|
28
|
-
ExtractionThread.new(extractor:, raw_page:, verbose:)
|
|
29
|
-
end
|
|
27
|
+
attr_reader :raw_pages, :extractor
|
|
30
28
|
end
|
|
31
29
|
end
|
|
@@ -2,10 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
module Nous
|
|
4
4
|
class ExtractionThread
|
|
5
|
-
def initialize(extractor:, raw_page
|
|
5
|
+
def initialize(extractor:, raw_page:)
|
|
6
6
|
@extractor = extractor
|
|
7
7
|
@raw_page = raw_page
|
|
8
|
-
@verbose = verbose
|
|
9
8
|
end
|
|
10
9
|
|
|
11
10
|
def call
|
|
@@ -18,12 +17,12 @@ module Nous
|
|
|
18
17
|
content: extracted[:content]
|
|
19
18
|
)
|
|
20
19
|
rescue Nous::Error => e
|
|
21
|
-
warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if verbose
|
|
20
|
+
warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if Nous.configuration.verbose?
|
|
22
21
|
nil
|
|
23
22
|
end
|
|
24
23
|
|
|
25
24
|
private
|
|
26
25
|
|
|
27
|
-
attr_reader :extractor, :raw_page
|
|
26
|
+
attr_reader :extractor, :raw_page
|
|
28
27
|
end
|
|
29
28
|
end
|
data/lib/nous/fetcher.rb
CHANGED
|
@@ -4,10 +4,9 @@ module Nous
|
|
|
4
4
|
class Fetcher < Command
|
|
5
5
|
class Error < Command::Error; end
|
|
6
6
|
|
|
7
|
-
def initialize(seed_url:, extractor: Extractor::Default.new
|
|
7
|
+
def initialize(seed_url:, extractor: Extractor::Default.new)
|
|
8
8
|
@seed_url = seed_url
|
|
9
9
|
@extractor = extractor
|
|
10
|
-
@crawler_options = crawler_options
|
|
11
10
|
end
|
|
12
11
|
|
|
13
12
|
def call
|
|
@@ -18,22 +17,20 @@ module Nous
|
|
|
18
17
|
|
|
19
18
|
private
|
|
20
19
|
|
|
21
|
-
attr_reader :seed_url, :extractor
|
|
20
|
+
attr_reader :seed_url, :extractor
|
|
22
21
|
|
|
23
22
|
def crawl
|
|
24
|
-
result = Crawler.call(seed_url
|
|
23
|
+
result = Crawler.call(seed_url:)
|
|
25
24
|
raise Error, result.error.message if result.failure?
|
|
26
25
|
|
|
27
26
|
result.payload
|
|
28
27
|
end
|
|
29
28
|
|
|
30
29
|
def extract(raw_pages)
|
|
31
|
-
ExtractionRunner.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
verbose: crawler_options.fetch(:verbose, false)
|
|
36
|
-
).call
|
|
30
|
+
result = ExtractionRunner.call(raw_pages:, extractor:)
|
|
31
|
+
raise Error, result.error.message if result.failure?
|
|
32
|
+
|
|
33
|
+
result.payload
|
|
37
34
|
end
|
|
38
35
|
end
|
|
39
36
|
end
|
data/lib/nous/version.rb
CHANGED
data/lib/nous.rb
CHANGED
|
@@ -6,19 +6,31 @@ loader = Zeitwerk::Loader.for_gem
|
|
|
6
6
|
loader.setup
|
|
7
7
|
|
|
8
8
|
module Nous
|
|
9
|
-
|
|
9
|
+
class << self
|
|
10
|
+
attr_reader :configuration
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
def configure(seed_url:, **options)
|
|
13
|
+
@configuration = Configuration.new(seed_url:, **options)
|
|
14
|
+
end
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
def reset_configuration!
|
|
17
|
+
@configuration = nil
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def fetch(seed_url, extractor: Extractor::Default.new, **options)
|
|
21
|
+
configure(seed_url:, **options)
|
|
22
|
+
|
|
23
|
+
result = Fetcher.call(seed_url:, extractor:)
|
|
24
|
+
raise result.error if result.failure?
|
|
25
|
+
|
|
26
|
+
result.payload
|
|
27
|
+
end
|
|
17
28
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
29
|
+
def serialize(pages, format: :text)
|
|
30
|
+
result = Serializer.call(pages:, format:)
|
|
31
|
+
raise result.error if result.failure?
|
|
21
32
|
|
|
22
|
-
|
|
33
|
+
result.payload
|
|
34
|
+
end
|
|
23
35
|
end
|
|
24
36
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: nous
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dan Frenette
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: async
|
|
@@ -188,6 +187,7 @@ extensions: []
|
|
|
188
187
|
extra_rdoc_files: []
|
|
189
188
|
files:
|
|
190
189
|
- ".rspec"
|
|
190
|
+
- ".ruby-version"
|
|
191
191
|
- ".standard.yml"
|
|
192
192
|
- CHANGELOG.md
|
|
193
193
|
- LICENSE.txt
|
|
@@ -197,9 +197,9 @@ files:
|
|
|
197
197
|
- lib/nous.rb
|
|
198
198
|
- lib/nous/cli.rb
|
|
199
199
|
- lib/nous/command.rb
|
|
200
|
+
- lib/nous/configuration.rb
|
|
200
201
|
- lib/nous/converter.rb
|
|
201
202
|
- lib/nous/crawler.rb
|
|
202
|
-
- lib/nous/crawler/configuration.rb
|
|
203
203
|
- lib/nous/crawler/link_extractor.rb
|
|
204
204
|
- lib/nous/crawler/page_fetcher.rb
|
|
205
205
|
- lib/nous/crawler/url_filter.rb
|
|
@@ -222,7 +222,6 @@ metadata:
|
|
|
222
222
|
homepage_uri: https://github.com/danfrenette/nous
|
|
223
223
|
source_code_uri: https://github.com/danfrenette/nous
|
|
224
224
|
changelog_uri: https://github.com/danfrenette/nous/blob/main/CHANGELOG.md
|
|
225
|
-
post_install_message:
|
|
226
225
|
rdoc_options: []
|
|
227
226
|
require_paths:
|
|
228
227
|
- lib
|
|
@@ -237,8 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
237
236
|
- !ruby/object:Gem::Version
|
|
238
237
|
version: '0'
|
|
239
238
|
requirements: []
|
|
240
|
-
rubygems_version:
|
|
241
|
-
signing_key:
|
|
239
|
+
rubygems_version: 4.0.3
|
|
242
240
|
specification_version: 4
|
|
243
241
|
summary: Crawl websites and extract readable markdown for LLM workflows
|
|
244
242
|
test_files: []
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "uri"
|
|
4
|
-
|
|
5
|
-
module Nous
|
|
6
|
-
class Crawler < Command
|
|
7
|
-
class Error < Command::Error; end
|
|
8
|
-
|
|
9
|
-
class Configuration
|
|
10
|
-
attr_reader :seed, :concurrency, :match, :limit, :timeout, :verbose, :keep_query
|
|
11
|
-
|
|
12
|
-
DEFAULT_CONCURRENCY = 3
|
|
13
|
-
DEFAULT_LIMIT = 100
|
|
14
|
-
DEFAULT_TIMEOUT = 15
|
|
15
|
-
|
|
16
|
-
def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
|
|
17
|
-
timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
|
|
18
|
-
@seed = parse_seed!(seed_url)
|
|
19
|
-
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
20
|
-
@match = Array(match)
|
|
21
|
-
@limit = Integer(limit).clamp(1, 10_000)
|
|
22
|
-
@timeout = Integer(timeout)
|
|
23
|
-
@verbose = verbose
|
|
24
|
-
@keep_query = keep_query
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
private
|
|
28
|
-
|
|
29
|
-
def parse_seed!(url)
|
|
30
|
-
uri = URI.parse(url)
|
|
31
|
-
raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
|
|
32
|
-
|
|
33
|
-
uri
|
|
34
|
-
rescue URI::InvalidURIError => e
|
|
35
|
-
raise Error, "invalid seed URL: #{e.message}"
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|