nous 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e781bd7a823a8022f4a6e5a4346c183aadde727a59306103be9ffa937b7029dc
4
+ data.tar.gz: 80bcb682c861204b589c3a548088d5e5806d23281d509e2556e27f9fa8ef3960
5
+ SHA512:
6
+ metadata.gz: 4cd9a3a161b7203689063820d9e5bc5fda4c04be288262ac5bdddeaae46bc283bd2a85a4029579b28329c5f1bcc6d324d3042ce4bde9a53763ab4d19830bfa28
7
+ data.tar.gz: 3b0a4e96b51060064f3494b7227aa43ffa0e487a9a01f36ce610be7c852c5a86570665bbd92be07a622fa3b25efd2eef1c772f1db56161d63c062d770a973e26
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.standard.yml ADDED
@@ -0,0 +1,3 @@
1
+ # For available configuration options, see:
2
+ # https://github.com/standardrb/standard
3
+ ruby_version: 3.1
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2026-02-21
4
+
5
+ - Initial release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 Dan Frenette
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,141 @@
1
+ # Nous
2
+
3
+ Crawl websites and extract readable Markdown, optimized for LLM consumption.
4
+
5
+ Nous fetches same-host pages starting from a seed URL, extracts readable content, and outputs clean Markdown as XML-tagged text or JSON. It supports concurrent crawling, glob-based URL filtering, and two extraction backends: a local parser (ruby-readability) and the Jina Reader API for JS-rendered sites.
6
+
7
+ ## Installation
8
+
9
+ Add to your Gemfile:
10
+
11
+ ```ruby
12
+ gem "nous"
13
+ ```
14
+
15
+ Or install directly:
16
+
17
+ ```bash
18
+ gem install nous
19
+ ```
20
+
21
+ ## CLI Usage
22
+
23
+ ```bash
24
+ # Crawl a site and print extracted content to stdout
25
+ nous https://example.com
26
+
27
+ # Output as JSON
28
+ nous https://example.com -f json
29
+
30
+ # Write to a file
31
+ nous https://example.com -o site.md
32
+
33
+ # Limit pages and increase concurrency
34
+ nous https://example.com -l 20 -c 5
35
+
36
+ # Only crawl pages matching a glob pattern
37
+ nous https://example.com -m "/blog/*"
38
+
39
+ # Scope extraction to a CSS selector
40
+ nous https://example.com -s "article.post"
41
+
42
+ # Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
43
+ nous https://example.com --jina
44
+
45
+ # Verbose logging
46
+ nous https://example.com -v
47
+ ```
48
+
49
+ ### Options
50
+
51
+ | Flag | Description | Default |
52
+ |------|-------------|---------|
53
+ | `-o`, `--output PATH` | Write output to file | stdout |
54
+ | `-f`, `--format FORMAT` | Output format: `text` or `json` | `text` |
55
+ | `-c`, `--concurrency N` | Concurrent requests | `3` |
56
+ | `-m`, `--match PATTERN` | Glob filter for URLs (repeatable) | none |
57
+ | `-s`, `--selector SELECTOR` | CSS selector to scope extraction | none |
58
+ | `-l`, `--limit N` | Maximum pages to fetch | `100` |
59
+ | `--timeout N` | Per-request timeout in seconds | `15` |
60
+ | `--jina` | Use Jina Reader API for extraction | off |
61
+ | `-v`, `--verbose` | Verbose logging to stderr | off |
62
+
63
+ ## Ruby API
64
+
65
+ ```ruby
66
+ require "nous"
67
+
68
+ # Fetch pages with the default extractor
69
+ pages = Nous.fetch("https://example.com", limit: 10, concurrency: 3)
70
+
71
+ # Each page is a Nous::Page with title, url, pathname, content
72
+ pages.each do |page|
73
+ puts "#{page.title} (#{page.url})"
74
+ puts page.content
75
+ end
76
+
77
+ # Serialize to XML-tagged text
78
+ text = Nous.serialize(pages, format: :text)
79
+
80
+ # Serialize to JSON
81
+ json = Nous.serialize(pages, format: :json)
82
+
83
+ # Use the Jina extractor for JS-heavy sites
84
+ pages = Nous.fetch("https://spa-site.com",
85
+ extractor: Nous::Extractor::Jina.new,
86
+ limit: 5
87
+ )
88
+ ```
89
+
90
+ ## Extraction Backends
91
+
92
+ ### Default (ruby-readability)
93
+
94
+ Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (nav, footer, script, header), and converts to Markdown via [reverse_markdown](https://github.com/xijo/reverse_markdown). Fast and requires no external services, but cannot extract content from JS-rendered pages.
95
+
96
+ ### Jina Reader API
97
+
98
+ Uses the [Jina Reader API](https://jina.ai/reader/) which renders pages with headless Chrome. Handles Next.js App Router, React Server Components, SPAs, and other JS-heavy sites. Free tier allows 20 requests/minute without a key, or 500 RPM with a `JINA_API_KEY` environment variable.
99
+
100
+ ## Output Formats
101
+
102
+ ### Text (default)
103
+
104
+ XML-tagged output designed for LLM context windows:
105
+
106
+ ```xml
107
+ <page>
108
+ <title>Page Title</title>
109
+ <url>https://example.com/page</url>
110
+ <content>
111
+ # Heading
112
+
113
+ Extracted markdown content...
114
+ </content>
115
+ </page>
116
+ ```
117
+
118
+ ### JSON
119
+
120
+ ```json
121
+ [
122
+ {
123
+ "title": "Page Title",
124
+ "url": "https://example.com/page",
125
+ "pathname": "/page",
126
+ "content": "# Heading\n\nExtracted markdown content..."
127
+ }
128
+ ]
129
+ ```
130
+
131
+ ## Development
132
+
133
+ ```bash
134
+ bin/setup # Install dependencies
135
+ bundle exec rspec # Run tests
136
+ bundle exec standardrb # Lint
137
+ ```
138
+
139
+ ## License
140
+
141
+ MIT License. See [LICENSE.txt](LICENSE.txt).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "standard/rake"
9
+
10
+ task default: %i[spec standard]
data/exe/nous ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "nous"
5
+
6
+ Nous::Cli.new(ARGV).run
data/lib/nous/cli.rb ADDED
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+
5
+ module Nous
6
+ class Cli
7
+ class Error < Nous::Error; end
8
+
9
+ def initialize(argv)
10
+ @argv = argv
11
+ @options = {format: :text, concurrency: 3, limit: 100, timeout: 15}
12
+ end
13
+
14
+ def run
15
+ parse_options!
16
+ validate!
17
+
18
+ pages = Nous.fetch(seed_url, **fetch_options)
19
+ output = Nous.serialize(pages, format: options[:format])
20
+ write_output(output)
21
+ rescue Nous::Error => e
22
+ warn("nous: #{e.message}")
23
+ exit 1
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :argv, :options
29
+
30
+ def seed_url
31
+ argv.first
32
+ end
33
+
34
+ def fetch_options
35
+ opts = options.slice(:concurrency, :match, :limit, :timeout, :verbose)
36
+ opts[:extractor] = extractor
37
+ opts
38
+ end
39
+
40
+ def extractor
41
+ return Extractor::Jina.new if options[:jina]
42
+
43
+ Extractor::Default.new(selector: options[:selector])
44
+ end
45
+
46
+ def validate!
47
+ raise Error, "no URL provided. Usage: nous <url> [options]" unless seed_url
48
+ end
49
+
50
+ def write_output(output)
51
+ if options[:output]
52
+ File.write(options[:output], output)
53
+ else
54
+ $stdout.puts(output)
55
+ end
56
+ end
57
+
58
+ def parse_options!
59
+ parser.parse!(argv)
60
+ rescue OptionParser::InvalidOption => e
61
+ raise Error, e.message
62
+ end
63
+
64
+ def parser
65
+ OptionParser.new do |opts|
66
+ opts.banner = "Usage: nous <url> [options]"
67
+
68
+ opts.on("-o", "--output PATH", "Write output to file (default: stdout)") { |v| options[:output] = v }
69
+ opts.on("-f", "--format FORMAT", "Output format: text or json (default: text)") do |v|
70
+ options[:format] = v.to_sym
71
+ end
72
+ opts.on("-c", "--concurrency N", Integer, "Concurrent requests (default: 3)") { |v| options[:concurrency] = v }
73
+ opts.on("-m", "--match PATTERN", "Only include pages matching glob (repeatable)") do |v|
74
+ (options[:match] ||= []) << v
75
+ end
76
+ opts.on("-s", "--selector SELECTOR", "CSS selector to scope extraction") { |v| options[:selector] = v }
77
+ opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
78
+ opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
79
+ opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
80
+ opts.on("-v", "--verbose", "Verbose logging to stderr") { options[:verbose] = true }
81
+ opts.on("-h", "--help", "Show help") do
82
+ $stdout.puts(opts)
83
+ exit
84
+ end
85
+ opts.on("--version", "Show version") do
86
+ $stdout.puts("nous #{Nous::VERSION}")
87
+ exit
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Command
5
+ class Error < Nous::Error; end
6
+
7
+ class Result
8
+ attr_reader :payload, :error, :metadata
9
+
10
+ def initialize(success:, payload: nil, error: nil, metadata: {})
11
+ @success = success
12
+ @payload = payload
13
+ @error = error
14
+ @metadata = metadata
15
+ end
16
+
17
+ def success?
18
+ @success
19
+ end
20
+
21
+ def failure?
22
+ !@success
23
+ end
24
+ end
25
+
26
+ def self.call(...)
27
+ command = new(...)
28
+ command.call
29
+ rescue => e
30
+ return command.failure(Error.new("unexpected: #{e.message}")) if command
31
+
32
+ Result.new(success: false, error: e)
33
+ end
34
+
35
+ def success(payload:, metadata: {})
36
+ Result.new(success: true, payload:, metadata:)
37
+ end
38
+
39
+ def failure(error, metadata: {})
40
+ Result.new(success: false, error:, metadata:)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "reverse_markdown"
4
+
5
+ module Nous
6
+ class Converter < Command
7
+ class Error < Command::Error; end
8
+
9
+ def initialize(html:)
10
+ @html = html
11
+ end
12
+
13
+ def call
14
+ markdown = ReverseMarkdown.convert(html, github_flavored: true).strip
15
+ success(payload: markdown)
16
+ end
17
+
18
+ private
19
+
20
+ attr_reader :html
21
+ end
22
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Nous
6
+ class Crawler < Command
7
+ class Error < Command::Error; end
8
+
9
+ class Configuration
10
+ attr_reader :seed, :concurrency, :match, :limit, :timeout, :verbose, :keep_query
11
+
12
+ DEFAULT_CONCURRENCY = 3
13
+ DEFAULT_LIMIT = 100
14
+ DEFAULT_TIMEOUT = 15
15
+
16
+ def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
17
+ timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
18
+ @seed = parse_seed!(seed_url)
19
+ @concurrency = Integer(concurrency).clamp(1, 20)
20
+ @match = Array(match)
21
+ @limit = Integer(limit).clamp(1, 10_000)
22
+ @timeout = Integer(timeout)
23
+ @verbose = verbose
24
+ @keep_query = keep_query
25
+ end
26
+
27
+ private
28
+
29
+ def parse_seed!(url)
30
+ uri = URI.parse(url)
31
+ raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
32
+
33
+ uri
34
+ rescue URI::InvalidURIError => e
35
+ raise Error, "invalid seed URL: #{e.message}"
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Crawler < Command
5
+ class LinkExtractor
6
+ def initialize(url_filter:, verbose: false)
7
+ @url_filter = url_filter
8
+ @verbose = verbose
9
+ end
10
+
11
+ def extract(current_url, html)
12
+ base_uri = URI.parse(current_url)
13
+
14
+ anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :url_filter, :verbose
20
+
21
+ def anchors(html)
22
+ Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
23
+ end
24
+
25
+ def resolve(base_uri, href)
26
+ return unless url_filter.allowed?(href)
27
+
28
+ uri = URI.join(base_uri, href)
29
+ return unless url_filter.same_host?(uri)
30
+
31
+ canonical = url_filter.canonicalize(uri)
32
+ return unless url_filter.matches_path?(URI.parse(canonical).path)
33
+
34
+ canonical
35
+ rescue URI::InvalidURIError => e
36
+ warn("[nous] malformed href #{href.inspect}: #{e.message}") if verbose
37
+ nil
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Crawler < Command
5
+ class PageFetcher
6
+ HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
7
+
8
+ def initialize(client:, timeout:, verbose: false)
9
+ @client = client
10
+ @timeout = timeout
11
+ @verbose = verbose
12
+ end
13
+
14
+ def fetch(url)
15
+ Async::Task.current.with_timeout(timeout) do
16
+ response = client.get(url, {})
17
+ return skip(url, "status #{response.status}") unless response.status == 200
18
+ return skip(url, "non-html content") unless html?(response)
19
+
20
+ {url:, pathname: URI.parse(url).path, html: response.read}
21
+ ensure
22
+ response&.close
23
+ end
24
+ rescue Async::TimeoutError
25
+ skip(url, "timeout after #{timeout}s")
26
+ rescue IOError, SocketError, Errno::ECONNREFUSED => e
27
+ skip(url, e.message)
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :client, :timeout, :verbose
33
+
34
+ def html?(response)
35
+ content_type = response.headers["content-type"].to_s
36
+ HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
37
+ end
38
+
39
+ def skip(url, reason)
40
+ warn("[nous] skip #{url}: #{reason}") if verbose
41
+ nil
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Crawler < Command
5
+ class UrlFilter
6
+ IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
7
+
8
+ def initialize(config)
9
+ @host = config.seed.host
10
+ @match = config.match
11
+ @keep_query = config.keep_query
12
+ end
13
+
14
+ def canonicalize(uri)
15
+ uri = URI.parse(uri.to_s)
16
+ uri.fragment = nil
17
+ uri.query = nil unless keep_query
18
+ uri.path = "/" if uri.path.empty?
19
+ uri.to_s
20
+ end
21
+
22
+ def allowed?(href)
23
+ return false if href.strip.empty?
24
+
25
+ IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
26
+ end
27
+
28
+ def same_host?(uri)
29
+ uri.is_a?(URI::HTTP) && uri.host == host
30
+ end
31
+
32
+ def matches_path?(path)
33
+ return true if match.empty?
34
+
35
+ match.any? { |pattern| File.fnmatch(pattern, path, File::FNM_PATHNAME | File::FNM_EXTGLOB) }
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :host, :match, :keep_query
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "async"
4
+ require "async/http/internet"
5
+ require "nokogiri"
6
+ require "uri"
7
+
8
+ module Nous
9
+ class Crawler < Command
10
+ class Error < Command::Error; end
11
+
12
+ def initialize(seed_url:, **options)
13
+ @config = Configuration.new(seed_url:, **options)
14
+ end
15
+
16
+ def call
17
+ pages = []
18
+ queue = [url_filter.canonicalize(config.seed)]
19
+ seen = Set.new(queue)
20
+
21
+ Async do
22
+ client = Async::HTTP::Internet.new
23
+ begin
24
+ crawl(queue:, seen:, pages:, client:)
25
+ ensure
26
+ client.close
27
+ end
28
+ end.wait
29
+
30
+ success(payload: pages)
31
+ end
32
+
33
+ private
34
+
35
+ attr_reader :config
36
+
37
+ def crawl(queue:, seen:, pages:, client:)
38
+ while queue.any? && pages.length < config.limit
39
+ batch = queue.shift(config.concurrency)
40
+ fetch_batch(batch, client).each do |page|
41
+ next unless page
42
+
43
+ pages << page
44
+ break if pages.length >= config.limit
45
+
46
+ link_extractor.extract(page[:url], page[:html]).each do |url|
47
+ next if seen.include?(url)
48
+
49
+ seen << url
50
+ queue << url
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ def fetch_batch(urls, client)
57
+ tasks = []
58
+
59
+ Async do |task|
60
+ urls.each do |url|
61
+ tasks << task.async { page_fetcher(client).fetch(url) }
62
+ end
63
+ end.wait
64
+
65
+ tasks.map(&:wait)
66
+ end
67
+
68
+ def url_filter
69
+ @url_filter ||= UrlFilter.new(config)
70
+ end
71
+
72
+ def link_extractor
73
+ @link_extractor ||= LinkExtractor.new(url_filter:, verbose: config.verbose)
74
+ end
75
+
76
+ def page_fetcher(client)
77
+ PageFetcher.new(client:, timeout: config.timeout, verbose: config.verbose)
78
+ end
79
+ end
80
+ end
data/lib/nous/error.rb ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Error < StandardError; end
5
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class ExtractionRunner
5
+ def initialize(raw_pages:, extractor:, concurrency: 3, verbose: false)
6
+ @raw_pages = raw_pages
7
+ @extractor = extractor
8
+ @concurrency = Integer(concurrency).clamp(1, 20)
9
+ @verbose = verbose
10
+ end
11
+
12
+ def call
13
+ raw_pages.each_slice(concurrency).each_with_object([]) do |batch, pages|
14
+ threads = batch.map { |raw| Thread.new { build_thread(raw).call } }
15
+
16
+ threads.each do |thread|
17
+ result = thread.value
18
+ pages << result if result
19
+ end
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ attr_reader :raw_pages, :extractor, :concurrency, :verbose
26
+
27
+ def build_thread(raw_page)
28
+ ExtractionThread.new(extractor:, raw_page:, verbose:)
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class ExtractionThread
5
+ def initialize(extractor:, raw_page:, verbose: false)
6
+ @extractor = extractor
7
+ @raw_page = raw_page
8
+ @verbose = verbose
9
+ end
10
+
11
+ def call
12
+ extracted = extractor.extract(raw_page)
13
+
14
+ Page.new(
15
+ title: extracted[:title],
16
+ url: raw_page[:url],
17
+ pathname: raw_page[:pathname],
18
+ content: extracted[:content]
19
+ )
20
+ rescue Nous::Error => e
21
+ warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if verbose
22
+ nil
23
+ end
24
+
25
+ private
26
+
27
+ attr_reader :extractor, :raw_page, :verbose
28
+ end
29
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Extractor
5
+ class Default
6
+ def initialize(selector: nil)
7
+ @selector = selector
8
+ end
9
+
10
+ def extract(page)
11
+ extracted = extract_content(page[:html])
12
+ markdown = convert_to_markdown(extracted[:content])
13
+
14
+ {title: extracted[:title], content: markdown}
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :selector
20
+
21
+ def extract_content(html)
22
+ result = Extractor.call(html:, selector:)
23
+ raise result.error if result.failure?
24
+
25
+ result.payload
26
+ end
27
+
28
+ def convert_to_markdown(html)
29
+ result = Converter.call(html:)
30
+ raise result.error if result.failure?
31
+
32
+ result.payload
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "faraday/retry"
5
+ require "json"
6
+
7
+ module Nous
8
+ class Extractor
9
+ class Jina
10
+ class Client
11
+ class Error < Nous::Error; end
12
+
13
+ BASE_URL = "https://r.jina.ai"
14
+ RETRYABLE_STATUSES = [429, 500, 502, 503, 504].freeze
15
+ MAX_RETRIES = 3
16
+
17
+ def initialize(api_key: nil, timeout: 30, retry_interval: 1)
18
+ @connection = build_connection(api_key:, timeout:, retry_interval:)
19
+ end
20
+
21
+ def get(url)
22
+ response = connection.get("/#{url}")
23
+ parse(response.body)
24
+ rescue Faraday::Error => e
25
+ raise Error, e.message
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :connection
31
+
32
+ def build_connection(api_key:, timeout:, retry_interval:)
33
+ Faraday.new(url: BASE_URL) do |f|
34
+ f.response :raise_error
35
+
36
+ f.request :retry,
37
+ max: MAX_RETRIES,
38
+ interval: retry_interval,
39
+ backoff_factor: 2,
40
+ retry_statuses: RETRYABLE_STATUSES
41
+
42
+ f.headers["Accept"] = "application/json"
43
+ f.headers["X-No-Cache"] = "true"
44
+ f.headers["Authorization"] = "Bearer #{api_key}" if api_key
45
+
46
+ f.options.timeout = timeout
47
+ f.options.open_timeout = timeout
48
+ end
49
+ end
50
+
51
+ def parse(body)
52
+ JSON.parse(body)
53
+ rescue JSON::ParserError => e
54
+ raise Error, "invalid JSON: #{e.message}"
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Extractor
5
+ class Jina
6
+ class Error < Nous::Error; end
7
+
8
+ def initialize(api_key: nil, timeout: 30, **client_options)
9
+ @client = Client.new(api_key: api_key || ENV["JINA_API_KEY"], timeout:, **client_options)
10
+ end
11
+
12
+ def extract(page)
13
+ body = client.get(page[:url])
14
+
15
+ {title: body.dig("data", "title") || "", content: body.dig("data", "content") || ""}
16
+ rescue Client::Error => e
17
+ raise Error, e.message
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :client
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "readability"
4
+
5
+ module Nous
6
+ class Extractor < Command
7
+ class Error < Command::Error; end
8
+
9
+ NOISY_TAGS = %w[script style link nav header footer img video svg].freeze
10
+
11
+ def initialize(html:, selector: nil)
12
+ @html = html
13
+ @selector = selector
14
+ end
15
+
16
+ def call
17
+ doc = Nokogiri::HTML(html)
18
+ doc = scope_to_selector(doc) if selector
19
+ strip_noisy_tags(doc)
20
+
21
+ readable = Readability::Document.new(doc.to_html)
22
+ text = Nokogiri::HTML(readable.content).text.strip
23
+
24
+ return failure(Error.new("readability returned no content")) if text.empty?
25
+
26
+ success(payload: {title: readable.title, content: readable.content})
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :html, :selector
32
+
33
+ def scope_to_selector(doc)
34
+ scoped = doc.at_css(selector)
35
+ return doc unless scoped
36
+
37
+ fragment = Nokogiri::HTML::Document.new
38
+ fragment.root = scoped
39
+ fragment
40
+ end
41
+
42
+ def strip_noisy_tags(doc)
43
+ NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ class Fetcher < Command
5
+ class Error < Command::Error; end
6
+
7
+ def initialize(seed_url:, extractor: Extractor::Default.new, **crawler_options)
8
+ @seed_url = seed_url
9
+ @extractor = extractor
10
+ @crawler_options = crawler_options
11
+ end
12
+
13
+ def call
14
+ raw_pages = crawl
15
+ pages = extract(raw_pages)
16
+ success(payload: pages)
17
+ end
18
+
19
+ private
20
+
21
+ attr_reader :seed_url, :extractor, :crawler_options
22
+
23
+ def crawl
24
+ result = Crawler.call(seed_url:, **crawler_options)
25
+ raise Error, result.error.message if result.failure?
26
+
27
+ result.payload
28
+ end
29
+
30
+ def extract(raw_pages)
31
+ ExtractionRunner.new(
32
+ raw_pages:,
33
+ extractor:,
34
+ concurrency: crawler_options.fetch(:concurrency, 3),
35
+ verbose: crawler_options.fetch(:verbose, false)
36
+ ).call
37
+ end
38
+ end
39
+ end
data/lib/nous/page.rb ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ Page = Data.define(:title, :url, :pathname, :content)
5
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Nous
6
+ class Serializer < Command
7
+ class Error < Command::Error; end
8
+
9
+ FORMATS = %i[text json].freeze
10
+
11
+ def initialize(pages:, format: :text)
12
+ @pages = pages
13
+ @format = format.to_sym
14
+ validate_format!
15
+ end
16
+
17
+ def call
18
+ output = (format == :json) ? serialize_json : serialize_text
19
+ success(payload: output)
20
+ end
21
+
22
+ private
23
+
24
+ attr_reader :pages, :format
25
+
26
+ def validate_format!
27
+ raise Error, "unknown format: #{format}. Must be one of: #{FORMATS.join(", ")}" unless FORMATS.include?(format)
28
+ end
29
+
30
+ def serialize_text
31
+ pages.map { |page| text_page(page) }.join("\n\n")
32
+ end
33
+
34
+ def serialize_json
35
+ JSON.pretty_generate(pages.map { |page| json_page(page) })
36
+ end
37
+
38
+ def text_page(page)
39
+ <<~XML
40
+ <page>
41
+ <title>#{page.title}</title>
42
+ <url>#{page.url}</url>
43
+ <content>
44
+ #{page.content}
45
+ </content>
46
+ </page>
47
+ XML
48
+ end
49
+
50
+ def json_page(page)
51
+ {title: page.title, url: page.url, content: page.content}
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nous
4
+ VERSION = "0.1.0"
5
+ end
data/lib/nous.rb ADDED
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zeitwerk"
4
+
5
+ loader = Zeitwerk::Loader.for_gem
6
+ loader.setup
7
+
8
+ module Nous
9
+ module_function
10
+
11
+ def fetch(seed_url, **options)
12
+ result = Fetcher.call(seed_url:, **options)
13
+ raise result.error if result.failure?
14
+
15
+ result.payload
16
+ end
17
+
18
+ def serialize(pages, format: :text)
19
+ result = Serializer.call(pages:, format:)
20
+ raise result.error if result.failure?
21
+
22
+ result.payload
23
+ end
24
+ end
data/sig/nous.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Nous
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,244 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nous
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dan Frenette
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2026-02-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: async
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.24'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.24'
27
+ - !ruby/object:Gem::Dependency
28
+ name: async-http
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.88'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.88'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.12'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.12'
55
+ - !ruby/object:Gem::Dependency
56
+ name: faraday-retry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.2'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.16'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.16'
83
+ - !ruby/object:Gem::Dependency
84
+ name: reverse_markdown
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: ruby-readability
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '0.7'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: zeitwerk
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.6'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.6'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '13.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '13.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rspec
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '3.13'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '3.13'
153
+ - !ruby/object:Gem::Dependency
154
+ name: standard
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.42'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '1.42'
167
+ - !ruby/object:Gem::Dependency
168
+ name: webmock
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '3.25'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '3.25'
181
+ description: Nous crawls same-host web pages, extracts readable content, and serializes
182
+ clean Markdown as text or JSON.
183
+ email:
184
+ - dan.r.frenette@gmail.com
185
+ executables:
186
+ - nous
187
+ extensions: []
188
+ extra_rdoc_files: []
189
+ files:
190
+ - ".rspec"
191
+ - ".standard.yml"
192
+ - CHANGELOG.md
193
+ - LICENSE.txt
194
+ - README.md
195
+ - Rakefile
196
+ - exe/nous
197
+ - lib/nous.rb
198
+ - lib/nous/cli.rb
199
+ - lib/nous/command.rb
200
+ - lib/nous/converter.rb
201
+ - lib/nous/crawler.rb
202
+ - lib/nous/crawler/configuration.rb
203
+ - lib/nous/crawler/link_extractor.rb
204
+ - lib/nous/crawler/page_fetcher.rb
205
+ - lib/nous/crawler/url_filter.rb
206
+ - lib/nous/error.rb
207
+ - lib/nous/extraction_runner.rb
208
+ - lib/nous/extraction_thread.rb
209
+ - lib/nous/extractor.rb
210
+ - lib/nous/extractor/default.rb
211
+ - lib/nous/extractor/jina.rb
212
+ - lib/nous/extractor/jina/client.rb
213
+ - lib/nous/fetcher.rb
214
+ - lib/nous/page.rb
215
+ - lib/nous/serializer.rb
216
+ - lib/nous/version.rb
217
+ - sig/nous.rbs
218
+ homepage: https://github.com/danfrenette/nous
219
+ licenses:
220
+ - MIT
221
+ metadata:
222
+ homepage_uri: https://github.com/danfrenette/nous
223
+ source_code_uri: https://github.com/danfrenette/nous
224
+ changelog_uri: https://github.com/danfrenette/nous/blob/main/CHANGELOG.md
225
+ post_install_message:
226
+ rdoc_options: []
227
+ require_paths:
228
+ - lib
229
+ required_ruby_version: !ruby/object:Gem::Requirement
230
+ requirements:
231
+ - - ">="
232
+ - !ruby/object:Gem::Version
233
+ version: 3.2.0
234
+ required_rubygems_version: !ruby/object:Gem::Requirement
235
+ requirements:
236
+ - - ">="
237
+ - !ruby/object:Gem::Version
238
+ version: '0'
239
+ requirements: []
240
+ rubygems_version: 3.5.9
241
+ signing_key:
242
+ specification_version: 4
243
+ summary: Crawl websites and extract readable markdown for LLM workflows
244
+ test_files: []