nous 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +141 -0
- data/Rakefile +10 -0
- data/exe/nous +6 -0
- data/lib/nous/cli.rb +92 -0
- data/lib/nous/command.rb +43 -0
- data/lib/nous/converter.rb +22 -0
- data/lib/nous/crawler/configuration.rb +39 -0
- data/lib/nous/crawler/link_extractor.rb +41 -0
- data/lib/nous/crawler/page_fetcher.rb +45 -0
- data/lib/nous/crawler/url_filter.rb +43 -0
- data/lib/nous/crawler.rb +80 -0
- data/lib/nous/error.rb +5 -0
- data/lib/nous/extraction_runner.rb +31 -0
- data/lib/nous/extraction_thread.rb +29 -0
- data/lib/nous/extractor/default.rb +36 -0
- data/lib/nous/extractor/jina/client.rb +59 -0
- data/lib/nous/extractor/jina.rb +25 -0
- data/lib/nous/extractor.rb +46 -0
- data/lib/nous/fetcher.rb +39 -0
- data/lib/nous/page.rb +5 -0
- data/lib/nous/serializer.rb +54 -0
- data/lib/nous/version.rb +5 -0
- data/lib/nous.rb +24 -0
- data/sig/nous.rbs +4 -0
- metadata +244 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: e781bd7a823a8022f4a6e5a4346c183aadde727a59306103be9ffa937b7029dc
|
|
4
|
+
data.tar.gz: 80bcb682c861204b589c3a548088d5e5806d23281d509e2556e27f9fa8ef3960
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 4cd9a3a161b7203689063820d9e5bc5fda4c04be288262ac5bdddeaae46bc283bd2a85a4029579b28329c5f1bcc6d324d3042ce4bde9a53763ab4d19830bfa28
|
|
7
|
+
data.tar.gz: 3b0a4e96b51060064f3494b7227aa43ffa0e487a9a01f36ce610be7c852c5a86570665bbd92be07a622fa3b25efd2eef1c772f1db56161d63c062d770a973e26
|
data/.rspec
ADDED
data/.standard.yml
ADDED
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Dan Frenette
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Nous
|
|
2
|
+
|
|
3
|
+
Crawl websites and extract readable Markdown, optimized for LLM consumption.
|
|
4
|
+
|
|
5
|
+
Nous fetches same-host pages starting from a seed URL, extracts readable content, and outputs clean Markdown as XML-tagged text or JSON. It supports concurrent crawling, glob-based URL filtering, and two extraction backends: a local parser (ruby-readability) and the Jina Reader API for JS-rendered sites.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Add to your Gemfile:
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem "nous"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Or install directly:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
gem install nous
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## CLI Usage
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# Crawl a site and print extracted content to stdout
|
|
25
|
+
nous https://example.com
|
|
26
|
+
|
|
27
|
+
# Output as JSON
|
|
28
|
+
nous https://example.com -f json
|
|
29
|
+
|
|
30
|
+
# Write to a file
|
|
31
|
+
nous https://example.com -o site.md
|
|
32
|
+
|
|
33
|
+
# Limit pages and increase concurrency
|
|
34
|
+
nous https://example.com -l 20 -c 5
|
|
35
|
+
|
|
36
|
+
# Only crawl pages matching a glob pattern
|
|
37
|
+
nous https://example.com -m "/blog/*"
|
|
38
|
+
|
|
39
|
+
# Scope extraction to a CSS selector
|
|
40
|
+
nous https://example.com -s "article.post"
|
|
41
|
+
|
|
42
|
+
# Use Jina Reader API for JS-rendered sites (Next.js, SPAs)
|
|
43
|
+
nous https://example.com --jina
|
|
44
|
+
|
|
45
|
+
# Verbose logging
|
|
46
|
+
nous https://example.com -v
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Options
|
|
50
|
+
|
|
51
|
+
| Flag | Description | Default |
|
|
52
|
+
|------|-------------|---------|
|
|
53
|
+
| `-o`, `--output PATH` | Write output to file | stdout |
|
|
54
|
+
| `-f`, `--format FORMAT` | Output format: `text` or `json` | `text` |
|
|
55
|
+
| `-c`, `--concurrency N` | Concurrent requests | `3` |
|
|
56
|
+
| `-m`, `--match PATTERN` | Glob filter for URLs (repeatable) | none |
|
|
57
|
+
| `-s`, `--selector SELECTOR` | CSS selector to scope extraction | none |
|
|
58
|
+
| `-l`, `--limit N` | Maximum pages to fetch | `100` |
|
|
59
|
+
| `--timeout N` | Per-request timeout in seconds | `15` |
|
|
60
|
+
| `--jina` | Use Jina Reader API for extraction | off |
|
|
61
|
+
| `-v`, `--verbose` | Verbose logging to stderr | off |
|
|
62
|
+
|
|
63
|
+
## Ruby API
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
require "nous"
|
|
67
|
+
|
|
68
|
+
# Fetch pages with the default extractor
|
|
69
|
+
pages = Nous.fetch("https://example.com", limit: 10, concurrency: 3)
|
|
70
|
+
|
|
71
|
+
# Each page is a Nous::Page with title, url, pathname, content
|
|
72
|
+
pages.each do |page|
|
|
73
|
+
puts "#{page.title} (#{page.url})"
|
|
74
|
+
puts page.content
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Serialize to XML-tagged text
|
|
78
|
+
text = Nous.serialize(pages, format: :text)
|
|
79
|
+
|
|
80
|
+
# Serialize to JSON
|
|
81
|
+
json = Nous.serialize(pages, format: :json)
|
|
82
|
+
|
|
83
|
+
# Use the Jina extractor for JS-heavy sites
|
|
84
|
+
pages = Nous.fetch("https://spa-site.com",
|
|
85
|
+
extractor: Nous::Extractor::Jina.new,
|
|
86
|
+
limit: 5
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Extraction Backends
|
|
91
|
+
|
|
92
|
+
### Default (ruby-readability)
|
|
93
|
+
|
|
94
|
+
Parses static HTML using [ruby-readability](https://github.com/cantino/ruby-readability), strips noisy elements (nav, footer, script, header), and converts to Markdown via [reverse_markdown](https://github.com/xijo/reverse_markdown). Fast and requires no external services, but cannot extract content from JS-rendered pages.
|
|
95
|
+
|
|
96
|
+
### Jina Reader API
|
|
97
|
+
|
|
98
|
+
Uses the [Jina Reader API](https://jina.ai/reader/) which renders pages with headless Chrome. Handles Next.js App Router, React Server Components, SPAs, and other JS-heavy sites. Free tier allows 20 requests/minute without a key, or 500 RPM with a `JINA_API_KEY` environment variable.
|
|
99
|
+
|
|
100
|
+
## Output Formats
|
|
101
|
+
|
|
102
|
+
### Text (default)
|
|
103
|
+
|
|
104
|
+
XML-tagged output designed for LLM context windows:
|
|
105
|
+
|
|
106
|
+
```xml
|
|
107
|
+
<page>
|
|
108
|
+
<title>Page Title</title>
|
|
109
|
+
<url>https://example.com/page</url>
|
|
110
|
+
<content>
|
|
111
|
+
# Heading
|
|
112
|
+
|
|
113
|
+
Extracted markdown content...
|
|
114
|
+
</content>
|
|
115
|
+
</page>
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### JSON
|
|
119
|
+
|
|
120
|
+
```json
|
|
121
|
+
[
|
|
122
|
+
{
|
|
123
|
+
"title": "Page Title",
|
|
124
|
+
"url": "https://example.com/page",
|
|
125
|
+
"pathname": "/page",
|
|
126
|
+
"content": "# Heading\n\nExtracted markdown content..."
|
|
127
|
+
}
|
|
128
|
+
]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Development
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
bin/setup # Install dependencies
|
|
135
|
+
bundle exec rspec # Run tests
|
|
136
|
+
bundle exec standardrb # Lint
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## License
|
|
140
|
+
|
|
141
|
+
MIT License. See [LICENSE.txt](LICENSE.txt).
|
data/Rakefile
ADDED
data/exe/nous
ADDED
data/lib/nous/cli.rb
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Cli
|
|
7
|
+
class Error < Nous::Error; end
|
|
8
|
+
|
|
9
|
+
def initialize(argv)
|
|
10
|
+
@argv = argv
|
|
11
|
+
@options = {format: :text, concurrency: 3, limit: 100, timeout: 15}
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def run
|
|
15
|
+
parse_options!
|
|
16
|
+
validate!
|
|
17
|
+
|
|
18
|
+
pages = Nous.fetch(seed_url, **fetch_options)
|
|
19
|
+
output = Nous.serialize(pages, format: options[:format])
|
|
20
|
+
write_output(output)
|
|
21
|
+
rescue Nous::Error => e
|
|
22
|
+
warn("nous: #{e.message}")
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :argv, :options
|
|
29
|
+
|
|
30
|
+
def seed_url
|
|
31
|
+
argv.first
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def fetch_options
|
|
35
|
+
opts = options.slice(:concurrency, :match, :limit, :timeout, :verbose)
|
|
36
|
+
opts[:extractor] = extractor
|
|
37
|
+
opts
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def extractor
|
|
41
|
+
return Extractor::Jina.new if options[:jina]
|
|
42
|
+
|
|
43
|
+
Extractor::Default.new(selector: options[:selector])
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def validate!
|
|
47
|
+
raise Error, "no URL provided. Usage: nous <url> [options]" unless seed_url
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def write_output(output)
|
|
51
|
+
if options[:output]
|
|
52
|
+
File.write(options[:output], output)
|
|
53
|
+
else
|
|
54
|
+
$stdout.puts(output)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def parse_options!
|
|
59
|
+
parser.parse!(argv)
|
|
60
|
+
rescue OptionParser::InvalidOption => e
|
|
61
|
+
raise Error, e.message
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def parser
|
|
65
|
+
OptionParser.new do |opts|
|
|
66
|
+
opts.banner = "Usage: nous <url> [options]"
|
|
67
|
+
|
|
68
|
+
opts.on("-o", "--output PATH", "Write output to file (default: stdout)") { |v| options[:output] = v }
|
|
69
|
+
opts.on("-f", "--format FORMAT", "Output format: text or json (default: text)") do |v|
|
|
70
|
+
options[:format] = v.to_sym
|
|
71
|
+
end
|
|
72
|
+
opts.on("-c", "--concurrency N", Integer, "Concurrent requests (default: 3)") { |v| options[:concurrency] = v }
|
|
73
|
+
opts.on("-m", "--match PATTERN", "Only include pages matching glob (repeatable)") do |v|
|
|
74
|
+
(options[:match] ||= []) << v
|
|
75
|
+
end
|
|
76
|
+
opts.on("-s", "--selector SELECTOR", "CSS selector to scope extraction") { |v| options[:selector] = v }
|
|
77
|
+
opts.on("-l", "--limit N", Integer, "Maximum pages to fetch") { |v| options[:limit] = v }
|
|
78
|
+
opts.on("--timeout N", Integer, "Per-request timeout in seconds (default: 15)") { |v| options[:timeout] = v }
|
|
79
|
+
opts.on("--jina", "Use Jina Reader API for extraction (handles JS-rendered sites)") { options[:jina] = true }
|
|
80
|
+
opts.on("-v", "--verbose", "Verbose logging to stderr") { options[:verbose] = true }
|
|
81
|
+
opts.on("-h", "--help", "Show help") do
|
|
82
|
+
$stdout.puts(opts)
|
|
83
|
+
exit
|
|
84
|
+
end
|
|
85
|
+
opts.on("--version", "Show version") do
|
|
86
|
+
$stdout.puts("nous #{Nous::VERSION}")
|
|
87
|
+
exit
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
data/lib/nous/command.rb
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Command
|
|
5
|
+
class Error < Nous::Error; end
|
|
6
|
+
|
|
7
|
+
class Result
|
|
8
|
+
attr_reader :payload, :error, :metadata
|
|
9
|
+
|
|
10
|
+
def initialize(success:, payload: nil, error: nil, metadata: {})
|
|
11
|
+
@success = success
|
|
12
|
+
@payload = payload
|
|
13
|
+
@error = error
|
|
14
|
+
@metadata = metadata
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def success?
|
|
18
|
+
@success
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def failure?
|
|
22
|
+
!@success
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.call(...)
|
|
27
|
+
command = new(...)
|
|
28
|
+
command.call
|
|
29
|
+
rescue => e
|
|
30
|
+
return command.failure(Error.new("unexpected: #{e.message}")) if command
|
|
31
|
+
|
|
32
|
+
Result.new(success: false, error: e)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def success(payload:, metadata: {})
|
|
36
|
+
Result.new(success: true, payload:, metadata:)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def failure(error, metadata: {})
|
|
40
|
+
Result.new(success: false, error:, metadata:)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "reverse_markdown"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Converter < Command
|
|
7
|
+
class Error < Command::Error; end
|
|
8
|
+
|
|
9
|
+
def initialize(html:)
|
|
10
|
+
@html = html
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call
|
|
14
|
+
markdown = ReverseMarkdown.convert(html, github_flavored: true).strip
|
|
15
|
+
success(payload: markdown)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
attr_reader :html
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Crawler < Command
|
|
7
|
+
class Error < Command::Error; end
|
|
8
|
+
|
|
9
|
+
class Configuration
|
|
10
|
+
attr_reader :seed, :concurrency, :match, :limit, :timeout, :verbose, :keep_query
|
|
11
|
+
|
|
12
|
+
DEFAULT_CONCURRENCY = 3
|
|
13
|
+
DEFAULT_LIMIT = 100
|
|
14
|
+
DEFAULT_TIMEOUT = 15
|
|
15
|
+
|
|
16
|
+
def initialize(seed_url:, concurrency: DEFAULT_CONCURRENCY, match: [], limit: DEFAULT_LIMIT,
|
|
17
|
+
timeout: DEFAULT_TIMEOUT, verbose: false, keep_query: false)
|
|
18
|
+
@seed = parse_seed!(seed_url)
|
|
19
|
+
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
20
|
+
@match = Array(match)
|
|
21
|
+
@limit = Integer(limit).clamp(1, 10_000)
|
|
22
|
+
@timeout = Integer(timeout)
|
|
23
|
+
@verbose = verbose
|
|
24
|
+
@keep_query = keep_query
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def parse_seed!(url)
|
|
30
|
+
uri = URI.parse(url)
|
|
31
|
+
raise Error, "seed URL must be http or https" unless uri.is_a?(URI::HTTP)
|
|
32
|
+
|
|
33
|
+
uri
|
|
34
|
+
rescue URI::InvalidURIError => e
|
|
35
|
+
raise Error, "invalid seed URL: #{e.message}"
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class LinkExtractor
|
|
6
|
+
def initialize(url_filter:, verbose: false)
|
|
7
|
+
@url_filter = url_filter
|
|
8
|
+
@verbose = verbose
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def extract(current_url, html)
|
|
12
|
+
base_uri = URI.parse(current_url)
|
|
13
|
+
|
|
14
|
+
anchors(html).filter_map { |href| resolve(base_uri, href) }.uniq
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
attr_reader :url_filter, :verbose
|
|
20
|
+
|
|
21
|
+
def anchors(html)
|
|
22
|
+
Nokogiri::HTML(html).css("a[href]").map { |node| node["href"] }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def resolve(base_uri, href)
|
|
26
|
+
return unless url_filter.allowed?(href)
|
|
27
|
+
|
|
28
|
+
uri = URI.join(base_uri, href)
|
|
29
|
+
return unless url_filter.same_host?(uri)
|
|
30
|
+
|
|
31
|
+
canonical = url_filter.canonicalize(uri)
|
|
32
|
+
return unless url_filter.matches_path?(URI.parse(canonical).path)
|
|
33
|
+
|
|
34
|
+
canonical
|
|
35
|
+
rescue URI::InvalidURIError => e
|
|
36
|
+
warn("[nous] malformed href #{href.inspect}: #{e.message}") if verbose
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class PageFetcher
|
|
6
|
+
HTML_CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
7
|
+
|
|
8
|
+
def initialize(client:, timeout:, verbose: false)
|
|
9
|
+
@client = client
|
|
10
|
+
@timeout = timeout
|
|
11
|
+
@verbose = verbose
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def fetch(url)
|
|
15
|
+
Async::Task.current.with_timeout(timeout) do
|
|
16
|
+
response = client.get(url, {})
|
|
17
|
+
return skip(url, "status #{response.status}") unless response.status == 200
|
|
18
|
+
return skip(url, "non-html content") unless html?(response)
|
|
19
|
+
|
|
20
|
+
{url:, pathname: URI.parse(url).path, html: response.read}
|
|
21
|
+
ensure
|
|
22
|
+
response&.close
|
|
23
|
+
end
|
|
24
|
+
rescue Async::TimeoutError
|
|
25
|
+
skip(url, "timeout after #{timeout}s")
|
|
26
|
+
rescue IOError, SocketError, Errno::ECONNREFUSED => e
|
|
27
|
+
skip(url, e.message)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
attr_reader :client, :timeout, :verbose
|
|
33
|
+
|
|
34
|
+
def html?(response)
|
|
35
|
+
content_type = response.headers["content-type"].to_s
|
|
36
|
+
HTML_CONTENT_TYPES.any? { |type| content_type.include?(type) }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def skip(url, reason)
|
|
40
|
+
warn("[nous] skip #{url}: #{reason}") if verbose
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Crawler < Command
|
|
5
|
+
class UrlFilter
|
|
6
|
+
IGNORED_SCHEMES = %w[mailto: javascript: tel:].freeze
|
|
7
|
+
|
|
8
|
+
def initialize(config)
|
|
9
|
+
@host = config.seed.host
|
|
10
|
+
@match = config.match
|
|
11
|
+
@keep_query = config.keep_query
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def canonicalize(uri)
|
|
15
|
+
uri = URI.parse(uri.to_s)
|
|
16
|
+
uri.fragment = nil
|
|
17
|
+
uri.query = nil unless keep_query
|
|
18
|
+
uri.path = "/" if uri.path.empty?
|
|
19
|
+
uri.to_s
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def allowed?(href)
|
|
23
|
+
return false if href.strip.empty?
|
|
24
|
+
|
|
25
|
+
IGNORED_SCHEMES.none? { |s| href.start_with?(s) }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def same_host?(uri)
|
|
29
|
+
uri.is_a?(URI::HTTP) && uri.host == host
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def matches_path?(path)
|
|
33
|
+
return true if match.empty?
|
|
34
|
+
|
|
35
|
+
match.any? { |pattern| File.fnmatch(pattern, path, File::FNM_PATHNAME | File::FNM_EXTGLOB) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
attr_reader :host, :match, :keep_query
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
data/lib/nous/crawler.rb
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "async"
|
|
4
|
+
require "async/http/internet"
|
|
5
|
+
require "nokogiri"
|
|
6
|
+
require "uri"
|
|
7
|
+
|
|
8
|
+
module Nous
|
|
9
|
+
class Crawler < Command
|
|
10
|
+
class Error < Command::Error; end
|
|
11
|
+
|
|
12
|
+
def initialize(seed_url:, **options)
|
|
13
|
+
@config = Configuration.new(seed_url:, **options)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def call
|
|
17
|
+
pages = []
|
|
18
|
+
queue = [url_filter.canonicalize(config.seed)]
|
|
19
|
+
seen = Set.new(queue)
|
|
20
|
+
|
|
21
|
+
Async do
|
|
22
|
+
client = Async::HTTP::Internet.new
|
|
23
|
+
begin
|
|
24
|
+
crawl(queue:, seen:, pages:, client:)
|
|
25
|
+
ensure
|
|
26
|
+
client.close
|
|
27
|
+
end
|
|
28
|
+
end.wait
|
|
29
|
+
|
|
30
|
+
success(payload: pages)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
attr_reader :config
|
|
36
|
+
|
|
37
|
+
def crawl(queue:, seen:, pages:, client:)
|
|
38
|
+
while queue.any? && pages.length < config.limit
|
|
39
|
+
batch = queue.shift(config.concurrency)
|
|
40
|
+
fetch_batch(batch, client).each do |page|
|
|
41
|
+
next unless page
|
|
42
|
+
|
|
43
|
+
pages << page
|
|
44
|
+
break if pages.length >= config.limit
|
|
45
|
+
|
|
46
|
+
link_extractor.extract(page[:url], page[:html]).each do |url|
|
|
47
|
+
next if seen.include?(url)
|
|
48
|
+
|
|
49
|
+
seen << url
|
|
50
|
+
queue << url
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def fetch_batch(urls, client)
|
|
57
|
+
tasks = []
|
|
58
|
+
|
|
59
|
+
Async do |task|
|
|
60
|
+
urls.each do |url|
|
|
61
|
+
tasks << task.async { page_fetcher(client).fetch(url) }
|
|
62
|
+
end
|
|
63
|
+
end.wait
|
|
64
|
+
|
|
65
|
+
tasks.map(&:wait)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def url_filter
|
|
69
|
+
@url_filter ||= UrlFilter.new(config)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def link_extractor
|
|
73
|
+
@link_extractor ||= LinkExtractor.new(url_filter:, verbose: config.verbose)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def page_fetcher(client)
|
|
77
|
+
PageFetcher.new(client:, timeout: config.timeout, verbose: config.verbose)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/nous/error.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class ExtractionRunner
|
|
5
|
+
def initialize(raw_pages:, extractor:, concurrency: 3, verbose: false)
|
|
6
|
+
@raw_pages = raw_pages
|
|
7
|
+
@extractor = extractor
|
|
8
|
+
@concurrency = Integer(concurrency).clamp(1, 20)
|
|
9
|
+
@verbose = verbose
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call
|
|
13
|
+
raw_pages.each_slice(concurrency).each_with_object([]) do |batch, pages|
|
|
14
|
+
threads = batch.map { |raw| Thread.new { build_thread(raw).call } }
|
|
15
|
+
|
|
16
|
+
threads.each do |thread|
|
|
17
|
+
result = thread.value
|
|
18
|
+
pages << result if result
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
attr_reader :raw_pages, :extractor, :concurrency, :verbose
|
|
26
|
+
|
|
27
|
+
def build_thread(raw_page)
|
|
28
|
+
ExtractionThread.new(extractor:, raw_page:, verbose:)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class ExtractionThread
|
|
5
|
+
def initialize(extractor:, raw_page:, verbose: false)
|
|
6
|
+
@extractor = extractor
|
|
7
|
+
@raw_page = raw_page
|
|
8
|
+
@verbose = verbose
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call
|
|
12
|
+
extracted = extractor.extract(raw_page)
|
|
13
|
+
|
|
14
|
+
Page.new(
|
|
15
|
+
title: extracted[:title],
|
|
16
|
+
url: raw_page[:url],
|
|
17
|
+
pathname: raw_page[:pathname],
|
|
18
|
+
content: extracted[:content]
|
|
19
|
+
)
|
|
20
|
+
rescue Nous::Error => e
|
|
21
|
+
warn("[nous] extract skip #{raw_page[:url]}: #{e.message}") if verbose
|
|
22
|
+
nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
attr_reader :extractor, :raw_page, :verbose
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Extractor
|
|
5
|
+
class Default
|
|
6
|
+
def initialize(selector: nil)
|
|
7
|
+
@selector = selector
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def extract(page)
|
|
11
|
+
extracted = extract_content(page[:html])
|
|
12
|
+
markdown = convert_to_markdown(extracted[:content])
|
|
13
|
+
|
|
14
|
+
{title: extracted[:title], content: markdown}
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
attr_reader :selector
|
|
20
|
+
|
|
21
|
+
def extract_content(html)
|
|
22
|
+
result = Extractor.call(html:, selector:)
|
|
23
|
+
raise result.error if result.failure?
|
|
24
|
+
|
|
25
|
+
result.payload
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def convert_to_markdown(html)
|
|
29
|
+
result = Converter.call(html:)
|
|
30
|
+
raise result.error if result.failure?
|
|
31
|
+
|
|
32
|
+
result.payload
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "faraday/retry"
|
|
5
|
+
require "json"
|
|
6
|
+
|
|
7
|
+
module Nous
|
|
8
|
+
class Extractor
|
|
9
|
+
class Jina
|
|
10
|
+
class Client
|
|
11
|
+
class Error < Nous::Error; end
|
|
12
|
+
|
|
13
|
+
BASE_URL = "https://r.jina.ai"
|
|
14
|
+
RETRYABLE_STATUSES = [429, 500, 502, 503, 504].freeze
|
|
15
|
+
MAX_RETRIES = 3
|
|
16
|
+
|
|
17
|
+
def initialize(api_key: nil, timeout: 30, retry_interval: 1)
|
|
18
|
+
@connection = build_connection(api_key:, timeout:, retry_interval:)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def get(url)
|
|
22
|
+
response = connection.get("/#{url}")
|
|
23
|
+
parse(response.body)
|
|
24
|
+
rescue Faraday::Error => e
|
|
25
|
+
raise Error, e.message
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
attr_reader :connection
|
|
31
|
+
|
|
32
|
+
def build_connection(api_key:, timeout:, retry_interval:)
|
|
33
|
+
Faraday.new(url: BASE_URL) do |f|
|
|
34
|
+
f.response :raise_error
|
|
35
|
+
|
|
36
|
+
f.request :retry,
|
|
37
|
+
max: MAX_RETRIES,
|
|
38
|
+
interval: retry_interval,
|
|
39
|
+
backoff_factor: 2,
|
|
40
|
+
retry_statuses: RETRYABLE_STATUSES
|
|
41
|
+
|
|
42
|
+
f.headers["Accept"] = "application/json"
|
|
43
|
+
f.headers["X-No-Cache"] = "true"
|
|
44
|
+
f.headers["Authorization"] = "Bearer #{api_key}" if api_key
|
|
45
|
+
|
|
46
|
+
f.options.timeout = timeout
|
|
47
|
+
f.options.open_timeout = timeout
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def parse(body)
|
|
52
|
+
JSON.parse(body)
|
|
53
|
+
rescue JSON::ParserError => e
|
|
54
|
+
raise Error, "invalid JSON: #{e.message}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Extractor
|
|
5
|
+
class Jina
|
|
6
|
+
class Error < Nous::Error; end
|
|
7
|
+
|
|
8
|
+
def initialize(api_key: nil, timeout: 30, **client_options)
|
|
9
|
+
@client = Client.new(api_key: api_key || ENV["JINA_API_KEY"], timeout:, **client_options)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def extract(page)
|
|
13
|
+
body = client.get(page[:url])
|
|
14
|
+
|
|
15
|
+
{title: body.dig("data", "title") || "", content: body.dig("data", "content") || ""}
|
|
16
|
+
rescue Client::Error => e
|
|
17
|
+
raise Error, e.message
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
attr_reader :client
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "readability"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Extractor < Command
|
|
7
|
+
class Error < Command::Error; end
|
|
8
|
+
|
|
9
|
+
NOISY_TAGS = %w[script style link nav header footer img video svg].freeze
|
|
10
|
+
|
|
11
|
+
def initialize(html:, selector: nil)
|
|
12
|
+
@html = html
|
|
13
|
+
@selector = selector
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def call
|
|
17
|
+
doc = Nokogiri::HTML(html)
|
|
18
|
+
doc = scope_to_selector(doc) if selector
|
|
19
|
+
strip_noisy_tags(doc)
|
|
20
|
+
|
|
21
|
+
readable = Readability::Document.new(doc.to_html)
|
|
22
|
+
text = Nokogiri::HTML(readable.content).text.strip
|
|
23
|
+
|
|
24
|
+
return failure(Error.new("readability returned no content")) if text.empty?
|
|
25
|
+
|
|
26
|
+
success(payload: {title: readable.title, content: readable.content})
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
attr_reader :html, :selector
|
|
32
|
+
|
|
33
|
+
def scope_to_selector(doc)
|
|
34
|
+
scoped = doc.at_css(selector)
|
|
35
|
+
return doc unless scoped
|
|
36
|
+
|
|
37
|
+
fragment = Nokogiri::HTML::Document.new
|
|
38
|
+
fragment.root = scoped
|
|
39
|
+
fragment
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def strip_noisy_tags(doc)
|
|
43
|
+
NOISY_TAGS.each { |tag| doc.css(tag).each(&:remove) }
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
data/lib/nous/fetcher.rb
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Nous
|
|
4
|
+
class Fetcher < Command
|
|
5
|
+
class Error < Command::Error; end
|
|
6
|
+
|
|
7
|
+
def initialize(seed_url:, extractor: Extractor::Default.new, **crawler_options)
|
|
8
|
+
@seed_url = seed_url
|
|
9
|
+
@extractor = extractor
|
|
10
|
+
@crawler_options = crawler_options
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def call
|
|
14
|
+
raw_pages = crawl
|
|
15
|
+
pages = extract(raw_pages)
|
|
16
|
+
success(payload: pages)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
attr_reader :seed_url, :extractor, :crawler_options
|
|
22
|
+
|
|
23
|
+
def crawl
|
|
24
|
+
result = Crawler.call(seed_url:, **crawler_options)
|
|
25
|
+
raise Error, result.error.message if result.failure?
|
|
26
|
+
|
|
27
|
+
result.payload
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def extract(raw_pages)
|
|
31
|
+
ExtractionRunner.new(
|
|
32
|
+
raw_pages:,
|
|
33
|
+
extractor:,
|
|
34
|
+
concurrency: crawler_options.fetch(:concurrency, 3),
|
|
35
|
+
verbose: crawler_options.fetch(:verbose, false)
|
|
36
|
+
).call
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/nous/page.rb
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Nous
|
|
6
|
+
class Serializer < Command
|
|
7
|
+
class Error < Command::Error; end
|
|
8
|
+
|
|
9
|
+
FORMATS = %i[text json].freeze
|
|
10
|
+
|
|
11
|
+
def initialize(pages:, format: :text)
|
|
12
|
+
@pages = pages
|
|
13
|
+
@format = format.to_sym
|
|
14
|
+
validate_format!
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call
|
|
18
|
+
output = (format == :json) ? serialize_json : serialize_text
|
|
19
|
+
success(payload: output)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
attr_reader :pages, :format
|
|
25
|
+
|
|
26
|
+
def validate_format!
|
|
27
|
+
raise Error, "unknown format: #{format}. Must be one of: #{FORMATS.join(", ")}" unless FORMATS.include?(format)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def serialize_text
|
|
31
|
+
pages.map { |page| text_page(page) }.join("\n\n")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def serialize_json
|
|
35
|
+
JSON.pretty_generate(pages.map { |page| json_page(page) })
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def text_page(page)
|
|
39
|
+
<<~XML
|
|
40
|
+
<page>
|
|
41
|
+
<title>#{page.title}</title>
|
|
42
|
+
<url>#{page.url}</url>
|
|
43
|
+
<content>
|
|
44
|
+
#{page.content}
|
|
45
|
+
</content>
|
|
46
|
+
</page>
|
|
47
|
+
XML
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def json_page(page)
|
|
51
|
+
{title: page.title, url: page.url, content: page.content}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
data/lib/nous/version.rb
ADDED
data/lib/nous.rb
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zeitwerk"
|
|
4
|
+
|
|
5
|
+
loader = Zeitwerk::Loader.for_gem
|
|
6
|
+
loader.setup
|
|
7
|
+
|
|
8
|
+
module Nous
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
def fetch(seed_url, **options)
|
|
12
|
+
result = Fetcher.call(seed_url:, **options)
|
|
13
|
+
raise result.error if result.failure?
|
|
14
|
+
|
|
15
|
+
result.payload
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def serialize(pages, format: :text)
|
|
19
|
+
result = Serializer.call(pages:, format:)
|
|
20
|
+
raise result.error if result.failure?
|
|
21
|
+
|
|
22
|
+
result.payload
|
|
23
|
+
end
|
|
24
|
+
end
|
data/sig/nous.rbs
ADDED
metadata
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: nous
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Dan Frenette
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-02-21 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: async
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '2.24'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '2.24'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: async-http
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0.88'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0.88'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: faraday
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '2.12'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '2.12'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: faraday-retry
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '2.2'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '2.2'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: nokogiri
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '1.16'
|
|
76
|
+
type: :runtime
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '1.16'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: reverse_markdown
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - "~>"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '3.0'
|
|
90
|
+
type: :runtime
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - "~>"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '3.0'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: ruby-readability
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - "~>"
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '0.7'
|
|
104
|
+
type: :runtime
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - "~>"
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '0.7'
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: zeitwerk
|
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
|
114
|
+
requirements:
|
|
115
|
+
- - "~>"
|
|
116
|
+
- !ruby/object:Gem::Version
|
|
117
|
+
version: '2.6'
|
|
118
|
+
type: :runtime
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - "~>"
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '2.6'
|
|
125
|
+
- !ruby/object:Gem::Dependency
|
|
126
|
+
name: rake
|
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
|
128
|
+
requirements:
|
|
129
|
+
- - "~>"
|
|
130
|
+
- !ruby/object:Gem::Version
|
|
131
|
+
version: '13.0'
|
|
132
|
+
type: :development
|
|
133
|
+
prerelease: false
|
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
+
requirements:
|
|
136
|
+
- - "~>"
|
|
137
|
+
- !ruby/object:Gem::Version
|
|
138
|
+
version: '13.0'
|
|
139
|
+
- !ruby/object:Gem::Dependency
|
|
140
|
+
name: rspec
|
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
|
142
|
+
requirements:
|
|
143
|
+
- - "~>"
|
|
144
|
+
- !ruby/object:Gem::Version
|
|
145
|
+
version: '3.13'
|
|
146
|
+
type: :development
|
|
147
|
+
prerelease: false
|
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
+
requirements:
|
|
150
|
+
- - "~>"
|
|
151
|
+
- !ruby/object:Gem::Version
|
|
152
|
+
version: '3.13'
|
|
153
|
+
- !ruby/object:Gem::Dependency
|
|
154
|
+
name: standard
|
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
|
156
|
+
requirements:
|
|
157
|
+
- - "~>"
|
|
158
|
+
- !ruby/object:Gem::Version
|
|
159
|
+
version: '1.42'
|
|
160
|
+
type: :development
|
|
161
|
+
prerelease: false
|
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
163
|
+
requirements:
|
|
164
|
+
- - "~>"
|
|
165
|
+
- !ruby/object:Gem::Version
|
|
166
|
+
version: '1.42'
|
|
167
|
+
- !ruby/object:Gem::Dependency
|
|
168
|
+
name: webmock
|
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
|
170
|
+
requirements:
|
|
171
|
+
- - "~>"
|
|
172
|
+
- !ruby/object:Gem::Version
|
|
173
|
+
version: '3.25'
|
|
174
|
+
type: :development
|
|
175
|
+
prerelease: false
|
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
177
|
+
requirements:
|
|
178
|
+
- - "~>"
|
|
179
|
+
- !ruby/object:Gem::Version
|
|
180
|
+
version: '3.25'
|
|
181
|
+
description: Nous crawls same-host web pages, extracts readable content, and serializes
|
|
182
|
+
clean Markdown as text or JSON.
|
|
183
|
+
email:
|
|
184
|
+
- dan.r.frenette@gmail.com
|
|
185
|
+
executables:
|
|
186
|
+
- nous
|
|
187
|
+
extensions: []
|
|
188
|
+
extra_rdoc_files: []
|
|
189
|
+
files:
|
|
190
|
+
- ".rspec"
|
|
191
|
+
- ".standard.yml"
|
|
192
|
+
- CHANGELOG.md
|
|
193
|
+
- LICENSE.txt
|
|
194
|
+
- README.md
|
|
195
|
+
- Rakefile
|
|
196
|
+
- exe/nous
|
|
197
|
+
- lib/nous.rb
|
|
198
|
+
- lib/nous/cli.rb
|
|
199
|
+
- lib/nous/command.rb
|
|
200
|
+
- lib/nous/converter.rb
|
|
201
|
+
- lib/nous/crawler.rb
|
|
202
|
+
- lib/nous/crawler/configuration.rb
|
|
203
|
+
- lib/nous/crawler/link_extractor.rb
|
|
204
|
+
- lib/nous/crawler/page_fetcher.rb
|
|
205
|
+
- lib/nous/crawler/url_filter.rb
|
|
206
|
+
- lib/nous/error.rb
|
|
207
|
+
- lib/nous/extraction_runner.rb
|
|
208
|
+
- lib/nous/extraction_thread.rb
|
|
209
|
+
- lib/nous/extractor.rb
|
|
210
|
+
- lib/nous/extractor/default.rb
|
|
211
|
+
- lib/nous/extractor/jina.rb
|
|
212
|
+
- lib/nous/extractor/jina/client.rb
|
|
213
|
+
- lib/nous/fetcher.rb
|
|
214
|
+
- lib/nous/page.rb
|
|
215
|
+
- lib/nous/serializer.rb
|
|
216
|
+
- lib/nous/version.rb
|
|
217
|
+
- sig/nous.rbs
|
|
218
|
+
homepage: https://github.com/danfrenette/nous
|
|
219
|
+
licenses:
|
|
220
|
+
- MIT
|
|
221
|
+
metadata:
|
|
222
|
+
homepage_uri: https://github.com/danfrenette/nous
|
|
223
|
+
source_code_uri: https://github.com/danfrenette/nous
|
|
224
|
+
changelog_uri: https://github.com/danfrenette/nous/blob/main/CHANGELOG.md
|
|
225
|
+
post_install_message:
|
|
226
|
+
rdoc_options: []
|
|
227
|
+
require_paths:
|
|
228
|
+
- lib
|
|
229
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
230
|
+
requirements:
|
|
231
|
+
- - ">="
|
|
232
|
+
- !ruby/object:Gem::Version
|
|
233
|
+
version: 3.2.0
|
|
234
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
235
|
+
requirements:
|
|
236
|
+
- - ">="
|
|
237
|
+
- !ruby/object:Gem::Version
|
|
238
|
+
version: '0'
|
|
239
|
+
requirements: []
|
|
240
|
+
rubygems_version: 3.5.9
|
|
241
|
+
signing_key:
|
|
242
|
+
specification_version: 4
|
|
243
|
+
summary: Crawl websites and extract readable markdown for LLM workflows
|
|
244
|
+
test_files: []
|