llm_scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.env.example +10 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +10 -0
- data/LICENSE.txt +21 -0
- data/README.md +268 -0
- data/Rakefile +12 -0
- data/lib/llm_scraper/configuration.rb +24 -0
- data/lib/llm_scraper/content_fetchers/base.rb +25 -0
- data/lib/llm_scraper/content_fetchers/firecrawl.rb +34 -0
- data/lib/llm_scraper/content_fetchers/jina.rb +31 -0
- data/lib/llm_scraper/content_fetchers/local.rb +57 -0
- data/lib/llm_scraper/content_fetchers/markdownify.rb +34 -0
- data/lib/llm_scraper/llm_clients/anthropic.rb +53 -0
- data/lib/llm_scraper/llm_clients/base.rb +45 -0
- data/lib/llm_scraper/llm_clients/openai_compatible.rb +50 -0
- data/lib/llm_scraper/prompt_builder.rb +73 -0
- data/lib/llm_scraper/response_parser.rb +80 -0
- data/lib/llm_scraper/result.rb +20 -0
- data/lib/llm_scraper/schema.rb +72 -0
- data/lib/llm_scraper/scraper.rb +131 -0
- data/lib/llm_scraper/version.rb +5 -0
- data/lib/llm_scraper.rb +31 -0
- data/sig/llm_scraper.rbs +4 -0
- metadata +179 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 7b23032aa5d35d2f2943290b0f353af606eea3590e4eaf5d7e54a689775a423f
|
|
4
|
+
data.tar.gz: 11bd8dc76ccda17b63d902bd0ce0d13d9c2badb139359e2448ba97a597ef948f
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: a1c6eb718ce331dcb4d895e2b2810d9f513208d8db2f7a1723be84bc41b13047cf13a33f625456183d19bc4b9af001cad578e952eae89e8a0f1d18317791d7a7
|
|
7
|
+
data.tar.gz: 8e819709d2fe96e9fcafe422e881ec853a13dc5041644ed2cb17e1460942f4a7fa1e859012609f8e4188c3c126e5658fd33de8101b435915c2d483797daf30a2
|
data/.env.example
ADDED
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
"llm_scraper" follows [The Ruby Community Conduct Guideline](https://www.ruby-lang.org/en/conduct) in all "collaborative space", which is defined as community communications channels (such as mailing lists, submitted patches, commit comments, etc.):
|
|
4
|
+
|
|
5
|
+
* Participants will be tolerant of opposing views.
|
|
6
|
+
* Participants must ensure that their language and actions are free of personal attacks and disparaging personal remarks.
|
|
7
|
+
* When interpreting the words and actions of others, participants should always assume good intentions.
|
|
8
|
+
* Behaviour which can be reasonably considered harassment will not be tolerated.
|
|
9
|
+
|
|
10
|
+
If you have any concerns about behaviour within this project, please contact us at ["cuongnguyenfu@gmail.com"](mailto:"cuongnguyenfu@gmail.com").
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 cuongnc0211
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# LlmScraper
|
|
2
|
+
|
|
3
|
+
Extract structured JSON from any web page using LLMs — more reliable than CSS selectors, works even when markup changes.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
URL → ContentFetcher (URL → Markdown) → LlmClient (Markdown → JSON) → Result
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Two independent layers let you mix any fetcher with any LLM provider.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Schema DSL** — declare fields with `type`, `what`, `how`, `examples`, `enum`, `required`, `default`
|
|
14
|
+
- **Multiple fetchers** — Jina AI, Firecrawl, ScrapeGraphAI Markdownify, or local Nokogiri (+ optional Ferrum for SPA)
|
|
15
|
+
- **Multiple LLM providers** — any OpenAI-compatible API (DeepSeek, Kimi, GLM, Gemini, OpenRouter…) or Anthropic native
|
|
16
|
+
- **Automatic retry** — re-prompts once with stricter instructions on JSON parse failure
|
|
17
|
+
- **Cost estimation** — `result.cost_usd` based on token usage
|
|
18
|
+
- **Minimal dependencies** — Faraday + Nokogiri + Zeitwerk, no Rails required
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Add to your Gemfile:
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
gem "llm_scraper"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or install directly:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
gem install llm_scraper
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
require "llm_scraper"
|
|
38
|
+
|
|
39
|
+
LlmScraper.configure do |c|
|
|
40
|
+
c.llm_provider = :openai_compatible
|
|
41
|
+
c.llm_base_url = "https://api.deepseek.com/v1"
|
|
42
|
+
c.llm_api_key = ENV["DEEPSEEK_API_KEY"]
|
|
43
|
+
c.llm_model = "deepseek-v4-flash"
|
|
44
|
+
c.fetcher = :jina
|
|
45
|
+
c.jina_api_key = ENV["JINA_API_KEY"] # optional — 200 req/day free without key
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
schema = LlmScraper::Schema.define do
|
|
49
|
+
field :name, type: :string, required: true, description: "Full name of the artisan"
|
|
50
|
+
field :price, type: :number, what: "Current retail price",
|
|
51
|
+
how: "Return CNY value as a number, strip ¥ symbol"
|
|
52
|
+
field :style, type: :string, enum: ["yixing", "zhuni", "duanni"]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
result = LlmScraper::Scraper.new(schema: schema).scrape("https://example.com/teapot")
|
|
56
|
+
|
|
57
|
+
result.success? # => true
|
|
58
|
+
result.data # => { name: "Gu Jingzhou", price: 15000, style: "yixing" }
|
|
59
|
+
result.tokens_used # => { input: 4821, output: 87 }
|
|
60
|
+
result.cost_usd # => 0.0009
|
|
61
|
+
result.fetcher # => :jina
|
|
62
|
+
result.provider # => :openai_compatible
|
|
63
|
+
result.duration_ms # => 1842
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Schema DSL
|
|
67
|
+
|
|
68
|
+
```ruby
|
|
69
|
+
schema = LlmScraper::Schema.define do
|
|
70
|
+
# Simple field — description is enough
|
|
71
|
+
field :name, type: :string, required: true, description: "Full artisan name"
|
|
72
|
+
field :available, type: :boolean, default: true, description: "In stock status"
|
|
73
|
+
|
|
74
|
+
# Complex field — what identifies the field, how tells the LLM how to extract it
|
|
75
|
+
field :price,
|
|
76
|
+
type: :number,
|
|
77
|
+
what: "Current retail price (not auction, not historical)",
|
|
78
|
+
how: "Return CNY as a plain number, strip ¥. If multiple prices, take the lowest",
|
|
79
|
+
examples: [1500, 8000, 25000]
|
|
80
|
+
|
|
81
|
+
# Closed-set field — LLM must pick from the list
|
|
82
|
+
field :clay_type,
|
|
83
|
+
type: :string,
|
|
84
|
+
what: "Clay type used",
|
|
85
|
+
how: "Return lowercase English name",
|
|
86
|
+
enum: ["zisha", "zhuni", "duanni", "hongni"]
|
|
87
|
+
|
|
88
|
+
# Array field
|
|
89
|
+
field :techniques, type: :array, items: :string,
|
|
90
|
+
description: "Distinctive crafting techniques"
|
|
91
|
+
end
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Field options
|
|
95
|
+
|
|
96
|
+
| Option | Purpose |
|
|
97
|
+
|---|---|
|
|
98
|
+
| `type` | `:string`, `:number`, `:boolean`, `:array`, `:object` |
|
|
99
|
+
| `description` | Alias for `what` — use for simple fields |
|
|
100
|
+
| `what` | What this field is (identity, disambiguation) |
|
|
101
|
+
| `how` | Extraction instruction (normalization, format, edge cases) |
|
|
102
|
+
| `examples` | Few-shot values to improve accuracy |
|
|
103
|
+
| `enum` | Closed-set — LLM must pick one of these values |
|
|
104
|
+
| `required` | Raises `ParseError` if null after extraction |
|
|
105
|
+
| `default` | Fallback value when field is missing |
|
|
106
|
+
| `items` | Element type for `type: :array` |
|
|
107
|
+
|
|
108
|
+
Schema can also be a plain Hash:
|
|
109
|
+
|
|
110
|
+
```ruby
|
|
111
|
+
schema = {
|
|
112
|
+
name: { type: :string, required: true, description: "Artisan name" },
|
|
113
|
+
price: { type: :number, description: "Price in CNY" },
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Fetchers
|
|
118
|
+
|
|
119
|
+
### Jina AI (default, recommended)
|
|
120
|
+
|
|
121
|
+
Clean Markdown via `r.jina.ai` — no JS execution needed, generous free tier.
|
|
122
|
+
|
|
123
|
+
```ruby
|
|
124
|
+
c.fetcher = :jina
|
|
125
|
+
c.jina_api_key = ENV["JINA_API_KEY"] # optional, ~200 req/day without key
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Firecrawl
|
|
129
|
+
|
|
130
|
+
Higher fidelity, handles JS-heavy pages, 1 credit per page.
|
|
131
|
+
|
|
132
|
+
```ruby
|
|
133
|
+
c.fetcher = :firecrawl
|
|
134
|
+
c.firecrawl_api_key = ENV["FIRECRAWL_API_KEY"]
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### ScrapeGraphAI Markdownify
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
c.fetcher = :markdownify
|
|
141
|
+
c.markdownify_api_key = ENV["MARKDOWNIFY_API_KEY"]
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Local (Nokogiri)
|
|
145
|
+
|
|
146
|
+
No external API — fetches directly and strips boilerplate HTML with Nokogiri.
|
|
147
|
+
|
|
148
|
+
```ruby
|
|
149
|
+
c.fetcher = :local
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
For SPA pages that require JavaScript, add `ferrum` to your Gemfile:
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
gem "ferrum"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## LLM Providers
|
|
159
|
+
|
|
160
|
+
### OpenAI-compatible (DeepSeek, Kimi, GLM, Gemini, OpenRouter…)
|
|
161
|
+
|
|
162
|
+
```ruby
|
|
163
|
+
# DeepSeek V4 Flash — cheap and accurate
|
|
164
|
+
c.llm_provider = :openai_compatible
|
|
165
|
+
c.llm_base_url = "https://api.deepseek.com/v1"
|
|
166
|
+
c.llm_api_key = ENV["DEEPSEEK_API_KEY"]
|
|
167
|
+
c.llm_model = "deepseek-v4-flash"
|
|
168
|
+
|
|
169
|
+
# GLM-4.7-Flash — free, good for testing
|
|
170
|
+
c.llm_base_url = "https://open.bigmodel.cn/api/paas/v4"
|
|
171
|
+
c.llm_api_key = ENV["GLM_API_KEY"]
|
|
172
|
+
c.llm_model = "glm-4.7-flash"
|
|
173
|
+
|
|
174
|
+
# Gemini 2.5 Flash
|
|
175
|
+
c.llm_base_url = "https://generativelanguage.googleapis.com/v1beta/openai"
|
|
176
|
+
c.llm_api_key = ENV["GEMINI_API_KEY"]
|
|
177
|
+
c.llm_model = "gemini-2.5-flash"
|
|
178
|
+
|
|
179
|
+
# Kimi K2.5 — long context, auto cache
|
|
180
|
+
c.llm_base_url = "https://api.moonshot.ai/v1"
|
|
181
|
+
c.llm_api_key = ENV["KIMI_API_KEY"]
|
|
182
|
+
c.llm_model = "kimi-k2.5"
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Anthropic
|
|
186
|
+
|
|
187
|
+
```ruby
|
|
188
|
+
c.llm_provider = :anthropic
|
|
189
|
+
c.llm_api_key = ENV["ANTHROPIC_API_KEY"]
|
|
190
|
+
c.llm_model = "claude-haiku-4-5-20251001"
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## API
|
|
194
|
+
|
|
195
|
+
### `Scraper#scrape(url, rescue_errors: false)`
|
|
196
|
+
|
|
197
|
+
Fetch URL then extract. Raises on error by default; pass `rescue_errors: true` to get a failure `Result` instead.
|
|
198
|
+
|
|
199
|
+
### `Scraper#extract(content)`
|
|
200
|
+
|
|
201
|
+
Extract from raw HTML/Markdown — skips the fetch step.
|
|
202
|
+
|
|
203
|
+
### `Scraper#scrape_batch(urls)`
|
|
204
|
+
|
|
205
|
+
Scrapes multiple URLs. Never raises — errors are captured in `result.error` per item.
|
|
206
|
+
|
|
207
|
+
```ruby
|
|
208
|
+
results = scraper.scrape_batch(["https://...", "https://..."])
|
|
209
|
+
results.each { |r| puts r.data if r.success? }
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### `Scraper#with_provider(provider)` / `#with_fetcher(fetcher)`
|
|
213
|
+
|
|
214
|
+
Return a new `Scraper` with a swapped provider or fetcher — original is unchanged.
|
|
215
|
+
|
|
216
|
+
```ruby
|
|
217
|
+
cheap = scraper.with_provider(:openai_compatible)
|
|
218
|
+
accurate = scraper.with_provider(:anthropic)
|
|
219
|
+
offline = scraper.with_fetcher(:local)
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### `Result`
|
|
223
|
+
|
|
224
|
+
| Field | Type | Description |
|
|
225
|
+
|---|---|---|
|
|
226
|
+
| `data` | `Hash` | Extracted fields (symbol keys) |
|
|
227
|
+
| `success?` | `Boolean` | |
|
|
228
|
+
| `error` | `String\|nil` | Error message on failure |
|
|
229
|
+
| `url` | `String\|nil` | Source URL |
|
|
230
|
+
| `fetcher` | `Symbol` | Fetcher used |
|
|
231
|
+
| `provider` | `Symbol` | LLM provider used |
|
|
232
|
+
| `model` | `String` | Model name |
|
|
233
|
+
| `tokens_used` | `Hash` | `{ input:, output: }` |
|
|
234
|
+
| `cost_usd` | `Float` | Estimated cost |
|
|
235
|
+
| `duration_ms` | `Integer` | Total wall time |
|
|
236
|
+
|
|
237
|
+
## Estimated Cost (1,000 pages/day)
|
|
238
|
+
|
|
239
|
+
| Combo | Fetcher | LLM/day | Total/day |
|
|
240
|
+
|---|---|---|---|
|
|
241
|
+
| Jina free + GLM-4.7-Flash | $0 | $0 | **$0** |
|
|
242
|
+
| Jina free + DeepSeek V4 Flash | $0 | ~$0.85 | **~$0.85** |
|
|
243
|
+
| Local + DeepSeek V4 Flash | $0 | ~$2–4 | **~$2–4** |
|
|
244
|
+
| Jina free + Claude Haiku | $0 | ~$3–5 | **~$3–5** |
|
|
245
|
+
|
|
246
|
+
> Local fetcher produces ~4× more tokens than Jina Markdown.
|
|
247
|
+
|
|
248
|
+
## Development
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
git clone https://github.com/cuongnc0211/llm_scraper
|
|
252
|
+
cd llm_scraper
|
|
253
|
+
bundle install
|
|
254
|
+
|
|
255
|
+
cp .env.example .env
|
|
256
|
+
# Add your API keys to .env
|
|
257
|
+
|
|
258
|
+
bundle exec rspec # run tests
|
|
259
|
+
bin/console # interactive console with dotenv loaded
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Contributing
|
|
263
|
+
|
|
264
|
+
Bug reports and pull requests are welcome at https://github.com/cuongnc0211/llm_scraper.
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
MIT — see [LICENSE.txt](LICENSE.txt).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :llm_provider # :openai_compatible | :anthropic
|
|
6
|
+
attr_accessor :llm_base_url # e.g. "https://api.deepseek.com/v1"
|
|
7
|
+
attr_accessor :llm_api_key
|
|
8
|
+
attr_accessor :llm_model # e.g. "deepseek-v4-flash"
|
|
9
|
+
attr_accessor :llm_timeout # seconds
|
|
10
|
+
attr_accessor :max_retries
|
|
11
|
+
|
|
12
|
+
attr_accessor :fetcher # :jina | :firecrawl | :markdownify | :local
|
|
13
|
+
attr_accessor :jina_api_key # nil = unauthenticated (~200 req/day limit)
|
|
14
|
+
attr_accessor :firecrawl_api_key
|
|
15
|
+
attr_accessor :markdownify_api_key
|
|
16
|
+
|
|
17
|
+
def initialize
|
|
18
|
+
@llm_provider = :openai_compatible
|
|
19
|
+
@llm_timeout = 30
|
|
20
|
+
@max_retries = 3
|
|
21
|
+
@fetcher = :jina
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module ContentFetchers
|
|
5
|
+
class Base
|
|
6
|
+
# @param url [String]
|
|
7
|
+
# @return [String] cleaned text/markdown content
|
|
8
|
+
# @raise [LlmScraper::FetchError]
|
|
9
|
+
def fetch(url)
|
|
10
|
+
raise NotImplementedError, "#{self.class}#fetch not implemented"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def build_connection(base_url: nil, timeout: 30)
|
|
16
|
+
Faraday.new(url: base_url) do |f|
|
|
17
|
+
f.request :retry, max: 3, interval: 1, backoff_factor: 2
|
|
18
|
+
f.options.timeout = timeout
|
|
19
|
+
f.options.open_timeout = 10
|
|
20
|
+
f.adapter Faraday.default_adapter
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module ContentFetchers
|
|
5
|
+
class Firecrawl < Base
|
|
6
|
+
BASE_URL = "https://api.firecrawl.dev"
|
|
7
|
+
|
|
8
|
+
def initialize(config = LlmScraper.configuration)
|
|
9
|
+
@config = config
|
|
10
|
+
@conn = build_connection(base_url: BASE_URL, timeout: 60)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# @param url [String]
|
|
14
|
+
# @return [String] markdown content
|
|
15
|
+
def fetch(url)
|
|
16
|
+
response = @conn.post("/v1/scrape") do |req|
|
|
17
|
+
req.headers["Authorization"] = "Bearer #{@config.firecrawl_api_key}"
|
|
18
|
+
req.headers["Content-Type"] = "application/json"
|
|
19
|
+
req.body = JSON.generate(url: url, formats: ["markdown"])
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
raise LlmScraper::FetchError, "Firecrawl error (#{response.status}): #{response.body}" unless response.success?
|
|
23
|
+
|
|
24
|
+
body = JSON.parse(response.body)
|
|
25
|
+
body.dig("data", "markdown") ||
|
|
26
|
+
raise(LlmScraper::FetchError, "Firecrawl returned no markdown for #{url}")
|
|
27
|
+
rescue Faraday::Error => e
|
|
28
|
+
raise LlmScraper::FetchError, "Firecrawl fetch error: #{e.message}"
|
|
29
|
+
rescue JSON::ParserError => e
|
|
30
|
+
raise LlmScraper::FetchError, "Firecrawl response parse error: #{e.message}"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module ContentFetchers
|
|
5
|
+
class Jina < Base
|
|
6
|
+
BASE_URL = "https://r.jina.ai"
|
|
7
|
+
|
|
8
|
+
def initialize(config = LlmScraper.configuration)
|
|
9
|
+
@config = config
|
|
10
|
+
@conn = build_connection(base_url: BASE_URL, timeout: 30)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# @param url [String]
|
|
14
|
+
# @return [String] markdown content
|
|
15
|
+
def fetch(url)
|
|
16
|
+
warn "[LlmScraper] No jina_api_key set — unauthenticated (~200 req/day limit)" if @config.jina_api_key.nil?
|
|
17
|
+
|
|
18
|
+
response = @conn.get("/#{url}") do |req|
|
|
19
|
+
req.headers["Accept"] = "text/markdown"
|
|
20
|
+
req.headers["Authorization"] = "Bearer #{@config.jina_api_key}" if @config.jina_api_key
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
raise LlmScraper::FetchError, "Jina error (#{response.status}): #{response.body}" unless response.success?
|
|
24
|
+
|
|
25
|
+
response.body
|
|
26
|
+
rescue Faraday::Error => e
|
|
27
|
+
raise LlmScraper::FetchError, "Jina fetch error: #{e.message}"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module ContentFetchers
|
|
5
|
+
class Local < Base
|
|
6
|
+
def initialize(config = LlmScraper.configuration)
|
|
7
|
+
@config = config
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# @param url [String]
|
|
11
|
+
# @param spa [Boolean] use Ferrum for JS-rendered pages
|
|
12
|
+
# @return [String] cleaned plain text
|
|
13
|
+
def fetch(url, spa: false)
|
|
14
|
+
spa ? fetch_spa(url) : fetch_static(url)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def fetch_static(url)
|
|
20
|
+
conn = build_connection(timeout: 30)
|
|
21
|
+
response = conn.get(url)
|
|
22
|
+
raise LlmScraper::FetchError, "HTTP #{response.status} for #{url}" unless response.success?
|
|
23
|
+
|
|
24
|
+
clean_html(response.body)
|
|
25
|
+
rescue Faraday::Error => e
|
|
26
|
+
raise LlmScraper::FetchError, "Failed to fetch #{url}: #{e.message}"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def fetch_spa(url)
|
|
30
|
+
begin
|
|
31
|
+
require "ferrum"
|
|
32
|
+
rescue LoadError
|
|
33
|
+
raise LlmScraper::FetchError,
|
|
34
|
+
"Ferrum gem required for SPA fetching. Add `gem 'ferrum'` to your Gemfile."
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
browser = Ferrum::Browser.new(headless: true)
|
|
38
|
+
browser.go_to(url)
|
|
39
|
+
browser.network.wait_for_idle
|
|
40
|
+
html = browser.body
|
|
41
|
+
browser.quit
|
|
42
|
+
clean_html(html)
|
|
43
|
+
rescue LlmScraper::FetchError
|
|
44
|
+
raise
|
|
45
|
+
rescue => e
|
|
46
|
+
raise LlmScraper::FetchError, "SPA fetch failed for #{url}: #{e.message}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Strips boilerplate, collapses whitespace — reduces ~50k tokens → ~5k
|
|
50
|
+
def clean_html(html)
|
|
51
|
+
doc = Nokogiri::HTML(html)
|
|
52
|
+
doc.css("script, style, nav, footer, header, [aria-hidden]").remove
|
|
53
|
+
doc.css("body").inner_text.gsub(/\s+/, " ").strip
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module ContentFetchers
|
|
5
|
+
class Markdownify < Base
|
|
6
|
+
BASE_URL = "https://api.scrapegraphai.com"
|
|
7
|
+
|
|
8
|
+
def initialize(config = LlmScraper.configuration)
|
|
9
|
+
@config = config
|
|
10
|
+
@conn = build_connection(base_url: BASE_URL, timeout: 60)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# @param url [String]
|
|
14
|
+
# @return [String] markdown content
|
|
15
|
+
def fetch(url)
|
|
16
|
+
response = @conn.post("/v1/markdownify") do |req|
|
|
17
|
+
req.headers["SGAI-APIKEY"] = @config.markdownify_api_key
|
|
18
|
+
req.headers["Content-Type"] = "application/json"
|
|
19
|
+
req.body = JSON.generate(website_url: url)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
raise LlmScraper::FetchError, "Markdownify error (#{response.status}): #{response.body}" unless response.success?
|
|
23
|
+
|
|
24
|
+
body = JSON.parse(response.body)
|
|
25
|
+
body["result"] ||
|
|
26
|
+
raise(LlmScraper::FetchError, "Markdownify returned no content for #{url}")
|
|
27
|
+
rescue Faraday::Error => e
|
|
28
|
+
raise LlmScraper::FetchError, "Markdownify fetch error: #{e.message}"
|
|
29
|
+
rescue JSON::ParserError => e
|
|
30
|
+
raise LlmScraper::FetchError, "Markdownify response parse error: #{e.message}"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module LlmClients
|
|
5
|
+
class Anthropic < Base
|
|
6
|
+
BASE_URL = "https://api.anthropic.com"
|
|
7
|
+
API_VERSION = "2023-06-01"
|
|
8
|
+
|
|
9
|
+
def initialize(config = LlmScraper.configuration)
|
|
10
|
+
@config = config
|
|
11
|
+
@conn = build_connection(base_url: BASE_URL, timeout: config.llm_timeout)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# @param prompt [String]
|
|
15
|
+
# @return [Hash] { content:, tokens:, cost_usd: }
|
|
16
|
+
def complete(prompt)
|
|
17
|
+
response = @conn.post("v1/messages") do |req|
|
|
18
|
+
req.headers["x-api-key"] = @config.llm_api_key
|
|
19
|
+
req.headers["anthropic-version"] = API_VERSION
|
|
20
|
+
req.headers["Content-Type"] = "application/json"
|
|
21
|
+
req.body = JSON.generate(
|
|
22
|
+
model: @config.llm_model,
|
|
23
|
+
max_tokens: 1024,
|
|
24
|
+
messages: [{ role: "user", content: prompt }]
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
handle_response(response)
|
|
29
|
+
rescue Faraday::Error => e
|
|
30
|
+
raise LlmScraper::LlmError, "Anthropic request failed: #{e.message}"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def handle_response(response)
|
|
36
|
+
raise LlmScraper::LlmError, "Anthropic API error #{response.status}: #{response.body}" unless response.success?
|
|
37
|
+
|
|
38
|
+
body = JSON.parse(response.body)
|
|
39
|
+
content = body.dig("content", 0, "text")
|
|
40
|
+
input_tokens = body.dig("usage", "input_tokens").to_i
|
|
41
|
+
output_tokens = body.dig("usage", "output_tokens").to_i
|
|
42
|
+
|
|
43
|
+
{
|
|
44
|
+
content: content,
|
|
45
|
+
tokens: { input: input_tokens, output: output_tokens },
|
|
46
|
+
cost_usd: estimate_cost(@config.llm_model, input_tokens, output_tokens)
|
|
47
|
+
}
|
|
48
|
+
rescue JSON::ParserError => e
|
|
49
|
+
raise LlmScraper::LlmError, "Failed to parse Anthropic response: #{e.message}"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module LlmClients
|
|
5
|
+
class Base
|
|
6
|
+
# Pricing in USD per 1M tokens (updated 2026-06)
|
|
7
|
+
PRICING = {
|
|
8
|
+
"deepseek-v4-flash" => { input: 0.14, output: 0.28 },
|
|
9
|
+
"deepseek-v4-pro" => { input: 1.74, output: 3.48 },
|
|
10
|
+
"kimi-k2.5" => { input: 0.60, output: 3.00 },
|
|
11
|
+
"glm-4.7-flash" => { input: 0.0, output: 0.0 },
|
|
12
|
+
"claude-haiku-4-5" => { input: 0.80, output: 4.00 },
|
|
13
|
+
"claude-haiku-4-5-20251001" => { input: 0.80, output: 4.00 },
|
|
14
|
+
"gemini-2.5-flash" => { input: 0.15, output: 0.60 },
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
# @param prompt [String]
|
|
18
|
+
# @return [Hash] { content: String, tokens: { input: Integer, output: Integer }, cost_usd: Float }
|
|
19
|
+
# @raise [LlmScraper::LlmError]
|
|
20
|
+
def complete(prompt)
|
|
21
|
+
raise NotImplementedError, "#{self.class}#complete not implemented"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def estimate_cost(model, input_tokens, output_tokens)
|
|
27
|
+
pricing = PRICING[model] || { input: 0.0, output: 0.0 }
|
|
28
|
+
(input_tokens * pricing[:input] + output_tokens * pricing[:output]) / 1_000_000.0
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def build_connection(base_url:, timeout: 30)
|
|
32
|
+
# Trailing slash required so relative paths (e.g. "chat/completions")
|
|
33
|
+
# are appended correctly when base_url itself has a path component
|
|
34
|
+
url = base_url.end_with?("/") ? base_url : "#{base_url}/"
|
|
35
|
+
Faraday.new(url: url) do |f|
|
|
36
|
+
f.request :retry, max: 2, interval: 1, backoff_factor: 2,
|
|
37
|
+
retry_statuses: [429, 500, 502, 503, 504]
|
|
38
|
+
f.options.timeout = timeout
|
|
39
|
+
f.options.open_timeout = 10
|
|
40
|
+
f.adapter Faraday.default_adapter
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
module LlmClients
|
|
5
|
+
class OpenaiCompatible < Base
|
|
6
|
+
def initialize(config = LlmScraper.configuration)
|
|
7
|
+
@config = config
|
|
8
|
+
@conn = build_connection(base_url: config.llm_base_url, timeout: config.llm_timeout)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# @param prompt [String]
|
|
12
|
+
# @return [Hash] { content:, tokens:, cost_usd: }
|
|
13
|
+
def complete(prompt)
|
|
14
|
+
response = @conn.post("chat/completions") do |req|
|
|
15
|
+
req.headers["Authorization"] = "Bearer #{@config.llm_api_key}"
|
|
16
|
+
req.headers["Content-Type"] = "application/json"
|
|
17
|
+
req.body = JSON.generate(
|
|
18
|
+
model: @config.llm_model,
|
|
19
|
+
messages: [{ role: "user", content: prompt }],
|
|
20
|
+
temperature: 0,
|
|
21
|
+
response_format: { type: "json_object" }
|
|
22
|
+
)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
handle_response(response)
|
|
26
|
+
rescue Faraday::Error => e
|
|
27
|
+
raise LlmScraper::LlmError, "LLM request failed: #{e.message}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def handle_response(response)
|
|
33
|
+
raise LlmScraper::LlmError, "LLM API error #{response.status}: #{response.body}" unless response.success?
|
|
34
|
+
|
|
35
|
+
body = JSON.parse(response.body)
|
|
36
|
+
content = body.dig("choices", 0, "message", "content")
|
|
37
|
+
input_tokens = body.dig("usage", "prompt_tokens").to_i
|
|
38
|
+
output_tokens = body.dig("usage", "completion_tokens").to_i
|
|
39
|
+
|
|
40
|
+
{
|
|
41
|
+
content: content,
|
|
42
|
+
tokens: { input: input_tokens, output: output_tokens },
|
|
43
|
+
cost_usd: estimate_cost(@config.llm_model, input_tokens, output_tokens)
|
|
44
|
+
}
|
|
45
|
+
rescue JSON::ParserError => e
|
|
46
|
+
raise LlmScraper::LlmError, "Failed to parse LLM response: #{e.message}"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
class PromptBuilder
|
|
5
|
+
# @param schema [Schema]
|
|
6
|
+
# @param content [String]
|
|
7
|
+
# @return [String]
|
|
8
|
+
def self.build(schema, content)
|
|
9
|
+
new(schema).build(content)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(schema)
|
|
13
|
+
@schema = schema
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def build(content)
|
|
17
|
+
<<~PROMPT
|
|
18
|
+
Extract the following fields from the content below.
|
|
19
|
+
Return ONLY a valid JSON object. No markdown fences. No explanation.
|
|
20
|
+
|
|
21
|
+
Fields:
|
|
22
|
+
#{render_fields}
|
|
23
|
+
Rules:
|
|
24
|
+
- Missing field → null (never omit the key)
|
|
25
|
+
- Return nothing except the JSON object
|
|
26
|
+
|
|
27
|
+
Content:
|
|
28
|
+
#{content}
|
|
29
|
+
PROMPT
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def render_fields
|
|
35
|
+
@schema.fields.map { |name, field| render_field(name, field) }.join("\n")
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def render_field(name, field)
|
|
39
|
+
if field.has_instructions?
|
|
40
|
+
render_detailed(name, field)
|
|
41
|
+
else
|
|
42
|
+
render_inline(name, field)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Simple fields: single line
|
|
47
|
+
def render_inline(name, field)
|
|
48
|
+
label = build_type_label(field)
|
|
49
|
+
what = field.what || name.to_s
|
|
50
|
+
"- #{name} (#{label}): #{what}"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Fields with how/examples/enum: multiline block
|
|
54
|
+
def render_detailed(name, field)
|
|
55
|
+
label = build_type_label(field)
|
|
56
|
+
lines = ["- #{name} (#{label}):"]
|
|
57
|
+
lines << " Field: #{field.what}" if field.what
|
|
58
|
+
lines << " Instructions: #{field.how}" if field.how
|
|
59
|
+
lines << " Examples: #{field.examples.join(", ")}" if field.examples
|
|
60
|
+
lines << " Allowed values: #{field.enum.join(", ")}" if field.enum
|
|
61
|
+
lines.join("\n")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def build_type_label(field)
|
|
65
|
+
base = if field.type == :array
|
|
66
|
+
field.items ? "array of #{field.items}" : "array"
|
|
67
|
+
else
|
|
68
|
+
field.type.to_s
|
|
69
|
+
end
|
|
70
|
+
field.required ? "#{base}, required" : base
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
class ResponseParser
|
|
5
|
+
# @param content [String] raw LLM response
|
|
6
|
+
# @param schema [Schema]
|
|
7
|
+
# @return [Hash]
|
|
8
|
+
# @raise [LlmScraper::ParseError]
|
|
9
|
+
def self.parse(content, schema)
|
|
10
|
+
new(schema).parse(content)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(schema)
|
|
14
|
+
@schema = schema
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def parse(content)
|
|
18
|
+
json = strip_fences(content)
|
|
19
|
+
data = JSON.parse(json)
|
|
20
|
+
coerced = coerce(data)
|
|
21
|
+
validate_required!(coerced)
|
|
22
|
+
coerced
|
|
23
|
+
rescue JSON::ParserError => e
|
|
24
|
+
raise LlmScraper::ParseError, "Invalid JSON from LLM: #{e.message}\nRaw: #{content}"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
# Strip markdown fences in case LLM ignores the instruction
|
|
30
|
+
def strip_fences(content)
|
|
31
|
+
content
|
|
32
|
+
.gsub(/\A```(?:json)?\s*/i, "")
|
|
33
|
+
.gsub(/\s*```\z/, "")
|
|
34
|
+
.strip
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def coerce(data)
|
|
38
|
+
@schema.fields.each_with_object({}) do |(name, field), result|
|
|
39
|
+
raw = data[name.to_s]
|
|
40
|
+
result[name] = raw.nil? ? field.default : coerce_value(raw, field)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def coerce_value(value, field)
|
|
45
|
+
return nil if value.nil?
|
|
46
|
+
|
|
47
|
+
case field.type
|
|
48
|
+
when :number then coerce_number(value)
|
|
49
|
+
when :boolean then coerce_boolean(value)
|
|
50
|
+
when :array then Array(value)
|
|
51
|
+
else value.to_s
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Strips currency symbols and thousands separators: "¥150,000" → 150000
|
|
56
|
+
def coerce_number(value)
|
|
57
|
+
return value if value.is_a?(Numeric)
|
|
58
|
+
|
|
59
|
+
cleaned = value.to_s.gsub(/[^\d.\-]/, "")
|
|
60
|
+
return nil if cleaned.empty?
|
|
61
|
+
|
|
62
|
+
cleaned.include?(".") ? cleaned.to_f : cleaned.to_i
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def coerce_boolean(value)
|
|
66
|
+
return value if value == true || value == false
|
|
67
|
+
|
|
68
|
+
%w[true yes 1].include?(value.to_s.downcase)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def validate_required!(data)
|
|
72
|
+
@schema.fields.each do |name, field|
|
|
73
|
+
next unless field.required
|
|
74
|
+
next unless data[name].nil?
|
|
75
|
+
|
|
76
|
+
raise LlmScraper::ParseError, "Required field '#{name}' is missing or null in LLM response"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
Result = Struct.new(
|
|
5
|
+
:data, # Hash — extracted fields
|
|
6
|
+
:success, # Boolean
|
|
7
|
+
:error, # String | nil
|
|
8
|
+
:url, # String | nil
|
|
9
|
+
:fetcher, # Symbol — fetcher used
|
|
10
|
+
:provider, # Symbol — LLM provider used
|
|
11
|
+
:model, # String — model name
|
|
12
|
+
:tokens_used, # Hash — { input: Integer, output: Integer }
|
|
13
|
+
:cost_usd, # Float — estimated cost
|
|
14
|
+
:duration_ms, # Integer — total time in milliseconds
|
|
15
|
+
keyword_init: true
|
|
16
|
+
) do
|
|
17
|
+
def success? = success
|
|
18
|
+
def failure? = !success
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
class Schema
|
|
5
|
+
attr_reader :fields
|
|
6
|
+
|
|
7
|
+
# @param block [Proc] DSL block
|
|
8
|
+
# @return [Schema]
|
|
9
|
+
def self.define(&block)
|
|
10
|
+
schema = new
|
|
11
|
+
schema.instance_eval(&block)
|
|
12
|
+
schema
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# @param hash [Hash] { field_name => { type:, description:, ... } }
|
|
16
|
+
# @return [Schema]
|
|
17
|
+
def self.from_hash(hash)
|
|
18
|
+
schema = new
|
|
19
|
+
hash.each do |name, opts|
|
|
20
|
+
schema.field(name.to_sym, **opts.transform_keys(&:to_sym))
|
|
21
|
+
end
|
|
22
|
+
schema
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def initialize
|
|
26
|
+
@fields = {}
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param name [Symbol]
|
|
30
|
+
# @param type [Symbol] :string | :number | :boolean | :array | :object
|
|
31
|
+
# @param options [Hash]
|
|
32
|
+
def field(name, type:, **options)
|
|
33
|
+
@fields[name.to_sym] = Field.new(name: name.to_sym, type: type, **options)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
class Field
|
|
37
|
+
VALID_TYPES = %i[string number boolean array object].freeze
|
|
38
|
+
|
|
39
|
+
attr_reader :name, :type, :what, :how, :examples, :enum,
|
|
40
|
+
:required, :default, :items
|
|
41
|
+
|
|
42
|
+
def initialize(name:, type:, description: nil, what: nil, how: nil,
|
|
43
|
+
examples: nil, enum: nil, required: false,
|
|
44
|
+
default: nil, items: nil, **_rest)
|
|
45
|
+
@name = name
|
|
46
|
+
@type = type.to_sym
|
|
47
|
+
@what = what || description
|
|
48
|
+
@how = how
|
|
49
|
+
@examples = examples
|
|
50
|
+
@enum = enum
|
|
51
|
+
@required = required
|
|
52
|
+
@default = default
|
|
53
|
+
@items = items
|
|
54
|
+
|
|
55
|
+
validate!
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# True when field has extraction instructions beyond just a label
|
|
59
|
+
def has_instructions?
|
|
60
|
+
!@how.nil? || !@examples.nil? || !@enum.nil?
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def validate!
|
|
66
|
+
return if VALID_TYPES.include?(@type)
|
|
67
|
+
|
|
68
|
+
raise LlmScraper::SchemaError, "Field '#{@name}': unsupported type '#{@type}'"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmScraper
|
|
4
|
+
class Scraper
|
|
5
|
+
# @param schema [Schema, Hash]
|
|
6
|
+
# @param config [Configuration]
|
|
7
|
+
def initialize(schema:, config: LlmScraper.configuration)
|
|
8
|
+
@schema = normalize_schema(schema)
|
|
9
|
+
@config = config
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# @param url [String]
|
|
13
|
+
# @param rescue_errors [Boolean] return error Result instead of raising
|
|
14
|
+
# @return [Result]
|
|
15
|
+
# @raise [LlmScraper::Error] when rescue_errors is false
|
|
16
|
+
def scrape(url, rescue_errors: false)
|
|
17
|
+
start = monotonic_now
|
|
18
|
+
result = run_pipeline(url: url)
|
|
19
|
+
attach_timing(result, start)
|
|
20
|
+
rescue LlmScraper::Error => e
|
|
21
|
+
raise unless rescue_errors
|
|
22
|
+
|
|
23
|
+
Result.new(success: false, error: e.message, url: url,
|
|
24
|
+
fetcher: @config.fetcher, provider: @config.llm_provider,
|
|
25
|
+
model: @config.llm_model)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Extract from raw content — skips fetching step
|
|
29
|
+
# @param content [String]
|
|
30
|
+
# @return [Result]
|
|
31
|
+
def extract(content)
|
|
32
|
+
start = monotonic_now
|
|
33
|
+
attach_timing(run_llm_pipeline(content: content), start)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @param urls [Array<String>]
|
|
37
|
+
# @return [Array<Result>] never raises — errors captured in result.error
|
|
38
|
+
def scrape_batch(urls)
|
|
39
|
+
urls.map { |url| scrape(url, rescue_errors: true) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @param provider [Symbol] :openai_compatible | :anthropic
|
|
43
|
+
# @return [Scraper] new instance with swapped LLM provider
|
|
44
|
+
def with_provider(provider)
|
|
45
|
+
self.class.new(schema: @schema, config: clone_config(llm_provider: provider))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# @param fetcher [Symbol] :jina | :firecrawl | :markdownify | :local
|
|
49
|
+
# @return [Scraper] new instance with swapped fetcher
|
|
50
|
+
def with_fetcher(fetcher)
|
|
51
|
+
self.class.new(schema: @schema, config: clone_config(fetcher: fetcher))
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def run_pipeline(url:)
|
|
57
|
+
content = build_fetcher.fetch(url)
|
|
58
|
+
result = run_llm_pipeline(content: content)
|
|
59
|
+
result.url = url
|
|
60
|
+
result.fetcher = @config.fetcher
|
|
61
|
+
result
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def run_llm_pipeline(content:)
|
|
65
|
+
client = build_llm_client
|
|
66
|
+
prompt = PromptBuilder.build(@schema, content)
|
|
67
|
+
llm_result = client.complete(prompt)
|
|
68
|
+
data = parse_with_retry(llm_result[:content], client, prompt)
|
|
69
|
+
|
|
70
|
+
Result.new(
|
|
71
|
+
data: data,
|
|
72
|
+
success: true,
|
|
73
|
+
provider: @config.llm_provider,
|
|
74
|
+
model: @config.llm_model,
|
|
75
|
+
tokens_used: llm_result[:tokens],
|
|
76
|
+
cost_usd: llm_result[:cost_usd]
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Retry once with a stricter prompt on JSON parse failure
|
|
81
|
+
def parse_with_retry(content, client, original_prompt)
|
|
82
|
+
ResponseParser.parse(content, @schema)
|
|
83
|
+
rescue LlmScraper::ParseError
|
|
84
|
+
retry_prompt = original_prompt +
|
|
85
|
+
"\n\nCRITICAL: Return ONLY the JSON object. Example output: {\"field\": \"value\"}"
|
|
86
|
+
llm_result = client.complete(retry_prompt)
|
|
87
|
+
ResponseParser.parse(llm_result[:content], @schema)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def build_fetcher
|
|
91
|
+
case @config.fetcher
|
|
92
|
+
when :jina then ContentFetchers::Jina.new(@config)
|
|
93
|
+
when :firecrawl then ContentFetchers::Firecrawl.new(@config)
|
|
94
|
+
when :markdownify then ContentFetchers::Markdownify.new(@config)
|
|
95
|
+
when :local then ContentFetchers::Local.new(@config)
|
|
96
|
+
else raise ConfigurationError, "Unknown fetcher: #{@config.fetcher.inspect}"
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def build_llm_client
|
|
101
|
+
case @config.llm_provider
|
|
102
|
+
when :openai_compatible then LlmClients::OpenaiCompatible.new(@config)
|
|
103
|
+
when :anthropic then LlmClients::Anthropic.new(@config)
|
|
104
|
+
else raise ConfigurationError, "Unknown LLM provider: #{@config.llm_provider.inspect}"
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def normalize_schema(schema)
|
|
109
|
+
case schema
|
|
110
|
+
when Schema then schema
|
|
111
|
+
when Hash then Schema.from_hash(schema)
|
|
112
|
+
else raise SchemaError, "schema must be a Hash or LlmScraper::Schema instance"
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def clone_config(**overrides)
|
|
117
|
+
new_config = @config.dup
|
|
118
|
+
overrides.each { |key, val| new_config.public_send(:"#{key}=", val) }
|
|
119
|
+
new_config
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def attach_timing(result, start)
|
|
123
|
+
result.duration_ms = ((monotonic_now - start) * 1000).round
|
|
124
|
+
result
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def monotonic_now
|
|
128
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
data/lib/llm_scraper.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zeitwerk"
|
|
4
|
+
require "faraday"
|
|
5
|
+
require "faraday/retry"
|
|
6
|
+
require "nokogiri"
|
|
7
|
+
require "json"
|
|
8
|
+
|
|
9
|
+
loader = Zeitwerk::Loader.for_gem
|
|
10
|
+
loader.setup
|
|
11
|
+
|
|
12
|
+
module LlmScraper
|
|
13
|
+
class Error < StandardError; end
|
|
14
|
+
class FetchError < Error; end
|
|
15
|
+
class LlmError < Error; end
|
|
16
|
+
class ParseError < Error; end
|
|
17
|
+
class SchemaError < Error; end
|
|
18
|
+
class ConfigurationError < Error; end
|
|
19
|
+
|
|
20
|
+
def self.configure
|
|
21
|
+
yield configuration
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self.configuration
|
|
25
|
+
@configuration ||= Configuration.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.reset_configuration!
|
|
29
|
+
@configuration = Configuration.new
|
|
30
|
+
end
|
|
31
|
+
end
|
data/sig/llm_scraper.rbs
ADDED
metadata
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: llm_scraper
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- cuongnc0211
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: faraday
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.10'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.10'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: faraday-retry
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '2.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '2.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: nokogiri
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '1.15'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '1.15'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: zeitwerk
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '2.6'
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '2.6'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: rspec
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '3.0'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '3.0'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: vcr
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '6.0'
|
|
89
|
+
type: :development
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '6.0'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: webmock
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - "~>"
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: '3.0'
|
|
103
|
+
type: :development
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - "~>"
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '3.0'
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: rubocop
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - "~>"
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '1.0'
|
|
117
|
+
type: :development
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - "~>"
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '1.0'
|
|
124
|
+
description: 'Pipeline: URL → ContentFetcher (Markdown) → LlmClient (JSON). Supports
|
|
125
|
+
Jina/Firecrawl fetchers and OpenAI-compatible/Anthropic LLM providers.'
|
|
126
|
+
email:
|
|
127
|
+
- cuongnguyenfu@gmail.com
|
|
128
|
+
executables: []
|
|
129
|
+
extensions: []
|
|
130
|
+
extra_rdoc_files: []
|
|
131
|
+
files:
|
|
132
|
+
- ".env.example"
|
|
133
|
+
- CHANGELOG.md
|
|
134
|
+
- CODE_OF_CONDUCT.md
|
|
135
|
+
- LICENSE.txt
|
|
136
|
+
- README.md
|
|
137
|
+
- Rakefile
|
|
138
|
+
- lib/llm_scraper.rb
|
|
139
|
+
- lib/llm_scraper/configuration.rb
|
|
140
|
+
- lib/llm_scraper/content_fetchers/base.rb
|
|
141
|
+
- lib/llm_scraper/content_fetchers/firecrawl.rb
|
|
142
|
+
- lib/llm_scraper/content_fetchers/jina.rb
|
|
143
|
+
- lib/llm_scraper/content_fetchers/local.rb
|
|
144
|
+
- lib/llm_scraper/content_fetchers/markdownify.rb
|
|
145
|
+
- lib/llm_scraper/llm_clients/anthropic.rb
|
|
146
|
+
- lib/llm_scraper/llm_clients/base.rb
|
|
147
|
+
- lib/llm_scraper/llm_clients/openai_compatible.rb
|
|
148
|
+
- lib/llm_scraper/prompt_builder.rb
|
|
149
|
+
- lib/llm_scraper/response_parser.rb
|
|
150
|
+
- lib/llm_scraper/result.rb
|
|
151
|
+
- lib/llm_scraper/schema.rb
|
|
152
|
+
- lib/llm_scraper/scraper.rb
|
|
153
|
+
- lib/llm_scraper/version.rb
|
|
154
|
+
- sig/llm_scraper.rbs
|
|
155
|
+
homepage: https://github.com/cuongnc0211/llm_scraper
|
|
156
|
+
licenses:
|
|
157
|
+
- MIT
|
|
158
|
+
metadata:
|
|
159
|
+
homepage_uri: https://github.com/cuongnc0211/llm_scraper
|
|
160
|
+
source_code_uri: https://github.com/cuongnc0211/llm_scraper
|
|
161
|
+
rubygems_mfa_required: 'true'
|
|
162
|
+
rdoc_options: []
|
|
163
|
+
require_paths:
|
|
164
|
+
- lib
|
|
165
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
166
|
+
requirements:
|
|
167
|
+
- - ">="
|
|
168
|
+
- !ruby/object:Gem::Version
|
|
169
|
+
version: 3.1.0
|
|
170
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
171
|
+
requirements:
|
|
172
|
+
- - ">="
|
|
173
|
+
- !ruby/object:Gem::Version
|
|
174
|
+
version: '0'
|
|
175
|
+
requirements: []
|
|
176
|
+
rubygems_version: 4.0.3
|
|
177
|
+
specification_version: 4
|
|
178
|
+
summary: Extract structured JSON from web pages using LLMs
|
|
179
|
+
test_files: []
|