nukitori 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -13
- data/lib/nukitori/version.rb +1 -1
- data/lib/nukitori.rb +0 -6
- metadata +7 -2
- data/lib/nukitori/models.json +0 -7428
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 37519a4a86c49b4904f08d4c6d1b91109ce00df94523c5cccb31bcc5ac8d55b5
|
|
4
|
+
data.tar.gz: 544da85b164aade937654ed702b49f6aa49b193d672d4108dc8952b25c1b3514
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 64a804009ff6786aee759fd3ca16dee56f60da253a0438c6fceac1159680aa47c07917ef12d8bc49fcf767dab12ef25c4db491f04c4b9ea487010f35021c3193
|
|
7
|
+
data.tar.gz: ff17ddd62a07c37c8b04b3b09bed0f1ace7b98bbc48feb02bcac8a10d7acb4dd47a5dc71db6a2584b7d9cc7de4eb6bd702f591b9e1984137e07a606812c1b232
|
data/README.md
CHANGED
|
@@ -8,17 +8,16 @@ Nukitori is a Ruby gem for HTML data extraction that uses an LLM once to generat
|
|
|
8
8
|
- **Robust reusable schemas** — avoids page-specific IDs, dynamic hashes, and fragile selectors
|
|
9
9
|
- **Transparent output** — generated schemas are plain JSON, easy to inspect, diff, and version
|
|
10
10
|
- **Token-optimized** — strips scripts, styles, and redundant DOM before sending HTML to the LLM
|
|
11
|
-
- **Any LLM provider** — works with OpenAI, Anthropic, Gemini, and local models
|
|
12
|
-
|
|
13
|
-
Define what you want to extract from HTML using a simple schema DSL:
|
|
11
|
+
- **Any LLM provider** — works with OpenAI, Anthropic, Gemini, and local models:
|
|
14
12
|
|
|
15
13
|
```ruby
|
|
16
|
-
#
|
|
14
|
+
# example_extract.rb
|
|
17
15
|
require 'nukitori'
|
|
18
16
|
require 'json'
|
|
19
17
|
|
|
20
18
|
html = "<HTML DOM from https://github.com/search?q=ruby+web+scraping&type=repositories>"
|
|
21
19
|
|
|
20
|
+
# define what you want to extract from HTML using simple DSL:
|
|
22
21
|
data = Nukitori(html, 'schema.json') do
|
|
23
22
|
integer :repositories_found_count
|
|
24
23
|
array :repositories do
|
|
@@ -35,12 +34,10 @@ end
|
|
|
35
34
|
File.write('results.json', JSON.pretty_generate(data))
|
|
36
35
|
```
|
|
37
36
|
|
|
38
|
-
On the first run `$ ruby
|
|
39
|
-
|
|
40
|
-
<details>
|
|
41
|
-
<summary><code>schema.json</code> (click to expand)</summary><br>
|
|
37
|
+
On the first run `$ ruby example_extract.rb` Nukitori uses AI to generate a reusable XPath extraction schema:
|
|
42
38
|
|
|
43
39
|
```json
|
|
40
|
+
/* schema.json */
|
|
44
41
|
{
|
|
45
42
|
"repositories_found_count": {
|
|
46
43
|
"xpath": "//a[@data-testid='nav-item-repositories']//span[@data-testid='resolved-count-label']",
|
|
@@ -78,14 +75,11 @@ On the first run `$ ruby github_extract.rb` Nukitori uses AI to generate a reusa
|
|
|
78
75
|
}
|
|
79
76
|
}
|
|
80
77
|
```
|
|
81
|
-
</details>
|
|
82
78
|
|
|
83
79
|
After that, Nukitori extracts structured data from similar HTMLs without any LLM calls, in milliseconds:
|
|
84
80
|
|
|
85
|
-
<details>
|
|
86
|
-
<summary><code>results.json</code> (click to expand)</summary><br>
|
|
87
|
-
|
|
88
81
|
```json
|
|
82
|
+
/* results.json */
|
|
89
83
|
{
|
|
90
84
|
"repositories_found_count": 314,
|
|
91
85
|
"repositories": [
|
|
@@ -114,7 +108,6 @@ After that, Nukitori extracts structured data from similar HTMLs without any LLM
|
|
|
114
108
|
]
|
|
115
109
|
}
|
|
116
110
|
```
|
|
117
|
-
</details>
|
|
118
111
|
|
|
119
112
|
## Installation
|
|
120
113
|
|
data/lib/nukitori/version.rb
CHANGED
data/lib/nukitori.rb
CHANGED
|
@@ -14,12 +14,8 @@ require_relative 'nukitori/schema_extractor'
|
|
|
14
14
|
require_relative 'nukitori/llm_extractor'
|
|
15
15
|
|
|
16
16
|
module Nukitori
|
|
17
|
-
# Path to bundled models.json with up-to-date model definitions
|
|
18
|
-
MODELS_JSON = File.expand_path('nukitori/models.json', __dir__)
|
|
19
17
|
class << self
|
|
20
18
|
# Configure RubyLLM through Nukitori
|
|
21
|
-
# Automatically uses bundled models.json with latest model definitions
|
|
22
|
-
#
|
|
23
19
|
# @example
|
|
24
20
|
# Nukitori.configure do |config|
|
|
25
21
|
# config.default_model = 'gpt-5.2'
|
|
@@ -28,8 +24,6 @@ module Nukitori
|
|
|
28
24
|
#
|
|
29
25
|
def configure
|
|
30
26
|
RubyLLM.configure do |config|
|
|
31
|
-
# Use bundled models.json with up-to-date model definitions
|
|
32
|
-
config.model_registry_file = MODELS_JSON
|
|
33
27
|
yield config if block_given?
|
|
34
28
|
end
|
|
35
29
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: nukitori
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Victor Afanasev
|
|
@@ -30,6 +30,9 @@ dependencies:
|
|
|
30
30
|
- - "~>"
|
|
31
31
|
- !ruby/object:Gem::Version
|
|
32
32
|
version: '1.9'
|
|
33
|
+
- - ">="
|
|
34
|
+
- !ruby/object:Gem::Version
|
|
35
|
+
version: 1.9.2
|
|
33
36
|
type: :runtime
|
|
34
37
|
prerelease: false
|
|
35
38
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -37,6 +40,9 @@ dependencies:
|
|
|
37
40
|
- - "~>"
|
|
38
41
|
- !ruby/object:Gem::Version
|
|
39
42
|
version: '1.9'
|
|
43
|
+
- - ">="
|
|
44
|
+
- !ruby/object:Gem::Version
|
|
45
|
+
version: 1.9.2
|
|
40
46
|
description: Nukitori is a Ruby gem for HTML data extraction. It uses an LLM once
|
|
41
47
|
to generate reusable XPath schemas, then extracts structured data from similarly
|
|
42
48
|
structured pages using plain Nokogiri. This makes scraping fast, predictable, and
|
|
@@ -55,7 +61,6 @@ files:
|
|
|
55
61
|
- lib/nukitori/chat_factory.rb
|
|
56
62
|
- lib/nukitori/html_preprocessor.rb
|
|
57
63
|
- lib/nukitori/llm_extractor.rb
|
|
58
|
-
- lib/nukitori/models.json
|
|
59
64
|
- lib/nukitori/response_parser.rb
|
|
60
65
|
- lib/nukitori/schema_extractor.rb
|
|
61
66
|
- lib/nukitori/schema_generator.rb
|