nukitori 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nukitori
4
+ # Parses LLM response content (handles both Hash and markdown-wrapped JSON)
5
+ class ResponseParser
6
+ # @param content [Hash, String] Response content from LLM
7
+ # @return [Hash] Parsed response
8
+ def self.parse(content)
9
+ return content if content.is_a?(Hash)
10
+
11
+ text = content.is_a?(String) ? content : content.to_s
12
+ text = text.strip
13
+ text = text.gsub(/\A```json\s*/, '').gsub(/\s*```\z/, '')
14
+ text = text.gsub(/\A```\s*/, '').gsub(/\s*```\z/, '')
15
+ JSON.parse(text)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nukitori
4
+ class SchemaExtractor
5
+ attr_reader :schema
6
+
7
+ # @param schema [Hash] XPath schema
8
+ def initialize(schema)
9
+ @schema = deep_stringify_keys(schema)
10
+ end
11
+
12
+ # Extract data from HTML using the XPath schema
13
+ # @param html [String, Nokogiri::HTML::Document] HTML string or Nokogiri document
14
+ # @return [Hash] Extracted data
15
+ def extract(html)
16
+ doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
17
+ extract_fields(doc, schema)
18
+ end
19
+
20
+ private
21
+
22
+ def deep_stringify_keys(obj)
23
+ case obj
24
+ when Hash
25
+ obj.each_with_object({}) do |(k, v), result|
26
+ result[k.to_s] = deep_stringify_keys(v)
27
+ end
28
+ when Array
29
+ obj.map { |v| deep_stringify_keys(v) }
30
+ else
31
+ obj
32
+ end
33
+ end
34
+
35
+ def extract_fields(context, fields)
36
+ result = {}
37
+ fields.each do |field_name, field_def|
38
+ result[field_name] = extract_field(context, field_def)
39
+ end
40
+ result
41
+ end
42
+
43
+ def extract_field(context, field_def)
44
+ return nil if field_def.is_a?(String)
45
+
46
+ case field_def['type']
47
+ when 'array'
48
+ extract_array(context, field_def)
49
+ when 'object'
50
+ extract_object(context, field_def)
51
+ else
52
+ extract_primitive(context, field_def) if field_def['xpath']
53
+ end
54
+ end
55
+
56
+ def extract_array(context, field_def)
57
+ container_xpath = field_def['container_xpath']
58
+ items_def = field_def['items']
59
+
60
+ return [] unless container_xpath && items_def
61
+
62
+ containers = context.xpath(container_xpath)
63
+
64
+ # Simple array (strings) vs array of objects
65
+ if items_def['xpath']
66
+ containers.map { |c| extract_primitive(c, items_def) }
67
+ else
68
+ containers.map { |c| extract_fields(c, items_def) }
69
+ end
70
+ end
71
+
72
+ def extract_object(context, field_def)
73
+ properties = field_def['properties']
74
+ context_xpath = field_def['context_xpath']
75
+
76
+ if context_xpath
77
+ context = context.at_xpath(context_xpath)
78
+ return nil unless context
79
+ end
80
+
81
+ extract_fields(context, properties)
82
+ end
83
+
84
+ def extract_primitive(context, field_def)
85
+ xpath = field_def['xpath']
86
+ type = field_def['type'] || 'string'
87
+
88
+ return nil unless xpath
89
+
90
+ result = context.xpath(xpath)
91
+ raw_value = extract_raw_value(result)
92
+
93
+ return nil if raw_value.nil?
94
+
95
+ convert_to_type(raw_value, type)
96
+ end
97
+
98
+ def extract_raw_value(xpath_result)
99
+ return nil if xpath_result.nil?
100
+ return nil if xpath_result.is_a?(Nokogiri::XML::NodeSet) && xpath_result.empty?
101
+
102
+ value = if xpath_result.is_a?(Nokogiri::XML::NodeSet)
103
+ node = xpath_result.first
104
+ node.is_a?(Nokogiri::XML::Attr) ? node.value : node.text
105
+ else
106
+ xpath_result.to_s
107
+ end
108
+
109
+ value.strip
110
+ end
111
+
112
+ def convert_to_type(value, type)
113
+ case type
114
+ when 'string'
115
+ value.to_s.gsub(/\s+/, ' ').strip
116
+ when 'integer'
117
+ value.gsub(/[^\d-]/, '').to_i
118
+ when 'number', 'float'
119
+ value.gsub(/[^\d.-]/, '').to_f
120
+ when 'boolean'
121
+ %w[true yes 1 on].include?(value.to_s.downcase)
122
+ else
123
+ value
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Nukitori
6
+ class SchemaGenerator
7
+ attr_reader :model
8
+
9
+ # @param model [String, nil] LLM model to use
10
+ # @param block [Proc] Schema definition block
11
+ #
12
+ # @example
13
+ # generator = Nukitori::SchemaGenerator.new do
14
+ # array :repos do
15
+ # object do
16
+ # string :name
17
+ # string :url
18
+ # end
19
+ # end
20
+ # end
21
+ #
22
+ def initialize(model: nil, &block)
23
+ raise ArgumentError, 'Block required for schema definition' unless block_given?
24
+
25
+ @model = model
26
+ @schema_block = block
27
+ end
28
+
29
+ # Create extraction schema for given HTML
30
+ # @param html [String, Nokogiri::HTML::Document] HTML string or Nokogiri document
31
+ # @return [Hash] Generated extraction schema
32
+ #
33
+ # @example
34
+ # extraction_schema = generator.create_extraction_schema_for(html)
35
+ #
36
+ def create_extraction_schema_for(html)
37
+ doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
38
+ requirements = build_requirements(&@schema_block)
39
+ processed_html = HtmlPreprocessor.process(doc)
40
+ normalized_requirements = normalize_requirements(requirements)
41
+ prompt = build_prompt(normalized_requirements)
42
+
43
+ chat = ChatFactory.create(model:)
44
+ chat.with_instructions(prompt)
45
+
46
+ response = chat.ask(processed_html)
47
+ ResponseParser.parse(response.content)
48
+ end
49
+
50
+ private
51
+
52
+ def build_requirements(&block)
53
+ schema_class = Class.new(RubyLLM::Schema, &block)
54
+ schema_class.new.to_json
55
+ end
56
+
57
+ def normalize_requirements(requirements)
58
+ schema_json = JSON.parse(requirements)
59
+ schema_json.dig('schema', 'properties')
60
+ end
61
+
62
+ def build_prompt(requirements)
63
+ <<~PROMPT
64
+ You are an expert at analyzing HTML structure and generating XPath expressions.
65
+
66
+ ## Task
67
+ Analyze the provided HTML and generate an XPath schema that can extract data
68
+ matching the requirements schema below. Return ONLY valid JSON, no other text.
69
+
70
+ ## Requirements Schema (what to extract)
71
+ ```json
72
+ #{requirements.to_json}
73
+ ```
74
+
75
+ ## XPath Schema Format
76
+
77
+ For each field in requirements, generate the corresponding XPath definition:
78
+
79
+ 1. **For primitive types** (string, integer, number, boolean):
80
+ ```json
81
+ {
82
+ "field_name": {
83
+ "xpath": "//div[@class='example']",
84
+ "type": "string"
85
+ }
86
+ }
87
+ ```
88
+
89
+ 2. **For arrays of objects**:
90
+ ```json
91
+ {
92
+ "items_list": {
93
+ "type": "array",
94
+ "container_xpath": "//div[@class='item']",
95
+ "items": {
96
+ "name": {"xpath": ".//h3", "type": "string"},
97
+ "price": {"xpath": ".//span[@class='price']", "type": "number"}
98
+ }
99
+ }
100
+ }
101
+ ```
102
+
103
+ 3. **For arrays of strings**:
104
+ ```json
105
+ {
106
+ "tags": {
107
+ "type": "array",
108
+ "container_xpath": ".//a[@class='tag']",
109
+ "items": {
110
+ "xpath": ".",
111
+ "type": "string"
112
+ }
113
+ }
114
+ }
115
+ ```
116
+
117
+ ## XPath Rules
118
+
119
+ - Use `container_xpath` to identify repeating elements for arrays
120
+ - Use relative XPaths (starting with `.//` or `.`) for fields inside arrays
121
+ - Do NOT use `/text()` - just select the element, we extract text automatically
122
+ - Use `@attr` to extract attribute values (e.g., `@href`, `@src`), especially for schema attributes which ends at `link` or `url`
123
+ - Prefer semantic attributes: `@data-testid`, `@role`, `@aria-label`
124
+ - Prefer tag structure: `//article//h3/a` over class-based selectors
125
+
126
+ ## CRITICAL: Avoid Page-Specific Values
127
+
128
+ The XPath schema must work on ALL similar pages, not just this one. NEVER use:
129
+
130
+ - **IDs with numbers/UUIDs**: `@id='product-price-19140'` or `@id='item-abc123'` — these are page-specific
131
+ - **Dynamic function names**: `@x-data='initComponent_6956666703fbc()'` — these change per page load
132
+ - **Hashed class names**: `Box-sc-62in7e-0`, `css-1a2b3c`, `styled-xyz123` — generated by CSS-in-JS
133
+ - **Session/random tokens**: any attribute value that looks like a hash or random string
134
+
135
+ Instead, use:
136
+ - Class names that describe purpose: `@class='price'`, `@class='product-title'`
137
+ - Structural patterns: `//div[@class='product']//span[contains(@class,'price')]`
138
+ - Partial matches when needed: `contains(@id,'product-price')` instead of exact ID
139
+ - Tag hierarchy: `//article//h3/a` — relies on structure, not dynamic values
140
+
141
+ ## Output
142
+
143
+ Return ONLY the JSON XPath schema. No explanations, no markdown code blocks.
144
+ PROMPT
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nukitori
4
+ VERSION = '0.1.0'
5
+ end
data/lib/nukitori.rb ADDED
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'ruby_llm'
5
+ require 'ruby_llm/schema'
6
+ require 'json'
7
+
8
+ require_relative 'nukitori/version'
9
+ require_relative 'nukitori/response_parser'
10
+ require_relative 'nukitori/html_preprocessor'
11
+ require_relative 'nukitori/chat_factory'
12
+ require_relative 'nukitori/schema_generator'
13
+ require_relative 'nukitori/schema_extractor'
14
+ require_relative 'nukitori/llm_extractor'
15
+
16
+ module Nukitori
17
+ # Path to bundled models.json with up-to-date model definitions
18
+ MODELS_JSON = File.expand_path('nukitori/models.json', __dir__)
19
+ class << self
20
+ # Configure RubyLLM through Nukitori
21
+ # Automatically uses bundled models.json with latest model definitions
22
+ #
23
+ # @example
24
+ # Nukitori.configure do |config|
25
+ # config.default_model = 'gpt-5.2'
26
+ # config.openai_api_key = ENV['OPENAI_API_KEY']
27
+ # end
28
+ #
29
+ def configure
30
+ RubyLLM.configure do |config|
31
+ # Use bundled models.json with up-to-date model definitions
32
+ config.model_registry_file = MODELS_JSON
33
+ yield config if block_given?
34
+ end
35
+ end
36
+
37
+ # Main entry point - callable as Nukitori(html, 'schema.json') { schema }
38
+ #
39
+ # @param html [String, Nokogiri::HTML::Document] HTML content or Nokogiri doc
40
+ # @param schema_path [String, nil] Path to cache extraction schema (optional)
41
+ # @param model [String, nil] LLM model to use (overrides default_model)
42
+ # @param block [Proc] Schema definition block
43
+ # @return [Hash] Extracted data
44
+ #
45
+ # @example With schema caching (recommended for scraping similar pages)
46
+ # data = Nukitori(html, 'repos_schema.json') do
47
+ # array :repos do
48
+ # object do
49
+ # string :name
50
+ # string :url
51
+ # end
52
+ # end
53
+ # end
54
+ #
55
+ # @example With custom model
56
+ # data = Nukitori(html, 'repos_schema.json', model: 'gpt-5.2') do
57
+ # string :title
58
+ # end
59
+ #
60
+ # @example AI-only mode (no schema, calls LLM each time)
61
+ # data = Nukitori(html) do
62
+ # array :products do
63
+ # object do
64
+ # string :title
65
+ # number :price
66
+ # end
67
+ # end
68
+ # end
69
+ #
70
+ # @example AI-only mode with custom model
71
+ # data = Nukitori(html, model: 'claude-sonnet-4') do
72
+ # string :title
73
+ # end
74
+ #
75
+ def call(html, schema_path = nil, model: nil, prefix: nil, &block)
76
+ raise ArgumentError, 'Block required for schema definition' unless block_given?
77
+
78
+ if schema_path
79
+ extract_with_schema(html, schema_path, model:, prefix:, &block)
80
+ else
81
+ LlmExtractor.extract(html, model:, &block)
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ # XPath-based extraction with reusable schema
88
+ def extract_with_schema(html, schema_path, model: nil, prefix: nil, &block)
89
+ doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
90
+
91
+ xpath_schema = if File.exist?(schema_path)
92
+ file_content = JSON.parse(File.read(schema_path))
93
+ prefix ? file_content[prefix] : file_content
94
+ end
95
+
96
+ xpath_schema ||= generate_and_save_schema(doc, schema_path, model:, prefix:, &block)
97
+
98
+ extractor = SchemaExtractor.new(xpath_schema)
99
+ extractor.extract(doc)
100
+ end
101
+
102
+ def generate_and_save_schema(doc, path, model: nil, prefix: nil, &block)
103
+ generator = SchemaGenerator.new(model:, &block)
104
+ xpath_schema = generator.create_extraction_schema_for(doc)
105
+
106
+ if prefix
107
+ existing = File.exist?(path) ? JSON.parse(File.read(path)) : {}
108
+ existing[prefix] = xpath_schema
109
+ File.write(path, JSON.pretty_generate(existing))
110
+ else
111
+ File.write(path, JSON.pretty_generate(xpath_schema))
112
+ end
113
+
114
+ xpath_schema
115
+ end
116
+ end
117
+ end
118
+
119
+ # DSL method - allows Nukitori(html, 'schema.json') { ... } syntax
120
+ def Nukitori(html, schema_path = nil, model: nil, prefix: nil, &block)
121
+ Nukitori.call(html, schema_path, model:, prefix:, &block)
122
+ end
data/sig/nukitori.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Nukitori
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nukitori
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Victor Afanasev
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: nokogiri
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.19'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.19'
26
+ - !ruby/object:Gem::Dependency
27
+ name: ruby_llm
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '1.9'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.9'
40
+ description: Nukitori is a Ruby gem for HTML data extraction. It uses an LLM once
41
+ to generate reusable XPath schemas, then extracts structured data from similarly
42
+ structured pages using plain Nokogiri. This makes scraping fast, predictable, and
43
+ cheap for repeated runs.
44
+ email:
45
+ - vicfreefly@gmail.com
46
+ executables: []
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - CHANGELOG.md
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - lib/nukitori.rb
55
+ - lib/nukitori/chat_factory.rb
56
+ - lib/nukitori/html_preprocessor.rb
57
+ - lib/nukitori/llm_extractor.rb
58
+ - lib/nukitori/models.json
59
+ - lib/nukitori/response_parser.rb
60
+ - lib/nukitori/schema_extractor.rb
61
+ - lib/nukitori/schema_generator.rb
62
+ - lib/nukitori/version.rb
63
+ - sig/nukitori.rbs
64
+ homepage: https://github.com/vifreefly/nukitori
65
+ licenses:
66
+ - MIT
67
+ metadata:
68
+ homepage_uri: https://github.com/vifreefly/nukitori
69
+ source_code_uri: https://github.com/vifreefly/nukitori
70
+ changelog_uri: https://github.com/vifreefly/nukitori/blob/main/CHANGELOG.md
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: 3.2.0
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubygems_version: 4.0.1
86
+ specification_version: 4
87
+ summary: Generate reusable XPath schemas with an LLM, then scrape HTML without AI
88
+ test_files: []