RubyGems - nukitori - Versions diffs - 0.1.0 - Mend

nukitori 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +313 -0
data/Rakefile +12 -0
data/lib/nukitori/chat_factory.rb +31 -0
data/lib/nukitori/html_preprocessor.rb +21 -0
data/lib/nukitori/llm_extractor.rb +52 -0
data/lib/nukitori/models.json +7428 -0
data/lib/nukitori/response_parser.rb +18 -0
data/lib/nukitori/schema_extractor.rb +127 -0
data/lib/nukitori/schema_generator.rb +147 -0
data/lib/nukitori/version.rb +5 -0
data/lib/nukitori.rb +122 -0
data/sig/nukitori.rbs +4 -0
metadata +88 -0

data/lib/nukitori/response_parser.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+module Nukitori
+  # Parses LLM response content (handles both Hash and markdown-wrapped JSON)
+  class ResponseParser
+    # @param content [Hash, String] Response content from LLM
+    # @return [Hash] Parsed response
+    def self.parse(content)
+      return content if content.is_a?(Hash)
+      text = content.is_a?(String) ? content : content.to_s
+      text = text.strip
+      text = text.gsub(/\A```json\s*/, '').gsub(/\s*```\z/, '')
+      text = text.gsub(/\A```\s*/, '').gsub(/\s*```\z/, '')
+      JSON.parse(text)
+    end
+  end
+end

data/lib/nukitori/schema_extractor.rb ADDED Viewed

@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+module Nukitori
+  class SchemaExtractor
+    attr_reader :schema
+    # @param schema [Hash] XPath schema
+    def initialize(schema)
+      @schema = deep_stringify_keys(schema)
+    end
+    # Extract data from HTML using the XPath schema
+    # @param html [String, Nokogiri::HTML::Document] HTML string or Nokogiri document
+    # @return [Hash] Extracted data
+    def extract(html)
+      doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
+      extract_fields(doc, schema)
+    end
+    private
+    def deep_stringify_keys(obj)
+      case obj
+      when Hash
+        obj.each_with_object({}) do |(k, v), result|
+          result[k.to_s] = deep_stringify_keys(v)
+        end
+      when Array
+        obj.map { |v| deep_stringify_keys(v) }
+      else
+        obj
+      end
+    end
+    def extract_fields(context, fields)
+      result = {}
+      fields.each do |field_name, field_def|
+        result[field_name] = extract_field(context, field_def)
+      end
+      result
+    end
+    def extract_field(context, field_def)
+      return nil if field_def.is_a?(String)
+      case field_def['type']
+      when 'array'
+        extract_array(context, field_def)
+      when 'object'
+        extract_object(context, field_def)
+      else
+        extract_primitive(context, field_def) if field_def['xpath']
+      end
+    end
+    def extract_array(context, field_def)
+      container_xpath = field_def['container_xpath']
+      items_def = field_def['items']
+      return [] unless container_xpath && items_def
+      containers = context.xpath(container_xpath)
+      # Simple array (strings) vs array of objects
+      if items_def['xpath']
+        containers.map { |c| extract_primitive(c, items_def) }
+      else
+        containers.map { |c| extract_fields(c, items_def) }
+      end
+    end
+    def extract_object(context, field_def)
+      properties = field_def['properties']
+      context_xpath = field_def['context_xpath']
+      if context_xpath
+        context = context.at_xpath(context_xpath)
+        return nil unless context
+      end
+      extract_fields(context, properties)
+    end
+    def extract_primitive(context, field_def)
+      xpath = field_def['xpath']
+      type = field_def['type'] || 'string'
+      return nil unless xpath
+      result = context.xpath(xpath)
+      raw_value = extract_raw_value(result)
+      return nil if raw_value.nil?
+      convert_to_type(raw_value, type)
+    end
+    def extract_raw_value(xpath_result)
+      return nil if xpath_result.nil?
+      return nil if xpath_result.is_a?(Nokogiri::XML::NodeSet) && xpath_result.empty?
+      value = if xpath_result.is_a?(Nokogiri::XML::NodeSet)
+                node = xpath_result.first
+                node.is_a?(Nokogiri::XML::Attr) ? node.value : node.text
+              else
+                xpath_result.to_s
+              end
+      value.strip
+    end
+    def convert_to_type(value, type)
+      case type
+      when 'string'
+        value.to_s.gsub(/\s+/, ' ').strip
+      when 'integer'
+        value.gsub(/[^\d-]/, '').to_i
+      when 'number', 'float'
+        value.gsub(/[^\d.-]/, '').to_f
+      when 'boolean'
+        %w[true yes 1 on].include?(value.to_s.downcase)
+      else
+        value
+      end
+    end
+  end
+end

data/lib/nukitori/schema_generator.rb ADDED Viewed

@@ -0,0 +1,147 @@
+# frozen_string_literal: true
+require 'json'
+module Nukitori
+  class SchemaGenerator
+    attr_reader :model
+    # @param model [String, nil] LLM model to use
+    # @param block [Proc] Schema definition block
+    #
+    # @example
+    #   generator = Nukitori::SchemaGenerator.new do
+    #     array :repos do
+    #       object do
+    #         string :name
+    #         string :url
+    #       end
+    #     end
+    #   end
+    #
+    def initialize(model: nil, &block)
+      raise ArgumentError, 'Block required for schema definition' unless block_given?
+      @model = model
+      @schema_block = block
+    end
+    # Create extraction schema for given HTML
+    # @param html [String, Nokogiri::HTML::Document] HTML string or Nokogiri document
+    # @return [Hash] Generated extraction schema
+    #
+    # @example
+    #   extraction_schema = generator.create_extraction_schema_for(html)
+    #
+    def create_extraction_schema_for(html)
+      doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
+      requirements = build_requirements(&@schema_block)
+      processed_html = HtmlPreprocessor.process(doc)
+      normalized_requirements = normalize_requirements(requirements)
+      prompt = build_prompt(normalized_requirements)
+      chat = ChatFactory.create(model:)
+      chat.with_instructions(prompt)
+      response = chat.ask(processed_html)
+      ResponseParser.parse(response.content)
+    end
+    private
+    def build_requirements(&block)
+      schema_class = Class.new(RubyLLM::Schema, &block)
+      schema_class.new.to_json
+    end
+    def normalize_requirements(requirements)
+      schema_json = JSON.parse(requirements)
+      schema_json.dig('schema', 'properties')
+    end
+    def build_prompt(requirements)
+      <<~PROMPT
+        You are an expert at analyzing HTML structure and generating XPath expressions.
+        ## Task
+        Analyze the provided HTML and generate an XPath schema that can extract data
+        matching the requirements schema below. Return ONLY valid JSON, no other text.
+        ## Requirements Schema (what to extract)
+        ```json
+        #{requirements.to_json}
+        ```
+        ## XPath Schema Format
+        For each field in requirements, generate the corresponding XPath definition:
+        1. **For primitive types** (string, integer, number, boolean):
+           ```json
+           {
+             "field_name": {
+               "xpath": "//div[@class='example']",
+               "type": "string"
+             }
+           }
+           ```
+        2. **For arrays of objects**:
+           ```json
+           {
+             "items_list": {
+               "type": "array",
+               "container_xpath": "//div[@class='item']",
+               "items": {
+                 "name": {"xpath": ".//h3", "type": "string"},
+                 "price": {"xpath": ".//span[@class='price']", "type": "number"}
+               }
+             }
+           }
+           ```
+        3. **For arrays of strings**:
+           ```json
+           {
+             "tags": {
+               "type": "array",
+               "container_xpath": ".//a[@class='tag']",
+               "items": {
+                 "xpath": ".",
+                 "type": "string"
+               }
+             }
+           }
+           ```
+        ## XPath Rules
+        - Use `container_xpath` to identify repeating elements for arrays
+        - Use relative XPaths (starting with `.//` or `.`) for fields inside arrays
+        - Do NOT use `/text()` - just select the element, we extract text automatically
+        - Use `@attr` to extract attribute values (e.g., `@href`, `@src`), especially for schema attributes which ends at `link` or `url`
+        - Prefer semantic attributes: `@data-testid`, `@role`, `@aria-label`
+        - Prefer tag structure: `//article//h3/a` over class-based selectors
+        ## CRITICAL: Avoid Page-Specific Values
+        The XPath schema must work on ALL similar pages, not just this one. NEVER use:
+        - **IDs with numbers/UUIDs**: `@id='product-price-19140'` or `@id='item-abc123'` — these are page-specific
+        - **Dynamic function names**: `@x-data='initComponent_6956666703fbc()'` — these change per page load
+        - **Hashed class names**: `Box-sc-62in7e-0`, `css-1a2b3c`, `styled-xyz123` — generated by CSS-in-JS
+        - **Session/random tokens**: any attribute value that looks like a hash or random string
+        Instead, use:
+        - Class names that describe purpose: `@class='price'`, `@class='product-title'`
+        - Structural patterns: `//div[@class='product']//span[contains(@class,'price')]`
+        - Partial matches when needed: `contains(@id,'product-price')` instead of exact ID
+        - Tag hierarchy: `//article//h3/a` — relies on structure, not dynamic values
+        ## Output
+        Return ONLY the JSON XPath schema. No explanations, no markdown code blocks.
+      PROMPT
+    end
+  end
+end

data/lib/nukitori/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Nukitori
+  VERSION = '0.1.0'
+end

data/lib/nukitori.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require 'ruby_llm'
+require 'ruby_llm/schema'
+require 'json'
+require_relative 'nukitori/version'
+require_relative 'nukitori/response_parser'
+require_relative 'nukitori/html_preprocessor'
+require_relative 'nukitori/chat_factory'
+require_relative 'nukitori/schema_generator'
+require_relative 'nukitori/schema_extractor'
+require_relative 'nukitori/llm_extractor'
+module Nukitori
+  # Path to bundled models.json with up-to-date model definitions
+  MODELS_JSON = File.expand_path('nukitori/models.json', __dir__)
+  class << self
+    # Configure RubyLLM through Nukitori
+    # Automatically uses bundled models.json with latest model definitions
+    #
+    # @example
+    #   Nukitori.configure do |config|
+    #     config.default_model = 'gpt-5.2'
+    #     config.openai_api_key = ENV['OPENAI_API_KEY']
+    #   end
+    #
+    def configure
+      RubyLLM.configure do |config|
+        # Use bundled models.json with up-to-date model definitions
+        config.model_registry_file = MODELS_JSON
+        yield config if block_given?
+      end
+    end
+    # Main entry point - callable as Nukitori(html, 'schema.json') { schema }
+    #
+    # @param html [String, Nokogiri::HTML::Document] HTML content or Nokogiri doc
+    # @param schema_path [String, nil] Path to cache extraction schema (optional)
+    # @param model [String, nil] LLM model to use (overrides default_model)
+    # @param block [Proc] Schema definition block
+    # @return [Hash] Extracted data
+    #
+    # @example With schema caching (recommended for scraping similar pages)
+    #   data = Nukitori(html, 'repos_schema.json') do
+    #     array :repos do
+    #       object do
+    #         string :name
+    #         string :url
+    #       end
+    #     end
+    #   end
+    #
+    # @example With custom model
+    #   data = Nukitori(html, 'repos_schema.json', model: 'gpt-5.2') do
+    #     string :title
+    #   end
+    #
+    # @example AI-only mode (no schema, calls LLM each time)
+    #   data = Nukitori(html) do
+    #     array :products do
+    #       object do
+    #         string :title
+    #         number :price
+    #       end
+    #     end
+    #   end
+    #
+    # @example AI-only mode with custom model
+    #   data = Nukitori(html, model: 'claude-sonnet-4') do
+    #     string :title
+    #   end
+    #
+    def call(html, schema_path = nil, model: nil, prefix: nil, &block)
+      raise ArgumentError, 'Block required for schema definition' unless block_given?
+      if schema_path
+        extract_with_schema(html, schema_path, model:, prefix:, &block)
+      else
+        LlmExtractor.extract(html, model:, &block)
+      end
+    end
+    private
+    # XPath-based extraction with reusable schema
+    def extract_with_schema(html, schema_path, model: nil, prefix: nil, &block)
+      doc = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
+      xpath_schema = if File.exist?(schema_path)
+                       file_content = JSON.parse(File.read(schema_path))
+                       prefix ? file_content[prefix] : file_content
+                     end
+      xpath_schema ||= generate_and_save_schema(doc, schema_path, model:, prefix:, &block)
+      extractor = SchemaExtractor.new(xpath_schema)
+      extractor.extract(doc)
+    end
+    def generate_and_save_schema(doc, path, model: nil, prefix: nil, &block)
+      generator = SchemaGenerator.new(model:, &block)
+      xpath_schema = generator.create_extraction_schema_for(doc)
+      if prefix
+        existing = File.exist?(path) ? JSON.parse(File.read(path)) : {}
+        existing[prefix] = xpath_schema
+        File.write(path, JSON.pretty_generate(existing))
+      else
+        File.write(path, JSON.pretty_generate(xpath_schema))
+      end
+      xpath_schema
+    end
+  end
+end
+# DSL method - allows Nukitori(html, 'schema.json') { ... } syntax
+def Nukitori(html, schema_path = nil, model: nil, prefix: nil, &block)
+  Nukitori.call(html, schema_path, model:, prefix:, &block)
+end

data/sig/nukitori.rbs ADDED Viewed

@@ -0,0 +1,4 @@
+module Nukitori
+  VERSION: String
+  # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+end

metadata ADDED Viewed

@@ -0,0 +1,88 @@
+--- !ruby/object:Gem::Specification
+name: nukitori
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Victor Afanasev
+bindir: exe
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.19'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.19'
+- !ruby/object:Gem::Dependency
+  name: ruby_llm
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+description: Nukitori is a Ruby gem for HTML data extraction. It uses an LLM once
+  to generate reusable XPath schemas, then extracts structured data from similarly
+  structured pages using plain Nokogiri. This makes scraping fast, predictable, and
+  cheap for repeated runs.
+email:
+- vicfreefly@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- CHANGELOG.md
+- LICENSE.txt
+- README.md
+- Rakefile
+- lib/nukitori.rb
+- lib/nukitori/chat_factory.rb
+- lib/nukitori/html_preprocessor.rb
+- lib/nukitori/llm_extractor.rb
+- lib/nukitori/models.json
+- lib/nukitori/response_parser.rb
+- lib/nukitori/schema_extractor.rb
+- lib/nukitori/schema_generator.rb
+- lib/nukitori/version.rb
+- sig/nukitori.rbs
+homepage: https://github.com/vifreefly/nukitori
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/vifreefly/nukitori
+  source_code_uri: https://github.com/vifreefly/nukitori
+  changelog_uri: https://github.com/vifreefly/nukitori/blob/main/CHANGELOG.md
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.2.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 4.0.1
+specification_version: 4
+summary: Generate reusable XPath schemas with an LLM, then scrape HTML without AI
+test_files: []