RubyGems - iriq - Versions diffs - 0.1.0 → 0.30.2 - Mend

iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +87 -0
data/CLAUDE.md +208 -0
data/Gemfile.lock +8 -2
data/Makefile +113 -0
data/README.md +249 -270
data/completions/_iriq +52 -0
data/completions/iriq.bash +70 -0
data/docs/ARCHITECTURE.md +223 -0
data/docs/ROADMAP.md +190 -0
data/iriq.gemspec +5 -4
data/lib/iriq/cli.rb +402 -49
data/lib/iriq/cluster.rb +304 -8
data/lib/iriq/clusterer.rb +19 -44
data/lib/iriq/corpus.rb +417 -81
data/lib/iriq/cross_host_shape.rb +37 -0
data/lib/iriq/event.rb +22 -0
data/lib/iriq/evidence.rb +114 -0
data/lib/iriq/explanation.rb +1 -1
data/lib/iriq/normalizer.rb +71 -29
data/lib/iriq/parser.rb +1 -1
data/lib/iriq/path_shape.rb +30 -24
data/lib/iriq/position.rb +75 -0
data/lib/iriq/position_stats.rb +74 -8
data/lib/iriq/recognizer.rb +54 -0
data/lib/iriq/recognizer_proposal.rb +167 -0
data/lib/iriq/recognizers/date.rb +53 -0
data/lib/iriq/recognizers/integer.rb +37 -0
data/lib/iriq/recognizers/uuid.rb +16 -0
data/lib/iriq/reducer.rb +37 -0
data/lib/iriq/registrable_domain.rb +56 -0
data/lib/iriq/segment_classifier.rb +475 -23
data/lib/iriq/segment_hints.rb +9 -0
data/lib/iriq/shape.rb +106 -0
data/lib/iriq/specificity.rb +35 -0
data/lib/iriq/storage/json.rb +43 -0
data/lib/iriq/storage/memory.rb +209 -0
data/lib/iriq/storage/sqlite.rb +546 -0
data/lib/iriq/storage.rb +35 -0
data/lib/iriq/synthesized_recognizer.rb +56 -0
data/lib/iriq/trace.rb +294 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +18 -0
metadata +44 -8
data/script/benchmark.rb +0 -81
data/script/memory.rb +0 -121

data/lib/iriq/evidence.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module Iriq
+  # Evidence is the structured substrate for explanation. Each Record
+  # captures one fact about the system's reasoning: "this segment
+  # classified as :integer because the Integer recognizer fired with
+  # specificity TYPED", "the IPv4 type collapses to {ip} by policy",
+  # "Position P is mostly variable because of corpus stats".
+  #
+  # Trace and Explanation are views over a list of Evidence records;
+  # the structured form is what programmatic consumers (test assertions,
+  # PR-diff annotators, downstream tooling) should build on. Human note
+  # strings emitted by Trace are derived from Evidence payloads, so
+  # adding a new note kind starts with adding a new Evidence shape.
+  #
+  # Two axes:
+  #
+  #   subject_kind ∈ {:segment, :position, :cluster}
+  #     What this Evidence is about. Today most Evidence is :segment
+  #     (per-segment classification facts). :position and :cluster
+  #     Evidence become load-bearing once corpus-informed Trace lands
+  #     in a follow-up step.
+  #
+  #   source ∈ {:lexical, :recognizer, :corpus, :neighbor, :policy}
+  #     What kind of fact is being asserted.
+  #       :lexical    — pure shape match (e.g. "matches DATE_RE")
+  #       :recognizer — a named Recognizer fired with confidence/specificity
+  #       :corpus     — aggregated counts/distributions support this
+  #       :neighbor   — adjacent context informed this (prior literal,
+  #                     param name hint)
+  #       :policy     — a normalization policy applied (ip umbrella
+  #                     collapse, canonical date, currency upcase)
+  module Evidence
+    SUBJECT_KINDS = %i[segment position cluster].freeze
+    SOURCES       = %i[lexical recognizer corpus neighbor policy].freeze
+    # A single Evidence fact.
+    #
+    #   subject_kind — :segment | :position | :cluster
+    #   subject      — kind-specific identity:
+    #                    :segment  → { index:, value: }
+    #                    :position → Iriq::Position
+    #                    :cluster  → cluster key (string)
+    #   source       — :lexical | :recognizer | :corpus | :neighbor | :policy
+    #   payload      — source-and-kind-specific structured data
+    #   weight       — optional float in [0,1] — contribution to the
+    #                  ultimate decision. Set when scoring is meaningful;
+    #                  nil otherwise.
+    #   notes        — optional human-readable strings. Trace renders
+    #                  these directly; programmatic consumers can ignore.
+    class Record
+      attr_reader :subject_kind, :subject, :source, :payload, :weight, :notes
+      def initialize(subject_kind:, subject:, source:, payload:, weight: nil, notes: [])
+        unless SUBJECT_KINDS.include?(subject_kind)
+          raise ArgumentError, "subject_kind must be one of #{SUBJECT_KINDS.inspect}"
+        end
+        unless SOURCES.include?(source)
+          raise ArgumentError, "source must be one of #{SOURCES.inspect}"
+        end
+        @subject_kind = subject_kind
+        @subject      = subject
+        @source       = source
+        @payload      = payload || {}
+        @weight       = weight
+        @notes        = notes || []
+      end
+      def to_h
+        {
+          subject_kind: @subject_kind,
+          subject:      subject_serialized,
+          source:       @source,
+          payload:      @payload,
+          weight:       @weight,
+          notes:        @notes,
+        }.compact
+      end
+      private
+      def subject_serialized
+        return @subject.to_h if @subject.respond_to?(:to_h) && !@subject.is_a?(Hash)
+        @subject
+      end
+    end
+    module_function
+    # Factories so call sites don't have to repeat subject_kind:.
+    def segment(index:, value:, source:, payload:, weight: nil, notes: [])
+      Record.new(
+        subject_kind: :segment,
+        subject:      { index: index, value: value },
+        source:       source, payload: payload, weight: weight, notes: notes,
+      )
+    end
+    def position(position:, source:, payload:, weight: nil, notes: [])
+      Record.new(
+        subject_kind: :position,
+        subject:      position,
+        source:       source, payload: payload, weight: weight, notes: notes,
+      )
+    end
+    def cluster(key:, source:, payload:, weight: nil, notes: [])
+      Record.new(
+        subject_kind: :cluster,
+        subject:      key,
+        source:       source, payload: payload, weight: weight, notes: notes,
+      )
+    end
+  end
+end

data/lib/iriq/explanation.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Iriq
   #   Explanation.explain("https://foo.com/users/123")
   #   # => [
   #   #      { value: "users", type: :literal,    variable: false, hint: nil       },
-  #   #      { value: "123",   type: :integer_id, variable: true,  hint: "user_id" },
+  #   #      { value: "123",   type: :integer, variable: true,  hint: "user_id" },
   #   #    ]
   module Explanation
     module_function

data/lib/iriq/normalizer.rb CHANGED Viewed

@@ -5,46 +5,88 @@ module Iriq
   #   # => "https://foo.com/users/{user_id}"
   #
   # The form is intended for grouping/diffing — it is not a round-trippable URL.
+  #
+  # Path + query rendering dispatches through an evidence source so the
+  # mechanical (classifier-only) and corpus-informed code paths share one
+  # entry point. When `evidence` is nil, NullEvidenceSource provides the
+  # mechanical behavior (PathShape + param-name-hint query rules). When a
+  # Corpus is passed as `evidence`, its observed Position / Cluster stats
+  # drive the rendering (variability promotion, popular outlier
+  # preservation, cluster-inferred query types).
   module Normalizer
     module_function
-    def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
+    def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
       iri = input.is_a?(Identifier) ? input : Parser.parse(input)
-      normalize_identifier(iri, classifier: classifier, hints: hints)
+      normalize_identifier(iri, classifier: classifier, hints: hints, evidence: evidence)
+    end
+    def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
+      return normalize_urn(iri, classifier, hints) if iri.urn?
+      src = evidence || NullEvidenceSource.new
+      out = +""
+      out << "#{iri.scheme}://" if iri.scheme
+      out << iri.host if iri.host
+      out << ":#{iri.port}" if iri.port
+      out << src.render_path(iri, classifier, hints)
+      if iri.query_params && !iri.query_params.empty?
+        out << "?" << src.render_query(iri, classifier)
+      end
+      out
     end
-    def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
-      if iri.urn?
-        if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
-          ns, value = iri.nss.split(":", 2)
-          entry     = SegmentHints.derive([ns, value], classifier).last
-          shaped    = if entry[:variable]
-                        "{#{(hints && entry[:hint]) || entry[:type]}}"
-                      else
-                        entry[:value]
-                      end
-          "urn:#{ns}:#{shaped}"
+    def normalize_urn(iri, classifier, hints)
+      return iri.canonical unless iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
+      ns, value = iri.nss.split(":", 2)
+      entry     = SegmentHints.derive([ns, value], classifier).last
+      shaped =
+        if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
+          canon
+        elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
+          canon
+        elsif entry[:variable]
+          "{#{(hints && entry[:hint]) || SegmentClassifier.display_type(entry[:type])}}"
         else
-          iri.canonical
+          entry[:value]
         end
-      else
-        out = +""
-        out << "#{iri.scheme}://" if iri.scheme
-        out << iri.host if iri.host
-        out << ":#{iri.port}" if iri.port
-        out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
-        if iri.query_params && !iri.query_params.empty?
-          out << "?" + shape_query(iri.query_params, classifier)
-        end
-        out
-      end
+      "urn:#{ns}:#{shaped}"
+    end
+  end
+  # NullEvidenceSource is the default evidence source — purely
+  # classifier-driven, no corpus signal. The Normalizer's mechanical
+  # behavior is what this produces. Implements the same {render_path,
+  # render_query} interface that Corpus implements for the corpus-informed
+  # path.
+  class NullEvidenceSource
+    def render_path(iri, classifier, hints)
+      PathShape.new(
+        classifier: classifier, hints: hints,
+        canonical_dates: true, canonical_currencies: true,
+      ).for(iri.path_segments)
     end
-    def shape_query(params, classifier)
-      params.keys.sort.map do |k|
-        v    = params[k]
+    def render_query(iri, classifier)
+      iri.query_params.keys.sort.map do |k|
+        v    = iri.query_params[k]
         type = classifier.classify(v.to_s)
-        shaped = classifier.variable?(type) ? "{#{type}}" : v
+        # Param-name hint can lift a generic literal/opaque_id/slug into
+        # a semantic type — `?phone=unknown` becomes `{phone}`.
+        if (hint = SegmentClassifier.param_name_hint(k, type))
+          type = hint
+        end
+        shaped =
+          if type == :date && (canon = SegmentClassifier.canonical_date(v.to_s))
+            canon
+          elsif type == :currency && (canon = SegmentClassifier.canonical_currency(v.to_s))
+            canon
+          elsif classifier.variable?(type)
+            "{#{SegmentClassifier.display_type(type)}}"
+          else
+            v
+          end
         "#{k}=#{shaped}"
       end.join("&")
     end

data/lib/iriq/parser.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Iriq
   #
   # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
   # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
-  # Unicode hosts and paths) to support normalization and clustering.
+  # Unicode hosts and paths) to support extraction, normalization, and clustering.
   module Parser
     SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze

data/lib/iriq/path_shape.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 module Iriq
-  # Converts a sequence of path segments into a route-shape string by
-  # replacing variable segments with `{hint}` placeholders, falling back to
-  # `{type}` when no hint is available.
+  # Renderer that produces a route-shape string by replacing variable
+  # segments with `{hint}` placeholders. As of v0.16 this is a thin wrapper
+  # around Iriq::Shape — kept for back-compat with callers that still want
+  # to get a string in one call.
   #
   #   PathShape.for(["users", "123", "orders", "456"])
   #   # => "/users/{user_id}/orders/{order_id}"
@@ -9,37 +10,42 @@ module Iriq
   # Pass `hints: false` to use raw types instead:
   #
   #   PathShape.for(["users", "123"], hints: false)
-  #   # => "/users/{integer_id}"
+  #   # => "/users/{integer}"
+  #
+  # Pass `canonical_dates: true` to render date-typed segments in canonical
+  # ISO form (2024/01/15 → 2024-01-15) instead of as a `{date}` placeholder.
+  # Pass `canonical_currencies: true` for the same treatment of currency
+  # codes (`usd` → `USD`).
+  #
+  # For new code, prefer building an Iriq::Shape directly and calling
+  # `#render`. PathShape stays available for the common string-only path.
   class PathShape
-    def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
-      @classifier = classifier
-      @hints      = hints
+    def initialize(classifier: SegmentClassifier::DEFAULT, hints: true,
+                   canonical_dates: false, canonical_currencies: false)
+      @classifier           = classifier
+      @hints                = hints
+      @canonical_dates      = canonical_dates
+      @canonical_currencies = canonical_currencies
     end
     def for(segments)
-      return "/" if segments.nil? || segments.empty?
-      from_entries(SegmentHints.derive(segments, @classifier))
+      from_entries(SegmentHints.derive(segments || [], @classifier))
     end
     # Build a shape string from already-derived SegmentHints entries.
-    # Used by Corpus to avoid re-deriving entries per observation when it
-    # needs multiple shape variants (raw and hinted).
     def from_entries(entries)
-      return "/" if entries.nil? || entries.empty?
-      "/" + entries.map { |e| shape_token(e) }.join("/")
-    end
-    def shape_token(entry)
-      return entry[:value] unless entry[:variable]
-      placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
-      "{#{placeholder}}"
+      Shape.from_entries(entries).render(
+        hints: @hints,
+        canonical_dates: @canonical_dates,
+        canonical_currencies: @canonical_currencies,
+      )
     end
-    def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
-      new(classifier: classifier, hints: hints).for(segments)
+    def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true,
+                 canonical_dates: false, canonical_currencies: false)
+      new(classifier: classifier, hints: hints,
+          canonical_dates: canonical_dates,
+          canonical_currencies: canonical_currencies).for(segments)
     end
   end
 end

data/lib/iriq/position.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module Iriq
+  # A typed slot in a host's URL structure.
+  #
+  # Two observations occupy the same Position when (host, scope, locator)
+  # match exactly. Position is the keying type used by Storage for
+  # frequency tables and by Cluster for per-slot inference.
+  #
+  # host    — the EFFECTIVE host per Corpus#host_strategy. Observations of
+  #           api.foo.com and app.foo.com under :registrable share the
+  #           same Position. The original host stays on the Identifier.
+  # scope   — :path or :query.
+  # locator — for :path, the typed prefix built up to this slot, e.g.
+  #           "/orgs/{opaque_id}/users" for the integer slot in
+  #           /orgs/abc/users/123. (Variable segments render as their
+  #           hint or display-type, so the prefix groups across observations
+  #           regardless of the specific IDs seen.)
+  #         — for :query, the ?key= parameter name.
+  #
+  # Position implements value equality and is safe to use as a Hash key.
+  class Position
+    SCOPES = %i[path query].freeze
+    attr_reader :host, :scope, :locator
+    def self.path(host:, prefix:)
+      new(host: host, scope: :path, locator: prefix)
+    end
+    def self.query(host:, name:)
+      new(host: host, scope: :query, locator: name)
+    end
+    def initialize(host:, scope:, locator:)
+      raise ArgumentError, "scope must be one of #{SCOPES.inspect}" unless SCOPES.include?(scope)
+      @host    = host
+      @scope   = scope
+      @locator = locator
+    end
+    def path?;  @scope == :path;  end
+    def query?; @scope == :query; end
+    def ==(other)
+      other.is_a?(Position) &&
+        other.host == @host &&
+        other.scope == @scope &&
+        other.locator == @locator
+    end
+    alias eql? ==
+    def hash
+      [@host, @scope, @locator].hash
+    end
+    def to_h
+      { host: @host, scope: @scope, locator: @locator }
+    end
+    def to_s
+      "Position(#{@host.inspect}, #{@scope}, #{@locator.inspect})"
+    end
+    alias inspect to_s
+    # Serialized form used by JSON / SQLite storage. Scope is emitted as
+    # a string for cross-runtime compatibility.
+    def to_dump
+      { "host" => @host, "scope" => @scope.to_s, "locator" => @locator }
+    end
+    def self.from_dump(h)
+      new(host: h["host"], scope: h["scope"].to_sym, locator: h["locator"])
+    end
+  end
+end

data/lib/iriq/position_stats.rb CHANGED Viewed

@@ -3,16 +3,29 @@ module Iriq
   # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
   # doesn't grow memory without bound — `total` keeps growing accurately, but
   # only the first `max_values` distinct values are tracked individually.
+  # Existing tracked values still receive increments after the cap is hit;
+  # only NEW distinct values are dropped.
   class PositionStats
-    DEFAULT_MAX_VALUES = 1_000
+    DEFAULT_MAX_VALUES = 5_000
-    attr_reader :value_counts, :type_counts, :total, :max_values
+    attr_reader :value_counts, :type_counts, :total, :max_values,
+                :numeric_count, :numeric_min, :numeric_max, :numeric_sum
+    NUMERIC_TYPES = %i[integer float].freeze
     def initialize(max_values: DEFAULT_MAX_VALUES)
-      @value_counts = Hash.new(0)
-      @type_counts  = Hash.new(0)
-      @total        = 0
-      @max_values   = max_values
+      @value_counts  = Hash.new(0)
+      @type_counts   = Hash.new(0)
+      @total         = 0
+      @max_values    = max_values
+      # Range stats for numeric observations only. Lets the corpus
+      # promote /articles/2024 etc. to :year when all values land in
+      # 1900..2100, and surfaces min/max/avg on ParamSummary for
+      # general numeric params.
+      @numeric_count = 0
+      @numeric_min   = nil
+      @numeric_max   = nil
+      @numeric_sum   = 0.0
     end
     def observe(value, type)
@@ -21,8 +34,31 @@ module Iriq
       if @value_counts.size < @max_values || @value_counts.key?(value)
         @value_counts[value] += 1
       end
+      record_numeric(value, type)
+    end
+    def numeric_avg
+      return nil if @numeric_count.zero?
+      @numeric_sum / @numeric_count
     end
+    private
+    def record_numeric(value, type)
+      return unless NUMERIC_TYPES.include?(type)
+      n = Float(value) rescue nil
+      return unless n
+      @numeric_count += 1
+      @numeric_min = n if @numeric_min.nil? || n < @numeric_min
+      @numeric_max = n if @numeric_max.nil? || n > @numeric_max
+      @numeric_sum += n
+    end
+    public
     def cardinality
       @value_counts.size
     end
@@ -42,13 +78,37 @@ module Iriq
       (@value_counts[value] || 0).to_f / @total
     end
+    # Most common type. On count ties, breaks lexicographically by type
+    # symbol name so the result is deterministic and matches Go's
+    # DominantType (Go's map iteration is randomized).
+    def dominant_type
+      best = nil
+      best_count = -1
+      @type_counts.each do |t, n|
+        if n > best_count || (n == best_count && t.to_s < best.to_s)
+          best = t
+          best_count = n
+        end
+      end
+      best
+    end
     def dump
-      {
-        "value_counts" => @value_counts,
+      # Dup the hashes so callers can mutate the dump structure (test
+      # fixtures, post-processing) without aliasing the live state.
+      out = {
+        "value_counts" => @value_counts.dup,
         "type_counts"  => @type_counts.transform_keys(&:to_s),
         "total"        => @total,
         "max_values"   => @max_values,
       }
+      if @numeric_count.positive?
+        out["numeric_count"] = @numeric_count
+        out["numeric_min"]   = @numeric_min
+        out["numeric_max"]   = @numeric_max
+        out["numeric_sum"]   = @numeric_sum
+      end
+      out
     end
     def self.from_dump(h)
@@ -58,6 +118,12 @@ module Iriq
       tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
       stats.instance_variable_set(:@value_counts, vc)
       stats.instance_variable_set(:@type_counts, tc)
+      if h["numeric_count"]
+        stats.instance_variable_set(:@numeric_count, h["numeric_count"])
+        stats.instance_variable_set(:@numeric_min, h["numeric_min"])
+        stats.instance_variable_set(:@numeric_max, h["numeric_max"])
+        stats.instance_variable_set(:@numeric_sum, h["numeric_sum"])
+      end
       stats
     end
   end

data/lib/iriq/recognizer.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module Iriq
+  # Pluggable single-type classifier.
+  #
+  # A Recognizer encapsulates "this string-shape implies this type" plus the
+  # canonical form (if any). The ensemble-based SegmentClassifier consults
+  # Recognizers in order and picks the first that fires. (Scored-ensemble
+  # voting comes in a follow-up; for now each fire is decisive.)
+  #
+  # try(segment) -> { type:, confidence:, canonical:, notes: } | nil
+  #   nil   — this Recognizer does not claim the segment.
+  #   type  — symbol from the recognized vocabulary.
+  #   confidence — float in [0, 1]. Phase-1 step 2 always returns 1.0
+  #     when a Recognizer fires; calibration arrives with the scored
+  #     ensemble in step 4.
+  #   canonical — canonical form (e.g. ISO date for :date). nil ≡ "use input".
+  #   notes — optional array of strings the Trace view may surface.
+  #
+  # Recognizers are instantiated once and shared (they hold no per-call
+  # state). See Iriq::Recognizers::UUID / DATE / INTEGER for the built-ins.
+  class Recognizer
+    def try(_segment)
+      raise NotImplementedError
+    end
+    # Run each Recognizer against the segment and return the winning
+    # Verdict — the one with max(specificity × confidence). Ties go to
+    # the earlier Recognizer in the list (stable, deterministic).
+    # Returns nil when no Recognizer fires.
+    #
+    # Stepping-stone toward the full scored ensemble: today only three
+    # Recognizers participate (uuid, date, integer) and they're
+    # mutually-exclusive on shape, so the ensemble is effectively a
+    # short-circuit OR. As more Recognizers carve out of SegmentClassifier
+    # they'll join the pool and the scoring becomes load-bearing.
+    def self.ensemble(segment, *recognizers)
+      best = nil
+      best_score = -1.0
+      recognizers.each do |r|
+        v = r.try(segment)
+        next unless v
+        score = (v[:specificity] || 0.0) * (v[:confidence] || 0.0)
+        if score > best_score
+          best       = v
+          best_score = score
+        end
+      end
+      best
+    end
+  end
+  module Recognizers
+  end
+end