RubyGems - iriq - Versions diffs - 0.2.0 → 0.30.2 - Mend

iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +78 -0
data/CLAUDE.md +128 -41
data/Gemfile.lock +4 -4
data/Makefile +80 -23
data/README.md +225 -347
data/completions/_iriq +52 -0
data/completions/iriq.bash +70 -0
data/docs/ARCHITECTURE.md +223 -0
data/docs/ROADMAP.md +190 -0
data/iriq.gemspec +2 -2
data/lib/iriq/cli.rb +398 -46
data/lib/iriq/cluster.rb +284 -12
data/lib/iriq/corpus.rb +318 -36
data/lib/iriq/cross_host_shape.rb +37 -0
data/lib/iriq/event.rb +22 -0
data/lib/iriq/evidence.rb +114 -0
data/lib/iriq/explanation.rb +1 -1
data/lib/iriq/normalizer.rb +71 -29
data/lib/iriq/path_shape.rb +30 -24
data/lib/iriq/position.rb +75 -0
data/lib/iriq/position_stats.rb +74 -8
data/lib/iriq/recognizer.rb +54 -0
data/lib/iriq/recognizer_proposal.rb +167 -0
data/lib/iriq/recognizers/date.rb +53 -0
data/lib/iriq/recognizers/integer.rb +37 -0
data/lib/iriq/recognizers/uuid.rb +16 -0
data/lib/iriq/reducer.rb +37 -0
data/lib/iriq/registrable_domain.rb +56 -0
data/lib/iriq/segment_classifier.rb +475 -23
data/lib/iriq/segment_hints.rb +9 -0
data/lib/iriq/shape.rb +106 -0
data/lib/iriq/specificity.rb +35 -0
data/lib/iriq/storage/memory.rb +83 -12
data/lib/iriq/storage/sqlite.rb +216 -37
data/lib/iriq/synthesized_recognizer.rb +56 -0
data/lib/iriq/trace.rb +294 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +17 -0
metadata +22 -3

data/lib/iriq/cluster.rb CHANGED Viewed

@@ -1,31 +1,79 @@
 module Iriq
   # A group of identifiers that share a host + shape key. Tracks examples and
   # per-position segment statistics so callers can ask which positions are
-  # actually stable in practice (e.g. /users/ always literal, /{integer_id}
+  # actually stable in practice (e.g. /users/ always literal, /{integer}
   # always variable).
   class Cluster
-    attr_reader :key, :host, :scheme, :shape, :examples, :count
+    attr_reader :key, :host, :scheme, :shape, :examples, :count, :param_stats, :max_values
+    # Structured Shape lazily derived from the first observed example —
+    # Iriq::Shape, or nil if no examples are present yet. Cached after the
+    # first call.
+    def shape_object(classifier: SegmentClassifier::DEFAULT)
+      return @shape_object if @shape_object
+      return nil if @examples.empty?
+      @shape_object = Shape.from_segments(@examples.first.path_segments, classifier: classifier)
+    end
     MAX_EXAMPLES = 10
-    def initialize(key:, host:, scheme:, shape:)
+    # Share of date-typed observations required before the corpus promotes
+    # a param to :date. 8-digit IDs in the 1900..2100 range look like
+    # YYYYMMDD by accident — without quorum we'd canonicalize random IDs.
+    DATE_CONFIDENCE_THRESHOLD = 0.8
+    # `:number` umbrella thresholds. Promote a position to :number when
+    # the combined :integer + :float observations dominate (≥ majority)
+    # AND neither subtype alone hits the strong threshold (we have a clear
+    # numeric pattern but it isn't purely ints or purely floats).
+    NUMBER_CONFIDENCE_THRESHOLD = 0.8
+    NUMBER_SUBTYPE_THRESHOLD    = 0.8
+    # `:enum` thresholds. Promote a param to :enum when the corpus has seen
+    # enough samples to trust the bound, the value set is small, each value
+    # appears more than once (rules out singletons), and the tracked values
+    # account for nearly all observations (lets a few stragglers through).
+    ENUM_MIN_OBSERVATIONS = 20
+    ENUM_MAX_CARDINALITY  = 10
+    ENUM_MIN_VALUE_COUNT  = 2
+    ENUM_MIN_COVERAGE     = 0.95
+    def initialize(key:, host:, scheme:, shape:, max_values: PositionStats::DEFAULT_MAX_VALUES)
       @key            = key
       @host           = host
       @scheme         = scheme
       @shape          = shape
+      @shape_object   = nil
       @examples       = []
+      @example_keys   = Set.new
       @count          = 0
       @segment_counts = []
+      @max_values     = max_values
+      # Query-param stats keyed by param name. Each is a PositionStats — same
+      # cardinality cap, same type-counts machinery, just indexed by ?key=
+      # instead of by path position.
+      @param_stats    = {}
     end
-    def add(identifier)
+    def add(identifier, classifier: SegmentClassifier::DEFAULT)
       @count += 1
-      @examples << identifier if @examples.size < MAX_EXAMPLES
+      if @examples.size < MAX_EXAMPLES
+        canon = identifier.canonical
+        @examples << identifier unless @example_keys.include?(canon)
+        @example_keys << canon
+      end
       identifier.path_segments.each_with_index do |seg, i|
         @segment_counts[i] ||= Hash.new(0)
         @segment_counts[i][seg] += 1
       end
+      return unless identifier.query_params
+      identifier.query_params.each do |name, value|
+        stats = @param_stats[name] ||= PositionStats.new(max_values: @max_values)
+        stats.observe(value.to_s, classifier.classify(value.to_s))
+      end
     end
     # Per-position summary:
@@ -52,9 +100,223 @@ module Iriq
         count:    count,
         examples: examples.map(&:canonical),
         segments: segment_stats,
+        params:   param_summary,
       }
     end
+    # Per-param summary, ordered by descending presence. Each entry is:
+    #   { name: "page", count: N, type: :integer, cardinality: K, presence: 0.83 }
+    # presence is count / @count — the fraction of observations that had
+    # this param.
+    def param_summary
+      return [] if @param_stats.empty?
+      @param_stats.map { |name, _stats|
+        stats = @param_stats[name]
+        type  = param_type(name)
+        row   = {
+          name:        name,
+          count:       stats.total,
+          type:        type,
+          cardinality: stats.cardinality,
+          presence:    @count.positive? ? stats.total.to_f / @count : 0.0,
+        }
+        row[:values] = enum_values(stats) if type == :enum
+        # Verbose value distribution — fractions over tracked occurrences.
+        # Boolean and enum positions get the per-value breakdown (e.g.
+        # `true: 0.97, false: 0.03`). Number positions get the int-vs-float
+        # split via :subtype_distribution.
+        if type == :boolean || type == :enum
+          row[:value_distribution] = value_distribution(stats)
+        end
+        if type == :number
+          row[:subtype_distribution] = subtype_distribution(stats, %i[integer float])
+        end
+        # :file kind breakdown — derived from tracked value_counts at
+        # summary time. Best-effort: only reflects observations within
+        # the value-tracking cap.
+        if type == :file
+          row[:kind_distribution] = file_kind_distribution(stats)
+        end
+        if stats.numeric_count.positive?
+          row[:min] = stats.numeric_min
+          row[:max] = stats.numeric_max
+          row[:avg] = stats.numeric_avg
+        end
+        row
+      }.sort_by { |row| [-row[:count], row[:name]] }
+    end
+    # Returns the type the corpus is confident enough to call this param.
+    # Equals stats.dominant_type when the dominant type isn't :date; when
+    # :date is dominant but below DATE_CONFIDENCE_THRESHOLD, falls back to
+    # the most-common non-date type (or :literal if none exists). Shared
+    # by Cluster#param_summary and Corpus#inferred_param_type so both views
+    # agree on what the corpus "thinks" about a param.
+    def param_type(name)
+      stats = @param_stats[name]
+      return nil unless stats
+      return nil if stats.total.zero?
+      type = stats.dominant_type
+      # :year takes priority over :enum for numeric range columns —
+      # a "years 2020..2026" position is more useful described as a
+      # ranged year than as an enum of those specific values.
+      return :year if year_position?(type, stats)
+      # :http_status — 3-digit ints clustered in 100..599 are almost
+      # certainly HTTP statuses. Same shape as :year (range check) but
+      # tighter window. Useful for `?status=...` or path positions that
+      # echo a status code.
+      return :http_status if http_status_position?(type, stats)
+      # :enum check — bounded set of repeated values trumps the underlying
+      # value type. `?status=active|draft|archived` surfaces as :enum
+      # (with the value list) rather than :literal even though each value
+      # individually classifies as a literal. Skip the override when the
+      # dominant type is already specific (`:boolean` carries more meaning
+      # than a 2-value enum).
+      return :enum if enum?(stats) && type != :boolean
+      # :date gate — demote when there isn't enough date-typed quorum.
+      if type == :date
+        date_frac = stats.type_counts[:date].to_f / stats.total
+        return type if date_frac >= DATE_CONFIDENCE_THRESHOLD
+        return dominant_excluding(stats, :date) || :literal
+      end
+      # :number umbrella — promote when ints + floats together dominate
+      # but neither alone is the clear winner.
+      if type == :integer || type == :float
+        int_frac   = stats.type_counts[:integer].to_f / stats.total
+        float_frac = stats.type_counts[:float].to_f / stats.total
+        if int_frac < NUMBER_SUBTYPE_THRESHOLD &&
+           float_frac < NUMBER_SUBTYPE_THRESHOLD &&
+           (int_frac + float_frac) >= NUMBER_CONFIDENCE_THRESHOLD
+          return :number
+        end
+      end
+      # Param-name fallback — `?phone=...` overrides a generic literal
+      # type with `:phone` when the value's shape was too weak to detect
+      # on its own. Only fires for overridable types (literal/opaque_id/slug).
+      if (hint = SegmentClassifier.param_name_hint(name, type))
+        return hint
+      end
+      type
+    end
+    YEAR_RANGE              = 1900..2100
+    YEAR_MIN_OBSERVATIONS   = 5
+    YEAR_MIN_DISTINCT       = 2
+    YEAR_MAX_DISTINCT       = 150
+    def year_position?(type, stats)
+      return false unless type == :integer
+      return false if stats.numeric_count.zero?
+      return false if stats.cardinality < YEAR_MIN_DISTINCT
+      return false if stats.cardinality > YEAR_MAX_DISTINCT
+      return false if stats.total < YEAR_MIN_OBSERVATIONS
+      YEAR_RANGE.cover?(stats.numeric_min) && YEAR_RANGE.cover?(stats.numeric_max)
+    end
+    HTTP_STATUS_RANGE            = 100..599
+    HTTP_STATUS_MIN_OBSERVATIONS = 5
+    HTTP_STATUS_MIN_DISTINCT     = 2
+    HTTP_STATUS_MAX_DISTINCT     = 30
+    def http_status_position?(type, stats)
+      return false unless type == :integer
+      return false if stats.numeric_count.zero?
+      return false if stats.cardinality < HTTP_STATUS_MIN_DISTINCT
+      return false if stats.cardinality > HTTP_STATUS_MAX_DISTINCT
+      return false if stats.total < HTTP_STATUS_MIN_OBSERVATIONS
+      HTTP_STATUS_RANGE.cover?(stats.numeric_min) && HTTP_STATUS_RANGE.cover?(stats.numeric_max)
+    end
+    # True when stats shows a bounded set of repeated values worth treating
+    # as an enum. See ENUM_* constants at the top of this class.
+    def enum?(stats)
+      return false if stats.total < ENUM_MIN_OBSERVATIONS
+      return false if stats.cardinality.zero? || stats.cardinality > ENUM_MAX_CARDINALITY
+      return false if stats.value_counts.any? { |_, n| n < ENUM_MIN_VALUE_COUNT }
+      coverage = stats.value_counts.values.sum.to_f / stats.total
+      coverage >= ENUM_MIN_COVERAGE
+    end
+    # Distinct values tracked for this param, ordered by descending count
+    # (lex tie-break). Returned alongside :enum-typed rows in param_summary
+    # so verbose/explain consumers can render the value set.
+    def enum_values(stats)
+      stats.value_counts.sort_by { |v, n| [-n, v] }.map(&:first)
+    end
+    # value_distribution returns the fraction of total observations each
+    # tracked value represents, ordered by descending count then lex. Used
+    # by param_summary for :boolean and :enum positions so callers can
+    # render "true 97%, false 3%"-style breakdowns.
+    def value_distribution(stats)
+      return {} if stats.total.zero?
+      stats.value_counts.sort_by { |v, n| [-n, v] }.to_h.transform_values do |n|
+        (n.to_f / stats.total).round(4)
+      end
+    end
+    # subtype_distribution slices type_counts to a specific subset and
+    # returns the fraction each subtype represents. Used for the :number
+    # umbrella to expose the int-vs-float split.
+    def subtype_distribution(stats, subtypes)
+      return {} if stats.total.zero?
+      subtypes.each_with_object({}) do |t, out|
+        n = stats.type_counts[t] || 0
+        out[t] = (n.to_f / stats.total).round(4) if n.positive?
+      end
+    end
+    # file_kind_distribution buckets tracked values by file kind and
+    # returns the fraction each kind represents over tracked observations.
+    # `:unknown` covers values that classified as :file but whose extension
+    # isn't in the kind allowlist (shouldn't normally happen since the
+    # classifier already gates on the kind map). Sums to ≤ 1.0 since
+    # value_counts caps at PositionStats::DEFAULT_MAX_VALUES.
+    def file_kind_distribution(stats)
+      return {} if stats.value_counts.empty?
+      total = stats.value_counts.values.sum
+      return {} if total.zero?
+      kinds = Hash.new(0)
+      stats.value_counts.each do |value, n|
+        kind = SegmentClassifier.file_kind(value) || :unknown
+        kinds[kind] += n
+      end
+      kinds.sort_by { |k, n| [-n, k.to_s] }.to_h.transform_values do |n|
+        (n.to_f / total).round(4)
+      end
+    end
+    # Most common type in stats.type_counts excluding `skip` — lex tie-break
+    # so the choice is deterministic across runtimes.
+    def dominant_excluding(stats, skip)
+      best = nil
+      best_count = -1
+      stats.type_counts.each do |t, n|
+        next if t == skip
+        if n > best_count || (n == best_count && t.to_s < best.to_s)
+          best = t
+          best_count = n
+        end
+      end
+      best
+    end
     # JSON-friendly dump for persistence (distinct from #to_h which is a
     # display form). Examples are dumped as canonical strings and re-parsed
     # on load.
@@ -67,22 +329,31 @@ module Iriq
         "count"          => count,
         "examples"       => examples.map(&:canonical),
         "segment_counts" => @segment_counts.map { |h| h || {} },
+        "param_stats"    => @param_stats.transform_values(&:dump),
       }
     end
-    def self.from_dump(h)
-      cluster = new(key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"])
+    def self.from_dump(h, max_values: PositionStats::DEFAULT_MAX_VALUES)
+      cluster = new(
+        key: h["key"], host: h["host"], scheme: h["scheme"], shape: h["shape"],
+        max_values: max_values,
+      )
       cluster.instance_variable_set(:@count, h["count"])
-      cluster.instance_variable_set(:@examples, h["examples"].map { |s| Parser.parse(s) })
+      examples = h["examples"].map { |s| Parser.parse(s) }
+      cluster.instance_variable_set(:@examples, examples)
+      cluster.instance_variable_set(:@example_keys, examples.map(&:canonical).to_set)
       cluster.instance_variable_set(:@segment_counts, h["segment_counts"].map { |sub| Hash.new(0).merge(sub) })
+      params = (h["param_stats"] || {}).transform_values { |sd| PositionStats.from_dump(sd) }
+      cluster.instance_variable_set(:@param_stats, params)
       cluster
     end
     # Shared cluster-key derivation. Returns [key, host, scheme, shape] —
     # callers that already have a hinted shape can pass it in to skip the
     # recomputation; URN inputs ignore the override and always derive their
-    # own shape from the NSS value.
-    def self.key_for(iri, classifier:, shape: nil)
+    # own shape from the NSS value. `host:` overrides iri.host — used by
+    # Corpus when host_strategy collapses subdomains or ignores the host.
+    def self.key_for(iri, classifier:, shape: nil, host: nil)
       if iri.urn?
         ns, value = (iri.nss || "").split(":", 2)
         derived = value ? urn_value_shape(ns, value, classifier) : nil
@@ -90,8 +361,9 @@ module Iriq
         [key, nil, "urn", key]
       else
         shape ||= PathShape.new(classifier: classifier).for(iri.path_segments)
-        key = "#{iri.scheme}://#{iri.host}#{shape}"
-        [key, iri.host, iri.scheme, shape]
+        effective_host = host.nil? ? iri.host : host
+        key = "#{iri.scheme}://#{effective_host}#{shape}"
+        [key, effective_host, iri.scheme, shape]
       end
     end