RubyGems - iriq - Versions diffs - 0.1.0 → 0.30.2 - Mend

iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +87 -0
data/CLAUDE.md +208 -0
data/Gemfile.lock +8 -2
data/Makefile +113 -0
data/README.md +249 -270
data/completions/_iriq +52 -0
data/completions/iriq.bash +70 -0
data/docs/ARCHITECTURE.md +223 -0
data/docs/ROADMAP.md +190 -0
data/iriq.gemspec +5 -4
data/lib/iriq/cli.rb +402 -49
data/lib/iriq/cluster.rb +304 -8
data/lib/iriq/clusterer.rb +19 -44
data/lib/iriq/corpus.rb +417 -81
data/lib/iriq/cross_host_shape.rb +37 -0
data/lib/iriq/event.rb +22 -0
data/lib/iriq/evidence.rb +114 -0
data/lib/iriq/explanation.rb +1 -1
data/lib/iriq/normalizer.rb +71 -29
data/lib/iriq/parser.rb +1 -1
data/lib/iriq/path_shape.rb +30 -24
data/lib/iriq/position.rb +75 -0
data/lib/iriq/position_stats.rb +74 -8
data/lib/iriq/recognizer.rb +54 -0
data/lib/iriq/recognizer_proposal.rb +167 -0
data/lib/iriq/recognizers/date.rb +53 -0
data/lib/iriq/recognizers/integer.rb +37 -0
data/lib/iriq/recognizers/uuid.rb +16 -0
data/lib/iriq/reducer.rb +37 -0
data/lib/iriq/registrable_domain.rb +56 -0
data/lib/iriq/segment_classifier.rb +475 -23
data/lib/iriq/segment_hints.rb +9 -0
data/lib/iriq/shape.rb +106 -0
data/lib/iriq/specificity.rb +35 -0
data/lib/iriq/storage/json.rb +43 -0
data/lib/iriq/storage/memory.rb +209 -0
data/lib/iriq/storage/sqlite.rb +546 -0
data/lib/iriq/storage.rb +35 -0
data/lib/iriq/synthesized_recognizer.rb +56 -0
data/lib/iriq/trace.rb +294 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +18 -0
metadata +44 -8
data/script/benchmark.rb +0 -81
data/script/memory.rb +0 -121

data/lib/iriq/corpus.rb CHANGED Viewed

@@ -7,6 +7,10 @@ module Iriq
   #
   # The deterministic, single-IRI API (Iriq.normalize/explain) is unchanged —
   # Corpus#normalize and Corpus#explain are the corpus-informed variants.
+  #
+  # State lives in a Storage backend (Memory by default; Json or Sqlite when
+  # opened against a file). The classification logic on top is identical
+  # regardless of where the counters live.
   class Corpus
     # Type-based: position is "mostly variable" (UUIDs/integers/etc.).
     VARIABLE_DOMINANCE_THRESHOLD = 0.8
@@ -38,44 +42,274 @@ module Iriq
     POPULAR_MIN_COUNT         = 5
     POPULAR_BASELINE_MULTIPLE = 3
-    attr_reader :host_counts, :path_length_counts, :raw_shape_counts,
-                :fingerprint_counts, :position_stats
+    HOST_STRATEGIES = %i[full registrable none].freeze
+    attr_reader :storage, :host_strategy, :classifier
     def initialize(classifier: SegmentClassifier::DEFAULT,
-                   max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
-      @classifier              = classifier
-      @max_values_per_position = max_values_per_position
-      @host_counts             = Hash.new(0)
-      @path_length_counts      = Hash.new(0)
-      @raw_shape_counts        = Hash.new(0)
-      @fingerprint_counts      = Hash.new(0)
-      @position_stats          = {}
-      @clusterer               = Clusterer.new(classifier: classifier)
+                   max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
+                   host_strategy: :full,
+                   storage: nil)
+      raise ArgumentError, "host_strategy must be one of #{HOST_STRATEGIES.inspect}" \
+        unless HOST_STRATEGIES.include?(host_strategy)
+      @classifier    = classifier
+      @host_strategy = host_strategy
+      @storage       = storage || Storage::Memory.new(
+        classifier: classifier,
+        max_values_per_position: max_values_per_position,
+      )
+    end
+    # Open a corpus against `path`. File extension picks the backend:
+    # `.db`/`.sqlite`/`.sqlite3` use SQLite (incremental writes); anything
+    # else uses JSON.
+    def self.open(path, classifier: SegmentClassifier::DEFAULT,
+                        max_values_per_position: PositionStats::DEFAULT_MAX_VALUES,
+                        host_strategy: :full)
+      storage = Storage.open(path,
+                             classifier: classifier,
+                             max_values_per_position: max_values_per_position)
+      corpus = new(classifier: classifier, storage: storage, host_strategy: host_strategy)
+      corpus.send(:reapply_activated_recognizers!) if storage.respond_to?(:each_activated_recognizer)
+      corpus
+    end
+    # Normalize the host for keying purposes. `:full` keeps the original
+    # host; `:registrable` collapses subdomains via the inline-PSL heuristic
+    # (api.foo.com + app.foo.com → foo.com); `:none` ignores host entirely
+    # so clusters group across all hosts by shape alone.
+    def effective_host(host)
+      case @host_strategy
+      when :registrable then RegistrableDomain.for(host)
+      when :none        then ""
+      else                   host
+      end
     end
     # Observe a single IRI. Returns an Observation.
+    #
+    # Internally: builds an Event list for the IRI, then applies each event
+    # through the Reducer registry inside a single storage transaction. The
+    # event list is transient today — a future commit can persist it and
+    # replay against alternate reducers / thresholds for re-runnable
+    # inference. See lib/iriq/event.rb and lib/iriq/reducer.rb.
     def observe(input)
+      iri     = coerce(input)
+      events  = events_for(iri)
+      cluster = nil
+      @storage.transaction do |s|
+        events.each do |e|
+          result = Reducer.apply(e, s)
+          cluster = result if e.is_a?(Event::ClusterAddition)
+        end
+        s.record_observation(iri.canonical) if s.respond_to?(:record_observation)
+      end
+      Observation.new(corpus: self, identifier: iri, cluster: cluster)
+    end
+    # Drop every materialized view (host counts, position stats, clusters,
+    # …) and rebuild them by replaying the source-IRI log through the
+    # current events + reducers pipeline. Useful for:
+    #
+    #   - Tuning thresholds (swap a Corpus constant, call reinfer)
+    #   - Swapping the classifier (open the Corpus with a different
+    #     classifier, call reinfer — events are re-derived from raw IRIs)
+    #   - Recovering after a Reducer-set change
+    #
+    # Wrapped in a single backend transaction so a failure mid-replay
+    # leaves the prior views intact.
+    def reinfer
+      @storage.transaction do |s|
+        iris = []
+        s.each_observed_iri { |canonical| iris << canonical }
+        s.clear_materialized_views
+        iris.each do |canonical|
+          iri = Parser.parse(canonical)
+          events_for(iri).each { |e| Reducer.apply(e, s) }
+        end
+      end
+      nil
+    end
+    # Number of IRIs in the source-IRI log. The materialized views are
+    # derived from this log; reinfer replays it.
+    def observed_iri_count
+      return @storage.observed_iri_count if @storage.respond_to?(:observed_iri_count)
+      0
+    end
+    # Scan observed values for shape patterns that recur frequently enough
+    # to suggest a new Recognizer. Returns RecognizerProposal records; nothing
+    # is automatically applied — the proposal carries enough evidence for a
+    # human to decide whether to bake the Recognizer in.
+    #
+    # Strategies are pluggable; the default set lives in
+    # Iriq::ProposalStrategy::DEFAULTS. Pass `strategies:` to limit / extend.
+    # Pass `min_observations:` / `min_coverage:` / `min_hosts:` to tune
+    # what passes the noise floor.
+    def propose_recognizers(strategies: ProposalStrategy::DEFAULTS, **opts)
+      strategies.flat_map { |s| s.propose(@storage, **opts) }
+    end
+    # Promote a RecognizerProposal into a live Recognizer for this corpus.
+    #
+    # Mechanics:
+    #   1. Synthesize a SynthesizedRecognizer from the proposal's prefix.
+    #   2. Switch to a per-corpus classifier (if we were sharing the
+    #      module-level DEFAULT) so activation doesn't leak to other
+    #      corpora using the same default singleton.
+    #   3. Register the Recognizer on the classifier — the ensemble
+    #      picks it up on the next classify() call.
+    #   4. Persist the activation in storage so reopens re-apply it.
+    #   5. Reinfer so existing observations get re-classified through
+    #      the new Recognizer.
+    #
+    # Returns the synthesized Recognizer.
+    def activate_proposal(proposal)
+      recognizer = SynthesizedRecognizer.from_proposal(proposal)
+      ensure_per_corpus_classifier!
+      @classifier.register_recognizer(recognizer)
+      if @storage.respond_to?(:record_activated_recognizer)
+        @storage.record_activated_recognizer(recognizer.to_dump)
+      end
+      reinfer
+      recognizer
+    end
+    # Convenience: activate every proposal whose confidence clears the
+    # given threshold. Returns the activated Recognizers. Confidence
+    # incorporates both per-position coverage AND cross-host
+    # corroboration — see RecognizerProposal#compute_confidence.
+    def activate_proposals_above(confidence_threshold, **propose_opts)
+      proposals = propose_recognizers(**propose_opts)
+      proposals.select { |p| p.confidence >= confidence_threshold }.map { |p| activate_proposal(p) }
+    end
+    # Number of activated recognizers persisted with this corpus.
+    def activated_recognizer_count
+      return @storage.activated_recognizer_count if @storage.respond_to?(:activated_recognizer_count)
+      0
+    end
+    # Route shapes that recur across `min_hosts` or more distinct hosts.
+    # Returns CrossHostShape records sorted by host_count desc, then by
+    # observation_count desc, then by shape (stable, deterministic).
+    #
+    # Cross-host recurrence is independent evidence of a real semantic
+    # pattern — two unrelated hosts inventing the same `/users/{integer}`
+    # structure by accident is unlikely. A natural follow-up is feeding
+    # this signal back into RecognizerProposal confidence: a proposal
+    # supported by N hosts is much stronger than one seen on a single
+    # host with the same per-position coverage.
+    def cross_host_shapes(min_hosts: 2)
+      by_shape = Hash.new { |h, k| h[k] = { hosts: Set.new, count: 0 } }
+      @storage.clusters.each do |cluster|
+        # Skip non-URL clusters (URN clusters have no host).
+        next if cluster.host.nil? || cluster.host.empty?
+        agg = by_shape[cluster.shape]
+        agg[:hosts] << cluster.host
+        agg[:count] += cluster.count
+      end
+      by_shape.filter_map do |shape, data|
+        next nil if data[:hosts].size < min_hosts
+        CrossHostShape.new(
+          shape:             shape,
+          hosts:             data[:hosts],
+          observation_count: data[:count],
+        )
+      end.sort_by { |s| [-s.host_count, -s.observation_count, s.shape] }
+    end
+    # Build the ordered Event list for `input` without applying it. Useful
+    # for inspection, tests, and future event-log persistence. Each call is
+    # pure — no storage side-effects.
+    def events_for(input)
       iri = coerce(input)
       hinted_entries = SegmentHints.derive(iri.path_segments, @classifier)
-      record_aggregates(iri, hinted_entries)
+      raw_shape    = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
       hinted_shape = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
-      cluster = @clusterer.add(iri, shape: hinted_shape)
-      Observation.new(corpus: self, identifier: iri, cluster: cluster)
+      keying_host  = effective_host(iri.host)
+      events = [
+        Event::HostSeen.new(keying_host),
+        Event::PathLengthSeen.new(iri.path_segments.size),
+        Event::RawShapeSeen.new(raw_shape),
+        Event::FingerprintSeen.new(hinted_shape),
+      ]
+      prefix = ""
+      hinted_entries.each do |entry|
+        events << Event::PositionSeen.new(
+          Position.path(host: keying_host, prefix: prefix),
+          entry[:value], entry[:type],
+        )
+        prefix = "#{prefix}/#{placeholder(entry)}"
+      end
+      key, host, scheme, shape = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape, host: keying_host)
+      events << Event::ClusterAddition.new(key, host, scheme, shape, iri)
+      events
     end
     # Corpus-informed normalization. Falls back to mechanical normalization
-    # when the corpus has no signal for a position.
+    # when the corpus has no signal for a position. Implemented as a thin
+    # call into Normalizer with `evidence: self`; the corpus-informed path
+    # and query rendering live in #render_path / #render_query below
+    # (the evidence-source interface).
     def normalize(input)
       iri = coerce(input)
-      return Normalizer.normalize_identifier(iri) if iri.urn? || iri.path_segments.empty?
+      Normalizer.normalize_identifier(iri, classifier: @classifier, hints: true, evidence: self)
+    end
+    # Evidence-source interface — called by Normalizer when this Corpus is
+    # passed as `evidence:`. Renders the path using corpus-informed
+    # classifications (variability promotion, popular-outlier preservation).
+    # Always emits a leading "/" — empty path collapses to "/" to match
+    # mechanical output and anchor any trailing query.
+    def render_path(iri, _classifier, _hints)
       tokens = annotate_segments(iri).map { |entry| corpus_token(entry) }
-      out = +""
-      out << "#{iri.scheme}://" if iri.scheme
-      out << iri.host if iri.host
-      out << ":#{iri.port}" if iri.port
-      out << "/" << tokens.join("/")
-      out
+      "/" + tokens.join("/")
+    end
+    # Evidence-source interface — render the query string with
+    # cluster-inferred param types where available. The mechanical
+    # NullEvidenceSource provides the classifier-only fallback; this
+    # version prefers the cluster's observed type per param (dominant
+    # type_count, subject to the corpus thresholds).
+    def render_query(iri, _classifier = @classifier)
+      hinted_shape = PathShape.new(classifier: @classifier, hints: true)
+                              .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
+      key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
+                               host: effective_host(iri.host))
+      cluster = @storage.cluster_for(key)
+      iri.query_params.keys.sort.map do |k|
+        v = iri.query_params[k].to_s
+        type = inferred_param_type(cluster, k, v)
+        shaped = render_param_value(v, type)
+        "#{k}=#{shaped}"
+      end.join("&")
+    end
+    # Inferred params for the cluster `input` would fall into. Returns the
+    # same shape as Cluster#param_summary — useful for "what query params
+    # might this URL accept?" tooling. Empty array if no cluster has been
+    # observed for this shape yet.
+    def params_for(input)
+      iri = coerce(input)
+      hinted_shape = PathShape.new(classifier: @classifier, hints: true)
+                              .from_entries(SegmentHints.derive(iri.path_segments, @classifier))
+      key, * = Cluster.key_for(iri, classifier: @classifier, shape: hinted_shape,
+                               host: effective_host(iri.host))
+      cluster = @storage.cluster_for(key)
+      cluster ? cluster.param_summary : []
     end
     # Per-segment explanation with corpus-informed `classification`.
@@ -89,55 +323,93 @@ module Iriq
       end
     end
+    def host_counts;        @storage.host_counts;        end
+    def path_length_counts; @storage.path_length_counts; end
+    def raw_shape_counts;   @storage.raw_shape_counts;   end
+    def fingerprint_counts; @storage.fingerprint_counts; end
+    # Iterates Position → PositionStats over all observed positions.
+    # Used by inspection tooling; not part of the hot path.
+    def each_position_stats(&block)
+      @storage.each_position_stats(&block)
+    end
     def clusters
-      @clusterer.clusters
+      @storage.clusters
     end
     def size
-      @clusterer.size
+      @storage.cluster_size
     end
-    # Stats for a given (host, prefix_shape) — useful for tests and
+    # Stats for a given (host, path-prefix) — useful for tests and
     # debugging. Returns nil if nothing has been observed there.
-    def stats_for(host, prefix)
-      @position_stats[[host, prefix]]
+    # Accepts either a Position or (host, prefix) for ergonomics.
+    def stats_for(host_or_position, prefix = nil)
+      position = host_or_position.is_a?(Position) ? host_or_position : Position.path(host: host_or_position, prefix: prefix)
+      @storage.position_stats(position)
     end
-    private
+    # Persist the corpus.
+    #
+    #   save()           → flush the backend in place (JSON writes its file,
+    #                      SQLite is already on disk).
+    #   save(same_path)  → same as save() — idempotent for the backend's path.
+    #   save(other_path) → export to other_path as JSON, regardless of the
+    #                      live backend.
+    def save(path = nil)
+      backend_path = @storage.respond_to?(:path) ? @storage.path : nil
+      if path.nil? || path == backend_path
+        @storage.save
+      else
+        write_json_dump(path)
+      end
+    end
-    def coerce(input)
-      input.is_a?(Identifier) ? input : Parser.parse(input)
+    def close
+      @storage.close
+    end
+    # Wrap many observations in a single backend transaction. For SQLite this
+    # turns thousands of fsyncs into one; for in-memory backends it's a
+    # no-op. Use when ingesting a batch.
+    def batch(&block)
+      @storage.batch(&block)
     end
-    def record_aggregates(iri, hinted_entries)
-      @host_counts[iri.host] += 1 if iri.host
-      @path_length_counts[iri.path_segments.size] += 1
+    private
-      raw = PathShape.new(classifier: @classifier, hints: false).from_entries(hinted_entries)
-      fp  = PathShape.new(classifier: @classifier, hints: true).from_entries(hinted_entries)
-      @raw_shape_counts[raw] += 1
-      @fingerprint_counts[fp] += 1
+    # If we're still sharing the module-level DEFAULT classifier, switch
+    # to our own copy so register_recognizer doesn't leak into other
+    # corpora using the same default singleton.
+    def ensure_per_corpus_classifier!
+      return if @classifier != SegmentClassifier::DEFAULT
-      record_position_stats(iri, hinted_entries)
+      @classifier = SegmentClassifier.new
     end
-    def record_position_stats(iri, hinted_entries)
-      prefix = ""
-      hinted_entries.each do |entry|
-        key   = [iri.host, prefix]
-        stats = @position_stats[key] ||= PositionStats.new(max_values: @max_values_per_position)
-        stats.observe(entry[:value], entry[:type])
-        prefix = "#{prefix}/#{placeholder(entry)}"
+    # On Corpus.open, walk the stored activations and register each one
+    # on this corpus's classifier. Switches to a per-corpus classifier
+    # if any activations exist.
+    def reapply_activated_recognizers!
+      return if @storage.activated_recognizer_count.zero?
+      ensure_per_corpus_classifier!
+      @storage.each_activated_recognizer do |dump|
+        @classifier.register_recognizer(SynthesizedRecognizer.from_dump(dump))
       end
     end
-    # Walks the IRI's segments and returns hint-derived entries enriched with
-    # the (host, prefix) PositionStats reference and a :classification symbol.
+    def coerce(input)
+      input.is_a?(Identifier) ? input : Parser.parse(input)
+    end
     def annotate_segments(iri)
       hinted = SegmentHints.derive(iri.path_segments, @classifier)
       prefix = ""
+      keying_host = effective_host(iri.host)
       hinted.map do |entry|
-        stats = @position_stats[[iri.host, prefix]]
+        stats = @storage.position_stats(Position.path(host: keying_host, prefix: prefix))
         out = entry.merge(
           prefix:         prefix,
           classification: classify(entry, stats),
@@ -150,14 +422,28 @@ module Iriq
     def placeholder(entry)
       return entry[:value] unless entry[:variable]
-      "{#{entry[:hint] || entry[:type]}}"
+      "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}"
     end
+    # Types whose values are often a small fixed set (or a single static
+    # value baked into a REST route). For these, run through the same
+    # cardinality / value-fraction analysis literals get — a dominant
+    # value gets preserved as :stable_literal instead of being
+    # placeholdered as a generic {version}/{slug}/etc.
+    #
+    # Slug + opaque_id are here because a lot of route literals
+    # accidentally match those shapes (`/users/{id}/create-new`,
+    # reference codes like `WK1234`). When a single value dominates the
+    # position, the literal is almost always the better display.
+    STABLE_VARIABLE_TYPES = %i[version locale currency boolean slug opaque_id].freeze
     def classify(entry, stats)
       variable = entry[:variable]
       return variable ? :variable_identifier : :ambiguous if stats.nil? || stats.total.zero?
-      return :variable_identifier if variable
+      if variable && !STABLE_VARIABLE_TYPES.include?(entry[:type])
+        return :variable_identifier
+      end
       value            = entry[:value]
       total            = stats.total
@@ -166,6 +452,17 @@ module Iriq
       enough_data      = total >= MIN_OBSERVATIONS_FOR_INFERENCE
       value_frac       = stats.value_fraction(value)
+      # For STABLE_VARIABLE_TYPES (version, locale, currency, boolean),
+      # a dominant value wins over the variable-dominance branch — a
+      # single-version /api/v1/... pattern stays as the literal `v1`
+      # rather than placeholdering to {version}. Without dominance,
+      # fall through to :variable_identifier (the per-type placeholder).
+      if variable
+        return :stable_literal if value_frac >= STABLE_LITERAL_THRESHOLD
+        return :variable_identifier
+      end
       if enough_data && variable_frac >= VARIABLE_DOMINANCE_THRESHOLD
         # Position is dominated by variable types (UUIDs, integers, etc.).
         # A literal here is a special-case outlier (e.g. /users/me).
@@ -204,6 +501,28 @@ module Iriq
       stats.value_fraction(value) >= POPULAR_BASELINE_MULTIPLE * baseline
     end
+    def inferred_param_type(cluster, name, value)
+      # Prefer the cluster's confident type when we have enough samples;
+      # otherwise classify the current value directly. Cluster#param_type
+      # applies the :date quorum gate (see Cluster::DATE_CONFIDENCE_THRESHOLD).
+      stats = cluster && cluster.param_stats[name]
+      if stats && stats.total >= MIN_OBSERVATIONS_FOR_INFERENCE
+        cluster.param_type(name) || @classifier.classify(value)
+      else
+        @classifier.classify(value)
+      end
+    end
+    def render_param_value(value, type)
+      if type == :date && (canon = SegmentClassifier.canonical_date(value))
+        canon
+      elsif @classifier.variable?(type)
+        "{#{SegmentClassifier.display_type(type)}}"
+      else
+        value
+      end
+    end
     def corpus_token(entry)
       case entry[:classification]
       when :variable_identifier, :corpus_inferred_variable
@@ -214,7 +533,13 @@ module Iriq
     end
     def placeholder_for_variable(entry)
-      return "{#{entry[:hint] || entry[:type]}}" if entry[:variable]
+      # Dates render in canonical ISO form rather than as a `{date}` placeholder
+      # — matches what mechanical Iriq.normalize does for path segments and
+      # what render_param_value does for query params.
+      if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
+        return canon
+      end
+      return "{#{entry[:hint] || SegmentClassifier.display_type(entry[:type])}}" if entry[:variable]
       # corpus-inferred variable: classifier said literal, corpus says
       # otherwise. Derive a hint from the prefix's last literal segment if
@@ -226,43 +551,54 @@ module Iriq
     public
+    # --- Legacy dump/load (JSON shape) ------------------------------------
+    #
+    # The pre-Storage release exposed `Corpus#dump`, `Corpus#save(path)`, and
+    # `Corpus.load(path)` for JSON-backed persistence. Those names still work
+    # but are now thin wrappers around the appropriate Storage backend.
     def dump
-      {
-        "host_counts"             => @host_counts,
-        "path_length_counts"      => @path_length_counts.transform_keys(&:to_s),
-        "raw_shape_counts"        => @raw_shape_counts,
-        "fingerprint_counts"      => @fingerprint_counts,
-        "max_values_per_position" => @max_values_per_position,
-        "position_stats"          => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
-        "clusterer"               => @clusterer.dump,
-      }
-    end
-    def save(path)
-      tmp = "#{path}.tmp"
-      File.write(tmp, JSON.generate(dump))
-      File.rename(tmp, path)
+      memory_view.to_dump
     end
     def self.from_dump(h, classifier: SegmentClassifier::DEFAULT)
-      c = new(
-        classifier: classifier,
-        max_values_per_position: h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES),
-      )
-      c.instance_variable_set(:@host_counts,        Hash.new(0).merge(h["host_counts"]))
-      c.instance_variable_set(:@path_length_counts, Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i)))
-      c.instance_variable_set(:@raw_shape_counts,   Hash.new(0).merge(h["raw_shape_counts"]))
-      c.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(h["fingerprint_counts"]))
-      stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
-        acc[[host, prefix]] = PositionStats.from_dump(sdump)
-      end
-      c.instance_variable_set(:@position_stats, stats)
-      c.instance_variable_set(:@clusterer, Clusterer.from_dump(h["clusterer"], classifier: classifier))
-      c
+      max_values = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
+      storage = Storage::Memory.new(classifier: classifier, max_values_per_position: max_values)
+      storage.load_dump!(h)
+      new(classifier: classifier, storage: storage)
     end
     def self.load(path, classifier: SegmentClassifier::DEFAULT)
-      from_dump(JSON.parse(File.read(path)), classifier: classifier)
+      open(path, classifier: classifier)
+    end
+    private
+    def write_json_dump(path)
+      tmp = "#{path}.tmp"
+      File.write(tmp, JSON.generate(memory_view.to_dump))
+      File.rename(tmp, path)
+    end
+    # Materialize a Memory snapshot of the current state — used by dump for
+    # backends that don't natively know how to emit the JSON shape.
+    def memory_view
+      return @storage if @storage.respond_to?(:to_dump)
+      mem = Storage::Memory.new(
+        classifier: @classifier,
+        max_values_per_position: @storage.max_values_per_position,
+      )
+      mem.instance_variable_set(:@host_counts,        Hash.new(0).merge(@storage.host_counts))
+      mem.instance_variable_set(:@path_length_counts, Hash.new(0).merge(@storage.path_length_counts))
+      mem.instance_variable_set(:@raw_shape_counts,   Hash.new(0).merge(@storage.raw_shape_counts))
+      mem.instance_variable_set(:@fingerprint_counts, Hash.new(0).merge(@storage.fingerprint_counts))
+      ps = {}
+      @storage.each_position_stats { |key, stats| ps[key] = stats }
+      mem.instance_variable_set(:@position_stats, ps)
+      clusters_h = @storage.clusters.each_with_object({}) { |c, h| h[c.key] = c }
+      mem.instance_variable_set(:@clusters, clusters_h)
+      mem
     end
   end
 end

data/lib/iriq/cross_host_shape.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require "set"
+module Iriq
+  # A route shape that recurs across multiple hosts.
+  #
+  # Emitted by Corpus#cross_host_shapes. The shape string ("/users/{user_id}")
+  # is the cluster's rendered placeholder form; two clusters with the same
+  # shape but different hosts coalesce into one CrossHostShape record.
+  #
+  # A shape appearing at N hosts is strong evidence of a semantic pattern
+  # rather than a host-local quirk — independent hosts are unlikely to
+  # invent the same `/users/{integer}` structure by accident. Future work
+  # can feed this signal into proposal confidence and corpus-informed
+  # normalization (raise weight when a Shape has cross-host support).
+  class CrossHostShape
+    attr_reader :shape, :hosts, :observation_count
+    def initialize(shape:, hosts:, observation_count:)
+      @shape             = shape
+      @hosts             = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
+      @observation_count = observation_count
+    end
+    def host_count
+      @hosts.size
+    end
+    def to_h
+      {
+        shape:             @shape,
+        hosts:             @hosts.to_a.sort,
+        host_count:        host_count,
+        observation_count: @observation_count,
+      }
+    end
+  end
+end

data/lib/iriq/event.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Iriq
+  # Events are the atomic observation-time facts emitted by Corpus#observe
+  # before any state changes. A single observe(iri) call emits a small
+  # ordered list of Events; Reducers consume that list to update materialized
+  # views (host counts, position stats, clusters, etc.).
+  #
+  # Today the event list is transient — built fresh per observe(), applied,
+  # and discarded. The shape is in place so a future commit can persist the
+  # log and replay it to re-derive materialized views without re-feeding
+  # source IRIs (the "re-runnable inference" win from ROADMAP.md).
+  #
+  # Each Event is a Struct so callers can pattern-match on type and access
+  # fields positionally or by name.
+  module Event
+    HostSeen        = Struct.new(:host)
+    PathLengthSeen  = Struct.new(:length)
+    RawShapeSeen    = Struct.new(:shape)
+    FingerprintSeen = Struct.new(:shape)
+    PositionSeen    = Struct.new(:position, :value, :type)
+    ClusterAddition = Struct.new(:key, :host, :scheme, :shape, :identifier)
+  end
+end