RubyGems - iriq - Versions diffs - 0.1.0 → 0.2.0 - Mend

iriq 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/lib/iriq/storage/sqlite.rb ADDED Viewed

@@ -0,0 +1,367 @@
+require "sqlite3"
+module Iriq
+  module Storage
+    # Sqlite is the incremental-write backend. Each observation translates
+    # to a handful of UPSERTs against a long-lived connection; nothing is
+    # materialized in memory beyond what reads explicitly ask for.
+    #
+    # WAL journaling lets multiple processes observe against the same file
+    # concurrently — the writer is serialized, readers are not blocked, and
+    # the existing `iriq --corpus c.db <url>` pattern works without a flock
+    # at the application layer.
+    class Sqlite
+      SCHEMA_VERSION = 1
+      SCHEMA = <<~SQL.freeze
+        CREATE TABLE IF NOT EXISTS meta (
+          key   TEXT PRIMARY KEY,
+          value TEXT
+        );
+        CREATE TABLE IF NOT EXISTS host_counts (
+          host  TEXT PRIMARY KEY,
+          count INTEGER NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS path_length_counts (
+          length INTEGER PRIMARY KEY,
+          count  INTEGER NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS raw_shape_counts (
+          shape TEXT PRIMARY KEY,
+          count INTEGER NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS fingerprint_counts (
+          shape TEXT PRIMARY KEY,
+          count INTEGER NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS position_stats (
+          host   TEXT NOT NULL,
+          prefix TEXT NOT NULL,
+          total  INTEGER NOT NULL DEFAULT 0,
+          PRIMARY KEY (host, prefix)
+        );
+        CREATE TABLE IF NOT EXISTS position_values (
+          host   TEXT NOT NULL,
+          prefix TEXT NOT NULL,
+          value  TEXT NOT NULL,
+          count  INTEGER NOT NULL,
+          PRIMARY KEY (host, prefix, value)
+        );
+        CREATE TABLE IF NOT EXISTS position_types (
+          host   TEXT NOT NULL,
+          prefix TEXT NOT NULL,
+          type   TEXT NOT NULL,
+          count  INTEGER NOT NULL,
+          PRIMARY KEY (host, prefix, type)
+        );
+        CREATE TABLE IF NOT EXISTS clusters (
+          key    TEXT PRIMARY KEY,
+          host   TEXT,
+          scheme TEXT,
+          shape  TEXT,
+          count  INTEGER NOT NULL DEFAULT 0,
+          ord    INTEGER NOT NULL
+        );
+        CREATE TABLE IF NOT EXISTS cluster_examples (
+          cluster_key TEXT NOT NULL,
+          position    INTEGER NOT NULL,
+          canonical   TEXT NOT NULL,
+          PRIMARY KEY (cluster_key, position)
+        );
+        CREATE TABLE IF NOT EXISTS cluster_segments (
+          cluster_key TEXT NOT NULL,
+          position    INTEGER NOT NULL,
+          value       TEXT NOT NULL,
+          count       INTEGER NOT NULL,
+          PRIMARY KEY (cluster_key, position, value)
+        );
+      SQL
+      attr_reader :path, :max_values_per_position
+      def self.open(path, classifier: SegmentClassifier::DEFAULT,
+                          max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
+        new(path: path, classifier: classifier, max_values_per_position: max_values_per_position).tap(&:setup!)
+      end
+      def initialize(path:, classifier: SegmentClassifier::DEFAULT,
+                     max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
+        @path                    = path
+        @classifier              = classifier
+        @max_values_per_position = max_values_per_position
+        @db                      = SQLite3::Database.new(path)
+        # busy_timeout MUST come first: other PRAGMAs (journal_mode in
+        # particular) can themselves block on the write lock under
+        # concurrent open, and without busy_timeout set they fail
+        # immediately with SQLITE_BUSY.
+        @db.execute("PRAGMA busy_timeout = 30000")
+        @db.execute("PRAGMA journal_mode = WAL")
+        @db.execute("PRAGMA synchronous = NORMAL")
+        @db.execute("PRAGMA foreign_keys = ON")
+        @in_batch = false
+      end
+      def setup!
+        @db.execute_batch(SCHEMA)
+        existing = @db.get_first_value("SELECT value FROM meta WHERE key = 'schema_version'")
+        if existing.nil?
+          @db.execute("INSERT INTO meta (key, value) VALUES ('schema_version', ?)", SCHEMA_VERSION.to_s)
+          @db.execute("INSERT INTO meta (key, value) VALUES ('max_values_per_position', ?)",
+                      @max_values_per_position.to_s)
+        else
+          @max_values_per_position = (@db.get_first_value(
+            "SELECT value FROM meta WHERE key = 'max_values_per_position'"
+          ) || @max_values_per_position).to_i
+        end
+        self
+      end
+      def transaction
+        # While inside an outer batch, observe()-time transactions become
+        # no-ops — the outer batch wraps everything in one txn for speed.
+        return yield(self) if @in_batch
+        @db.transaction
+        yield self
+        @db.commit
+      rescue
+        @db.rollback rescue nil
+        raise
+      end
+      # Wrap many observations in a single transaction. Cuts SQLite write
+      # overhead from O(observations) fsyncs to O(1).
+      def batch
+        return yield if @in_batch
+        @in_batch = true
+        @db.transaction
+        begin
+          yield
+          @db.commit
+        rescue
+          @db.rollback rescue nil
+          raise
+        ensure
+          @in_batch = false
+        end
+      end
+      # Saving is automatic — incremental UPSERTs hit disk on commit. flush
+      # makes that explicit; close releases the connection.
+      def flush; end
+      def save(_path = nil)
+        # Already persisted. Provided for parity with the JSON backend.
+      end
+      def close
+        # Checkpoint + truncate the WAL so the .db-wal sidecar doesn't grow
+        # unbounded across long-lived `iriq --corpus c.db` sessions.
+        @db.execute("PRAGMA wal_checkpoint(TRUNCATE)") rescue nil
+        @db.close
+      end
+      # --- Increments -------------------------------------------------------
+      def increment_host(host)
+        return unless host
+        @db.execute(<<~SQL, host)
+          INSERT INTO host_counts (host, count) VALUES (?, 1)
+          ON CONFLICT(host) DO UPDATE SET count = count + 1
+        SQL
+      end
+      def increment_path_length(length)
+        @db.execute(<<~SQL, length)
+          INSERT INTO path_length_counts (length, count) VALUES (?, 1)
+          ON CONFLICT(length) DO UPDATE SET count = count + 1
+        SQL
+      end
+      def increment_raw_shape(shape)
+        upsert_shape("raw_shape_counts", shape)
+      end
+      def increment_fingerprint(shape)
+        upsert_shape("fingerprint_counts", shape)
+      end
+      def observe_position(host, prefix, value, type)
+        host ||= ""
+        @db.execute(<<~SQL, [host, prefix])
+          INSERT INTO position_stats (host, prefix, total) VALUES (?, ?, 1)
+          ON CONFLICT(host, prefix) DO UPDATE SET total = total + 1
+        SQL
+        # Type counts are unbounded — always upsert.
+        @db.execute(<<~SQL, [host, prefix, type.to_s])
+          INSERT INTO position_types (host, prefix, type, count) VALUES (?, ?, ?, 1)
+          ON CONFLICT(host, prefix, type) DO UPDATE SET count = count + 1
+        SQL
+        # Value counts are capped at max_values_per_position. If the value
+        # already exists, increment it; otherwise insert only when
+        # cardinality is below the cap. Two-step rather than ON CONFLICT
+        # because we need to enforce the cap on insert.
+        @db.execute(<<~SQL, [host, prefix, value])
+          UPDATE position_values SET count = count + 1
+          WHERE host = ? AND prefix = ? AND value = ?
+        SQL
+        if @db.changes.zero?
+          card = @db.get_first_value(
+            "SELECT COUNT(*) FROM position_values WHERE host = ? AND prefix = ?",
+            [host, prefix],
+          )
+          if card < @max_values_per_position
+            @db.execute(
+              "INSERT INTO position_values (host, prefix, value, count) VALUES (?, ?, ?, 1)",
+              [host, prefix, value],
+            )
+          end
+        end
+      end
+      def add_to_cluster(key, host, scheme, shape, identifier)
+        # Insert the cluster row if new (with a monotonic ord for stable
+        # iteration), then bump its count.
+        @db.execute(<<~SQL, [key, host, scheme, shape])
+          INSERT INTO clusters (key, host, scheme, shape, count, ord)
+          VALUES (?, ?, ?, ?, 1, (SELECT COALESCE(MAX(ord), 0) + 1 FROM clusters))
+          ON CONFLICT(key) DO UPDATE SET count = count + 1
+        SQL
+        # Examples — capped at Cluster::MAX_EXAMPLES.
+        examples_count = @db.get_first_value(
+          "SELECT COUNT(*) FROM cluster_examples WHERE cluster_key = ?", [key],
+        )
+        if examples_count < Cluster::MAX_EXAMPLES
+          @db.execute(<<~SQL, [key, examples_count, identifier.canonical])
+            INSERT INTO cluster_examples (cluster_key, position, canonical)
+            VALUES (?, ?, ?)
+          SQL
+        end
+        # Per-position segment counts — uncapped.
+        identifier.path_segments.each_with_index do |seg, i|
+          @db.execute(<<~SQL, [key, i, seg])
+            INSERT INTO cluster_segments (cluster_key, position, value, count) VALUES (?, ?, ?, 1)
+            ON CONFLICT(cluster_key, position, value) DO UPDATE SET count = count + 1
+          SQL
+        end
+        load_cluster(key)
+      end
+      # --- Reads ------------------------------------------------------------
+      def host_counts
+        rows_to_count_hash("host_counts", "host")
+      end
+      def path_length_counts
+        h = Hash.new(0)
+        @db.execute("SELECT length, count FROM path_length_counts") { |r| h[r[0]] = r[1] }
+        h
+      end
+      def raw_shape_counts
+        rows_to_count_hash("raw_shape_counts", "shape")
+      end
+      def fingerprint_counts
+        rows_to_count_hash("fingerprint_counts", "shape")
+      end
+      def position_stats(host, prefix)
+        host ||= ""
+        total = @db.get_first_value(
+          "SELECT total FROM position_stats WHERE host = ? AND prefix = ?", [host, prefix],
+        )
+        return nil if total.nil?
+        stats = PositionStats.new(max_values: @max_values_per_position)
+        stats.instance_variable_set(:@total, total)
+        vc = Hash.new(0)
+        @db.execute(
+          "SELECT value, count FROM position_values WHERE host = ? AND prefix = ?", [host, prefix]
+        ) { |r| vc[r[0]] = r[1] }
+        stats.instance_variable_set(:@value_counts, vc)
+        tc = Hash.new(0)
+        @db.execute(
+          "SELECT type, count FROM position_types WHERE host = ? AND prefix = ?", [host, prefix]
+        ) { |r| tc[r[0].to_sym] = r[1] }
+        stats.instance_variable_set(:@type_counts, tc)
+        stats
+      end
+      def each_position_stats
+        seen = []
+        @db.execute("SELECT DISTINCT host, prefix FROM position_stats ORDER BY ROWID") do |row|
+          seen << row
+        end
+        seen.each { |host, prefix| yield [host, prefix], position_stats(host, prefix) }
+      end
+      def clusters
+        out = []
+        @db.execute("SELECT key FROM clusters ORDER BY ord") do |row|
+          out << load_cluster(row[0])
+        end
+        out
+      end
+      def cluster_size
+        @db.get_first_value("SELECT COUNT(*) FROM clusters")
+      end
+      private
+      def upsert_shape(table, shape)
+        @db.execute(<<~SQL, shape)
+          INSERT INTO #{table} (shape, count) VALUES (?, 1)
+          ON CONFLICT(shape) DO UPDATE SET count = count + 1
+        SQL
+      end
+      def rows_to_count_hash(table, key_col)
+        h = Hash.new(0)
+        @db.execute("SELECT #{key_col}, count FROM #{table}") { |r| h[r[0]] = r[1] }
+        h
+      end
+      def load_cluster(key)
+        row = @db.get_first_row(
+          "SELECT key, host, scheme, shape, count FROM clusters WHERE key = ?", [key],
+        )
+        return nil unless row
+        c = Cluster.new(key: row[0], host: row[1], scheme: row[2], shape: row[3])
+        c.instance_variable_set(:@count, row[4])
+        examples = []
+        @db.execute(
+          "SELECT canonical FROM cluster_examples WHERE cluster_key = ? ORDER BY position", [key]
+        ) { |r| examples << Parser.parse(r[0]) }
+        c.instance_variable_set(:@examples, examples)
+        seg_counts = []
+        @db.execute(
+          "SELECT position, value, count FROM cluster_segments WHERE cluster_key = ? ORDER BY position",
+          [key],
+        ) do |r|
+          pos = r[0]
+          seg_counts[pos] ||= Hash.new(0)
+          seg_counts[pos][r[1]] = r[2]
+        end
+        c.instance_variable_set(:@segment_counts, seg_counts)
+        c
+      end
+    end
+  end
+end

data/lib/iriq/storage.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module Iriq
+  # Storage is the persistence layer for a Corpus. It owns every counter and
+  # per-(host, prefix) frequency map; the Corpus class delegates state to it.
+  #
+  # Three concrete backends ship:
+  #
+  #   Storage::Memory   — in-memory only; matches the original behavior.
+  #   Storage::Json     — Memory backend wrapped with load/save against a JSON file.
+  #   Storage::Sqlite   — incremental UPSERTs against a SQLite database.
+  #
+  # File-extension dispatch keeps callers simple: `.json` (or anything else)
+  # picks Json, `.db`/`.sqlite`/`.sqlite3` picks Sqlite.
+  module Storage
+    SQLITE_EXTS = %w[.db .sqlite .sqlite3].freeze
+    module_function
+    # Opens (or creates) a storage at `path`, picking the backend by extension.
+    # If `path` is nil, returns a Memory backend.
+    def open(path, classifier: SegmentClassifier::DEFAULT,
+                   max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
+      return Memory.new(classifier: classifier, max_values_per_position: max_values_per_position) if path.nil?
+      if SQLITE_EXTS.include?(File.extname(path).downcase)
+        require "iriq/storage/sqlite"
+        Sqlite.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
+      else
+        require "iriq/storage/json"
+        Json.open(path, classifier: classifier, max_values_per_position: max_values_per_position)
+      end
+    end
+  end
+end
+require "iriq/storage/memory"

data/lib/iriq/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Iriq
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end

data/lib/iriq.rb CHANGED Viewed

@@ -12,6 +12,7 @@ require "iriq/cluster"
 require "iriq/clusterer"
 require "iriq/position_stats"
 require "iriq/observation"
+require "iriq/storage"
 require "iriq/corpus"
 require "iriq/extractor"
 require "iriq/cli"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: iriq
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Daniel Pepper
@@ -65,17 +65,32 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0.22'
-description: Semantic IRI/URI/URL/URN parsing, normalization, classification, and
-  clustering.
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '1.6'
+description: IRI extraction, normalization, and clustering.
 executables:
 - iriq
 extensions: []
 extra_rdoc_files: []
 files:
 - CHANGELOG.md
+- CLAUDE.md
 - Gemfile
 - Gemfile.lock
 - LICENSE.txt
+- Makefile
 - README.md
 - exe/iriq
 - iriq.gemspec
@@ -96,9 +111,11 @@ files:
 - lib/iriq/position_stats.rb
 - lib/iriq/segment_classifier.rb
 - lib/iriq/segment_hints.rb
+- lib/iriq/storage.rb
+- lib/iriq/storage/json.rb
+- lib/iriq/storage/memory.rb
+- lib/iriq/storage/sqlite.rb
 - lib/iriq/version.rb
-- script/benchmark.rb
-- script/memory.rb
 homepage: https://github.com/dpep/iriq
 licenses:
 - MIT
@@ -119,5 +136,5 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubygems_version: 3.6.9
 specification_version: 4
-summary: Semantic IRI normalization and clustering.
+summary: IRI extraction, normalization, and clustering.
 test_files: []

data/script/benchmark.rb DELETED Viewed

@@ -1,81 +0,0 @@
-#!/usr/bin/env ruby
-# Performance benchmark for the main hot paths in Iriq.
-#
-# Usage:
-#   bundle exec script/benchmark.rb              # default sizes
-#   bundle exec script/benchmark.rb 50000        # custom "large" size
-#
-# Inputs are generated deterministically from IriGenerator so results are
-# comparable across runs.
-require "benchmark"
-require "tempfile"
-$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
-$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
-require "iriq"
-require "iri_generator"
-LARGE = Integer(ARGV[0] || 10_000)
-SMALL = [LARGE / 10, 1_000].min
-HUGE  = LARGE * 10
-puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
-puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
-puts
-small_urls = IriGenerator.urls(count: SMALL, seed: 1)
-large_urls = IriGenerator.urls(count: LARGE, seed: 1)
-huge_urls  = IriGenerator.urls(count: HUGE,  seed: 1)
-# ~ LARGE URLs embedded in prose
-text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
-puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
-puts
-results = {}
-Benchmark.bm(42) do |x|
-  results[:parse]     = x.report("parse #{LARGE} URLs")                  { large_urls.each { |u| Iriq.parse(u) } }
-  results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
-  results[:explain]   = x.report("explain #{LARGE} URLs (deterministic)")   { large_urls.each { |u| Iriq.explain(u) } }
-  results[:extract]   = x.report("extract from ~#{text_blob.bytesize / 1024} KB text")     { Iriq.extract(text_blob) }
-  results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
-    c = Iriq::Corpus.new
-    small_urls.each { |u| c.observe(u) }
-  end
-  results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
-    c = Iriq::Corpus.new
-    large_urls.each { |u| c.observe(u) }
-  end
-  results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
-    c = Iriq::Corpus.new
-    huge_urls.each { |u| c.observe(u) }
-  end
-  results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
-    c = Iriq::Corpus.new
-    large_urls.each { |u| c.observe(u) }
-    Tempfile.open(["iriq-bench", ".json"]) do |f|
-      c.save(f.path)
-      Iriq::Corpus.load(f.path)
-    end
-  end
-end
-puts
-puts "Throughput summary:"
-[
-  [:parse,         LARGE, "URLs/s"],
-  [:normalize,     LARGE, "URLs/s"],
-  [:explain,       LARGE, "URLs/s"],
-  [:observe_small, SMALL, "URLs/s"],
-  [:observe_large, LARGE, "URLs/s"],
-  [:observe_huge,  HUGE,  "URLs/s"],
-].each do |key, n, unit|
-  per_sec = n / results[key].real
-  printf("  %-30s %12s %s\n", key, per_sec.round.to_s, unit)
-end
-extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
-printf("  %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)

data/script/memory.rb DELETED Viewed

@@ -1,121 +0,0 @@
-#!/usr/bin/env ruby
-# Memory profile for the main code paths in Iriq.
-#
-# Usage:
-#   bundle exec script/memory.rb              # default sizes
-#   bundle exec script/memory.rb 50000        # custom corpus size
-#
-# Reports retained memory per operation, cache footprints, and memory
-# growth across corpus sizes (to verify linear scaling — no leaks).
-require "objspace"
-$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
-$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
-require "iriq"
-require "iri_generator"
-CORPUS_SIZE = Integer(ARGV[0] || 10_000)
-SIZES       = [1_000, 10_000, 100_000].uniq.sort
-SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
-SIZES.sort!
-# Bytes → KB / MB string for display.
-def fmt_bytes(n)
-  if n < 1024
-    "#{n} B"
-  elsif n < 1024 * 1024
-    format("%.1f KB", n / 1024.0)
-  else
-    format("%.2f MB", n / (1024.0 * 1024.0))
-  end
-end
-# Run a block in isolation: GC before + after, return delta in bytes.
-def measure_retained(&block)
-  GC.start
-  before = ObjectSpace.memsize_of_all
-  result = block.call
-  GC.start
-  after  = ObjectSpace.memsize_of_all
-  [after - before, result]
-end
-# Reset caches so each scenario starts clean.
-def reset_caches
-  Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
-  Iriq::Inflector.instance_variable_get(:@cache)&.clear
-end
-puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
-puts
-# -- Section 1: memory growth across corpus sizes --
-puts "── corpus retained memory by N (verifies linear growth) ──"
-printf("  %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
-SIZES.each do |n|
-  reset_caches
-  urls = IriGenerator.urls(count: n, seed: 1)
-  alloc_before = GC.stat(:total_allocated_objects)
-  retained, _ = measure_retained do
-    c = Iriq::Corpus.new
-    urls.each { |u| c.observe(u) }
-    c
-  end
-  alloc_total = GC.stat(:total_allocated_objects) - alloc_before
-  printf("  %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
-end
-puts
-# -- Section 2: corpus state breakdown at CORPUS_SIZE --
-puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
-reset_caches
-urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
-corpus = Iriq::Corpus.new
-urls.each { |u| corpus.observe(u) }
-puts "  unique hosts:           #{corpus.host_counts.size}"
-puts "  unique fingerprints:    #{corpus.fingerprint_counts.size}"
-puts "  unique raw shapes:      #{corpus.raw_shape_counts.size}"
-puts "  clusters:               #{corpus.size}"
-puts "  position_stats entries: #{corpus.position_stats.size}"
-puts "  total observed values:  #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
-puts
-# -- Section 3: cache footprints --
-puts "── memoization caches ──"
-classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
-inflector_cache  = Iriq::Inflector.instance_variable_get(:@cache) || {}
-puts "  classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
-puts "  inflector cache:  #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
-puts
-# -- Section 4: per-operation memory cost --
-puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
-urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
-text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
-[
-  ["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
-  ["normalize #{CORPUS_SIZE} URLs",               ->{ urls.each { |u| Iriq.normalize(u) } }],
-  ["explain #{CORPUS_SIZE} URLs",                 ->{ urls.each { |u| Iriq.explain(u) } }],
-  ["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
-  ["Corpus.observe #{CORPUS_SIZE} URLs",          ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
-].each do |label, op|
-  reset_caches
-  retained, _ = measure_retained(&op)
-  printf("  %-50s %s\n", label, fmt_bytes(retained))
-end
-puts
-# -- Section 5: persistence overhead --
-puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
-require "tempfile"
-reset_caches
-corpus = Iriq::Corpus.new
-urls.each { |u| corpus.observe(u) }
-Tempfile.open(["iriq-mem", ".json"]) do |f|
-  corpus.save(f.path)
-  bytes = File.size(f.path)
-  puts "  JSON file on disk:  #{fmt_bytes(bytes)}"
-  puts "  ratio:              #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
-end