RubyGems - iriq - Versions diffs - 0.0.1 → 0.1.0 - Mend

iriq 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/Gemfile.lock +2 -2
data/README.md +227 -33
data/lib/iriq/cli.rb +288 -100
data/lib/iriq/cluster.rb +23 -0
data/lib/iriq/clusterer.rb +32 -17
data/lib/iriq/corpus.rb +268 -0
data/lib/iriq/explanation.rb +6 -22
data/lib/iriq/extractor.rb +125 -0
data/lib/iriq/identifier.rb +11 -3
data/lib/iriq/inflector.rb +145 -0
data/lib/iriq/normalizer.rb +11 -8
data/lib/iriq/observation.rb +25 -0
data/lib/iriq/path_shape.rb +27 -9
data/lib/iriq/position_stats.rb +64 -0
data/lib/iriq/segment_classifier.rb +31 -7
data/lib/iriq/segment_hints.rb +32 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +10 -0
data/script/benchmark.rb +81 -0
data/script/memory.rb +121 -0
metadata +9 -1

data/lib/iriq/path_shape.rb CHANGED Viewed

@@ -1,27 +1,45 @@
 module Iriq
   # Converts a sequence of path segments into a route-shape string by
-  # replacing variable segments with `{type}` placeholders.
+  # replacing variable segments with `{hint}` placeholders, falling back to
+  # `{type}` when no hint is available.
   #
   #   PathShape.for(["users", "123", "orders", "456"])
-  #   # => "/users/{integer_id}/orders/{integer_id}"
+  #   # => "/users/{user_id}/orders/{order_id}"
+  #
+  # Pass `hints: false` to use raw types instead:
+  #
+  #   PathShape.for(["users", "123"], hints: false)
+  #   # => "/users/{integer_id}"
   class PathShape
-    def initialize(classifier: SegmentClassifier.new)
+    def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
       @classifier = classifier
+      @hints      = hints
     end
     def for(segments)
       return "/" if segments.nil? || segments.empty?
-      "/" + segments.map { |s| shape_segment(s) }.join("/")
+      from_entries(SegmentHints.derive(segments, @classifier))
+    end
+    # Build a shape string from already-derived SegmentHints entries.
+    # Used by Corpus to avoid re-deriving entries per observation when it
+    # needs multiple shape variants (raw and hinted).
+    def from_entries(entries)
+      return "/" if entries.nil? || entries.empty?
+      "/" + entries.map { |e| shape_token(e) }.join("/")
     end
-    def shape_segment(segment)
-      type = @classifier.classify(segment)
-      @classifier.variable?(type) ? "{#{type}}" : segment
+    def shape_token(entry)
+      return entry[:value] unless entry[:variable]
+      placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
+      "{#{placeholder}}"
     end
-    def self.for(segments, classifier: SegmentClassifier.new)
-      new(classifier: classifier).for(segments)
+    def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
+      new(classifier: classifier, hints: hints).for(segments)
     end
   end
 end

data/lib/iriq/position_stats.rb ADDED Viewed

@@ -0,0 +1,64 @@
+module Iriq
+  # Rolling frequency counts for a single (host, prefix-shape, position).
+  # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
+  # doesn't grow memory without bound — `total` keeps growing accurately, but
+  # only the first `max_values` distinct values are tracked individually.
+  class PositionStats
+    DEFAULT_MAX_VALUES = 1_000
+    attr_reader :value_counts, :type_counts, :total, :max_values
+    def initialize(max_values: DEFAULT_MAX_VALUES)
+      @value_counts = Hash.new(0)
+      @type_counts  = Hash.new(0)
+      @total        = 0
+      @max_values   = max_values
+    end
+    def observe(value, type)
+      @total += 1
+      @type_counts[type] += 1
+      if @value_counts.size < @max_values || @value_counts.key?(value)
+        @value_counts[value] += 1
+      end
+    end
+    def cardinality
+      @value_counts.size
+    end
+    # Fraction of observations whose type was variable (i.e. classifier said
+    # not :literal).
+    def variable_fraction(classifier)
+      return 0.0 if @total.zero?
+      var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
+      var.to_f / @total
+    end
+    def value_fraction(value)
+      return 0.0 if @total.zero?
+      (@value_counts[value] || 0).to_f / @total
+    end
+    def dump
+      {
+        "value_counts" => @value_counts,
+        "type_counts"  => @type_counts.transform_keys(&:to_s),
+        "total"        => @total,
+        "max_values"   => @max_values,
+      }
+    end
+    def self.from_dump(h)
+      stats = new(max_values: h["max_values"])
+      stats.instance_variable_set(:@total, h["total"])
+      vc = Hash.new(0).merge(h["value_counts"])
+      tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
+      stats.instance_variable_set(:@value_counts, vc)
+      stats.instance_variable_set(:@type_counts, tc)
+      stats
+    end
+  end
+end

data/lib/iriq/segment_classifier.rb CHANGED Viewed

@@ -20,9 +20,34 @@ module Iriq
     TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
     TS_MILLIS_RANGE  = 1_000_000_000_000..9_999_999_999_999
+    # Bounded memoization: classification of a given string is pure, so
+    # repeat segments (e.g. /users in countless paths) can be cached. Cap
+    # keeps the cache from unbounded growth when inputs are dominated by
+    # unique IDs.
+    CACHE_MAX = 10_000
+    def initialize
+      @cache = {}
+    end
     def classify(segment)
       return :literal if segment.nil? || segment.empty?
+      cached = @cache[segment]
+      return cached if cached
+      @cache.clear if @cache.size >= CACHE_MAX
+      @cache[segment] = compute_classification(segment)
+    end
+    # Anything except :literal is considered variable for shape/explain.
+    def variable?(type)
+      type != :literal
+    end
+    private
+    def compute_classification(segment)
       case segment
       when UUID_RE     then :uuid
       when DATE_RE     then :date
@@ -36,13 +61,6 @@ module Iriq
       end
     end
-    # Anything except :literal is considered variable for shape/explain.
-    def variable?(type)
-      type != :literal
-    end
-    private
     def classify_integer(segment)
       n = segment.to_i
       return :timestamp if TS_MILLIS_RANGE.cover?(n)
@@ -50,5 +68,11 @@ module Iriq
       :integer_id
     end
+    public
+    # Shared singleton — preferred default for callers that don't bring
+    # their own classifier (saves a per-call allocation).
+    DEFAULT = new
   end
 end

data/lib/iriq/segment_hints.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Iriq
+  # Walks a segment list and annotates each entry with the type, whether it's
+  # variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
+  # follows a literal one — `/users/123` ⇒ hint `user_id`.
+  module SegmentHints
+    module_function
+    def derive(segments, classifier)
+      segments.each_with_index.map do |seg, i|
+        type     = classifier.classify(seg)
+        variable = classifier.variable?(type)
+        {
+          value:    seg,
+          type:     type,
+          variable: variable,
+          hint:     hint_for(segments, i, type, variable, classifier),
+        }
+      end
+    end
+    def hint_for(segments, i, type, variable, classifier)
+      return nil unless variable && i > 0
+      prev = segments[i - 1]
+      return nil unless classifier.classify(prev) == :literal
+      base   = Inflector.singularize(prev)
+      suffix = type == :uuid ? "_uuid" : "_id"
+      "#{base}#{suffix}"
+    end
+  end
+end

data/lib/iriq/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Iriq
-  VERSION = "0.0.1"
+  VERSION = "0.1.0"
 end

data/lib/iriq.rb CHANGED Viewed

@@ -1,13 +1,19 @@
 require "iriq/version"
 require "iriq/errors"
+require "iriq/inflector"
 require "iriq/identifier"
 require "iriq/parser"
 require "iriq/segment_classifier"
+require "iriq/segment_hints"
 require "iriq/path_shape"
 require "iriq/normalizer"
 require "iriq/explanation"
 require "iriq/cluster"
 require "iriq/clusterer"
+require "iriq/position_stats"
+require "iriq/observation"
+require "iriq/corpus"
+require "iriq/extractor"
 require "iriq/cli"
 module Iriq
@@ -23,5 +29,9 @@ module Iriq
     def explain(input)
       Explanation.explain(input)
     end
+    def extract(text)
+      Extractor.new.extract(text)
+    end
   end
 end

data/script/benchmark.rb ADDED Viewed

@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+# Performance benchmark for the main hot paths in Iriq.
+#
+# Usage:
+#   bundle exec script/benchmark.rb              # default sizes
+#   bundle exec script/benchmark.rb 50000        # custom "large" size
+#
+# Inputs are generated deterministically from IriGenerator so results are
+# comparable across runs.
+require "benchmark"
+require "tempfile"
+$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
+$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
+require "iriq"
+require "iri_generator"
+LARGE = Integer(ARGV[0] || 10_000)
+SMALL = [LARGE / 10, 1_000].min
+HUGE  = LARGE * 10
+puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
+puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
+puts
+small_urls = IriGenerator.urls(count: SMALL, seed: 1)
+large_urls = IriGenerator.urls(count: LARGE, seed: 1)
+huge_urls  = IriGenerator.urls(count: HUGE,  seed: 1)
+# ~ LARGE URLs embedded in prose
+text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
+puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
+puts
+results = {}
+Benchmark.bm(42) do |x|
+  results[:parse]     = x.report("parse #{LARGE} URLs")                  { large_urls.each { |u| Iriq.parse(u) } }
+  results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
+  results[:explain]   = x.report("explain #{LARGE} URLs (deterministic)")   { large_urls.each { |u| Iriq.explain(u) } }
+  results[:extract]   = x.report("extract from ~#{text_blob.bytesize / 1024} KB text")     { Iriq.extract(text_blob) }
+  results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
+    c = Iriq::Corpus.new
+    small_urls.each { |u| c.observe(u) }
+  end
+  results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
+    c = Iriq::Corpus.new
+    large_urls.each { |u| c.observe(u) }
+  end
+  results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
+    c = Iriq::Corpus.new
+    huge_urls.each { |u| c.observe(u) }
+  end
+  results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
+    c = Iriq::Corpus.new
+    large_urls.each { |u| c.observe(u) }
+    Tempfile.open(["iriq-bench", ".json"]) do |f|
+      c.save(f.path)
+      Iriq::Corpus.load(f.path)
+    end
+  end
+end
+puts
+puts "Throughput summary:"
+[
+  [:parse,         LARGE, "URLs/s"],
+  [:normalize,     LARGE, "URLs/s"],
+  [:explain,       LARGE, "URLs/s"],
+  [:observe_small, SMALL, "URLs/s"],
+  [:observe_large, LARGE, "URLs/s"],
+  [:observe_huge,  HUGE,  "URLs/s"],
+].each do |key, n, unit|
+  per_sec = n / results[key].real
+  printf("  %-30s %12s %s\n", key, per_sec.round.to_s, unit)
+end
+extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
+printf("  %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)

data/script/memory.rb ADDED Viewed

@@ -0,0 +1,121 @@
+#!/usr/bin/env ruby
+# Memory profile for the main code paths in Iriq.
+#
+# Usage:
+#   bundle exec script/memory.rb              # default sizes
+#   bundle exec script/memory.rb 50000        # custom corpus size
+#
+# Reports retained memory per operation, cache footprints, and memory
+# growth across corpus sizes (to verify linear scaling — no leaks).
+require "objspace"
+$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
+$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
+require "iriq"
+require "iri_generator"
+CORPUS_SIZE = Integer(ARGV[0] || 10_000)
+SIZES       = [1_000, 10_000, 100_000].uniq.sort
+SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
+SIZES.sort!
+# Bytes → KB / MB string for display.
+def fmt_bytes(n)
+  if n < 1024
+    "#{n} B"
+  elsif n < 1024 * 1024
+    format("%.1f KB", n / 1024.0)
+  else
+    format("%.2f MB", n / (1024.0 * 1024.0))
+  end
+end
+# Run a block in isolation: GC before + after, return delta in bytes.
+def measure_retained(&block)
+  GC.start
+  before = ObjectSpace.memsize_of_all
+  result = block.call
+  GC.start
+  after  = ObjectSpace.memsize_of_all
+  [after - before, result]
+end
+# Reset caches so each scenario starts clean.
+def reset_caches
+  Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
+  Iriq::Inflector.instance_variable_get(:@cache)&.clear
+end
+puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
+puts
+# -- Section 1: memory growth across corpus sizes --
+puts "── corpus retained memory by N (verifies linear growth) ──"
+printf("  %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
+SIZES.each do |n|
+  reset_caches
+  urls = IriGenerator.urls(count: n, seed: 1)
+  alloc_before = GC.stat(:total_allocated_objects)
+  retained, _ = measure_retained do
+    c = Iriq::Corpus.new
+    urls.each { |u| c.observe(u) }
+    c
+  end
+  alloc_total = GC.stat(:total_allocated_objects) - alloc_before
+  printf("  %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
+end
+puts
+# -- Section 2: corpus state breakdown at CORPUS_SIZE --
+puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
+reset_caches
+urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
+corpus = Iriq::Corpus.new
+urls.each { |u| corpus.observe(u) }
+puts "  unique hosts:           #{corpus.host_counts.size}"
+puts "  unique fingerprints:    #{corpus.fingerprint_counts.size}"
+puts "  unique raw shapes:      #{corpus.raw_shape_counts.size}"
+puts "  clusters:               #{corpus.size}"
+puts "  position_stats entries: #{corpus.position_stats.size}"
+puts "  total observed values:  #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
+puts
+# -- Section 3: cache footprints --
+puts "── memoization caches ──"
+classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
+inflector_cache  = Iriq::Inflector.instance_variable_get(:@cache) || {}
+puts "  classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
+puts "  inflector cache:  #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
+puts
+# -- Section 4: per-operation memory cost --
+puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
+urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
+text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
+[
+  ["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
+  ["normalize #{CORPUS_SIZE} URLs",               ->{ urls.each { |u| Iriq.normalize(u) } }],
+  ["explain #{CORPUS_SIZE} URLs",                 ->{ urls.each { |u| Iriq.explain(u) } }],
+  ["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
+  ["Corpus.observe #{CORPUS_SIZE} URLs",          ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
+].each do |label, op|
+  reset_caches
+  retained, _ = measure_retained(&op)
+  printf("  %-50s %s\n", label, fmt_bytes(retained))
+end
+puts
+# -- Section 5: persistence overhead --
+puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
+require "tempfile"
+reset_caches
+corpus = Iriq::Corpus.new
+urls.each { |u| corpus.observe(u) }
+Tempfile.open(["iriq-mem", ".json"]) do |f|
+  corpus.save(f.path)
+  bytes = File.size(f.path)
+  puts "  JSON file on disk:  #{fmt_bytes(bytes)}"
+  puts "  ratio:              #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: iriq
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - Daniel Pepper
@@ -83,14 +83,22 @@ files:
 - lib/iriq/cli.rb
 - lib/iriq/cluster.rb
 - lib/iriq/clusterer.rb
+- lib/iriq/corpus.rb
 - lib/iriq/errors.rb
 - lib/iriq/explanation.rb
+- lib/iriq/extractor.rb
 - lib/iriq/identifier.rb
+- lib/iriq/inflector.rb
 - lib/iriq/normalizer.rb
+- lib/iriq/observation.rb
 - lib/iriq/parser.rb
 - lib/iriq/path_shape.rb
+- lib/iriq/position_stats.rb
 - lib/iriq/segment_classifier.rb
+- lib/iriq/segment_hints.rb
 - lib/iriq/version.rb
+- script/benchmark.rb
+- script/memory.rb
 homepage: https://github.com/dpep/iriq
 licenses:
 - MIT