RubyGems - iriq - Versions diffs - 0.0.1 → 0.2.0 - Mend

iriq 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +25 -0
data/CLAUDE.md +121 -0
data/Gemfile.lock +8 -2
data/Makefile +56 -0
data/README.md +334 -39
data/iriq.gemspec +4 -3
data/lib/iriq/cli.rb +289 -100
data/lib/iriq/cluster.rb +47 -0
data/lib/iriq/clusterer.rb +29 -39
data/lib/iriq/corpus.rb +322 -0
data/lib/iriq/explanation.rb +6 -22
data/lib/iriq/extractor.rb +125 -0
data/lib/iriq/identifier.rb +11 -3
data/lib/iriq/inflector.rb +145 -0
data/lib/iriq/normalizer.rb +11 -8
data/lib/iriq/observation.rb +25 -0
data/lib/iriq/parser.rb +1 -1
data/lib/iriq/path_shape.rb +27 -9
data/lib/iriq/position_stats.rb +64 -0
data/lib/iriq/segment_classifier.rb +31 -7
data/lib/iriq/segment_hints.rb +32 -0
data/lib/iriq/storage/json.rb +43 -0
data/lib/iriq/storage/memory.rb +138 -0
data/lib/iriq/storage/sqlite.rb +367 -0
data/lib/iriq/storage.rb +35 -0
data/lib/iriq/version.rb +1 -1
data/lib/iriq.rb +11 -0
metadata +29 -4

data/lib/iriq/normalizer.rb CHANGED Viewed

@@ -2,24 +2,27 @@ module Iriq
   # Produces a canonical, shape-aware string for an identifier.
   #
   #   Normalizer.normalize("https://Foo.com:443/users/123")
-  #   # => "https://foo.com/users/{integer_id}"
+  #   # => "https://foo.com/users/{user_id}"
   #
   # The form is intended for grouping/diffing — it is not a round-trippable URL.
   module Normalizer
     module_function
-    def normalize(input, classifier: SegmentClassifier.new)
+    def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
       iri = input.is_a?(Identifier) ? input : Parser.parse(input)
-      normalize_identifier(iri, classifier: classifier)
+      normalize_identifier(iri, classifier: classifier, hints: hints)
     end
-    def normalize_identifier(iri, classifier: SegmentClassifier.new)
+    def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
       if iri.urn?
-        # urn:isbn:0451450523 -> urn:isbn:{integer_id}
         if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
           ns, value = iri.nss.split(":", 2)
-          type      = classifier.classify(value)
-          shaped    = classifier.variable?(type) ? "{#{type}}" : value
+          entry     = SegmentHints.derive([ns, value], classifier).last
+          shaped    = if entry[:variable]
+                        "{#{(hints && entry[:hint]) || entry[:type]}}"
+                      else
+                        entry[:value]
+                      end
           "urn:#{ns}:#{shaped}"
         else
           iri.canonical
@@ -29,7 +32,7 @@ module Iriq
         out << "#{iri.scheme}://" if iri.scheme
         out << iri.host if iri.host
         out << ":#{iri.port}" if iri.port
-        out << PathShape.new(classifier: classifier).for(iri.path_segments)
+        out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
         if iri.query_params && !iri.query_params.empty?
           out << "?" + shape_query(iri.query_params, classifier)
         end

data/lib/iriq/observation.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module Iriq
+  # The result of Corpus#observe. Lightweight value object — heavy work
+  # (explanation, normalization) is deferred until you ask.
+  class Observation
+    attr_reader :identifier, :cluster
+    def initialize(corpus:, identifier:, cluster:)
+      @corpus     = corpus
+      @identifier = identifier
+      @cluster    = cluster
+    end
+    def fingerprint
+      @fingerprint ||= Normalizer.normalize_identifier(@identifier)
+    end
+    def explanation
+      @explanation ||= @corpus.explain(@identifier)
+    end
+    def normalize
+      @corpus.normalize(@identifier)
+    end
+  end
+end

data/lib/iriq/parser.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Iriq
   #
   # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
   # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
-  # Unicode hosts and paths) to support normalization and clustering.
+  # Unicode hosts and paths) to support extraction, normalization, and clustering.
   module Parser
     SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze

data/lib/iriq/path_shape.rb CHANGED Viewed

@@ -1,27 +1,45 @@
 module Iriq
   # Converts a sequence of path segments into a route-shape string by
-  # replacing variable segments with `{type}` placeholders.
+  # replacing variable segments with `{hint}` placeholders, falling back to
+  # `{type}` when no hint is available.
   #
   #   PathShape.for(["users", "123", "orders", "456"])
-  #   # => "/users/{integer_id}/orders/{integer_id}"
+  #   # => "/users/{user_id}/orders/{order_id}"
+  #
+  # Pass `hints: false` to use raw types instead:
+  #
+  #   PathShape.for(["users", "123"], hints: false)
+  #   # => "/users/{integer_id}"
   class PathShape
-    def initialize(classifier: SegmentClassifier.new)
+    def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
       @classifier = classifier
+      @hints      = hints
     end
     def for(segments)
       return "/" if segments.nil? || segments.empty?
-      "/" + segments.map { |s| shape_segment(s) }.join("/")
+      from_entries(SegmentHints.derive(segments, @classifier))
+    end
+    # Build a shape string from already-derived SegmentHints entries.
+    # Used by Corpus to avoid re-deriving entries per observation when it
+    # needs multiple shape variants (raw and hinted).
+    def from_entries(entries)
+      return "/" if entries.nil? || entries.empty?
+      "/" + entries.map { |e| shape_token(e) }.join("/")
     end
-    def shape_segment(segment)
-      type = @classifier.classify(segment)
-      @classifier.variable?(type) ? "{#{type}}" : segment
+    def shape_token(entry)
+      return entry[:value] unless entry[:variable]
+      placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
+      "{#{placeholder}}"
     end
-    def self.for(segments, classifier: SegmentClassifier.new)
-      new(classifier: classifier).for(segments)
+    def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
+      new(classifier: classifier, hints: hints).for(segments)
     end
   end
 end

data/lib/iriq/position_stats.rb ADDED Viewed

@@ -0,0 +1,64 @@
+module Iriq
+  # Rolling frequency counts for a single (host, prefix-shape, position).
+  # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
+  # doesn't grow memory without bound — `total` keeps growing accurately, but
+  # only the first `max_values` distinct values are tracked individually.
+  class PositionStats
+    DEFAULT_MAX_VALUES = 1_000
+    attr_reader :value_counts, :type_counts, :total, :max_values
+    def initialize(max_values: DEFAULT_MAX_VALUES)
+      @value_counts = Hash.new(0)
+      @type_counts  = Hash.new(0)
+      @total        = 0
+      @max_values   = max_values
+    end
+    def observe(value, type)
+      @total += 1
+      @type_counts[type] += 1
+      if @value_counts.size < @max_values || @value_counts.key?(value)
+        @value_counts[value] += 1
+      end
+    end
+    def cardinality
+      @value_counts.size
+    end
+    # Fraction of observations whose type was variable (i.e. classifier said
+    # not :literal).
+    def variable_fraction(classifier)
+      return 0.0 if @total.zero?
+      var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
+      var.to_f / @total
+    end
+    def value_fraction(value)
+      return 0.0 if @total.zero?
+      (@value_counts[value] || 0).to_f / @total
+    end
+    def dump
+      {
+        "value_counts" => @value_counts,
+        "type_counts"  => @type_counts.transform_keys(&:to_s),
+        "total"        => @total,
+        "max_values"   => @max_values,
+      }
+    end
+    def self.from_dump(h)
+      stats = new(max_values: h["max_values"])
+      stats.instance_variable_set(:@total, h["total"])
+      vc = Hash.new(0).merge(h["value_counts"])
+      tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
+      stats.instance_variable_set(:@value_counts, vc)
+      stats.instance_variable_set(:@type_counts, tc)
+      stats
+    end
+  end
+end

data/lib/iriq/segment_classifier.rb CHANGED Viewed

@@ -20,9 +20,34 @@ module Iriq
     TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
     TS_MILLIS_RANGE  = 1_000_000_000_000..9_999_999_999_999
+    # Bounded memoization: classification of a given string is pure, so
+    # repeat segments (e.g. /users in countless paths) can be cached. Cap
+    # keeps the cache from unbounded growth when inputs are dominated by
+    # unique IDs.
+    CACHE_MAX = 10_000
+    def initialize
+      @cache = {}
+    end
     def classify(segment)
       return :literal if segment.nil? || segment.empty?
+      cached = @cache[segment]
+      return cached if cached
+      @cache.clear if @cache.size >= CACHE_MAX
+      @cache[segment] = compute_classification(segment)
+    end
+    # Anything except :literal is considered variable for shape/explain.
+    def variable?(type)
+      type != :literal
+    end
+    private
+    def compute_classification(segment)
       case segment
       when UUID_RE     then :uuid
       when DATE_RE     then :date
@@ -36,13 +61,6 @@ module Iriq
       end
     end
-    # Anything except :literal is considered variable for shape/explain.
-    def variable?(type)
-      type != :literal
-    end
-    private
     def classify_integer(segment)
       n = segment.to_i
       return :timestamp if TS_MILLIS_RANGE.cover?(n)
@@ -50,5 +68,11 @@ module Iriq
       :integer_id
     end
+    public
+    # Shared singleton — preferred default for callers that don't bring
+    # their own classifier (saves a per-call allocation).
+    DEFAULT = new
   end
 end

data/lib/iriq/segment_hints.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Iriq
+  # Walks a segment list and annotates each entry with the type, whether it's
+  # variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
+  # follows a literal one — `/users/123` ⇒ hint `user_id`.
+  module SegmentHints
+    module_function
+    def derive(segments, classifier)
+      segments.each_with_index.map do |seg, i|
+        type     = classifier.classify(seg)
+        variable = classifier.variable?(type)
+        {
+          value:    seg,
+          type:     type,
+          variable: variable,
+          hint:     hint_for(segments, i, type, variable, classifier),
+        }
+      end
+    end
+    def hint_for(segments, i, type, variable, classifier)
+      return nil unless variable && i > 0
+      prev = segments[i - 1]
+      return nil unless classifier.classify(prev) == :literal
+      base   = Inflector.singularize(prev)
+      suffix = type == :uuid ? "_uuid" : "_id"
+      "#{base}#{suffix}"
+    end
+  end
+end

data/lib/iriq/storage/json.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require "json"
+module Iriq
+  module Storage
+    # Json wraps Memory with load-from-file at open and save-to-file at close.
+    # Same JSON shape as the pre-Storage release, so files round-trip across
+    # versions.
+    class Json < Memory
+      attr_reader :path
+      def initialize(path: nil, **opts)
+        super(**opts)
+        @path = path
+      end
+      def self.open(path, **opts)
+        s = new(path: path, **opts)
+        s.load!(path) if File.exist?(path) && File.size(path).positive?
+        s
+      end
+      def load!(path)
+        data = File.read(path)
+        return self if data.empty?
+        load_dump!(JSON.parse(data))
+        @path = path
+        self
+      end
+      # save writes atomically (tmp + rename). Defaults to the path passed at
+      # open(); pass an explicit path to write elsewhere.
+      def save(path = nil)
+        target = path || @path
+        raise ArgumentError, "no path provided" unless target
+        tmp = "#{target}.tmp"
+        File.write(tmp, JSON.generate(to_dump))
+        File.rename(tmp, target)
+      end
+    end
+  end
+end

data/lib/iriq/storage/memory.rb ADDED Viewed

@@ -0,0 +1,138 @@
+module Iriq
+  module Storage
+    # Memory is the canonical backend — every other backend either wraps it
+    # (Json) or implements the same surface against an external store (Sqlite).
+    #
+    # The contract is small enough to enumerate up top:
+    #
+    #   increment_host(host)
+    #   increment_path_length(length)
+    #   increment_raw_shape(shape)
+    #   increment_fingerprint(shape)
+    #   observe_position(host, prefix, value, type)
+    #   add_to_cluster(key, host, scheme, shape, identifier)
+    #
+    #   host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
+    #   position_stats(host, prefix)
+    #   clusters / cluster_size
+    #
+    #   transaction { ... }    # backends may batch within
+    #   flush                  # commit pending writes (no-op for Memory)
+    #   close                  # release resources
+    class Memory
+      attr_reader :max_values_per_position
+      # Path of the underlying file, if any. Memory backends are unpathed;
+      # Json/Sqlite override.
+      def path; nil; end
+      def initialize(classifier: SegmentClassifier::DEFAULT,
+                     max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
+        @classifier              = classifier
+        @max_values_per_position = max_values_per_position
+        @host_counts             = Hash.new(0)
+        @path_length_counts      = Hash.new(0)
+        @raw_shape_counts        = Hash.new(0)
+        @fingerprint_counts      = Hash.new(0)
+        @position_stats          = {}
+        @clusters                = {}
+      end
+      def transaction
+        yield self
+      end
+      def batch
+        yield
+      end
+      def flush;  end
+      def close;  end
+      # No-op for in-memory; subclasses override.
+      def save(path = nil); end
+      # --- Increments -------------------------------------------------------
+      def increment_host(host)
+        @host_counts[host] += 1 if host
+      end
+      def increment_path_length(length)
+        @path_length_counts[length] += 1
+      end
+      def increment_raw_shape(shape)
+        @raw_shape_counts[shape] += 1
+      end
+      def increment_fingerprint(shape)
+        @fingerprint_counts[shape] += 1
+      end
+      def observe_position(host, prefix, value, type)
+        stats = @position_stats[[host, prefix]] ||= PositionStats.new(max_values: @max_values_per_position)
+        stats.observe(value, type)
+      end
+      def add_to_cluster(key, host, scheme, shape, identifier)
+        cluster = @clusters[key] ||= Cluster.new(key: key, host: host, scheme: scheme, shape: shape)
+        cluster.add(identifier)
+        cluster
+      end
+      # --- Reads ------------------------------------------------------------
+      def host_counts;        @host_counts;        end
+      def path_length_counts; @path_length_counts; end
+      def raw_shape_counts;   @raw_shape_counts;   end
+      def fingerprint_counts; @fingerprint_counts; end
+      def position_stats(host, prefix)
+        @position_stats[[host, prefix]]
+      end
+      def each_position_stats(&block)
+        @position_stats.each(&block)
+      end
+      def clusters
+        @clusters.values
+      end
+      def cluster_size
+        @clusters.size
+      end
+      # --- Bulk load (used by JSON backend) --------------------------------
+      def load_dump!(h)
+        @host_counts        = Hash.new(0).merge(h["host_counts"])
+        @path_length_counts = Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i))
+        @raw_shape_counts   = Hash.new(0).merge(h["raw_shape_counts"])
+        @fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
+        @max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
+        @position_stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
+          acc[[host, prefix]] = PositionStats.from_dump(sdump)
+        end
+        cdump = h.fetch("clusterer", { "clusters" => {} })
+        @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c) }
+        self
+      end
+      def to_dump
+        {
+          "host_counts"             => @host_counts,
+          "path_length_counts"      => @path_length_counts.transform_keys(&:to_s),
+          "raw_shape_counts"        => @raw_shape_counts,
+          "fingerprint_counts"      => @fingerprint_counts,
+          "max_values_per_position" => @max_values_per_position,
+          "position_stats"          => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
+          "clusterer"               => {
+            "clusters" => @clusters.transform_values(&:dump),
+          },
+        }
+      end
+    end
+  end
+end