iriq 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,24 +2,27 @@ module Iriq
2
2
  # Produces a canonical, shape-aware string for an identifier.
3
3
  #
4
4
  # Normalizer.normalize("https://Foo.com:443/users/123")
5
- # # => "https://foo.com/users/{integer_id}"
5
+ # # => "https://foo.com/users/{user_id}"
6
6
  #
7
7
  # The form is intended for grouping/diffing — it is not a round-trippable URL.
8
8
  module Normalizer
9
9
  module_function
10
10
 
11
- def normalize(input, classifier: SegmentClassifier.new)
11
+ def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
12
12
  iri = input.is_a?(Identifier) ? input : Parser.parse(input)
13
- normalize_identifier(iri, classifier: classifier)
13
+ normalize_identifier(iri, classifier: classifier, hints: hints)
14
14
  end
15
15
 
16
- def normalize_identifier(iri, classifier: SegmentClassifier.new)
16
+ def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
17
17
  if iri.urn?
18
- # urn:isbn:0451450523 -> urn:isbn:{integer_id}
19
18
  if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
20
19
  ns, value = iri.nss.split(":", 2)
21
- type = classifier.classify(value)
22
- shaped = classifier.variable?(type) ? "{#{type}}" : value
20
+ entry = SegmentHints.derive([ns, value], classifier).last
21
+ shaped = if entry[:variable]
22
+ "{#{(hints && entry[:hint]) || entry[:type]}}"
23
+ else
24
+ entry[:value]
25
+ end
23
26
  "urn:#{ns}:#{shaped}"
24
27
  else
25
28
  iri.canonical
@@ -29,7 +32,7 @@ module Iriq
29
32
  out << "#{iri.scheme}://" if iri.scheme
30
33
  out << iri.host if iri.host
31
34
  out << ":#{iri.port}" if iri.port
32
- out << PathShape.new(classifier: classifier).for(iri.path_segments)
35
+ out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
33
36
  if iri.query_params && !iri.query_params.empty?
34
37
  out << "?" + shape_query(iri.query_params, classifier)
35
38
  end
@@ -0,0 +1,25 @@
1
+ module Iriq
2
+ # The result of Corpus#observe. Lightweight value object — heavy work
3
+ # (explanation, normalization) is deferred until you ask.
4
+ class Observation
5
+ attr_reader :identifier, :cluster
6
+
7
+ def initialize(corpus:, identifier:, cluster:)
8
+ @corpus = corpus
9
+ @identifier = identifier
10
+ @cluster = cluster
11
+ end
12
+
13
+ def fingerprint
14
+ @fingerprint ||= Normalizer.normalize_identifier(@identifier)
15
+ end
16
+
17
+ def explanation
18
+ @explanation ||= @corpus.explain(@identifier)
19
+ end
20
+
21
+ def normalize
22
+ @corpus.normalize(@identifier)
23
+ end
24
+ end
25
+ end
data/lib/iriq/parser.rb CHANGED
@@ -3,7 +3,7 @@ module Iriq
3
3
  #
4
4
  # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
5
5
  # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
6
- # Unicode hosts and paths) to support normalization and clustering.
6
+ # Unicode hosts and paths) to support extraction, normalization, and clustering.
7
7
  module Parser
8
8
  SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
9
9
 
@@ -1,27 +1,45 @@
1
1
  module Iriq
2
2
  # Converts a sequence of path segments into a route-shape string by
3
- # replacing variable segments with `{type}` placeholders.
3
+ # replacing variable segments with `{hint}` placeholders, falling back to
4
+ # `{type}` when no hint is available.
4
5
  #
5
6
  # PathShape.for(["users", "123", "orders", "456"])
6
- # # => "/users/{integer_id}/orders/{integer_id}"
7
+ # # => "/users/{user_id}/orders/{order_id}"
8
+ #
9
+ # Pass `hints: false` to use raw types instead:
10
+ #
11
+ # PathShape.for(["users", "123"], hints: false)
12
+ # # => "/users/{integer_id}"
7
13
  class PathShape
8
- def initialize(classifier: SegmentClassifier.new)
14
+ def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
9
15
  @classifier = classifier
16
+ @hints = hints
10
17
  end
11
18
 
12
19
  def for(segments)
13
20
  return "/" if segments.nil? || segments.empty?
14
21
 
15
- "/" + segments.map { |s| shape_segment(s) }.join("/")
22
+ from_entries(SegmentHints.derive(segments, @classifier))
23
+ end
24
+
25
+ # Build a shape string from already-derived SegmentHints entries.
26
+ # Used by Corpus to avoid re-deriving entries per observation when it
27
+ # needs multiple shape variants (raw and hinted).
28
+ def from_entries(entries)
29
+ return "/" if entries.nil? || entries.empty?
30
+
31
+ "/" + entries.map { |e| shape_token(e) }.join("/")
16
32
  end
17
33
 
18
- def shape_segment(segment)
19
- type = @classifier.classify(segment)
20
- @classifier.variable?(type) ? "{#{type}}" : segment
34
+ def shape_token(entry)
35
+ return entry[:value] unless entry[:variable]
36
+
37
+ placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
38
+ "{#{placeholder}}"
21
39
  end
22
40
 
23
- def self.for(segments, classifier: SegmentClassifier.new)
24
- new(classifier: classifier).for(segments)
41
+ def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
42
+ new(classifier: classifier, hints: hints).for(segments)
25
43
  end
26
44
  end
27
45
  end
@@ -0,0 +1,64 @@
1
+ module Iriq
2
+ # Rolling frequency counts for a single (host, prefix-shape, position).
3
+ # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
4
+ # doesn't grow memory without bound — `total` keeps growing accurately, but
5
+ # only the first `max_values` distinct values are tracked individually.
6
+ class PositionStats
7
+ DEFAULT_MAX_VALUES = 1_000
8
+
9
+ attr_reader :value_counts, :type_counts, :total, :max_values
10
+
11
+ def initialize(max_values: DEFAULT_MAX_VALUES)
12
+ @value_counts = Hash.new(0)
13
+ @type_counts = Hash.new(0)
14
+ @total = 0
15
+ @max_values = max_values
16
+ end
17
+
18
+ def observe(value, type)
19
+ @total += 1
20
+ @type_counts[type] += 1
21
+ if @value_counts.size < @max_values || @value_counts.key?(value)
22
+ @value_counts[value] += 1
23
+ end
24
+ end
25
+
26
+ def cardinality
27
+ @value_counts.size
28
+ end
29
+
30
+ # Fraction of observations whose type was variable (i.e. classifier said
31
+ # not :literal).
32
+ def variable_fraction(classifier)
33
+ return 0.0 if @total.zero?
34
+
35
+ var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
36
+ var.to_f / @total
37
+ end
38
+
39
+ def value_fraction(value)
40
+ return 0.0 if @total.zero?
41
+
42
+ (@value_counts[value] || 0).to_f / @total
43
+ end
44
+
45
+ def dump
46
+ {
47
+ "value_counts" => @value_counts,
48
+ "type_counts" => @type_counts.transform_keys(&:to_s),
49
+ "total" => @total,
50
+ "max_values" => @max_values,
51
+ }
52
+ end
53
+
54
+ def self.from_dump(h)
55
+ stats = new(max_values: h["max_values"])
56
+ stats.instance_variable_set(:@total, h["total"])
57
+ vc = Hash.new(0).merge(h["value_counts"])
58
+ tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
59
+ stats.instance_variable_set(:@value_counts, vc)
60
+ stats.instance_variable_set(:@type_counts, tc)
61
+ stats
62
+ end
63
+ end
64
+ end
@@ -20,9 +20,34 @@ module Iriq
20
20
  TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
21
21
  TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
22
22
 
23
+ # Bounded memoization: classification of a given string is pure, so
24
+ # repeat segments (e.g. /users in countless paths) can be cached. Cap
25
+ # keeps the cache from unbounded growth when inputs are dominated by
26
+ # unique IDs.
27
+ CACHE_MAX = 10_000
28
+
29
+ def initialize
30
+ @cache = {}
31
+ end
32
+
23
33
  def classify(segment)
24
34
  return :literal if segment.nil? || segment.empty?
25
35
 
36
+ cached = @cache[segment]
37
+ return cached if cached
38
+
39
+ @cache.clear if @cache.size >= CACHE_MAX
40
+ @cache[segment] = compute_classification(segment)
41
+ end
42
+
43
+ # Anything except :literal is considered variable for shape/explain.
44
+ def variable?(type)
45
+ type != :literal
46
+ end
47
+
48
+ private
49
+
50
+ def compute_classification(segment)
26
51
  case segment
27
52
  when UUID_RE then :uuid
28
53
  when DATE_RE then :date
@@ -36,13 +61,6 @@ module Iriq
36
61
  end
37
62
  end
38
63
 
39
- # Anything except :literal is considered variable for shape/explain.
40
- def variable?(type)
41
- type != :literal
42
- end
43
-
44
- private
45
-
46
64
  def classify_integer(segment)
47
65
  n = segment.to_i
48
66
  return :timestamp if TS_MILLIS_RANGE.cover?(n)
@@ -50,5 +68,11 @@ module Iriq
50
68
 
51
69
  :integer_id
52
70
  end
71
+
72
+ public
73
+
74
+ # Shared singleton — preferred default for callers that don't bring
75
+ # their own classifier (saves a per-call allocation).
76
+ DEFAULT = new
53
77
  end
54
78
  end
@@ -0,0 +1,32 @@
1
+ module Iriq
2
+ # Walks a segment list and annotates each entry with the type, whether it's
3
+ # variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
4
+ # follows a literal one — `/users/123` ⇒ hint `user_id`.
5
+ module SegmentHints
6
+ module_function
7
+
8
+ def derive(segments, classifier)
9
+ segments.each_with_index.map do |seg, i|
10
+ type = classifier.classify(seg)
11
+ variable = classifier.variable?(type)
12
+ {
13
+ value: seg,
14
+ type: type,
15
+ variable: variable,
16
+ hint: hint_for(segments, i, type, variable, classifier),
17
+ }
18
+ end
19
+ end
20
+
21
+ def hint_for(segments, i, type, variable, classifier)
22
+ return nil unless variable && i > 0
23
+
24
+ prev = segments[i - 1]
25
+ return nil unless classifier.classify(prev) == :literal
26
+
27
+ base = Inflector.singularize(prev)
28
+ suffix = type == :uuid ? "_uuid" : "_id"
29
+ "#{base}#{suffix}"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,43 @@
1
+ require "json"
2
+
3
+ module Iriq
4
+ module Storage
5
+ # Json wraps Memory with load-from-file at open and save-to-file at close.
6
+ # Same JSON shape as the pre-Storage release, so files round-trip across
7
+ # versions.
8
+ class Json < Memory
9
+ attr_reader :path
10
+
11
+ def initialize(path: nil, **opts)
12
+ super(**opts)
13
+ @path = path
14
+ end
15
+
16
+ def self.open(path, **opts)
17
+ s = new(path: path, **opts)
18
+ s.load!(path) if File.exist?(path) && File.size(path).positive?
19
+ s
20
+ end
21
+
22
+ def load!(path)
23
+ data = File.read(path)
24
+ return self if data.empty?
25
+
26
+ load_dump!(JSON.parse(data))
27
+ @path = path
28
+ self
29
+ end
30
+
31
+ # save writes atomically (tmp + rename). Defaults to the path passed at
32
+ # open(); pass an explicit path to write elsewhere.
33
+ def save(path = nil)
34
+ target = path || @path
35
+ raise ArgumentError, "no path provided" unless target
36
+
37
+ tmp = "#{target}.tmp"
38
+ File.write(tmp, JSON.generate(to_dump))
39
+ File.rename(tmp, target)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,138 @@
1
+ module Iriq
2
+ module Storage
3
+ # Memory is the canonical backend — every other backend either wraps it
4
+ # (Json) or implements the same surface against an external store (Sqlite).
5
+ #
6
+ # The contract is small enough to enumerate up top:
7
+ #
8
+ # increment_host(host)
9
+ # increment_path_length(length)
10
+ # increment_raw_shape(shape)
11
+ # increment_fingerprint(shape)
12
+ # observe_position(host, prefix, value, type)
13
+ # add_to_cluster(key, host, scheme, shape, identifier)
14
+ #
15
+ # host_counts / path_length_counts / raw_shape_counts / fingerprint_counts
16
+ # position_stats(host, prefix)
17
+ # clusters / cluster_size
18
+ #
19
+ # transaction { ... } # backends may batch within
20
+ # flush # commit pending writes (no-op for Memory)
21
+ # close # release resources
22
+ class Memory
23
+ attr_reader :max_values_per_position
24
+
25
+ # Path of the underlying file, if any. Memory backends are unpathed;
26
+ # Json/Sqlite override.
27
+ def path; nil; end
28
+
29
+ def initialize(classifier: SegmentClassifier::DEFAULT,
30
+ max_values_per_position: PositionStats::DEFAULT_MAX_VALUES)
31
+ @classifier = classifier
32
+ @max_values_per_position = max_values_per_position
33
+ @host_counts = Hash.new(0)
34
+ @path_length_counts = Hash.new(0)
35
+ @raw_shape_counts = Hash.new(0)
36
+ @fingerprint_counts = Hash.new(0)
37
+ @position_stats = {}
38
+ @clusters = {}
39
+ end
40
+
41
+ def transaction
42
+ yield self
43
+ end
44
+
45
+ def batch
46
+ yield
47
+ end
48
+
49
+ def flush; end
50
+ def close; end
51
+
52
+ # No-op for in-memory; subclasses override.
53
+ def save(path = nil); end
54
+
55
+ # --- Increments -------------------------------------------------------
56
+
57
+ def increment_host(host)
58
+ @host_counts[host] += 1 if host
59
+ end
60
+
61
+ def increment_path_length(length)
62
+ @path_length_counts[length] += 1
63
+ end
64
+
65
+ def increment_raw_shape(shape)
66
+ @raw_shape_counts[shape] += 1
67
+ end
68
+
69
+ def increment_fingerprint(shape)
70
+ @fingerprint_counts[shape] += 1
71
+ end
72
+
73
+ def observe_position(host, prefix, value, type)
74
+ stats = @position_stats[[host, prefix]] ||= PositionStats.new(max_values: @max_values_per_position)
75
+ stats.observe(value, type)
76
+ end
77
+
78
+ def add_to_cluster(key, host, scheme, shape, identifier)
79
+ cluster = @clusters[key] ||= Cluster.new(key: key, host: host, scheme: scheme, shape: shape)
80
+ cluster.add(identifier)
81
+ cluster
82
+ end
83
+
84
+ # --- Reads ------------------------------------------------------------
85
+
86
+ def host_counts; @host_counts; end
87
+ def path_length_counts; @path_length_counts; end
88
+ def raw_shape_counts; @raw_shape_counts; end
89
+ def fingerprint_counts; @fingerprint_counts; end
90
+
91
+ def position_stats(host, prefix)
92
+ @position_stats[[host, prefix]]
93
+ end
94
+
95
+ def each_position_stats(&block)
96
+ @position_stats.each(&block)
97
+ end
98
+
99
+ def clusters
100
+ @clusters.values
101
+ end
102
+
103
+ def cluster_size
104
+ @clusters.size
105
+ end
106
+
107
+ # --- Bulk load (used by JSON backend) --------------------------------
108
+
109
+ def load_dump!(h)
110
+ @host_counts = Hash.new(0).merge(h["host_counts"])
111
+ @path_length_counts = Hash.new(0).merge(h["path_length_counts"].transform_keys(&:to_i))
112
+ @raw_shape_counts = Hash.new(0).merge(h["raw_shape_counts"])
113
+ @fingerprint_counts = Hash.new(0).merge(h["fingerprint_counts"])
114
+ @max_values_per_position = h.fetch("max_values_per_position", PositionStats::DEFAULT_MAX_VALUES)
115
+ @position_stats = h["position_stats"].each_with_object({}) do |(host, prefix, sdump), acc|
116
+ acc[[host, prefix]] = PositionStats.from_dump(sdump)
117
+ end
118
+ cdump = h.fetch("clusterer", { "clusters" => {} })
119
+ @clusters = cdump["clusters"].transform_values { |c| Cluster.from_dump(c) }
120
+ self
121
+ end
122
+
123
+ def to_dump
124
+ {
125
+ "host_counts" => @host_counts,
126
+ "path_length_counts" => @path_length_counts.transform_keys(&:to_s),
127
+ "raw_shape_counts" => @raw_shape_counts,
128
+ "fingerprint_counts" => @fingerprint_counts,
129
+ "max_values_per_position" => @max_values_per_position,
130
+ "position_stats" => @position_stats.map { |(host, prefix), s| [host, prefix, s.dump] },
131
+ "clusterer" => {
132
+ "clusters" => @clusters.transform_values(&:dump),
133
+ },
134
+ }
135
+ end
136
+ end
137
+ end
138
+ end