iriq 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,45 @@
1
1
  module Iriq
2
2
  # Converts a sequence of path segments into a route-shape string by
3
- # replacing variable segments with `{type}` placeholders.
3
+ # replacing variable segments with `{hint}` placeholders, falling back to
4
+ # `{type}` when no hint is available.
4
5
  #
5
6
  # PathShape.for(["users", "123", "orders", "456"])
6
- # # => "/users/{integer_id}/orders/{integer_id}"
7
+ # # => "/users/{user_id}/orders/{order_id}"
8
+ #
9
+ # Pass `hints: false` to use raw types instead:
10
+ #
11
+ # PathShape.for(["users", "123"], hints: false)
12
+ # # => "/users/{integer_id}"
7
13
  class PathShape
8
- def initialize(classifier: SegmentClassifier.new)
14
+ def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
9
15
  @classifier = classifier
16
+ @hints = hints
10
17
  end
11
18
 
12
19
  def for(segments)
13
20
  return "/" if segments.nil? || segments.empty?
14
21
 
15
- "/" + segments.map { |s| shape_segment(s) }.join("/")
22
+ from_entries(SegmentHints.derive(segments, @classifier))
23
+ end
24
+
25
+ # Build a shape string from already-derived SegmentHints entries.
26
+ # Used by Corpus to avoid re-deriving entries per observation when it
27
+ # needs multiple shape variants (raw and hinted).
28
+ def from_entries(entries)
29
+ return "/" if entries.nil? || entries.empty?
30
+
31
+ "/" + entries.map { |e| shape_token(e) }.join("/")
16
32
  end
17
33
 
18
- def shape_segment(segment)
19
- type = @classifier.classify(segment)
20
- @classifier.variable?(type) ? "{#{type}}" : segment
34
+ def shape_token(entry)
35
+ return entry[:value] unless entry[:variable]
36
+
37
+ placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
38
+ "{#{placeholder}}"
21
39
  end
22
40
 
23
- def self.for(segments, classifier: SegmentClassifier.new)
24
- new(classifier: classifier).for(segments)
41
+ def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
42
+ new(classifier: classifier, hints: hints).for(segments)
25
43
  end
26
44
  end
27
45
  end
@@ -0,0 +1,64 @@
1
+ module Iriq
2
+ # Rolling frequency counts for a single (host, prefix-shape, position).
3
+ # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
4
+ # doesn't grow memory without bound — `total` keeps growing accurately, but
5
+ # only the first `max_values` distinct values are tracked individually.
6
+ class PositionStats
7
+ DEFAULT_MAX_VALUES = 1_000
8
+
9
+ attr_reader :value_counts, :type_counts, :total, :max_values
10
+
11
+ def initialize(max_values: DEFAULT_MAX_VALUES)
12
+ @value_counts = Hash.new(0)
13
+ @type_counts = Hash.new(0)
14
+ @total = 0
15
+ @max_values = max_values
16
+ end
17
+
18
+ def observe(value, type)
19
+ @total += 1
20
+ @type_counts[type] += 1
21
+ if @value_counts.size < @max_values || @value_counts.key?(value)
22
+ @value_counts[value] += 1
23
+ end
24
+ end
25
+
26
+ def cardinality
27
+ @value_counts.size
28
+ end
29
+
30
+ # Fraction of observations whose type was variable (i.e. classifier said
31
+ # not :literal).
32
+ def variable_fraction(classifier)
33
+ return 0.0 if @total.zero?
34
+
35
+ var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
36
+ var.to_f / @total
37
+ end
38
+
39
+ def value_fraction(value)
40
+ return 0.0 if @total.zero?
41
+
42
+ (@value_counts[value] || 0).to_f / @total
43
+ end
44
+
45
+ def dump
46
+ {
47
+ "value_counts" => @value_counts,
48
+ "type_counts" => @type_counts.transform_keys(&:to_s),
49
+ "total" => @total,
50
+ "max_values" => @max_values,
51
+ }
52
+ end
53
+
54
+ def self.from_dump(h)
55
+ stats = new(max_values: h["max_values"])
56
+ stats.instance_variable_set(:@total, h["total"])
57
+ vc = Hash.new(0).merge(h["value_counts"])
58
+ tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
59
+ stats.instance_variable_set(:@value_counts, vc)
60
+ stats.instance_variable_set(:@type_counts, tc)
61
+ stats
62
+ end
63
+ end
64
+ end
@@ -20,9 +20,34 @@ module Iriq
20
20
  TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
21
21
  TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
22
22
 
23
+ # Bounded memoization: classification of a given string is pure, so
24
+ # repeat segments (e.g. /users in countless paths) can be cached. Cap
25
+ # keeps the cache from unbounded growth when inputs are dominated by
26
+ # unique IDs.
27
+ CACHE_MAX = 10_000
28
+
29
+ def initialize
30
+ @cache = {}
31
+ end
32
+
23
33
  def classify(segment)
24
34
  return :literal if segment.nil? || segment.empty?
25
35
 
36
+ cached = @cache[segment]
37
+ return cached if cached
38
+
39
+ @cache.clear if @cache.size >= CACHE_MAX
40
+ @cache[segment] = compute_classification(segment)
41
+ end
42
+
43
+ # Anything except :literal is considered variable for shape/explain.
44
+ def variable?(type)
45
+ type != :literal
46
+ end
47
+
48
+ private
49
+
50
+ def compute_classification(segment)
26
51
  case segment
27
52
  when UUID_RE then :uuid
28
53
  when DATE_RE then :date
@@ -36,13 +61,6 @@ module Iriq
36
61
  end
37
62
  end
38
63
 
39
- # Anything except :literal is considered variable for shape/explain.
40
- def variable?(type)
41
- type != :literal
42
- end
43
-
44
- private
45
-
46
64
  def classify_integer(segment)
47
65
  n = segment.to_i
48
66
  return :timestamp if TS_MILLIS_RANGE.cover?(n)
@@ -50,5 +68,11 @@ module Iriq
50
68
 
51
69
  :integer_id
52
70
  end
71
+
72
+ public
73
+
74
+ # Shared singleton — preferred default for callers that don't bring
75
+ # their own classifier (saves a per-call allocation).
76
+ DEFAULT = new
53
77
  end
54
78
  end
@@ -0,0 +1,32 @@
1
+ module Iriq
2
+ # Walks a segment list and annotates each entry with the type, whether it's
3
+ # variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
4
+ # follows a literal one — `/users/123` ⇒ hint `user_id`.
5
+ module SegmentHints
6
+ module_function
7
+
8
+ def derive(segments, classifier)
9
+ segments.each_with_index.map do |seg, i|
10
+ type = classifier.classify(seg)
11
+ variable = classifier.variable?(type)
12
+ {
13
+ value: seg,
14
+ type: type,
15
+ variable: variable,
16
+ hint: hint_for(segments, i, type, variable, classifier),
17
+ }
18
+ end
19
+ end
20
+
21
+ def hint_for(segments, i, type, variable, classifier)
22
+ return nil unless variable && i > 0
23
+
24
+ prev = segments[i - 1]
25
+ return nil unless classifier.classify(prev) == :literal
26
+
27
+ base = Inflector.singularize(prev)
28
+ suffix = type == :uuid ? "_uuid" : "_id"
29
+ "#{base}#{suffix}"
30
+ end
31
+ end
32
+ end
data/lib/iriq/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Iriq
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/iriq.rb CHANGED
@@ -1,13 +1,19 @@
1
1
  require "iriq/version"
2
2
  require "iriq/errors"
3
+ require "iriq/inflector"
3
4
  require "iriq/identifier"
4
5
  require "iriq/parser"
5
6
  require "iriq/segment_classifier"
7
+ require "iriq/segment_hints"
6
8
  require "iriq/path_shape"
7
9
  require "iriq/normalizer"
8
10
  require "iriq/explanation"
9
11
  require "iriq/cluster"
10
12
  require "iriq/clusterer"
13
+ require "iriq/position_stats"
14
+ require "iriq/observation"
15
+ require "iriq/corpus"
16
+ require "iriq/extractor"
11
17
  require "iriq/cli"
12
18
 
13
19
  module Iriq
@@ -23,5 +29,9 @@ module Iriq
23
29
  def explain(input)
24
30
  Explanation.explain(input)
25
31
  end
32
+
33
+ def extract(text)
34
+ Extractor.new.extract(text)
35
+ end
26
36
  end
27
37
  end
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+ # Performance benchmark for the main hot paths in Iriq.
3
+ #
4
+ # Usage:
5
+ # bundle exec script/benchmark.rb # default sizes
6
+ # bundle exec script/benchmark.rb 50000 # custom "large" size
7
+ #
8
+ # Inputs are generated deterministically from IriGenerator so results are
9
+ # comparable across runs.
10
+
11
+ require "benchmark"
12
+ require "tempfile"
13
+
14
+ $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
15
+ $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
16
+ require "iriq"
17
+ require "iri_generator"
18
+
19
+ LARGE = Integer(ARGV[0] || 10_000)
20
+ SMALL = [LARGE / 10, 1_000].min
21
+ HUGE = LARGE * 10
22
+
23
+ puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
24
+ puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
25
+ puts
26
+
27
+ small_urls = IriGenerator.urls(count: SMALL, seed: 1)
28
+ large_urls = IriGenerator.urls(count: LARGE, seed: 1)
29
+ huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
30
+
31
+ # ~ LARGE URLs embedded in prose
32
+ text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
33
+ puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
34
+ puts
35
+
36
+ results = {}
37
+ Benchmark.bm(42) do |x|
38
+ results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
39
+ results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
40
+ results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
41
+ results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
42
+
43
+ results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
44
+ c = Iriq::Corpus.new
45
+ small_urls.each { |u| c.observe(u) }
46
+ end
47
+ results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
48
+ c = Iriq::Corpus.new
49
+ large_urls.each { |u| c.observe(u) }
50
+ end
51
+ results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
52
+ c = Iriq::Corpus.new
53
+ huge_urls.each { |u| c.observe(u) }
54
+ end
55
+
56
+ results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
57
+ c = Iriq::Corpus.new
58
+ large_urls.each { |u| c.observe(u) }
59
+ Tempfile.open(["iriq-bench", ".json"]) do |f|
60
+ c.save(f.path)
61
+ Iriq::Corpus.load(f.path)
62
+ end
63
+ end
64
+ end
65
+
66
+ puts
67
+ puts "Throughput summary:"
68
+ [
69
+ [:parse, LARGE, "URLs/s"],
70
+ [:normalize, LARGE, "URLs/s"],
71
+ [:explain, LARGE, "URLs/s"],
72
+ [:observe_small, SMALL, "URLs/s"],
73
+ [:observe_large, LARGE, "URLs/s"],
74
+ [:observe_huge, HUGE, "URLs/s"],
75
+ ].each do |key, n, unit|
76
+ per_sec = n / results[key].real
77
+ printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
78
+ end
79
+
80
+ extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
81
+ printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
data/script/memory.rb ADDED
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env ruby
2
+ # Memory profile for the main code paths in Iriq.
3
+ #
4
+ # Usage:
5
+ # bundle exec script/memory.rb # default sizes
6
+ # bundle exec script/memory.rb 50000 # custom corpus size
7
+ #
8
+ # Reports retained memory per operation, cache footprints, and memory
9
+ # growth across corpus sizes (to verify linear scaling — no leaks).
10
+
11
+ require "objspace"
12
+
13
+ $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
14
+ $LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
15
+ require "iriq"
16
+ require "iri_generator"
17
+
18
+ CORPUS_SIZE = Integer(ARGV[0] || 10_000)
19
+ SIZES = [1_000, 10_000, 100_000].uniq.sort
20
+ SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
21
+ SIZES.sort!
22
+
23
+ # Bytes → KB / MB string for display.
24
+ def fmt_bytes(n)
25
+ if n < 1024
26
+ "#{n} B"
27
+ elsif n < 1024 * 1024
28
+ format("%.1f KB", n / 1024.0)
29
+ else
30
+ format("%.2f MB", n / (1024.0 * 1024.0))
31
+ end
32
+ end
33
+
34
+ # Run a block in isolation: GC before + after, return delta in bytes.
35
+ def measure_retained(&block)
36
+ GC.start
37
+ before = ObjectSpace.memsize_of_all
38
+ result = block.call
39
+ GC.start
40
+ after = ObjectSpace.memsize_of_all
41
+ [after - before, result]
42
+ end
43
+
44
+ # Reset caches so each scenario starts clean.
45
+ def reset_caches
46
+ Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
47
+ Iriq::Inflector.instance_variable_get(:@cache)&.clear
48
+ end
49
+
50
+ puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
51
+ puts
52
+
53
+ # -- Section 1: memory growth across corpus sizes --
54
+ puts "── corpus retained memory by N (verifies linear growth) ──"
55
+ printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
56
+ SIZES.each do |n|
57
+ reset_caches
58
+ urls = IriGenerator.urls(count: n, seed: 1)
59
+ alloc_before = GC.stat(:total_allocated_objects)
60
+ retained, _ = measure_retained do
61
+ c = Iriq::Corpus.new
62
+ urls.each { |u| c.observe(u) }
63
+ c
64
+ end
65
+ alloc_total = GC.stat(:total_allocated_objects) - alloc_before
66
+ printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
67
+ end
68
+ puts
69
+
70
+ # -- Section 2: corpus state breakdown at CORPUS_SIZE --
71
+ puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
72
+ reset_caches
73
+ urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
74
+ corpus = Iriq::Corpus.new
75
+ urls.each { |u| corpus.observe(u) }
76
+ puts " unique hosts: #{corpus.host_counts.size}"
77
+ puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
78
+ puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
79
+ puts " clusters: #{corpus.size}"
80
+ puts " position_stats entries: #{corpus.position_stats.size}"
81
+ puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
82
+ puts
83
+
84
+ # -- Section 3: cache footprints --
85
+ puts "── memoization caches ──"
86
+ classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
87
+ inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
88
+ puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
89
+ puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
90
+ puts
91
+
92
+ # -- Section 4: per-operation memory cost --
93
+ puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
94
+ urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
95
+ text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
96
+
97
+ [
98
+ ["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
99
+ ["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
100
+ ["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
101
+ ["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
102
+ ["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
103
+ ].each do |label, op|
104
+ reset_caches
105
+ retained, _ = measure_retained(&op)
106
+ printf(" %-50s %s\n", label, fmt_bytes(retained))
107
+ end
108
+ puts
109
+
110
+ # -- Section 5: persistence overhead --
111
+ puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
112
+ require "tempfile"
113
+ reset_caches
114
+ corpus = Iriq::Corpus.new
115
+ urls.each { |u| corpus.observe(u) }
116
+ Tempfile.open(["iriq-mem", ".json"]) do |f|
117
+ corpus.save(f.path)
118
+ bytes = File.size(f.path)
119
+ puts " JSON file on disk: #{fmt_bytes(bytes)}"
120
+ puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
121
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iriq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Pepper
@@ -83,14 +83,22 @@ files:
83
83
  - lib/iriq/cli.rb
84
84
  - lib/iriq/cluster.rb
85
85
  - lib/iriq/clusterer.rb
86
+ - lib/iriq/corpus.rb
86
87
  - lib/iriq/errors.rb
87
88
  - lib/iriq/explanation.rb
89
+ - lib/iriq/extractor.rb
88
90
  - lib/iriq/identifier.rb
91
+ - lib/iriq/inflector.rb
89
92
  - lib/iriq/normalizer.rb
93
+ - lib/iriq/observation.rb
90
94
  - lib/iriq/parser.rb
91
95
  - lib/iriq/path_shape.rb
96
+ - lib/iriq/position_stats.rb
92
97
  - lib/iriq/segment_classifier.rb
98
+ - lib/iriq/segment_hints.rb
93
99
  - lib/iriq/version.rb
100
+ - script/benchmark.rb
101
+ - script/memory.rb
94
102
  homepage: https://github.com/dpep/iriq
95
103
  licenses:
96
104
  - MIT