iriq 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Gemfile.lock +2 -2
- data/README.md +227 -33
- data/lib/iriq/cli.rb +288 -100
- data/lib/iriq/cluster.rb +23 -0
- data/lib/iriq/clusterer.rb +32 -17
- data/lib/iriq/corpus.rb +268 -0
- data/lib/iriq/explanation.rb +6 -22
- data/lib/iriq/extractor.rb +125 -0
- data/lib/iriq/identifier.rb +11 -3
- data/lib/iriq/inflector.rb +145 -0
- data/lib/iriq/normalizer.rb +11 -8
- data/lib/iriq/observation.rb +25 -0
- data/lib/iriq/path_shape.rb +27 -9
- data/lib/iriq/position_stats.rb +64 -0
- data/lib/iriq/segment_classifier.rb +31 -7
- data/lib/iriq/segment_hints.rb +32 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +10 -0
- data/script/benchmark.rb +81 -0
- data/script/memory.rb +121 -0
- metadata +9 -1
data/lib/iriq/path_shape.rb
CHANGED
|
@@ -1,27 +1,45 @@
|
|
|
1
1
|
module Iriq
|
|
2
2
|
# Converts a sequence of path segments into a route-shape string by
|
|
3
|
-
# replacing variable segments with `{
|
|
3
|
+
# replacing variable segments with `{hint}` placeholders, falling back to
|
|
4
|
+
# `{type}` when no hint is available.
|
|
4
5
|
#
|
|
5
6
|
# PathShape.for(["users", "123", "orders", "456"])
|
|
6
|
-
# # => "/users/{
|
|
7
|
+
# # => "/users/{user_id}/orders/{order_id}"
|
|
8
|
+
#
|
|
9
|
+
# Pass `hints: false` to use raw types instead:
|
|
10
|
+
#
|
|
11
|
+
# PathShape.for(["users", "123"], hints: false)
|
|
12
|
+
# # => "/users/{integer_id}"
|
|
7
13
|
class PathShape
|
|
8
|
-
def initialize(classifier: SegmentClassifier
|
|
14
|
+
def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
9
15
|
@classifier = classifier
|
|
16
|
+
@hints = hints
|
|
10
17
|
end
|
|
11
18
|
|
|
12
19
|
def for(segments)
|
|
13
20
|
return "/" if segments.nil? || segments.empty?
|
|
14
21
|
|
|
15
|
-
|
|
22
|
+
from_entries(SegmentHints.derive(segments, @classifier))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Build a shape string from already-derived SegmentHints entries.
|
|
26
|
+
# Used by Corpus to avoid re-deriving entries per observation when it
|
|
27
|
+
# needs multiple shape variants (raw and hinted).
|
|
28
|
+
def from_entries(entries)
|
|
29
|
+
return "/" if entries.nil? || entries.empty?
|
|
30
|
+
|
|
31
|
+
"/" + entries.map { |e| shape_token(e) }.join("/")
|
|
16
32
|
end
|
|
17
33
|
|
|
18
|
-
def
|
|
19
|
-
|
|
20
|
-
|
|
34
|
+
def shape_token(entry)
|
|
35
|
+
return entry[:value] unless entry[:variable]
|
|
36
|
+
|
|
37
|
+
placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
|
|
38
|
+
"{#{placeholder}}"
|
|
21
39
|
end
|
|
22
40
|
|
|
23
|
-
def self.for(segments, classifier: SegmentClassifier
|
|
24
|
-
new(classifier: classifier).for(segments)
|
|
41
|
+
def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
42
|
+
new(classifier: classifier, hints: hints).for(segments)
|
|
25
43
|
end
|
|
26
44
|
end
|
|
27
45
|
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Rolling frequency counts for a single (host, prefix-shape, position).
|
|
3
|
+
# Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
|
|
4
|
+
# doesn't grow memory without bound — `total` keeps growing accurately, but
|
|
5
|
+
# only the first `max_values` distinct values are tracked individually.
|
|
6
|
+
class PositionStats
|
|
7
|
+
DEFAULT_MAX_VALUES = 1_000
|
|
8
|
+
|
|
9
|
+
attr_reader :value_counts, :type_counts, :total, :max_values
|
|
10
|
+
|
|
11
|
+
def initialize(max_values: DEFAULT_MAX_VALUES)
|
|
12
|
+
@value_counts = Hash.new(0)
|
|
13
|
+
@type_counts = Hash.new(0)
|
|
14
|
+
@total = 0
|
|
15
|
+
@max_values = max_values
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def observe(value, type)
|
|
19
|
+
@total += 1
|
|
20
|
+
@type_counts[type] += 1
|
|
21
|
+
if @value_counts.size < @max_values || @value_counts.key?(value)
|
|
22
|
+
@value_counts[value] += 1
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def cardinality
|
|
27
|
+
@value_counts.size
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Fraction of observations whose type was variable (i.e. classifier said
|
|
31
|
+
# not :literal).
|
|
32
|
+
def variable_fraction(classifier)
|
|
33
|
+
return 0.0 if @total.zero?
|
|
34
|
+
|
|
35
|
+
var = @type_counts.sum { |t, c| classifier.variable?(t) ? c : 0 }
|
|
36
|
+
var.to_f / @total
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def value_fraction(value)
|
|
40
|
+
return 0.0 if @total.zero?
|
|
41
|
+
|
|
42
|
+
(@value_counts[value] || 0).to_f / @total
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def dump
|
|
46
|
+
{
|
|
47
|
+
"value_counts" => @value_counts,
|
|
48
|
+
"type_counts" => @type_counts.transform_keys(&:to_s),
|
|
49
|
+
"total" => @total,
|
|
50
|
+
"max_values" => @max_values,
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.from_dump(h)
|
|
55
|
+
stats = new(max_values: h["max_values"])
|
|
56
|
+
stats.instance_variable_set(:@total, h["total"])
|
|
57
|
+
vc = Hash.new(0).merge(h["value_counts"])
|
|
58
|
+
tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
|
|
59
|
+
stats.instance_variable_set(:@value_counts, vc)
|
|
60
|
+
stats.instance_variable_set(:@type_counts, tc)
|
|
61
|
+
stats
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -20,9 +20,34 @@ module Iriq
|
|
|
20
20
|
TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
|
|
21
21
|
TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
|
|
22
22
|
|
|
23
|
+
# Bounded memoization: classification of a given string is pure, so
|
|
24
|
+
# repeat segments (e.g. /users in countless paths) can be cached. Cap
|
|
25
|
+
# keeps the cache from unbounded growth when inputs are dominated by
|
|
26
|
+
# unique IDs.
|
|
27
|
+
CACHE_MAX = 10_000
|
|
28
|
+
|
|
29
|
+
def initialize
|
|
30
|
+
@cache = {}
|
|
31
|
+
end
|
|
32
|
+
|
|
23
33
|
def classify(segment)
|
|
24
34
|
return :literal if segment.nil? || segment.empty?
|
|
25
35
|
|
|
36
|
+
cached = @cache[segment]
|
|
37
|
+
return cached if cached
|
|
38
|
+
|
|
39
|
+
@cache.clear if @cache.size >= CACHE_MAX
|
|
40
|
+
@cache[segment] = compute_classification(segment)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Anything except :literal is considered variable for shape/explain.
|
|
44
|
+
def variable?(type)
|
|
45
|
+
type != :literal
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def compute_classification(segment)
|
|
26
51
|
case segment
|
|
27
52
|
when UUID_RE then :uuid
|
|
28
53
|
when DATE_RE then :date
|
|
@@ -36,13 +61,6 @@ module Iriq
|
|
|
36
61
|
end
|
|
37
62
|
end
|
|
38
63
|
|
|
39
|
-
# Anything except :literal is considered variable for shape/explain.
|
|
40
|
-
def variable?(type)
|
|
41
|
-
type != :literal
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
private
|
|
45
|
-
|
|
46
64
|
def classify_integer(segment)
|
|
47
65
|
n = segment.to_i
|
|
48
66
|
return :timestamp if TS_MILLIS_RANGE.cover?(n)
|
|
@@ -50,5 +68,11 @@ module Iriq
|
|
|
50
68
|
|
|
51
69
|
:integer_id
|
|
52
70
|
end
|
|
71
|
+
|
|
72
|
+
public
|
|
73
|
+
|
|
74
|
+
# Shared singleton — preferred default for callers that don't bring
|
|
75
|
+
# their own classifier (saves a per-call allocation).
|
|
76
|
+
DEFAULT = new
|
|
53
77
|
end
|
|
54
78
|
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Walks a segment list and annotates each entry with the type, whether it's
|
|
3
|
+
# variable, and a RESTful "hint" (e.g. `user_id`) when a variable segment
|
|
4
|
+
# follows a literal one — `/users/123` ⇒ hint `user_id`.
|
|
5
|
+
module SegmentHints
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
def derive(segments, classifier)
|
|
9
|
+
segments.each_with_index.map do |seg, i|
|
|
10
|
+
type = classifier.classify(seg)
|
|
11
|
+
variable = classifier.variable?(type)
|
|
12
|
+
{
|
|
13
|
+
value: seg,
|
|
14
|
+
type: type,
|
|
15
|
+
variable: variable,
|
|
16
|
+
hint: hint_for(segments, i, type, variable, classifier),
|
|
17
|
+
}
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def hint_for(segments, i, type, variable, classifier)
|
|
22
|
+
return nil unless variable && i > 0
|
|
23
|
+
|
|
24
|
+
prev = segments[i - 1]
|
|
25
|
+
return nil unless classifier.classify(prev) == :literal
|
|
26
|
+
|
|
27
|
+
base = Inflector.singularize(prev)
|
|
28
|
+
suffix = type == :uuid ? "_uuid" : "_id"
|
|
29
|
+
"#{base}#{suffix}"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
data/lib/iriq/version.rb
CHANGED
data/lib/iriq.rb
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
require "iriq/version"
|
|
2
2
|
require "iriq/errors"
|
|
3
|
+
require "iriq/inflector"
|
|
3
4
|
require "iriq/identifier"
|
|
4
5
|
require "iriq/parser"
|
|
5
6
|
require "iriq/segment_classifier"
|
|
7
|
+
require "iriq/segment_hints"
|
|
6
8
|
require "iriq/path_shape"
|
|
7
9
|
require "iriq/normalizer"
|
|
8
10
|
require "iriq/explanation"
|
|
9
11
|
require "iriq/cluster"
|
|
10
12
|
require "iriq/clusterer"
|
|
13
|
+
require "iriq/position_stats"
|
|
14
|
+
require "iriq/observation"
|
|
15
|
+
require "iriq/corpus"
|
|
16
|
+
require "iriq/extractor"
|
|
11
17
|
require "iriq/cli"
|
|
12
18
|
|
|
13
19
|
module Iriq
|
|
@@ -23,5 +29,9 @@ module Iriq
|
|
|
23
29
|
def explain(input)
|
|
24
30
|
Explanation.explain(input)
|
|
25
31
|
end
|
|
32
|
+
|
|
33
|
+
def extract(text)
|
|
34
|
+
Extractor.new.extract(text)
|
|
35
|
+
end
|
|
26
36
|
end
|
|
27
37
|
end
|
data/script/benchmark.rb
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# Performance benchmark for the main hot paths in Iriq.
|
|
3
|
+
#
|
|
4
|
+
# Usage:
|
|
5
|
+
# bundle exec script/benchmark.rb # default sizes
|
|
6
|
+
# bundle exec script/benchmark.rb 50000 # custom "large" size
|
|
7
|
+
#
|
|
8
|
+
# Inputs are generated deterministically from IriGenerator so results are
|
|
9
|
+
# comparable across runs.
|
|
10
|
+
|
|
11
|
+
require "benchmark"
|
|
12
|
+
require "tempfile"
|
|
13
|
+
|
|
14
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
15
|
+
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
16
|
+
require "iriq"
|
|
17
|
+
require "iri_generator"
|
|
18
|
+
|
|
19
|
+
LARGE = Integer(ARGV[0] || 10_000)
|
|
20
|
+
SMALL = [LARGE / 10, 1_000].min
|
|
21
|
+
HUGE = LARGE * 10
|
|
22
|
+
|
|
23
|
+
puts "Iriq benchmark — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
24
|
+
puts "Sizes: small=#{SMALL}, large=#{LARGE}, huge=#{HUGE}"
|
|
25
|
+
puts
|
|
26
|
+
|
|
27
|
+
small_urls = IriGenerator.urls(count: SMALL, seed: 1)
|
|
28
|
+
large_urls = IriGenerator.urls(count: LARGE, seed: 1)
|
|
29
|
+
huge_urls = IriGenerator.urls(count: HUGE, seed: 1)
|
|
30
|
+
|
|
31
|
+
# ~ LARGE URLs embedded in prose
|
|
32
|
+
text_blob = small_urls.map { |u| "Some prose about #{u} here, also random words." }.join(" ") * (LARGE / SMALL)
|
|
33
|
+
puts "Text blob: #{text_blob.bytesize / 1024} KB (~#{LARGE} URLs embedded)"
|
|
34
|
+
puts
|
|
35
|
+
|
|
36
|
+
results = {}
|
|
37
|
+
Benchmark.bm(42) do |x|
|
|
38
|
+
results[:parse] = x.report("parse #{LARGE} URLs") { large_urls.each { |u| Iriq.parse(u) } }
|
|
39
|
+
results[:normalize] = x.report("normalize #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.normalize(u) } }
|
|
40
|
+
results[:explain] = x.report("explain #{LARGE} URLs (deterministic)") { large_urls.each { |u| Iriq.explain(u) } }
|
|
41
|
+
results[:extract] = x.report("extract from ~#{text_blob.bytesize / 1024} KB text") { Iriq.extract(text_blob) }
|
|
42
|
+
|
|
43
|
+
results[:observe_small] = x.report("Corpus.observe #{SMALL} URLs") do
|
|
44
|
+
c = Iriq::Corpus.new
|
|
45
|
+
small_urls.each { |u| c.observe(u) }
|
|
46
|
+
end
|
|
47
|
+
results[:observe_large] = x.report("Corpus.observe #{LARGE} URLs") do
|
|
48
|
+
c = Iriq::Corpus.new
|
|
49
|
+
large_urls.each { |u| c.observe(u) }
|
|
50
|
+
end
|
|
51
|
+
results[:observe_huge] = x.report("Corpus.observe #{HUGE} URLs") do
|
|
52
|
+
c = Iriq::Corpus.new
|
|
53
|
+
huge_urls.each { |u| c.observe(u) }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
results[:roundtrip] = x.report("Corpus save+load (#{LARGE} observations)") do
|
|
57
|
+
c = Iriq::Corpus.new
|
|
58
|
+
large_urls.each { |u| c.observe(u) }
|
|
59
|
+
Tempfile.open(["iriq-bench", ".json"]) do |f|
|
|
60
|
+
c.save(f.path)
|
|
61
|
+
Iriq::Corpus.load(f.path)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
puts
|
|
67
|
+
puts "Throughput summary:"
|
|
68
|
+
[
|
|
69
|
+
[:parse, LARGE, "URLs/s"],
|
|
70
|
+
[:normalize, LARGE, "URLs/s"],
|
|
71
|
+
[:explain, LARGE, "URLs/s"],
|
|
72
|
+
[:observe_small, SMALL, "URLs/s"],
|
|
73
|
+
[:observe_large, LARGE, "URLs/s"],
|
|
74
|
+
[:observe_huge, HUGE, "URLs/s"],
|
|
75
|
+
].each do |key, n, unit|
|
|
76
|
+
per_sec = n / results[key].real
|
|
77
|
+
printf(" %-30s %12s %s\n", key, per_sec.round.to_s, unit)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
extract_mb = text_blob.bytesize / (1024.0 * 1024.0)
|
|
81
|
+
printf(" %-30s %12s MB/s\n", :extract, (extract_mb / results[:extract].real).round(2).to_s)
|
data/script/memory.rb
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# Memory profile for the main code paths in Iriq.
|
|
3
|
+
#
|
|
4
|
+
# Usage:
|
|
5
|
+
# bundle exec script/memory.rb # default sizes
|
|
6
|
+
# bundle exec script/memory.rb 50000 # custom corpus size
|
|
7
|
+
#
|
|
8
|
+
# Reports retained memory per operation, cache footprints, and memory
|
|
9
|
+
# growth across corpus sizes (to verify linear scaling — no leaks).
|
|
10
|
+
|
|
11
|
+
require "objspace"
|
|
12
|
+
|
|
13
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
14
|
+
$LOAD_PATH.unshift File.expand_path("../spec/support", __dir__)
|
|
15
|
+
require "iriq"
|
|
16
|
+
require "iri_generator"
|
|
17
|
+
|
|
18
|
+
CORPUS_SIZE = Integer(ARGV[0] || 10_000)
|
|
19
|
+
SIZES = [1_000, 10_000, 100_000].uniq.sort
|
|
20
|
+
SIZES << CORPUS_SIZE unless SIZES.include?(CORPUS_SIZE)
|
|
21
|
+
SIZES.sort!
|
|
22
|
+
|
|
23
|
+
# Bytes → KB / MB string for display.
|
|
24
|
+
def fmt_bytes(n)
|
|
25
|
+
if n < 1024
|
|
26
|
+
"#{n} B"
|
|
27
|
+
elsif n < 1024 * 1024
|
|
28
|
+
format("%.1f KB", n / 1024.0)
|
|
29
|
+
else
|
|
30
|
+
format("%.2f MB", n / (1024.0 * 1024.0))
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Run a block in isolation: GC before + after, return delta in bytes.
|
|
35
|
+
def measure_retained(&block)
|
|
36
|
+
GC.start
|
|
37
|
+
before = ObjectSpace.memsize_of_all
|
|
38
|
+
result = block.call
|
|
39
|
+
GC.start
|
|
40
|
+
after = ObjectSpace.memsize_of_all
|
|
41
|
+
[after - before, result]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Reset caches so each scenario starts clean.
|
|
45
|
+
def reset_caches
|
|
46
|
+
Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache).clear
|
|
47
|
+
Iriq::Inflector.instance_variable_get(:@cache)&.clear
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
puts "Iriq memory profile — Ruby #{RUBY_VERSION}, Iriq #{Iriq::VERSION}"
|
|
51
|
+
puts
|
|
52
|
+
|
|
53
|
+
# -- Section 1: memory growth across corpus sizes --
|
|
54
|
+
puts "── corpus retained memory by N (verifies linear growth) ──"
|
|
55
|
+
printf(" %-12s %-14s %-14s %-10s\n", "N obs", "retained", "per obs", "allocs")
|
|
56
|
+
SIZES.each do |n|
|
|
57
|
+
reset_caches
|
|
58
|
+
urls = IriGenerator.urls(count: n, seed: 1)
|
|
59
|
+
alloc_before = GC.stat(:total_allocated_objects)
|
|
60
|
+
retained, _ = measure_retained do
|
|
61
|
+
c = Iriq::Corpus.new
|
|
62
|
+
urls.each { |u| c.observe(u) }
|
|
63
|
+
c
|
|
64
|
+
end
|
|
65
|
+
alloc_total = GC.stat(:total_allocated_objects) - alloc_before
|
|
66
|
+
printf(" %-12s %-14s %-14s %-10s\n", n, fmt_bytes(retained), fmt_bytes(retained / n), alloc_total)
|
|
67
|
+
end
|
|
68
|
+
puts
|
|
69
|
+
|
|
70
|
+
# -- Section 2: corpus state breakdown at CORPUS_SIZE --
|
|
71
|
+
puts "── corpus state breakdown at N=#{CORPUS_SIZE} ──"
|
|
72
|
+
reset_caches
|
|
73
|
+
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
74
|
+
corpus = Iriq::Corpus.new
|
|
75
|
+
urls.each { |u| corpus.observe(u) }
|
|
76
|
+
puts " unique hosts: #{corpus.host_counts.size}"
|
|
77
|
+
puts " unique fingerprints: #{corpus.fingerprint_counts.size}"
|
|
78
|
+
puts " unique raw shapes: #{corpus.raw_shape_counts.size}"
|
|
79
|
+
puts " clusters: #{corpus.size}"
|
|
80
|
+
puts " position_stats entries: #{corpus.position_stats.size}"
|
|
81
|
+
puts " total observed values: #{corpus.position_stats.sum { |_, s| s.value_counts.size }}"
|
|
82
|
+
puts
|
|
83
|
+
|
|
84
|
+
# -- Section 3: cache footprints --
|
|
85
|
+
puts "── memoization caches ──"
|
|
86
|
+
classifier_cache = Iriq::SegmentClassifier::DEFAULT.instance_variable_get(:@cache)
|
|
87
|
+
inflector_cache = Iriq::Inflector.instance_variable_get(:@cache) || {}
|
|
88
|
+
puts " classifier cache: #{classifier_cache.size} entries (cap #{Iriq::SegmentClassifier::CACHE_MAX})"
|
|
89
|
+
puts " inflector cache: #{inflector_cache.size} entries (cap #{Iriq::Inflector::CACHE_MAX})"
|
|
90
|
+
puts
|
|
91
|
+
|
|
92
|
+
# -- Section 4: per-operation memory cost --
|
|
93
|
+
puts "── retained memory per operation (N=#{CORPUS_SIZE}) ──"
|
|
94
|
+
urls = IriGenerator.urls(count: CORPUS_SIZE, seed: 1)
|
|
95
|
+
text_blob = urls.map { |u| "Some prose about #{u} here." }.join(" ")
|
|
96
|
+
|
|
97
|
+
[
|
|
98
|
+
["parse #{CORPUS_SIZE} URLs (discarded after)", ->{ urls.each { |u| Iriq.parse(u) } }],
|
|
99
|
+
["normalize #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.normalize(u) } }],
|
|
100
|
+
["explain #{CORPUS_SIZE} URLs", ->{ urls.each { |u| Iriq.explain(u) } }],
|
|
101
|
+
["extract from #{fmt_bytes(text_blob.bytesize)} prose", ->{ Iriq.extract(text_blob) }],
|
|
102
|
+
["Corpus.observe #{CORPUS_SIZE} URLs", ->{ c = Iriq::Corpus.new; urls.each { |u| c.observe(u) }; c }],
|
|
103
|
+
].each do |label, op|
|
|
104
|
+
reset_caches
|
|
105
|
+
retained, _ = measure_retained(&op)
|
|
106
|
+
printf(" %-50s %s\n", label, fmt_bytes(retained))
|
|
107
|
+
end
|
|
108
|
+
puts
|
|
109
|
+
|
|
110
|
+
# -- Section 5: persistence overhead --
|
|
111
|
+
puts "── save/load roundtrip (N=#{CORPUS_SIZE}) ──"
|
|
112
|
+
require "tempfile"
|
|
113
|
+
reset_caches
|
|
114
|
+
corpus = Iriq::Corpus.new
|
|
115
|
+
urls.each { |u| corpus.observe(u) }
|
|
116
|
+
Tempfile.open(["iriq-mem", ".json"]) do |f|
|
|
117
|
+
corpus.save(f.path)
|
|
118
|
+
bytes = File.size(f.path)
|
|
119
|
+
puts " JSON file on disk: #{fmt_bytes(bytes)}"
|
|
120
|
+
puts " ratio: #{format("%.2f bytes/obs", bytes.to_f / CORPUS_SIZE)}"
|
|
121
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iriq
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniel Pepper
|
|
@@ -83,14 +83,22 @@ files:
|
|
|
83
83
|
- lib/iriq/cli.rb
|
|
84
84
|
- lib/iriq/cluster.rb
|
|
85
85
|
- lib/iriq/clusterer.rb
|
|
86
|
+
- lib/iriq/corpus.rb
|
|
86
87
|
- lib/iriq/errors.rb
|
|
87
88
|
- lib/iriq/explanation.rb
|
|
89
|
+
- lib/iriq/extractor.rb
|
|
88
90
|
- lib/iriq/identifier.rb
|
|
91
|
+
- lib/iriq/inflector.rb
|
|
89
92
|
- lib/iriq/normalizer.rb
|
|
93
|
+
- lib/iriq/observation.rb
|
|
90
94
|
- lib/iriq/parser.rb
|
|
91
95
|
- lib/iriq/path_shape.rb
|
|
96
|
+
- lib/iriq/position_stats.rb
|
|
92
97
|
- lib/iriq/segment_classifier.rb
|
|
98
|
+
- lib/iriq/segment_hints.rb
|
|
93
99
|
- lib/iriq/version.rb
|
|
100
|
+
- script/benchmark.rb
|
|
101
|
+
- script/memory.rb
|
|
94
102
|
homepage: https://github.com/dpep/iriq
|
|
95
103
|
licenses:
|
|
96
104
|
- MIT
|