iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
data/lib/iriq/normalizer.rb
CHANGED
|
@@ -5,46 +5,88 @@ module Iriq
|
|
|
5
5
|
# # => "https://foo.com/users/{user_id}"
|
|
6
6
|
#
|
|
7
7
|
# The form is intended for grouping/diffing — it is not a round-trippable URL.
|
|
8
|
+
#
|
|
9
|
+
# Path + query rendering dispatches through an evidence source so the
|
|
10
|
+
# mechanical (classifier-only) and corpus-informed code paths share one
|
|
11
|
+
# entry point. When `evidence` is nil, NullEvidenceSource provides the
|
|
12
|
+
# mechanical behavior (PathShape + param-name-hint query rules). When a
|
|
13
|
+
# Corpus is passed as `evidence`, its observed Position / Cluster stats
|
|
14
|
+
# drive the rendering (variability promotion, popular outlier
|
|
15
|
+
# preservation, cluster-inferred query types).
|
|
8
16
|
module Normalizer
|
|
9
17
|
module_function
|
|
10
18
|
|
|
11
|
-
def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
19
|
+
def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
|
|
12
20
|
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
13
|
-
normalize_identifier(iri, classifier: classifier, hints: hints)
|
|
21
|
+
normalize_identifier(iri, classifier: classifier, hints: hints, evidence: evidence)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
|
|
25
|
+
return normalize_urn(iri, classifier, hints) if iri.urn?
|
|
26
|
+
|
|
27
|
+
src = evidence || NullEvidenceSource.new
|
|
28
|
+
out = +""
|
|
29
|
+
out << "#{iri.scheme}://" if iri.scheme
|
|
30
|
+
out << iri.host if iri.host
|
|
31
|
+
out << ":#{iri.port}" if iri.port
|
|
32
|
+
out << src.render_path(iri, classifier, hints)
|
|
33
|
+
if iri.query_params && !iri.query_params.empty?
|
|
34
|
+
out << "?" << src.render_query(iri, classifier)
|
|
35
|
+
end
|
|
36
|
+
out
|
|
14
37
|
end
|
|
15
38
|
|
|
16
|
-
def
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
39
|
+
def normalize_urn(iri, classifier, hints)
|
|
40
|
+
return iri.canonical unless iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
|
|
41
|
+
|
|
42
|
+
ns, value = iri.nss.split(":", 2)
|
|
43
|
+
entry = SegmentHints.derive([ns, value], classifier).last
|
|
44
|
+
shaped =
|
|
45
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
46
|
+
canon
|
|
47
|
+
elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
|
|
48
|
+
canon
|
|
49
|
+
elsif entry[:variable]
|
|
50
|
+
"{#{(hints && entry[:hint]) || SegmentClassifier.display_type(entry[:type])}}"
|
|
27
51
|
else
|
|
28
|
-
|
|
52
|
+
entry[:value]
|
|
29
53
|
end
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
54
|
+
"urn:#{ns}:#{shaped}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# NullEvidenceSource is the default evidence source — purely
|
|
59
|
+
# classifier-driven, no corpus signal. The Normalizer's mechanical
|
|
60
|
+
# behavior is what this produces. Implements the same {render_path,
|
|
61
|
+
# render_query} interface that Corpus implements for the corpus-informed
|
|
62
|
+
# path.
|
|
63
|
+
class NullEvidenceSource
|
|
64
|
+
def render_path(iri, classifier, hints)
|
|
65
|
+
PathShape.new(
|
|
66
|
+
classifier: classifier, hints: hints,
|
|
67
|
+
canonical_dates: true, canonical_currencies: true,
|
|
68
|
+
).for(iri.path_segments)
|
|
41
69
|
end
|
|
42
70
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
v =
|
|
71
|
+
def render_query(iri, classifier)
|
|
72
|
+
iri.query_params.keys.sort.map do |k|
|
|
73
|
+
v = iri.query_params[k]
|
|
46
74
|
type = classifier.classify(v.to_s)
|
|
47
|
-
|
|
75
|
+
# Param-name hint can lift a generic literal/opaque_id/slug into
|
|
76
|
+
# a semantic type — `?phone=unknown` becomes `{phone}`.
|
|
77
|
+
if (hint = SegmentClassifier.param_name_hint(k, type))
|
|
78
|
+
type = hint
|
|
79
|
+
end
|
|
80
|
+
shaped =
|
|
81
|
+
if type == :date && (canon = SegmentClassifier.canonical_date(v.to_s))
|
|
82
|
+
canon
|
|
83
|
+
elsif type == :currency && (canon = SegmentClassifier.canonical_currency(v.to_s))
|
|
84
|
+
canon
|
|
85
|
+
elsif classifier.variable?(type)
|
|
86
|
+
"{#{SegmentClassifier.display_type(type)}}"
|
|
87
|
+
else
|
|
88
|
+
v
|
|
89
|
+
end
|
|
48
90
|
"#{k}=#{shaped}"
|
|
49
91
|
end.join("&")
|
|
50
92
|
end
|
data/lib/iriq/path_shape.rb
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
module Iriq
|
|
2
|
-
#
|
|
3
|
-
#
|
|
4
|
-
#
|
|
2
|
+
# Renderer that produces a route-shape string by replacing variable
|
|
3
|
+
# segments with `{hint}` placeholders. As of v0.16 this is a thin wrapper
|
|
4
|
+
# around Iriq::Shape — kept for back-compat with callers that still want
|
|
5
|
+
# to get a string in one call.
|
|
5
6
|
#
|
|
6
7
|
# PathShape.for(["users", "123", "orders", "456"])
|
|
7
8
|
# # => "/users/{user_id}/orders/{order_id}"
|
|
@@ -9,37 +10,42 @@ module Iriq
|
|
|
9
10
|
# Pass `hints: false` to use raw types instead:
|
|
10
11
|
#
|
|
11
12
|
# PathShape.for(["users", "123"], hints: false)
|
|
12
|
-
# # => "/users/{
|
|
13
|
+
# # => "/users/{integer}"
|
|
14
|
+
#
|
|
15
|
+
# Pass `canonical_dates: true` to render date-typed segments in canonical
|
|
16
|
+
# ISO form (2024/01/15 → 2024-01-15) instead of as a `{date}` placeholder.
|
|
17
|
+
# Pass `canonical_currencies: true` for the same treatment of currency
|
|
18
|
+
# codes (`usd` → `USD`).
|
|
19
|
+
#
|
|
20
|
+
# For new code, prefer building an Iriq::Shape directly and calling
|
|
21
|
+
# `#render`. PathShape stays available for the common string-only path.
|
|
13
22
|
class PathShape
|
|
14
|
-
def initialize(classifier: SegmentClassifier::DEFAULT, hints: true
|
|
15
|
-
|
|
16
|
-
@
|
|
23
|
+
def initialize(classifier: SegmentClassifier::DEFAULT, hints: true,
|
|
24
|
+
canonical_dates: false, canonical_currencies: false)
|
|
25
|
+
@classifier = classifier
|
|
26
|
+
@hints = hints
|
|
27
|
+
@canonical_dates = canonical_dates
|
|
28
|
+
@canonical_currencies = canonical_currencies
|
|
17
29
|
end
|
|
18
30
|
|
|
19
31
|
def for(segments)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from_entries(SegmentHints.derive(segments, @classifier))
|
|
32
|
+
from_entries(SegmentHints.derive(segments || [], @classifier))
|
|
23
33
|
end
|
|
24
34
|
|
|
25
35
|
# Build a shape string from already-derived SegmentHints entries.
|
|
26
|
-
# Used by Corpus to avoid re-deriving entries per observation when it
|
|
27
|
-
# needs multiple shape variants (raw and hinted).
|
|
28
36
|
def from_entries(entries)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def shape_token(entry)
|
|
35
|
-
return entry[:value] unless entry[:variable]
|
|
36
|
-
|
|
37
|
-
placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
|
|
38
|
-
"{#{placeholder}}"
|
|
37
|
+
Shape.from_entries(entries).render(
|
|
38
|
+
hints: @hints,
|
|
39
|
+
canonical_dates: @canonical_dates,
|
|
40
|
+
canonical_currencies: @canonical_currencies,
|
|
41
|
+
)
|
|
39
42
|
end
|
|
40
43
|
|
|
41
|
-
def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true
|
|
42
|
-
|
|
44
|
+
def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true,
|
|
45
|
+
canonical_dates: false, canonical_currencies: false)
|
|
46
|
+
new(classifier: classifier, hints: hints,
|
|
47
|
+
canonical_dates: canonical_dates,
|
|
48
|
+
canonical_currencies: canonical_currencies).for(segments)
|
|
43
49
|
end
|
|
44
50
|
end
|
|
45
51
|
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# A typed slot in a host's URL structure.
|
|
3
|
+
#
|
|
4
|
+
# Two observations occupy the same Position when (host, scope, locator)
|
|
5
|
+
# match exactly. Position is the keying type used by Storage for
|
|
6
|
+
# frequency tables and by Cluster for per-slot inference.
|
|
7
|
+
#
|
|
8
|
+
# host — the EFFECTIVE host per Corpus#host_strategy. Observations of
|
|
9
|
+
# api.foo.com and app.foo.com under :registrable share the
|
|
10
|
+
# same Position. The original host stays on the Identifier.
|
|
11
|
+
# scope — :path or :query.
|
|
12
|
+
# locator — for :path, the typed prefix built up to this slot, e.g.
|
|
13
|
+
# "/orgs/{opaque_id}/users" for the integer slot in
|
|
14
|
+
# /orgs/abc/users/123. (Variable segments render as their
|
|
15
|
+
# hint or display-type, so the prefix groups across observations
|
|
16
|
+
# regardless of the specific IDs seen.)
|
|
17
|
+
# — for :query, the ?key= parameter name.
|
|
18
|
+
#
|
|
19
|
+
# Position implements value equality and is safe to use as a Hash key.
|
|
20
|
+
class Position
|
|
21
|
+
SCOPES = %i[path query].freeze
|
|
22
|
+
|
|
23
|
+
attr_reader :host, :scope, :locator
|
|
24
|
+
|
|
25
|
+
def self.path(host:, prefix:)
|
|
26
|
+
new(host: host, scope: :path, locator: prefix)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.query(host:, name:)
|
|
30
|
+
new(host: host, scope: :query, locator: name)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def initialize(host:, scope:, locator:)
|
|
34
|
+
raise ArgumentError, "scope must be one of #{SCOPES.inspect}" unless SCOPES.include?(scope)
|
|
35
|
+
|
|
36
|
+
@host = host
|
|
37
|
+
@scope = scope
|
|
38
|
+
@locator = locator
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def path?; @scope == :path; end
|
|
42
|
+
def query?; @scope == :query; end
|
|
43
|
+
|
|
44
|
+
def ==(other)
|
|
45
|
+
other.is_a?(Position) &&
|
|
46
|
+
other.host == @host &&
|
|
47
|
+
other.scope == @scope &&
|
|
48
|
+
other.locator == @locator
|
|
49
|
+
end
|
|
50
|
+
alias eql? ==
|
|
51
|
+
|
|
52
|
+
def hash
|
|
53
|
+
[@host, @scope, @locator].hash
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def to_h
|
|
57
|
+
{ host: @host, scope: @scope, locator: @locator }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def to_s
|
|
61
|
+
"Position(#{@host.inspect}, #{@scope}, #{@locator.inspect})"
|
|
62
|
+
end
|
|
63
|
+
alias inspect to_s
|
|
64
|
+
|
|
65
|
+
# Serialized form used by JSON / SQLite storage. Scope is emitted as
|
|
66
|
+
# a string for cross-runtime compatibility.
|
|
67
|
+
def to_dump
|
|
68
|
+
{ "host" => @host, "scope" => @scope.to_s, "locator" => @locator }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.from_dump(h)
|
|
72
|
+
new(host: h["host"], scope: h["scope"].to_sym, locator: h["locator"])
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
data/lib/iriq/position_stats.rb
CHANGED
|
@@ -3,16 +3,29 @@ module Iriq
|
|
|
3
3
|
# Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
|
|
4
4
|
# doesn't grow memory without bound — `total` keeps growing accurately, but
|
|
5
5
|
# only the first `max_values` distinct values are tracked individually.
|
|
6
|
+
# Existing tracked values still receive increments after the cap is hit;
|
|
7
|
+
# only NEW distinct values are dropped.
|
|
6
8
|
class PositionStats
|
|
7
|
-
DEFAULT_MAX_VALUES =
|
|
9
|
+
DEFAULT_MAX_VALUES = 5_000
|
|
8
10
|
|
|
9
|
-
attr_reader :value_counts, :type_counts, :total, :max_values
|
|
11
|
+
attr_reader :value_counts, :type_counts, :total, :max_values,
|
|
12
|
+
:numeric_count, :numeric_min, :numeric_max, :numeric_sum
|
|
13
|
+
|
|
14
|
+
NUMERIC_TYPES = %i[integer float].freeze
|
|
10
15
|
|
|
11
16
|
def initialize(max_values: DEFAULT_MAX_VALUES)
|
|
12
|
-
@value_counts
|
|
13
|
-
@type_counts
|
|
14
|
-
@total
|
|
15
|
-
@max_values
|
|
17
|
+
@value_counts = Hash.new(0)
|
|
18
|
+
@type_counts = Hash.new(0)
|
|
19
|
+
@total = 0
|
|
20
|
+
@max_values = max_values
|
|
21
|
+
# Range stats for numeric observations only. Lets the corpus
|
|
22
|
+
# promote /articles/2024 etc. to :year when all values land in
|
|
23
|
+
# 1900..2100, and surfaces min/max/avg on ParamSummary for
|
|
24
|
+
# general numeric params.
|
|
25
|
+
@numeric_count = 0
|
|
26
|
+
@numeric_min = nil
|
|
27
|
+
@numeric_max = nil
|
|
28
|
+
@numeric_sum = 0.0
|
|
16
29
|
end
|
|
17
30
|
|
|
18
31
|
def observe(value, type)
|
|
@@ -21,8 +34,31 @@ module Iriq
|
|
|
21
34
|
if @value_counts.size < @max_values || @value_counts.key?(value)
|
|
22
35
|
@value_counts[value] += 1
|
|
23
36
|
end
|
|
37
|
+
record_numeric(value, type)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def numeric_avg
|
|
41
|
+
return nil if @numeric_count.zero?
|
|
42
|
+
|
|
43
|
+
@numeric_sum / @numeric_count
|
|
24
44
|
end
|
|
25
45
|
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def record_numeric(value, type)
|
|
49
|
+
return unless NUMERIC_TYPES.include?(type)
|
|
50
|
+
|
|
51
|
+
n = Float(value) rescue nil
|
|
52
|
+
return unless n
|
|
53
|
+
|
|
54
|
+
@numeric_count += 1
|
|
55
|
+
@numeric_min = n if @numeric_min.nil? || n < @numeric_min
|
|
56
|
+
@numeric_max = n if @numeric_max.nil? || n > @numeric_max
|
|
57
|
+
@numeric_sum += n
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
public
|
|
61
|
+
|
|
26
62
|
def cardinality
|
|
27
63
|
@value_counts.size
|
|
28
64
|
end
|
|
@@ -42,13 +78,37 @@ module Iriq
|
|
|
42
78
|
(@value_counts[value] || 0).to_f / @total
|
|
43
79
|
end
|
|
44
80
|
|
|
81
|
+
# Most common type. On count ties, breaks lexicographically by type
|
|
82
|
+
# symbol name so the result is deterministic and matches Go's
|
|
83
|
+
# DominantType (Go's map iteration is randomized).
|
|
84
|
+
def dominant_type
|
|
85
|
+
best = nil
|
|
86
|
+
best_count = -1
|
|
87
|
+
@type_counts.each do |t, n|
|
|
88
|
+
if n > best_count || (n == best_count && t.to_s < best.to_s)
|
|
89
|
+
best = t
|
|
90
|
+
best_count = n
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
best
|
|
94
|
+
end
|
|
95
|
+
|
|
45
96
|
def dump
|
|
46
|
-
|
|
47
|
-
|
|
97
|
+
# Dup the hashes so callers can mutate the dump structure (test
|
|
98
|
+
# fixtures, post-processing) without aliasing the live state.
|
|
99
|
+
out = {
|
|
100
|
+
"value_counts" => @value_counts.dup,
|
|
48
101
|
"type_counts" => @type_counts.transform_keys(&:to_s),
|
|
49
102
|
"total" => @total,
|
|
50
103
|
"max_values" => @max_values,
|
|
51
104
|
}
|
|
105
|
+
if @numeric_count.positive?
|
|
106
|
+
out["numeric_count"] = @numeric_count
|
|
107
|
+
out["numeric_min"] = @numeric_min
|
|
108
|
+
out["numeric_max"] = @numeric_max
|
|
109
|
+
out["numeric_sum"] = @numeric_sum
|
|
110
|
+
end
|
|
111
|
+
out
|
|
52
112
|
end
|
|
53
113
|
|
|
54
114
|
def self.from_dump(h)
|
|
@@ -58,6 +118,12 @@ module Iriq
|
|
|
58
118
|
tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
|
|
59
119
|
stats.instance_variable_set(:@value_counts, vc)
|
|
60
120
|
stats.instance_variable_set(:@type_counts, tc)
|
|
121
|
+
if h["numeric_count"]
|
|
122
|
+
stats.instance_variable_set(:@numeric_count, h["numeric_count"])
|
|
123
|
+
stats.instance_variable_set(:@numeric_min, h["numeric_min"])
|
|
124
|
+
stats.instance_variable_set(:@numeric_max, h["numeric_max"])
|
|
125
|
+
stats.instance_variable_set(:@numeric_sum, h["numeric_sum"])
|
|
126
|
+
end
|
|
61
127
|
stats
|
|
62
128
|
end
|
|
63
129
|
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Pluggable single-type classifier.
|
|
3
|
+
#
|
|
4
|
+
# A Recognizer encapsulates "this string-shape implies this type" plus the
|
|
5
|
+
# canonical form (if any). The ensemble-based SegmentClassifier consults
|
|
6
|
+
# Recognizers in order and picks the first that fires. (Scored-ensemble
|
|
7
|
+
# voting comes in a follow-up; for now each fire is decisive.)
|
|
8
|
+
#
|
|
9
|
+
# try(segment) -> { type:, confidence:, canonical:, notes: } | nil
|
|
10
|
+
# nil — this Recognizer does not claim the segment.
|
|
11
|
+
# type — symbol from the recognized vocabulary.
|
|
12
|
+
# confidence — float in [0, 1]. Phase-1 step 2 always returns 1.0
|
|
13
|
+
# when a Recognizer fires; calibration arrives with the scored
|
|
14
|
+
# ensemble in step 4.
|
|
15
|
+
# canonical — canonical form (e.g. ISO date for :date). nil ≡ "use input".
|
|
16
|
+
# notes — optional array of strings the Trace view may surface.
|
|
17
|
+
#
|
|
18
|
+
# Recognizers are instantiated once and shared (they hold no per-call
|
|
19
|
+
# state). See Iriq::Recognizers::UUID / DATE / INTEGER for the built-ins.
|
|
20
|
+
class Recognizer
|
|
21
|
+
def try(_segment)
|
|
22
|
+
raise NotImplementedError
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Run each Recognizer against the segment and return the winning
|
|
26
|
+
# Verdict — the one with max(specificity × confidence). Ties go to
|
|
27
|
+
# the earlier Recognizer in the list (stable, deterministic).
|
|
28
|
+
# Returns nil when no Recognizer fires.
|
|
29
|
+
#
|
|
30
|
+
# Stepping-stone toward the full scored ensemble: today only three
|
|
31
|
+
# Recognizers participate (uuid, date, integer) and they're
|
|
32
|
+
# mutually-exclusive on shape, so the ensemble is effectively a
|
|
33
|
+
# short-circuit OR. As more Recognizers carve out of SegmentClassifier
|
|
34
|
+
# they'll join the pool and the scoring becomes load-bearing.
|
|
35
|
+
def self.ensemble(segment, *recognizers)
|
|
36
|
+
best = nil
|
|
37
|
+
best_score = -1.0
|
|
38
|
+
recognizers.each do |r|
|
|
39
|
+
v = r.try(segment)
|
|
40
|
+
next unless v
|
|
41
|
+
|
|
42
|
+
score = (v[:specificity] || 0.0) * (v[:confidence] || 0.0)
|
|
43
|
+
if score > best_score
|
|
44
|
+
best = v
|
|
45
|
+
best_score = score
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
best
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
module Recognizers
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
3
|
+
module Iriq
|
|
4
|
+
# A suggestion that a new Recognizer should be added to the system.
|
|
5
|
+
#
|
|
6
|
+
# Emitted by Corpus#propose_recognizers. NOT automatically activated —
|
|
7
|
+
# proposals carry enough evidence for a human to judge whether to add
|
|
8
|
+
# the Recognizer to the built-in set (or, later, to register it
|
|
9
|
+
# dynamically via a public Recognizer registry).
|
|
10
|
+
#
|
|
11
|
+
# prefix — the detected shape signature (e.g. "ghp_")
|
|
12
|
+
# suggested_type — Symbol name we'd register the Recognizer under
|
|
13
|
+
# if accepted (e.g. :ghp)
|
|
14
|
+
# positions — every Position where the proposal matched
|
|
15
|
+
# hosts — distinct hosts the proposal was seen at; a high
|
|
16
|
+
# count is strong evidence the pattern isn't
|
|
17
|
+
# host-local
|
|
18
|
+
# coverage — fraction of sampled observations at affected
|
|
19
|
+
# Positions matching the proposal pattern
|
|
20
|
+
# observation_count — total matching observations across positions
|
|
21
|
+
# sample_values — up to 5 example matches, for the human reviewer
|
|
22
|
+
# strategy — the ProposalStrategy that emitted this record
|
|
23
|
+
class RecognizerProposal
|
|
24
|
+
attr_reader :prefix, :suggested_type, :positions, :hosts,
|
|
25
|
+
:coverage, :confidence, :observation_count,
|
|
26
|
+
:sample_values, :strategy
|
|
27
|
+
|
|
28
|
+
def initialize(prefix:, suggested_type:, positions:, hosts:,
|
|
29
|
+
coverage:, observation_count:, sample_values:,
|
|
30
|
+
strategy:, confidence: nil)
|
|
31
|
+
@prefix = prefix
|
|
32
|
+
@suggested_type = suggested_type
|
|
33
|
+
@positions = positions.freeze
|
|
34
|
+
@hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
|
|
35
|
+
@coverage = coverage
|
|
36
|
+
@observation_count = observation_count
|
|
37
|
+
@sample_values = sample_values.freeze
|
|
38
|
+
@strategy = strategy
|
|
39
|
+
@confidence = confidence.nil? ? compute_confidence : confidence
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def to_h
|
|
43
|
+
{
|
|
44
|
+
prefix: @prefix,
|
|
45
|
+
suggested_type: @suggested_type,
|
|
46
|
+
positions: @positions.map(&:to_h),
|
|
47
|
+
hosts: @hosts.to_a.sort,
|
|
48
|
+
coverage: @coverage,
|
|
49
|
+
confidence: @confidence,
|
|
50
|
+
observation_count: @observation_count,
|
|
51
|
+
sample_values: @sample_values,
|
|
52
|
+
strategy: @strategy,
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
# Confidence = coverage + linear cross-host boost, capped at 1.0.
|
|
59
|
+
# Single-host proposals get their raw coverage as confidence (no
|
|
60
|
+
# boost). Each additional host adds CROSS_HOST_BOOST_PER_HOST to
|
|
61
|
+
# the score. A proposal supported by ~10 distinct hosts caps out
|
|
62
|
+
# regardless of raw coverage; below that, both signals compose.
|
|
63
|
+
def compute_confidence
|
|
64
|
+
boost = (@hosts.size - 1) * ProposalStrategy::CROSS_HOST_BOOST_PER_HOST
|
|
65
|
+
score = @coverage + boost
|
|
66
|
+
score > 1.0 ? 1.0 : score
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Pluggable proposal-detection strategies. Each strategy.propose(storage, **opts)
|
|
71
|
+
# returns an array of RecognizerProposal. Adding a new detection rule =
|
|
72
|
+
# add a class with #propose; register it via DEFAULTS.
|
|
73
|
+
module ProposalStrategy
|
|
74
|
+
# Default minimum total matching observations across positions before
|
|
75
|
+
# we'll emit a proposal. Below this the signal is too noisy.
|
|
76
|
+
DEFAULT_MIN_OBSERVATIONS = 20
|
|
77
|
+
# Fraction of sampled observations at affected Positions that must
|
|
78
|
+
# match the proposal pattern.
|
|
79
|
+
DEFAULT_MIN_COVERAGE = 0.7
|
|
80
|
+
# Minimum number of distinct hosts the proposal must appear at. For
|
|
81
|
+
# single-host corpora this defaults to 1; bumping to 2+ promotes
|
|
82
|
+
# cross-host patterns over host-local ones.
|
|
83
|
+
DEFAULT_MIN_HOSTS = 1
|
|
84
|
+
# Confidence boost added per additional host beyond the first. A
|
|
85
|
+
# pattern seen on 10+ hosts caps out the boost (+0.45 ≈ 1.0 when
|
|
86
|
+
# combined with any reasonable coverage); single-host patterns get
|
|
87
|
+
# no boost (their coverage IS their confidence).
|
|
88
|
+
CROSS_HOST_BOOST_PER_HOST = 0.05
|
|
89
|
+
|
|
90
|
+
# Detects `<prefix>_<alphanumeric>` patterns at slug/opaque_id
|
|
91
|
+
# positions — the GitHub PAT (`ghp_…`), Stripe customer ID (`cus_…`),
|
|
92
|
+
# AWS-style (`sk_test_…` — partial match), Twilio SID-with-letter-
|
|
93
|
+
# prefix family. Restricting the suffix to alphanumeric (no further
|
|
94
|
+
# separators) keeps real slugs (`my-cool-post`, `red_team_member`)
|
|
95
|
+
# from triggering false proposals.
|
|
96
|
+
class PrefixUnderscoreId
|
|
97
|
+
PATTERN = /\A([a-z]+)_([A-Za-z0-9]+)\z/.freeze
|
|
98
|
+
NAME = :prefix_underscore_id
|
|
99
|
+
|
|
100
|
+
def propose(storage,
|
|
101
|
+
min_observations: DEFAULT_MIN_OBSERVATIONS,
|
|
102
|
+
min_coverage: DEFAULT_MIN_COVERAGE,
|
|
103
|
+
min_hosts: DEFAULT_MIN_HOSTS)
|
|
104
|
+
per_prefix = Hash.new { |h, k| h[k] = empty_accumulator }
|
|
105
|
+
|
|
106
|
+
storage.each_position_stats do |position, stats|
|
|
107
|
+
next unless slug_or_opaque?(stats)
|
|
108
|
+
|
|
109
|
+
stats.value_counts.each do |value, count|
|
|
110
|
+
m = PATTERN.match(value) or next
|
|
111
|
+
prefix = "#{m[1]}_"
|
|
112
|
+
acc = per_prefix[prefix]
|
|
113
|
+
acc[:matching_count] += count
|
|
114
|
+
acc[:position_observations] += stats.total unless acc[:positions].include?(position)
|
|
115
|
+
acc[:positions] << position
|
|
116
|
+
acc[:hosts] << position.host
|
|
117
|
+
# Collect every match; we'll sort + cap to a stable top-N at
|
|
118
|
+
# emission time so Ruby and Go produce identical samples
|
|
119
|
+
# regardless of underlying Hash / map iteration order.
|
|
120
|
+
acc[:matches] << value
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
per_prefix.filter_map { |prefix, acc|
|
|
125
|
+
next nil if acc[:matching_count] < min_observations
|
|
126
|
+
next nil if acc[:hosts].size < min_hosts
|
|
127
|
+
|
|
128
|
+
coverage = acc[:matching_count].to_f / acc[:position_observations]
|
|
129
|
+
next nil if coverage < min_coverage
|
|
130
|
+
|
|
131
|
+
RecognizerProposal.new(
|
|
132
|
+
prefix: prefix,
|
|
133
|
+
suggested_type: prefix.chomp("_").to_sym,
|
|
134
|
+
positions: acc[:positions].to_a,
|
|
135
|
+
hosts: acc[:hosts],
|
|
136
|
+
coverage: coverage,
|
|
137
|
+
observation_count: acc[:matching_count],
|
|
138
|
+
# Sort + cap to 5 so Ruby and Go produce identical samples
|
|
139
|
+
# regardless of underlying Hash / map iteration order. The
|
|
140
|
+
# samples are illustrative for humans; alphabetical is fine.
|
|
141
|
+
sample_values: acc[:matches].sort.first(5),
|
|
142
|
+
strategy: NAME,
|
|
143
|
+
)
|
|
144
|
+
}.sort_by { |p| [-p.confidence, p.prefix] }
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
private
|
|
148
|
+
|
|
149
|
+
def empty_accumulator
|
|
150
|
+
{
|
|
151
|
+
positions: Set.new,
|
|
152
|
+
hosts: Set.new,
|
|
153
|
+
matching_count: 0,
|
|
154
|
+
position_observations: 0,
|
|
155
|
+
matches: [],
|
|
156
|
+
}
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def slug_or_opaque?(stats)
|
|
160
|
+
dom = stats.type_counts.max_by { |_, c| c }&.first
|
|
161
|
+
dom == :slug || dom == :opaque_id
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
DEFAULTS = [PrefixUnderscoreId.new].freeze
|
|
166
|
+
end
|
|
167
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
module Recognizers
|
|
3
|
+
# ISO 8601 (YYYY-MM-DD), slash form (YYYY/MM/DD), and US-style
|
|
4
|
+
# (M/D/YYYY) date shapes. Compact YYYYMMDD lives on the Integer
|
|
5
|
+
# recognizer — it sees the digits-only input first.
|
|
6
|
+
#
|
|
7
|
+
# Conservative: DD/MM/YYYY is intentionally NOT recognized — from a
|
|
8
|
+
# bare segment we can't tell it apart from MM/DD/YYYY.
|
|
9
|
+
class Date < Recognizer
|
|
10
|
+
ISO_PATTERN = /\A\d{4}-\d{2}-\d{2}\z/.freeze
|
|
11
|
+
SLASH_PATTERN = %r{\A\d{4}/\d{2}/\d{2}\z}.freeze
|
|
12
|
+
US_PATTERN = %r{\A(\d{1,2})/(\d{1,2})/(\d{4})\z}.freeze
|
|
13
|
+
|
|
14
|
+
def try(segment)
|
|
15
|
+
has_dash = segment.include?("-")
|
|
16
|
+
has_slash = segment.include?("/")
|
|
17
|
+
return nil unless has_dash || has_slash
|
|
18
|
+
unless ISO_PATTERN.match?(segment) ||
|
|
19
|
+
SLASH_PATTERN.match?(segment) ||
|
|
20
|
+
US_PATTERN.match?(segment)
|
|
21
|
+
return nil
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
{ type: :date, confidence: 1.0, specificity: Specificity::STRUCTURED }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Canonicalize a recognized date to ISO 8601 (YYYY-MM-DD). nil for
|
|
28
|
+
# non-date / implausible-date values. Day-of-month validity (Feb 30,
|
|
29
|
+
# Apr 31) deliberately not checked — out of scope for a heuristic.
|
|
30
|
+
def self.canonical(value)
|
|
31
|
+
return nil if value.nil?
|
|
32
|
+
|
|
33
|
+
case value
|
|
34
|
+
when ISO_PATTERN
|
|
35
|
+
plausible?(value[0, 4], value[5, 2], value[8, 2]) ? value : nil
|
|
36
|
+
when SLASH_PATTERN
|
|
37
|
+
plausible?(value[0, 4], value[5, 2], value[8, 2]) ? value.tr("/", "-") : nil
|
|
38
|
+
when US_PATTERN
|
|
39
|
+
m = ::Regexp.last_match
|
|
40
|
+
mm, dd, yyyy = m[1].rjust(2, "0"), m[2].rjust(2, "0"), m[3]
|
|
41
|
+
plausible?(yyyy, mm, dd) ? "#{yyyy}-#{mm}-#{dd}" : nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.plausible?(y, m, d)
|
|
46
|
+
yi = y.to_i; mi = m.to_i; di = d.to_i
|
|
47
|
+
yi.between?(1900, 2100) && mi.between?(1, 12) && di.between?(1, 31)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
DATE = Date.new
|
|
52
|
+
end
|
|
53
|
+
end
|