iriq 0.1.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -0
- data/CLAUDE.md +208 -0
- data/Gemfile.lock +8 -2
- data/Makefile +113 -0
- data/README.md +249 -270
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +5 -4
- data/lib/iriq/cli.rb +402 -49
- data/lib/iriq/cluster.rb +304 -8
- data/lib/iriq/clusterer.rb +19 -44
- data/lib/iriq/corpus.rb +417 -81
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/parser.rb +1 -1
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/json.rb +43 -0
- data/lib/iriq/storage/memory.rb +209 -0
- data/lib/iriq/storage/sqlite.rb +546 -0
- data/lib/iriq/storage.rb +35 -0
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +18 -0
- metadata +44 -8
- data/script/benchmark.rb +0 -81
- data/script/memory.rb +0 -121
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Evidence is the structured substrate for explanation. Each Record
|
|
3
|
+
# captures one fact about the system's reasoning: "this segment
|
|
4
|
+
# classified as :integer because the Integer recognizer fired with
|
|
5
|
+
# specificity TYPED", "the IPv4 type collapses to {ip} by policy",
|
|
6
|
+
# "Position P is mostly variable because of corpus stats".
|
|
7
|
+
#
|
|
8
|
+
# Trace and Explanation are views over a list of Evidence records;
|
|
9
|
+
# the structured form is what programmatic consumers (test assertions,
|
|
10
|
+
# PR-diff annotators, downstream tooling) should build on. Human note
|
|
11
|
+
# strings emitted by Trace are derived from Evidence payloads, so
|
|
12
|
+
# adding a new note kind starts with adding a new Evidence shape.
|
|
13
|
+
#
|
|
14
|
+
# Two axes:
|
|
15
|
+
#
|
|
16
|
+
# subject_kind ∈ {:segment, :position, :cluster}
|
|
17
|
+
# What this Evidence is about. Today most Evidence is :segment
|
|
18
|
+
# (per-segment classification facts). :position and :cluster
|
|
19
|
+
# Evidence become load-bearing once corpus-informed Trace lands
|
|
20
|
+
# in a follow-up step.
|
|
21
|
+
#
|
|
22
|
+
# source ∈ {:lexical, :recognizer, :corpus, :neighbor, :policy}
|
|
23
|
+
# What kind of fact is being asserted.
|
|
24
|
+
# :lexical — pure shape match (e.g. "matches DATE_RE")
|
|
25
|
+
# :recognizer — a named Recognizer fired with confidence/specificity
|
|
26
|
+
# :corpus — aggregated counts/distributions support this
|
|
27
|
+
# :neighbor — adjacent context informed this (prior literal,
|
|
28
|
+
# param name hint)
|
|
29
|
+
# :policy — a normalization policy applied (ip umbrella
|
|
30
|
+
# collapse, canonical date, currency upcase)
|
|
31
|
+
module Evidence
|
|
32
|
+
SUBJECT_KINDS = %i[segment position cluster].freeze
|
|
33
|
+
SOURCES = %i[lexical recognizer corpus neighbor policy].freeze
|
|
34
|
+
|
|
35
|
+
# A single Evidence fact.
|
|
36
|
+
#
|
|
37
|
+
# subject_kind — :segment | :position | :cluster
|
|
38
|
+
# subject — kind-specific identity:
|
|
39
|
+
# :segment → { index:, value: }
|
|
40
|
+
# :position → Iriq::Position
|
|
41
|
+
# :cluster → cluster key (string)
|
|
42
|
+
# source — :lexical | :recognizer | :corpus | :neighbor | :policy
|
|
43
|
+
# payload — source-and-kind-specific structured data
|
|
44
|
+
# weight — optional float in [0,1] — contribution to the
|
|
45
|
+
# ultimate decision. Set when scoring is meaningful;
|
|
46
|
+
# nil otherwise.
|
|
47
|
+
# notes — optional human-readable strings. Trace renders
|
|
48
|
+
# these directly; programmatic consumers can ignore.
|
|
49
|
+
class Record
|
|
50
|
+
attr_reader :subject_kind, :subject, :source, :payload, :weight, :notes
|
|
51
|
+
|
|
52
|
+
def initialize(subject_kind:, subject:, source:, payload:, weight: nil, notes: [])
|
|
53
|
+
unless SUBJECT_KINDS.include?(subject_kind)
|
|
54
|
+
raise ArgumentError, "subject_kind must be one of #{SUBJECT_KINDS.inspect}"
|
|
55
|
+
end
|
|
56
|
+
unless SOURCES.include?(source)
|
|
57
|
+
raise ArgumentError, "source must be one of #{SOURCES.inspect}"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
@subject_kind = subject_kind
|
|
61
|
+
@subject = subject
|
|
62
|
+
@source = source
|
|
63
|
+
@payload = payload || {}
|
|
64
|
+
@weight = weight
|
|
65
|
+
@notes = notes || []
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def to_h
|
|
69
|
+
{
|
|
70
|
+
subject_kind: @subject_kind,
|
|
71
|
+
subject: subject_serialized,
|
|
72
|
+
source: @source,
|
|
73
|
+
payload: @payload,
|
|
74
|
+
weight: @weight,
|
|
75
|
+
notes: @notes,
|
|
76
|
+
}.compact
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def subject_serialized
|
|
82
|
+
return @subject.to_h if @subject.respond_to?(:to_h) && !@subject.is_a?(Hash)
|
|
83
|
+
@subject
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
module_function
|
|
88
|
+
|
|
89
|
+
# Factories so call sites don't have to repeat subject_kind:.
|
|
90
|
+
def segment(index:, value:, source:, payload:, weight: nil, notes: [])
|
|
91
|
+
Record.new(
|
|
92
|
+
subject_kind: :segment,
|
|
93
|
+
subject: { index: index, value: value },
|
|
94
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def position(position:, source:, payload:, weight: nil, notes: [])
|
|
99
|
+
Record.new(
|
|
100
|
+
subject_kind: :position,
|
|
101
|
+
subject: position,
|
|
102
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def cluster(key:, source:, payload:, weight: nil, notes: [])
|
|
107
|
+
Record.new(
|
|
108
|
+
subject_kind: :cluster,
|
|
109
|
+
subject: key,
|
|
110
|
+
source: source, payload: payload, weight: weight, notes: notes,
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
data/lib/iriq/explanation.rb
CHANGED
|
@@ -4,7 +4,7 @@ module Iriq
|
|
|
4
4
|
# Explanation.explain("https://foo.com/users/123")
|
|
5
5
|
# # => [
|
|
6
6
|
# # { value: "users", type: :literal, variable: false, hint: nil },
|
|
7
|
-
# # { value: "123", type: :
|
|
7
|
+
# # { value: "123", type: :integer, variable: true, hint: "user_id" },
|
|
8
8
|
# # ]
|
|
9
9
|
module Explanation
|
|
10
10
|
module_function
|
data/lib/iriq/normalizer.rb
CHANGED
|
@@ -5,46 +5,88 @@ module Iriq
|
|
|
5
5
|
# # => "https://foo.com/users/{user_id}"
|
|
6
6
|
#
|
|
7
7
|
# The form is intended for grouping/diffing — it is not a round-trippable URL.
|
|
8
|
+
#
|
|
9
|
+
# Path + query rendering dispatches through an evidence source so the
|
|
10
|
+
# mechanical (classifier-only) and corpus-informed code paths share one
|
|
11
|
+
# entry point. When `evidence` is nil, NullEvidenceSource provides the
|
|
12
|
+
# mechanical behavior (PathShape + param-name-hint query rules). When a
|
|
13
|
+
# Corpus is passed as `evidence`, its observed Position / Cluster stats
|
|
14
|
+
# drive the rendering (variability promotion, popular outlier
|
|
15
|
+
# preservation, cluster-inferred query types).
|
|
8
16
|
module Normalizer
|
|
9
17
|
module_function
|
|
10
18
|
|
|
11
|
-
def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
|
|
19
|
+
def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
|
|
12
20
|
iri = input.is_a?(Identifier) ? input : Parser.parse(input)
|
|
13
|
-
normalize_identifier(iri, classifier: classifier, hints: hints)
|
|
21
|
+
normalize_identifier(iri, classifier: classifier, hints: hints, evidence: evidence)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
|
|
25
|
+
return normalize_urn(iri, classifier, hints) if iri.urn?
|
|
26
|
+
|
|
27
|
+
src = evidence || NullEvidenceSource.new
|
|
28
|
+
out = +""
|
|
29
|
+
out << "#{iri.scheme}://" if iri.scheme
|
|
30
|
+
out << iri.host if iri.host
|
|
31
|
+
out << ":#{iri.port}" if iri.port
|
|
32
|
+
out << src.render_path(iri, classifier, hints)
|
|
33
|
+
if iri.query_params && !iri.query_params.empty?
|
|
34
|
+
out << "?" << src.render_query(iri, classifier)
|
|
35
|
+
end
|
|
36
|
+
out
|
|
14
37
|
end
|
|
15
38
|
|
|
16
|
-
def
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
39
|
+
def normalize_urn(iri, classifier, hints)
|
|
40
|
+
return iri.canonical unless iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
|
|
41
|
+
|
|
42
|
+
ns, value = iri.nss.split(":", 2)
|
|
43
|
+
entry = SegmentHints.derive([ns, value], classifier).last
|
|
44
|
+
shaped =
|
|
45
|
+
if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
|
|
46
|
+
canon
|
|
47
|
+
elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
|
|
48
|
+
canon
|
|
49
|
+
elsif entry[:variable]
|
|
50
|
+
"{#{(hints && entry[:hint]) || SegmentClassifier.display_type(entry[:type])}}"
|
|
27
51
|
else
|
|
28
|
-
|
|
52
|
+
entry[:value]
|
|
29
53
|
end
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
54
|
+
"urn:#{ns}:#{shaped}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# NullEvidenceSource is the default evidence source — purely
|
|
59
|
+
# classifier-driven, no corpus signal. The Normalizer's mechanical
|
|
60
|
+
# behavior is what this produces. Implements the same {render_path,
|
|
61
|
+
# render_query} interface that Corpus implements for the corpus-informed
|
|
62
|
+
# path.
|
|
63
|
+
class NullEvidenceSource
|
|
64
|
+
def render_path(iri, classifier, hints)
|
|
65
|
+
PathShape.new(
|
|
66
|
+
classifier: classifier, hints: hints,
|
|
67
|
+
canonical_dates: true, canonical_currencies: true,
|
|
68
|
+
).for(iri.path_segments)
|
|
41
69
|
end
|
|
42
70
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
v =
|
|
71
|
+
def render_query(iri, classifier)
|
|
72
|
+
iri.query_params.keys.sort.map do |k|
|
|
73
|
+
v = iri.query_params[k]
|
|
46
74
|
type = classifier.classify(v.to_s)
|
|
47
|
-
|
|
75
|
+
# Param-name hint can lift a generic literal/opaque_id/slug into
|
|
76
|
+
# a semantic type — `?phone=unknown` becomes `{phone}`.
|
|
77
|
+
if (hint = SegmentClassifier.param_name_hint(k, type))
|
|
78
|
+
type = hint
|
|
79
|
+
end
|
|
80
|
+
shaped =
|
|
81
|
+
if type == :date && (canon = SegmentClassifier.canonical_date(v.to_s))
|
|
82
|
+
canon
|
|
83
|
+
elsif type == :currency && (canon = SegmentClassifier.canonical_currency(v.to_s))
|
|
84
|
+
canon
|
|
85
|
+
elsif classifier.variable?(type)
|
|
86
|
+
"{#{SegmentClassifier.display_type(type)}}"
|
|
87
|
+
else
|
|
88
|
+
v
|
|
89
|
+
end
|
|
48
90
|
"#{k}=#{shaped}"
|
|
49
91
|
end.join("&")
|
|
50
92
|
end
|
data/lib/iriq/parser.rb
CHANGED
|
@@ -3,7 +3,7 @@ module Iriq
|
|
|
3
3
|
#
|
|
4
4
|
# Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
|
|
5
5
|
# accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
|
|
6
|
-
# Unicode hosts and paths) to support normalization and clustering.
|
|
6
|
+
# Unicode hosts and paths) to support extraction, normalization, and clustering.
|
|
7
7
|
module Parser
|
|
8
8
|
SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
|
|
9
9
|
|
data/lib/iriq/path_shape.rb
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
module Iriq
|
|
2
|
-
#
|
|
3
|
-
#
|
|
4
|
-
#
|
|
2
|
+
# Renderer that produces a route-shape string by replacing variable
|
|
3
|
+
# segments with `{hint}` placeholders. As of v0.16 this is a thin wrapper
|
|
4
|
+
# around Iriq::Shape — kept for back-compat with callers that still want
|
|
5
|
+
# to get a string in one call.
|
|
5
6
|
#
|
|
6
7
|
# PathShape.for(["users", "123", "orders", "456"])
|
|
7
8
|
# # => "/users/{user_id}/orders/{order_id}"
|
|
@@ -9,37 +10,42 @@ module Iriq
|
|
|
9
10
|
# Pass `hints: false` to use raw types instead:
|
|
10
11
|
#
|
|
11
12
|
# PathShape.for(["users", "123"], hints: false)
|
|
12
|
-
# # => "/users/{
|
|
13
|
+
# # => "/users/{integer}"
|
|
14
|
+
#
|
|
15
|
+
# Pass `canonical_dates: true` to render date-typed segments in canonical
|
|
16
|
+
# ISO form (2024/01/15 → 2024-01-15) instead of as a `{date}` placeholder.
|
|
17
|
+
# Pass `canonical_currencies: true` for the same treatment of currency
|
|
18
|
+
# codes (`usd` → `USD`).
|
|
19
|
+
#
|
|
20
|
+
# For new code, prefer building an Iriq::Shape directly and calling
|
|
21
|
+
# `#render`. PathShape stays available for the common string-only path.
|
|
13
22
|
class PathShape
|
|
14
|
-
def initialize(classifier: SegmentClassifier::DEFAULT, hints: true
|
|
15
|
-
|
|
16
|
-
@
|
|
23
|
+
def initialize(classifier: SegmentClassifier::DEFAULT, hints: true,
|
|
24
|
+
canonical_dates: false, canonical_currencies: false)
|
|
25
|
+
@classifier = classifier
|
|
26
|
+
@hints = hints
|
|
27
|
+
@canonical_dates = canonical_dates
|
|
28
|
+
@canonical_currencies = canonical_currencies
|
|
17
29
|
end
|
|
18
30
|
|
|
19
31
|
def for(segments)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from_entries(SegmentHints.derive(segments, @classifier))
|
|
32
|
+
from_entries(SegmentHints.derive(segments || [], @classifier))
|
|
23
33
|
end
|
|
24
34
|
|
|
25
35
|
# Build a shape string from already-derived SegmentHints entries.
|
|
26
|
-
# Used by Corpus to avoid re-deriving entries per observation when it
|
|
27
|
-
# needs multiple shape variants (raw and hinted).
|
|
28
36
|
def from_entries(entries)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def shape_token(entry)
|
|
35
|
-
return entry[:value] unless entry[:variable]
|
|
36
|
-
|
|
37
|
-
placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
|
|
38
|
-
"{#{placeholder}}"
|
|
37
|
+
Shape.from_entries(entries).render(
|
|
38
|
+
hints: @hints,
|
|
39
|
+
canonical_dates: @canonical_dates,
|
|
40
|
+
canonical_currencies: @canonical_currencies,
|
|
41
|
+
)
|
|
39
42
|
end
|
|
40
43
|
|
|
41
|
-
def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true
|
|
42
|
-
|
|
44
|
+
def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true,
|
|
45
|
+
canonical_dates: false, canonical_currencies: false)
|
|
46
|
+
new(classifier: classifier, hints: hints,
|
|
47
|
+
canonical_dates: canonical_dates,
|
|
48
|
+
canonical_currencies: canonical_currencies).for(segments)
|
|
43
49
|
end
|
|
44
50
|
end
|
|
45
51
|
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# A typed slot in a host's URL structure.
|
|
3
|
+
#
|
|
4
|
+
# Two observations occupy the same Position when (host, scope, locator)
|
|
5
|
+
# match exactly. Position is the keying type used by Storage for
|
|
6
|
+
# frequency tables and by Cluster for per-slot inference.
|
|
7
|
+
#
|
|
8
|
+
# host — the EFFECTIVE host per Corpus#host_strategy. Observations of
|
|
9
|
+
# api.foo.com and app.foo.com under :registrable share the
|
|
10
|
+
# same Position. The original host stays on the Identifier.
|
|
11
|
+
# scope — :path or :query.
|
|
12
|
+
# locator — for :path, the typed prefix built up to this slot, e.g.
|
|
13
|
+
# "/orgs/{opaque_id}/users" for the integer slot in
|
|
14
|
+
# /orgs/abc/users/123. (Variable segments render as their
|
|
15
|
+
# hint or display-type, so the prefix groups across observations
|
|
16
|
+
# regardless of the specific IDs seen.)
|
|
17
|
+
# — for :query, the ?key= parameter name.
|
|
18
|
+
#
|
|
19
|
+
# Position implements value equality and is safe to use as a Hash key.
|
|
20
|
+
class Position
|
|
21
|
+
SCOPES = %i[path query].freeze
|
|
22
|
+
|
|
23
|
+
attr_reader :host, :scope, :locator
|
|
24
|
+
|
|
25
|
+
def self.path(host:, prefix:)
|
|
26
|
+
new(host: host, scope: :path, locator: prefix)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.query(host:, name:)
|
|
30
|
+
new(host: host, scope: :query, locator: name)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def initialize(host:, scope:, locator:)
|
|
34
|
+
raise ArgumentError, "scope must be one of #{SCOPES.inspect}" unless SCOPES.include?(scope)
|
|
35
|
+
|
|
36
|
+
@host = host
|
|
37
|
+
@scope = scope
|
|
38
|
+
@locator = locator
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def path?; @scope == :path; end
|
|
42
|
+
def query?; @scope == :query; end
|
|
43
|
+
|
|
44
|
+
def ==(other)
|
|
45
|
+
other.is_a?(Position) &&
|
|
46
|
+
other.host == @host &&
|
|
47
|
+
other.scope == @scope &&
|
|
48
|
+
other.locator == @locator
|
|
49
|
+
end
|
|
50
|
+
alias eql? ==
|
|
51
|
+
|
|
52
|
+
def hash
|
|
53
|
+
[@host, @scope, @locator].hash
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def to_h
|
|
57
|
+
{ host: @host, scope: @scope, locator: @locator }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def to_s
|
|
61
|
+
"Position(#{@host.inspect}, #{@scope}, #{@locator.inspect})"
|
|
62
|
+
end
|
|
63
|
+
alias inspect to_s
|
|
64
|
+
|
|
65
|
+
# Serialized form used by JSON / SQLite storage. Scope is emitted as
|
|
66
|
+
# a string for cross-runtime compatibility.
|
|
67
|
+
def to_dump
|
|
68
|
+
{ "host" => @host, "scope" => @scope.to_s, "locator" => @locator }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.from_dump(h)
|
|
72
|
+
new(host: h["host"], scope: h["scope"].to_sym, locator: h["locator"])
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
data/lib/iriq/position_stats.rb
CHANGED
|
@@ -3,16 +3,29 @@ module Iriq
|
|
|
3
3
|
# Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
|
|
4
4
|
# doesn't grow memory without bound — `total` keeps growing accurately, but
|
|
5
5
|
# only the first `max_values` distinct values are tracked individually.
|
|
6
|
+
# Existing tracked values still receive increments after the cap is hit;
|
|
7
|
+
# only NEW distinct values are dropped.
|
|
6
8
|
class PositionStats
|
|
7
|
-
DEFAULT_MAX_VALUES =
|
|
9
|
+
DEFAULT_MAX_VALUES = 5_000
|
|
8
10
|
|
|
9
|
-
attr_reader :value_counts, :type_counts, :total, :max_values
|
|
11
|
+
attr_reader :value_counts, :type_counts, :total, :max_values,
|
|
12
|
+
:numeric_count, :numeric_min, :numeric_max, :numeric_sum
|
|
13
|
+
|
|
14
|
+
NUMERIC_TYPES = %i[integer float].freeze
|
|
10
15
|
|
|
11
16
|
def initialize(max_values: DEFAULT_MAX_VALUES)
|
|
12
|
-
@value_counts
|
|
13
|
-
@type_counts
|
|
14
|
-
@total
|
|
15
|
-
@max_values
|
|
17
|
+
@value_counts = Hash.new(0)
|
|
18
|
+
@type_counts = Hash.new(0)
|
|
19
|
+
@total = 0
|
|
20
|
+
@max_values = max_values
|
|
21
|
+
# Range stats for numeric observations only. Lets the corpus
|
|
22
|
+
# promote /articles/2024 etc. to :year when all values land in
|
|
23
|
+
# 1900..2100, and surfaces min/max/avg on ParamSummary for
|
|
24
|
+
# general numeric params.
|
|
25
|
+
@numeric_count = 0
|
|
26
|
+
@numeric_min = nil
|
|
27
|
+
@numeric_max = nil
|
|
28
|
+
@numeric_sum = 0.0
|
|
16
29
|
end
|
|
17
30
|
|
|
18
31
|
def observe(value, type)
|
|
@@ -21,8 +34,31 @@ module Iriq
|
|
|
21
34
|
if @value_counts.size < @max_values || @value_counts.key?(value)
|
|
22
35
|
@value_counts[value] += 1
|
|
23
36
|
end
|
|
37
|
+
record_numeric(value, type)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def numeric_avg
|
|
41
|
+
return nil if @numeric_count.zero?
|
|
42
|
+
|
|
43
|
+
@numeric_sum / @numeric_count
|
|
24
44
|
end
|
|
25
45
|
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def record_numeric(value, type)
|
|
49
|
+
return unless NUMERIC_TYPES.include?(type)
|
|
50
|
+
|
|
51
|
+
n = Float(value) rescue nil
|
|
52
|
+
return unless n
|
|
53
|
+
|
|
54
|
+
@numeric_count += 1
|
|
55
|
+
@numeric_min = n if @numeric_min.nil? || n < @numeric_min
|
|
56
|
+
@numeric_max = n if @numeric_max.nil? || n > @numeric_max
|
|
57
|
+
@numeric_sum += n
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
public
|
|
61
|
+
|
|
26
62
|
def cardinality
|
|
27
63
|
@value_counts.size
|
|
28
64
|
end
|
|
@@ -42,13 +78,37 @@ module Iriq
|
|
|
42
78
|
(@value_counts[value] || 0).to_f / @total
|
|
43
79
|
end
|
|
44
80
|
|
|
81
|
+
# Most common type. On count ties, breaks lexicographically by type
|
|
82
|
+
# symbol name so the result is deterministic and matches Go's
|
|
83
|
+
# DominantType (Go's map iteration is randomized).
|
|
84
|
+
def dominant_type
|
|
85
|
+
best = nil
|
|
86
|
+
best_count = -1
|
|
87
|
+
@type_counts.each do |t, n|
|
|
88
|
+
if n > best_count || (n == best_count && t.to_s < best.to_s)
|
|
89
|
+
best = t
|
|
90
|
+
best_count = n
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
best
|
|
94
|
+
end
|
|
95
|
+
|
|
45
96
|
def dump
|
|
46
|
-
|
|
47
|
-
|
|
97
|
+
# Dup the hashes so callers can mutate the dump structure (test
|
|
98
|
+
# fixtures, post-processing) without aliasing the live state.
|
|
99
|
+
out = {
|
|
100
|
+
"value_counts" => @value_counts.dup,
|
|
48
101
|
"type_counts" => @type_counts.transform_keys(&:to_s),
|
|
49
102
|
"total" => @total,
|
|
50
103
|
"max_values" => @max_values,
|
|
51
104
|
}
|
|
105
|
+
if @numeric_count.positive?
|
|
106
|
+
out["numeric_count"] = @numeric_count
|
|
107
|
+
out["numeric_min"] = @numeric_min
|
|
108
|
+
out["numeric_max"] = @numeric_max
|
|
109
|
+
out["numeric_sum"] = @numeric_sum
|
|
110
|
+
end
|
|
111
|
+
out
|
|
52
112
|
end
|
|
53
113
|
|
|
54
114
|
def self.from_dump(h)
|
|
@@ -58,6 +118,12 @@ module Iriq
|
|
|
58
118
|
tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
|
|
59
119
|
stats.instance_variable_set(:@value_counts, vc)
|
|
60
120
|
stats.instance_variable_set(:@type_counts, tc)
|
|
121
|
+
if h["numeric_count"]
|
|
122
|
+
stats.instance_variable_set(:@numeric_count, h["numeric_count"])
|
|
123
|
+
stats.instance_variable_set(:@numeric_min, h["numeric_min"])
|
|
124
|
+
stats.instance_variable_set(:@numeric_max, h["numeric_max"])
|
|
125
|
+
stats.instance_variable_set(:@numeric_sum, h["numeric_sum"])
|
|
126
|
+
end
|
|
61
127
|
stats
|
|
62
128
|
end
|
|
63
129
|
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Pluggable single-type classifier.
|
|
3
|
+
#
|
|
4
|
+
# A Recognizer encapsulates "this string-shape implies this type" plus the
|
|
5
|
+
# canonical form (if any). The ensemble-based SegmentClassifier consults
|
|
6
|
+
# Recognizers in order and picks the first that fires. (Scored-ensemble
|
|
7
|
+
# voting comes in a follow-up; for now each fire is decisive.)
|
|
8
|
+
#
|
|
9
|
+
# try(segment) -> { type:, confidence:, canonical:, notes: } | nil
|
|
10
|
+
# nil — this Recognizer does not claim the segment.
|
|
11
|
+
# type — symbol from the recognized vocabulary.
|
|
12
|
+
# confidence — float in [0, 1]. Phase-1 step 2 always returns 1.0
|
|
13
|
+
# when a Recognizer fires; calibration arrives with the scored
|
|
14
|
+
# ensemble in step 4.
|
|
15
|
+
# canonical — canonical form (e.g. ISO date for :date). nil ≡ "use input".
|
|
16
|
+
# notes — optional array of strings the Trace view may surface.
|
|
17
|
+
#
|
|
18
|
+
# Recognizers are instantiated once and shared (they hold no per-call
|
|
19
|
+
# state). See Iriq::Recognizers::UUID / DATE / INTEGER for the built-ins.
|
|
20
|
+
class Recognizer
|
|
21
|
+
def try(_segment)
|
|
22
|
+
raise NotImplementedError
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Run each Recognizer against the segment and return the winning
|
|
26
|
+
# Verdict — the one with max(specificity × confidence). Ties go to
|
|
27
|
+
# the earlier Recognizer in the list (stable, deterministic).
|
|
28
|
+
# Returns nil when no Recognizer fires.
|
|
29
|
+
#
|
|
30
|
+
# Stepping-stone toward the full scored ensemble: today only three
|
|
31
|
+
# Recognizers participate (uuid, date, integer) and they're
|
|
32
|
+
# mutually-exclusive on shape, so the ensemble is effectively a
|
|
33
|
+
# short-circuit OR. As more Recognizers carve out of SegmentClassifier
|
|
34
|
+
# they'll join the pool and the scoring becomes load-bearing.
|
|
35
|
+
def self.ensemble(segment, *recognizers)
|
|
36
|
+
best = nil
|
|
37
|
+
best_score = -1.0
|
|
38
|
+
recognizers.each do |r|
|
|
39
|
+
v = r.try(segment)
|
|
40
|
+
next unless v
|
|
41
|
+
|
|
42
|
+
score = (v[:specificity] || 0.0) * (v[:confidence] || 0.0)
|
|
43
|
+
if score > best_score
|
|
44
|
+
best = v
|
|
45
|
+
best_score = score
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
best
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
module Recognizers
|
|
53
|
+
end
|
|
54
|
+
end
|