iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
@@ -0,0 +1,114 @@
1
+ module Iriq
2
+ # Evidence is the structured substrate for explanation. Each Record
3
+ # captures one fact about the system's reasoning: "this segment
4
+ # classified as :integer because the Integer recognizer fired with
5
+ # specificity TYPED", "the IPv4 type collapses to {ip} by policy",
6
+ # "Position P is mostly variable because of corpus stats".
7
+ #
8
+ # Trace and Explanation are views over a list of Evidence records;
9
+ # the structured form is what programmatic consumers (test assertions,
10
+ # PR-diff annotators, downstream tooling) should build on. Human note
11
+ # strings emitted by Trace are derived from Evidence payloads, so
12
+ # adding a new note kind starts with adding a new Evidence shape.
13
+ #
14
+ # Two axes:
15
+ #
16
+ # subject_kind ∈ {:segment, :position, :cluster}
17
+ # What this Evidence is about. Today most Evidence is :segment
18
+ # (per-segment classification facts). :position and :cluster
19
+ # Evidence become load-bearing once corpus-informed Trace lands
20
+ # in a follow-up step.
21
+ #
22
+ # source ∈ {:lexical, :recognizer, :corpus, :neighbor, :policy}
23
+ # What kind of fact is being asserted.
24
+ # :lexical — pure shape match (e.g. "matches DATE_RE")
25
+ # :recognizer — a named Recognizer fired with confidence/specificity
26
+ # :corpus — aggregated counts/distributions support this
27
+ # :neighbor — adjacent context informed this (prior literal,
28
+ # param name hint)
29
+ # :policy — a normalization policy applied (ip umbrella
30
+ # collapse, canonical date, currency upcase)
31
+ module Evidence
32
+ SUBJECT_KINDS = %i[segment position cluster].freeze
33
+ SOURCES = %i[lexical recognizer corpus neighbor policy].freeze
34
+
35
+ # A single Evidence fact.
36
+ #
37
+ # subject_kind — :segment | :position | :cluster
38
+ # subject — kind-specific identity:
39
+ # :segment → { index:, value: }
40
+ # :position → Iriq::Position
41
+ # :cluster → cluster key (string)
42
+ # source — :lexical | :recognizer | :corpus | :neighbor | :policy
43
+ # payload — source-and-kind-specific structured data
44
+ # weight — optional float in [0,1] — contribution to the
45
+ # ultimate decision. Set when scoring is meaningful;
46
+ # nil otherwise.
47
+ # notes — optional human-readable strings. Trace renders
48
+ # these directly; programmatic consumers can ignore.
49
+ class Record
50
+ attr_reader :subject_kind, :subject, :source, :payload, :weight, :notes
51
+
52
+ def initialize(subject_kind:, subject:, source:, payload:, weight: nil, notes: [])
53
+ unless SUBJECT_KINDS.include?(subject_kind)
54
+ raise ArgumentError, "subject_kind must be one of #{SUBJECT_KINDS.inspect}"
55
+ end
56
+ unless SOURCES.include?(source)
57
+ raise ArgumentError, "source must be one of #{SOURCES.inspect}"
58
+ end
59
+
60
+ @subject_kind = subject_kind
61
+ @subject = subject
62
+ @source = source
63
+ @payload = payload || {}
64
+ @weight = weight
65
+ @notes = notes || []
66
+ end
67
+
68
+ def to_h
69
+ {
70
+ subject_kind: @subject_kind,
71
+ subject: subject_serialized,
72
+ source: @source,
73
+ payload: @payload,
74
+ weight: @weight,
75
+ notes: @notes,
76
+ }.compact
77
+ end
78
+
79
+ private
80
+
81
+ def subject_serialized
82
+ return @subject.to_h if @subject.respond_to?(:to_h) && !@subject.is_a?(Hash)
83
+ @subject
84
+ end
85
+ end
86
+
87
+ module_function
88
+
89
+ # Factories so call sites don't have to repeat subject_kind:.
90
+ def segment(index:, value:, source:, payload:, weight: nil, notes: [])
91
+ Record.new(
92
+ subject_kind: :segment,
93
+ subject: { index: index, value: value },
94
+ source: source, payload: payload, weight: weight, notes: notes,
95
+ )
96
+ end
97
+
98
+ def position(position:, source:, payload:, weight: nil, notes: [])
99
+ Record.new(
100
+ subject_kind: :position,
101
+ subject: position,
102
+ source: source, payload: payload, weight: weight, notes: notes,
103
+ )
104
+ end
105
+
106
+ def cluster(key:, source:, payload:, weight: nil, notes: [])
107
+ Record.new(
108
+ subject_kind: :cluster,
109
+ subject: key,
110
+ source: source, payload: payload, weight: weight, notes: notes,
111
+ )
112
+ end
113
+ end
114
+ end
@@ -4,7 +4,7 @@ module Iriq
4
4
  # Explanation.explain("https://foo.com/users/123")
5
5
  # # => [
6
6
  # # { value: "users", type: :literal, variable: false, hint: nil },
7
- # # { value: "123", type: :integer_id, variable: true, hint: "user_id" },
7
+ # # { value: "123", type: :integer, variable: true, hint: "user_id" },
8
8
  # # ]
9
9
  module Explanation
10
10
  module_function
@@ -5,46 +5,88 @@ module Iriq
5
5
  # # => "https://foo.com/users/{user_id}"
6
6
  #
7
7
  # The form is intended for grouping/diffing — it is not a round-trippable URL.
8
+ #
9
+ # Path + query rendering dispatches through an evidence source so the
10
+ # mechanical (classifier-only) and corpus-informed code paths share one
11
+ # entry point. When `evidence` is nil, NullEvidenceSource provides the
12
+ # mechanical behavior (PathShape + param-name-hint query rules). When a
13
+ # Corpus is passed as `evidence`, its observed Position / Cluster stats
14
+ # drive the rendering (variability promotion, popular outlier
15
+ # preservation, cluster-inferred query types).
8
16
  module Normalizer
9
17
  module_function
10
18
 
11
- def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true)
19
+ def normalize(input, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
12
20
  iri = input.is_a?(Identifier) ? input : Parser.parse(input)
13
- normalize_identifier(iri, classifier: classifier, hints: hints)
21
+ normalize_identifier(iri, classifier: classifier, hints: hints, evidence: evidence)
22
+ end
23
+
24
+ def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true, evidence: nil)
25
+ return normalize_urn(iri, classifier, hints) if iri.urn?
26
+
27
+ src = evidence || NullEvidenceSource.new
28
+ out = +""
29
+ out << "#{iri.scheme}://" if iri.scheme
30
+ out << iri.host if iri.host
31
+ out << ":#{iri.port}" if iri.port
32
+ out << src.render_path(iri, classifier, hints)
33
+ if iri.query_params && !iri.query_params.empty?
34
+ out << "?" << src.render_query(iri, classifier)
35
+ end
36
+ out
14
37
  end
15
38
 
16
- def normalize_identifier(iri, classifier: SegmentClassifier::DEFAULT, hints: true)
17
- if iri.urn?
18
- if iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
19
- ns, value = iri.nss.split(":", 2)
20
- entry = SegmentHints.derive([ns, value], classifier).last
21
- shaped = if entry[:variable]
22
- "{#{(hints && entry[:hint]) || entry[:type]}}"
23
- else
24
- entry[:value]
25
- end
26
- "urn:#{ns}:#{shaped}"
39
+ def normalize_urn(iri, classifier, hints)
40
+ return iri.canonical unless iri.scheme == "urn" && iri.nss && iri.nss.include?(":")
41
+
42
+ ns, value = iri.nss.split(":", 2)
43
+ entry = SegmentHints.derive([ns, value], classifier).last
44
+ shaped =
45
+ if entry[:type] == :date && (canon = SegmentClassifier.canonical_date(entry[:value]))
46
+ canon
47
+ elsif entry[:type] == :currency && (canon = SegmentClassifier.canonical_currency(entry[:value]))
48
+ canon
49
+ elsif entry[:variable]
50
+ "{#{(hints && entry[:hint]) || SegmentClassifier.display_type(entry[:type])}}"
27
51
  else
28
- iri.canonical
52
+ entry[:value]
29
53
  end
30
- else
31
- out = +""
32
- out << "#{iri.scheme}://" if iri.scheme
33
- out << iri.host if iri.host
34
- out << ":#{iri.port}" if iri.port
35
- out << PathShape.new(classifier: classifier, hints: hints).for(iri.path_segments)
36
- if iri.query_params && !iri.query_params.empty?
37
- out << "?" + shape_query(iri.query_params, classifier)
38
- end
39
- out
40
- end
54
+ "urn:#{ns}:#{shaped}"
55
+ end
56
+ end
57
+
58
+ # NullEvidenceSource is the default evidence source — purely
59
+ # classifier-driven, no corpus signal. The Normalizer's mechanical
60
+ # behavior is what this produces. Implements the same {render_path,
61
+ # render_query} interface that Corpus implements for the corpus-informed
62
+ # path.
63
+ class NullEvidenceSource
64
+ def render_path(iri, classifier, hints)
65
+ PathShape.new(
66
+ classifier: classifier, hints: hints,
67
+ canonical_dates: true, canonical_currencies: true,
68
+ ).for(iri.path_segments)
41
69
  end
42
70
 
43
- def shape_query(params, classifier)
44
- params.keys.sort.map do |k|
45
- v = params[k]
71
+ def render_query(iri, classifier)
72
+ iri.query_params.keys.sort.map do |k|
73
+ v = iri.query_params[k]
46
74
  type = classifier.classify(v.to_s)
47
- shaped = classifier.variable?(type) ? "{#{type}}" : v
75
+ # Param-name hint can lift a generic literal/opaque_id/slug into
76
+ # a semantic type — `?phone=unknown` becomes `{phone}`.
77
+ if (hint = SegmentClassifier.param_name_hint(k, type))
78
+ type = hint
79
+ end
80
+ shaped =
81
+ if type == :date && (canon = SegmentClassifier.canonical_date(v.to_s))
82
+ canon
83
+ elsif type == :currency && (canon = SegmentClassifier.canonical_currency(v.to_s))
84
+ canon
85
+ elsif classifier.variable?(type)
86
+ "{#{SegmentClassifier.display_type(type)}}"
87
+ else
88
+ v
89
+ end
48
90
  "#{k}=#{shaped}"
49
91
  end.join("&")
50
92
  end
data/lib/iriq/parser.rb CHANGED
@@ -3,7 +3,7 @@ module Iriq
3
3
  #
4
4
  # Intentionally NOT a full RFC 3986 / 3987 / WHATWG URL implementation. We
5
5
  # accept enough of the common shapes (URLs, scheme-less hosts, URNs, raw
6
- # Unicode hosts and paths) to support normalization and clustering.
6
+ # Unicode hosts and paths) to support extraction, normalization, and clustering.
7
7
  module Parser
8
8
  SCHEME_RE = /\A([a-zA-Z][a-zA-Z0-9+\-.]*):/.freeze
9
9
 
@@ -1,7 +1,8 @@
1
1
  module Iriq
2
- # Converts a sequence of path segments into a route-shape string by
3
- # replacing variable segments with `{hint}` placeholders, falling back to
4
- # `{type}` when no hint is available.
2
+ # Renderer that produces a route-shape string by replacing variable
3
+ # segments with `{hint}` placeholders. As of v0.16 this is a thin wrapper
4
+ # around Iriq::Shape kept for back-compat with callers that still want
5
+ # to get a string in one call.
5
6
  #
6
7
  # PathShape.for(["users", "123", "orders", "456"])
7
8
  # # => "/users/{user_id}/orders/{order_id}"
@@ -9,37 +10,42 @@ module Iriq
9
10
  # Pass `hints: false` to use raw types instead:
10
11
  #
11
12
  # PathShape.for(["users", "123"], hints: false)
12
- # # => "/users/{integer_id}"
13
+ # # => "/users/{integer}"
14
+ #
15
+ # Pass `canonical_dates: true` to render date-typed segments in canonical
16
+ # ISO form (2024/01/15 → 2024-01-15) instead of as a `{date}` placeholder.
17
+ # Pass `canonical_currencies: true` for the same treatment of currency
18
+ # codes (`usd` → `USD`).
19
+ #
20
+ # For new code, prefer building an Iriq::Shape directly and calling
21
+ # `#render`. PathShape stays available for the common string-only path.
13
22
  class PathShape
14
- def initialize(classifier: SegmentClassifier::DEFAULT, hints: true)
15
- @classifier = classifier
16
- @hints = hints
23
+ def initialize(classifier: SegmentClassifier::DEFAULT, hints: true,
24
+ canonical_dates: false, canonical_currencies: false)
25
+ @classifier = classifier
26
+ @hints = hints
27
+ @canonical_dates = canonical_dates
28
+ @canonical_currencies = canonical_currencies
17
29
  end
18
30
 
19
31
  def for(segments)
20
- return "/" if segments.nil? || segments.empty?
21
-
22
- from_entries(SegmentHints.derive(segments, @classifier))
32
+ from_entries(SegmentHints.derive(segments || [], @classifier))
23
33
  end
24
34
 
25
35
  # Build a shape string from already-derived SegmentHints entries.
26
- # Used by Corpus to avoid re-deriving entries per observation when it
27
- # needs multiple shape variants (raw and hinted).
28
36
  def from_entries(entries)
29
- return "/" if entries.nil? || entries.empty?
30
-
31
- "/" + entries.map { |e| shape_token(e) }.join("/")
32
- end
33
-
34
- def shape_token(entry)
35
- return entry[:value] unless entry[:variable]
36
-
37
- placeholder = @hints ? (entry[:hint] || entry[:type]) : entry[:type]
38
- "{#{placeholder}}"
37
+ Shape.from_entries(entries).render(
38
+ hints: @hints,
39
+ canonical_dates: @canonical_dates,
40
+ canonical_currencies: @canonical_currencies,
41
+ )
39
42
  end
40
43
 
41
- def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true)
42
- new(classifier: classifier, hints: hints).for(segments)
44
+ def self.for(segments, classifier: SegmentClassifier::DEFAULT, hints: true,
45
+ canonical_dates: false, canonical_currencies: false)
46
+ new(classifier: classifier, hints: hints,
47
+ canonical_dates: canonical_dates,
48
+ canonical_currencies: canonical_currencies).for(segments)
43
49
  end
44
50
  end
45
51
  end
@@ -0,0 +1,75 @@
1
+ module Iriq
2
+ # A typed slot in a host's URL structure.
3
+ #
4
+ # Two observations occupy the same Position when (host, scope, locator)
5
+ # match exactly. Position is the keying type used by Storage for
6
+ # frequency tables and by Cluster for per-slot inference.
7
+ #
8
+ # host — the EFFECTIVE host per Corpus#host_strategy. Observations of
9
+ # api.foo.com and app.foo.com under :registrable share the
10
+ # same Position. The original host stays on the Identifier.
11
+ # scope — :path or :query.
12
+ # locator — for :path, the typed prefix built up to this slot, e.g.
13
+ # "/orgs/{opaque_id}/users" for the integer slot in
14
+ # /orgs/abc/users/123. (Variable segments render as their
15
+ # hint or display-type, so the prefix groups across observations
16
+ # regardless of the specific IDs seen.)
17
+ # — for :query, the ?key= parameter name.
18
+ #
19
+ # Position implements value equality and is safe to use as a Hash key.
20
+ class Position
21
+ SCOPES = %i[path query].freeze
22
+
23
+ attr_reader :host, :scope, :locator
24
+
25
+ def self.path(host:, prefix:)
26
+ new(host: host, scope: :path, locator: prefix)
27
+ end
28
+
29
+ def self.query(host:, name:)
30
+ new(host: host, scope: :query, locator: name)
31
+ end
32
+
33
+ def initialize(host:, scope:, locator:)
34
+ raise ArgumentError, "scope must be one of #{SCOPES.inspect}" unless SCOPES.include?(scope)
35
+
36
+ @host = host
37
+ @scope = scope
38
+ @locator = locator
39
+ end
40
+
41
+ def path?; @scope == :path; end
42
+ def query?; @scope == :query; end
43
+
44
+ def ==(other)
45
+ other.is_a?(Position) &&
46
+ other.host == @host &&
47
+ other.scope == @scope &&
48
+ other.locator == @locator
49
+ end
50
+ alias eql? ==
51
+
52
+ def hash
53
+ [@host, @scope, @locator].hash
54
+ end
55
+
56
+ def to_h
57
+ { host: @host, scope: @scope, locator: @locator }
58
+ end
59
+
60
+ def to_s
61
+ "Position(#{@host.inspect}, #{@scope}, #{@locator.inspect})"
62
+ end
63
+ alias inspect to_s
64
+
65
+ # Serialized form used by JSON / SQLite storage. Scope is emitted as
66
+ # a string for cross-runtime compatibility.
67
+ def to_dump
68
+ { "host" => @host, "scope" => @scope.to_s, "locator" => @locator }
69
+ end
70
+
71
+ def self.from_dump(h)
72
+ new(host: h["host"], scope: h["scope"].to_sym, locator: h["locator"])
73
+ end
74
+ end
75
+ end
@@ -3,16 +3,29 @@ module Iriq
3
3
  # Value cardinality is capped so a high-entropy position (UUIDs, timestamps)
4
4
  # doesn't grow memory without bound — `total` keeps growing accurately, but
5
5
  # only the first `max_values` distinct values are tracked individually.
6
+ # Existing tracked values still receive increments after the cap is hit;
7
+ # only NEW distinct values are dropped.
6
8
  class PositionStats
7
- DEFAULT_MAX_VALUES = 1_000
9
+ DEFAULT_MAX_VALUES = 5_000
8
10
 
9
- attr_reader :value_counts, :type_counts, :total, :max_values
11
+ attr_reader :value_counts, :type_counts, :total, :max_values,
12
+ :numeric_count, :numeric_min, :numeric_max, :numeric_sum
13
+
14
+ NUMERIC_TYPES = %i[integer float].freeze
10
15
 
11
16
  def initialize(max_values: DEFAULT_MAX_VALUES)
12
- @value_counts = Hash.new(0)
13
- @type_counts = Hash.new(0)
14
- @total = 0
15
- @max_values = max_values
17
+ @value_counts = Hash.new(0)
18
+ @type_counts = Hash.new(0)
19
+ @total = 0
20
+ @max_values = max_values
21
+ # Range stats for numeric observations only. Lets the corpus
22
+ # promote /articles/2024 etc. to :year when all values land in
23
+ # 1900..2100, and surfaces min/max/avg on ParamSummary for
24
+ # general numeric params.
25
+ @numeric_count = 0
26
+ @numeric_min = nil
27
+ @numeric_max = nil
28
+ @numeric_sum = 0.0
16
29
  end
17
30
 
18
31
  def observe(value, type)
@@ -21,8 +34,31 @@ module Iriq
21
34
  if @value_counts.size < @max_values || @value_counts.key?(value)
22
35
  @value_counts[value] += 1
23
36
  end
37
+ record_numeric(value, type)
38
+ end
39
+
40
+ def numeric_avg
41
+ return nil if @numeric_count.zero?
42
+
43
+ @numeric_sum / @numeric_count
24
44
  end
25
45
 
46
+ private
47
+
48
+ def record_numeric(value, type)
49
+ return unless NUMERIC_TYPES.include?(type)
50
+
51
+ n = Float(value) rescue nil
52
+ return unless n
53
+
54
+ @numeric_count += 1
55
+ @numeric_min = n if @numeric_min.nil? || n < @numeric_min
56
+ @numeric_max = n if @numeric_max.nil? || n > @numeric_max
57
+ @numeric_sum += n
58
+ end
59
+
60
+ public
61
+
26
62
  def cardinality
27
63
  @value_counts.size
28
64
  end
@@ -42,13 +78,37 @@ module Iriq
42
78
  (@value_counts[value] || 0).to_f / @total
43
79
  end
44
80
 
81
+ # Most common type. On count ties, breaks lexicographically by type
82
+ # symbol name so the result is deterministic and matches Go's
83
+ # DominantType (Go's map iteration is randomized).
84
+ def dominant_type
85
+ best = nil
86
+ best_count = -1
87
+ @type_counts.each do |t, n|
88
+ if n > best_count || (n == best_count && t.to_s < best.to_s)
89
+ best = t
90
+ best_count = n
91
+ end
92
+ end
93
+ best
94
+ end
95
+
45
96
  def dump
46
- {
47
- "value_counts" => @value_counts,
97
+ # Dup the hashes so callers can mutate the dump structure (test
98
+ # fixtures, post-processing) without aliasing the live state.
99
+ out = {
100
+ "value_counts" => @value_counts.dup,
48
101
  "type_counts" => @type_counts.transform_keys(&:to_s),
49
102
  "total" => @total,
50
103
  "max_values" => @max_values,
51
104
  }
105
+ if @numeric_count.positive?
106
+ out["numeric_count"] = @numeric_count
107
+ out["numeric_min"] = @numeric_min
108
+ out["numeric_max"] = @numeric_max
109
+ out["numeric_sum"] = @numeric_sum
110
+ end
111
+ out
52
112
  end
53
113
 
54
114
  def self.from_dump(h)
@@ -58,6 +118,12 @@ module Iriq
58
118
  tc = Hash.new(0).merge(h["type_counts"].transform_keys(&:to_sym))
59
119
  stats.instance_variable_set(:@value_counts, vc)
60
120
  stats.instance_variable_set(:@type_counts, tc)
121
+ if h["numeric_count"]
122
+ stats.instance_variable_set(:@numeric_count, h["numeric_count"])
123
+ stats.instance_variable_set(:@numeric_min, h["numeric_min"])
124
+ stats.instance_variable_set(:@numeric_max, h["numeric_max"])
125
+ stats.instance_variable_set(:@numeric_sum, h["numeric_sum"])
126
+ end
61
127
  stats
62
128
  end
63
129
  end
@@ -0,0 +1,54 @@
1
+ module Iriq
2
+ # Pluggable single-type classifier.
3
+ #
4
+ # A Recognizer encapsulates "this string-shape implies this type" plus the
5
+ # canonical form (if any). The ensemble-based SegmentClassifier consults
6
+ # Recognizers in order and picks the first that fires. (Scored-ensemble
7
+ # voting comes in a follow-up; for now each fire is decisive.)
8
+ #
9
+ # try(segment) -> { type:, confidence:, canonical:, notes: } | nil
10
+ # nil — this Recognizer does not claim the segment.
11
+ # type — symbol from the recognized vocabulary.
12
+ # confidence — float in [0, 1]. Phase-1 step 2 always returns 1.0
13
+ # when a Recognizer fires; calibration arrives with the scored
14
+ # ensemble in step 4.
15
+ # canonical — canonical form (e.g. ISO date for :date). nil ≡ "use input".
16
+ # notes — optional array of strings the Trace view may surface.
17
+ #
18
+ # Recognizers are instantiated once and shared (they hold no per-call
19
+ # state). See Iriq::Recognizers::UUID / DATE / INTEGER for the built-ins.
20
+ class Recognizer
21
+ def try(_segment)
22
+ raise NotImplementedError
23
+ end
24
+
25
+ # Run each Recognizer against the segment and return the winning
26
+ # Verdict — the one with max(specificity × confidence). Ties go to
27
+ # the earlier Recognizer in the list (stable, deterministic).
28
+ # Returns nil when no Recognizer fires.
29
+ #
30
+ # Stepping-stone toward the full scored ensemble: today only three
31
+ # Recognizers participate (uuid, date, integer) and they're
32
+ # mutually-exclusive on shape, so the ensemble is effectively a
33
+ # short-circuit OR. As more Recognizers carve out of SegmentClassifier
34
+ # they'll join the pool and the scoring becomes load-bearing.
35
+ def self.ensemble(segment, *recognizers)
36
+ best = nil
37
+ best_score = -1.0
38
+ recognizers.each do |r|
39
+ v = r.try(segment)
40
+ next unless v
41
+
42
+ score = (v[:specificity] || 0.0) * (v[:confidence] || 0.0)
43
+ if score > best_score
44
+ best = v
45
+ best_score = score
46
+ end
47
+ end
48
+ best
49
+ end
50
+ end
51
+
52
+ module Recognizers
53
+ end
54
+ end