iriq 0.1.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +87 -0
  3. data/CLAUDE.md +208 -0
  4. data/Gemfile.lock +8 -2
  5. data/Makefile +113 -0
  6. data/README.md +249 -270
  7. data/completions/_iriq +52 -0
  8. data/completions/iriq.bash +70 -0
  9. data/docs/ARCHITECTURE.md +223 -0
  10. data/docs/ROADMAP.md +190 -0
  11. data/iriq.gemspec +5 -4
  12. data/lib/iriq/cli.rb +402 -49
  13. data/lib/iriq/cluster.rb +304 -8
  14. data/lib/iriq/clusterer.rb +19 -44
  15. data/lib/iriq/corpus.rb +417 -81
  16. data/lib/iriq/cross_host_shape.rb +37 -0
  17. data/lib/iriq/event.rb +22 -0
  18. data/lib/iriq/evidence.rb +114 -0
  19. data/lib/iriq/explanation.rb +1 -1
  20. data/lib/iriq/normalizer.rb +71 -29
  21. data/lib/iriq/parser.rb +1 -1
  22. data/lib/iriq/path_shape.rb +30 -24
  23. data/lib/iriq/position.rb +75 -0
  24. data/lib/iriq/position_stats.rb +74 -8
  25. data/lib/iriq/recognizer.rb +54 -0
  26. data/lib/iriq/recognizer_proposal.rb +167 -0
  27. data/lib/iriq/recognizers/date.rb +53 -0
  28. data/lib/iriq/recognizers/integer.rb +37 -0
  29. data/lib/iriq/recognizers/uuid.rb +16 -0
  30. data/lib/iriq/reducer.rb +37 -0
  31. data/lib/iriq/registrable_domain.rb +56 -0
  32. data/lib/iriq/segment_classifier.rb +475 -23
  33. data/lib/iriq/segment_hints.rb +9 -0
  34. data/lib/iriq/shape.rb +106 -0
  35. data/lib/iriq/specificity.rb +35 -0
  36. data/lib/iriq/storage/json.rb +43 -0
  37. data/lib/iriq/storage/memory.rb +209 -0
  38. data/lib/iriq/storage/sqlite.rb +546 -0
  39. data/lib/iriq/storage.rb +35 -0
  40. data/lib/iriq/synthesized_recognizer.rb +56 -0
  41. data/lib/iriq/trace.rb +294 -0
  42. data/lib/iriq/version.rb +1 -1
  43. data/lib/iriq.rb +18 -0
  44. metadata +44 -8
  45. data/script/benchmark.rb +0 -81
  46. data/script/memory.rb +0 -121
@@ -0,0 +1,167 @@
1
+ require "set"
2
+
3
+ module Iriq
4
+ # A suggestion that a new Recognizer should be added to the system.
5
+ #
6
+ # Emitted by Corpus#propose_recognizers. NOT automatically activated —
7
+ # proposals carry enough evidence for a human to judge whether to add
8
+ # the Recognizer to the built-in set (or, later, to register it
9
+ # dynamically via a public Recognizer registry).
10
+ #
11
+ # prefix — the detected shape signature (e.g. "ghp_")
12
+ # suggested_type — Symbol name we'd register the Recognizer under
13
+ # if accepted (e.g. :ghp)
14
+ # positions — every Position where the proposal matched
15
+ # hosts — distinct hosts the proposal was seen at; a high
16
+ # count is strong evidence the pattern isn't
17
+ # host-local
18
+ # coverage — fraction of sampled observations at affected
19
+ # Positions matching the proposal pattern
20
+ # observation_count — total matching observations across positions
21
+ # sample_values — up to 5 example matches, for the human reviewer
22
+ # strategy — the ProposalStrategy that emitted this record
23
+ class RecognizerProposal
24
+ attr_reader :prefix, :suggested_type, :positions, :hosts,
25
+ :coverage, :confidence, :observation_count,
26
+ :sample_values, :strategy
27
+
28
+ def initialize(prefix:, suggested_type:, positions:, hosts:,
29
+ coverage:, observation_count:, sample_values:,
30
+ strategy:, confidence: nil)
31
+ @prefix = prefix
32
+ @suggested_type = suggested_type
33
+ @positions = positions.freeze
34
+ @hosts = hosts.is_a?(Set) ? hosts.dup.freeze : Set.new(hosts).freeze
35
+ @coverage = coverage
36
+ @observation_count = observation_count
37
+ @sample_values = sample_values.freeze
38
+ @strategy = strategy
39
+ @confidence = confidence.nil? ? compute_confidence : confidence
40
+ end
41
+
42
+ def to_h
43
+ {
44
+ prefix: @prefix,
45
+ suggested_type: @suggested_type,
46
+ positions: @positions.map(&:to_h),
47
+ hosts: @hosts.to_a.sort,
48
+ coverage: @coverage,
49
+ confidence: @confidence,
50
+ observation_count: @observation_count,
51
+ sample_values: @sample_values,
52
+ strategy: @strategy,
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ # Confidence = coverage + linear cross-host boost, capped at 1.0.
59
+ # Single-host proposals get their raw coverage as confidence (no
60
+ # boost). Each additional host adds CROSS_HOST_BOOST_PER_HOST to
61
+ # the score. A proposal supported by ~10 distinct hosts caps out
62
+ # regardless of raw coverage; below that, both signals compose.
63
+ def compute_confidence
64
+ boost = (@hosts.size - 1) * ProposalStrategy::CROSS_HOST_BOOST_PER_HOST
65
+ score = @coverage + boost
66
+ score > 1.0 ? 1.0 : score
67
+ end
68
+ end
69
+
70
+ # Pluggable proposal-detection strategies. Each strategy.propose(storage, **opts)
71
+ # returns an array of RecognizerProposal. Adding a new detection rule =
72
+ # add a class with #propose; register it via DEFAULTS.
73
+ module ProposalStrategy
74
+ # Default minimum total matching observations across positions before
75
+ # we'll emit a proposal. Below this the signal is too noisy.
76
+ DEFAULT_MIN_OBSERVATIONS = 20
77
+ # Fraction of sampled observations at affected Positions that must
78
+ # match the proposal pattern.
79
+ DEFAULT_MIN_COVERAGE = 0.7
80
+ # Minimum number of distinct hosts the proposal must appear at. For
81
+ # single-host corpora this defaults to 1; bumping to 2+ promotes
82
+ # cross-host patterns over host-local ones.
83
+ DEFAULT_MIN_HOSTS = 1
84
+ # Confidence boost added per additional host beyond the first. A
85
+ # pattern seen on 10+ hosts caps out the boost (+0.45 ≈ 1.0 when
86
+ # combined with any reasonable coverage); single-host patterns get
87
+ # no boost (their coverage IS their confidence).
88
+ CROSS_HOST_BOOST_PER_HOST = 0.05
89
+
90
+ # Detects `<prefix>_<alphanumeric>` patterns at slug/opaque_id
91
+ # positions — the GitHub PAT (`ghp_…`), Stripe customer ID (`cus_…`),
92
+ # AWS-style (`sk_test_…` — partial match), Twilio SID-with-letter-
93
+ # prefix family. Restricting the suffix to alphanumeric (no further
94
+ # separators) keeps real slugs (`my-cool-post`, `red_team_member`)
95
+ # from triggering false proposals.
96
+ class PrefixUnderscoreId
97
+ PATTERN = /\A([a-z]+)_([A-Za-z0-9]+)\z/.freeze
98
+ NAME = :prefix_underscore_id
99
+
100
+ def propose(storage,
101
+ min_observations: DEFAULT_MIN_OBSERVATIONS,
102
+ min_coverage: DEFAULT_MIN_COVERAGE,
103
+ min_hosts: DEFAULT_MIN_HOSTS)
104
+ per_prefix = Hash.new { |h, k| h[k] = empty_accumulator }
105
+
106
+ storage.each_position_stats do |position, stats|
107
+ next unless slug_or_opaque?(stats)
108
+
109
+ stats.value_counts.each do |value, count|
110
+ m = PATTERN.match(value) or next
111
+ prefix = "#{m[1]}_"
112
+ acc = per_prefix[prefix]
113
+ acc[:matching_count] += count
114
+ acc[:position_observations] += stats.total unless acc[:positions].include?(position)
115
+ acc[:positions] << position
116
+ acc[:hosts] << position.host
117
+ # Collect every match; we'll sort + cap to a stable top-N at
118
+ # emission time so Ruby and Go produce identical samples
119
+ # regardless of underlying Hash / map iteration order.
120
+ acc[:matches] << value
121
+ end
122
+ end
123
+
124
+ per_prefix.filter_map { |prefix, acc|
125
+ next nil if acc[:matching_count] < min_observations
126
+ next nil if acc[:hosts].size < min_hosts
127
+
128
+ coverage = acc[:matching_count].to_f / acc[:position_observations]
129
+ next nil if coverage < min_coverage
130
+
131
+ RecognizerProposal.new(
132
+ prefix: prefix,
133
+ suggested_type: prefix.chomp("_").to_sym,
134
+ positions: acc[:positions].to_a,
135
+ hosts: acc[:hosts],
136
+ coverage: coverage,
137
+ observation_count: acc[:matching_count],
138
+ # Sort + cap to 5 so Ruby and Go produce identical samples
139
+ # regardless of underlying Hash / map iteration order. The
140
+ # samples are illustrative for humans; alphabetical is fine.
141
+ sample_values: acc[:matches].sort.first(5),
142
+ strategy: NAME,
143
+ )
144
+ }.sort_by { |p| [-p.confidence, p.prefix] }
145
+ end
146
+
147
+ private
148
+
149
+ def empty_accumulator
150
+ {
151
+ positions: Set.new,
152
+ hosts: Set.new,
153
+ matching_count: 0,
154
+ position_observations: 0,
155
+ matches: [],
156
+ }
157
+ end
158
+
159
+ def slug_or_opaque?(stats)
160
+ dom = stats.type_counts.max_by { |_, c| c }&.first
161
+ dom == :slug || dom == :opaque_id
162
+ end
163
+ end
164
+
165
+ DEFAULTS = [PrefixUnderscoreId.new].freeze
166
+ end
167
+ end
@@ -0,0 +1,53 @@
1
+ module Iriq
2
+ module Recognizers
3
+ # ISO 8601 (YYYY-MM-DD), slash form (YYYY/MM/DD), and US-style
4
+ # (M/D/YYYY) date shapes. Compact YYYYMMDD lives on the Integer
5
+ # recognizer — it sees the digits-only input first.
6
+ #
7
+ # Conservative: DD/MM/YYYY is intentionally NOT recognized — from a
8
+ # bare segment we can't tell it apart from MM/DD/YYYY.
9
+ class Date < Recognizer
10
+ ISO_PATTERN = /\A\d{4}-\d{2}-\d{2}\z/.freeze
11
+ SLASH_PATTERN = %r{\A\d{4}/\d{2}/\d{2}\z}.freeze
12
+ US_PATTERN = %r{\A(\d{1,2})/(\d{1,2})/(\d{4})\z}.freeze
13
+
14
+ def try(segment)
15
+ has_dash = segment.include?("-")
16
+ has_slash = segment.include?("/")
17
+ return nil unless has_dash || has_slash
18
+ unless ISO_PATTERN.match?(segment) ||
19
+ SLASH_PATTERN.match?(segment) ||
20
+ US_PATTERN.match?(segment)
21
+ return nil
22
+ end
23
+
24
+ { type: :date, confidence: 1.0, specificity: Specificity::STRUCTURED }
25
+ end
26
+
27
+ # Canonicalize a recognized date to ISO 8601 (YYYY-MM-DD). nil for
28
+ # non-date / implausible-date values. Day-of-month validity (Feb 30,
29
+ # Apr 31) deliberately not checked — out of scope for a heuristic.
30
+ def self.canonical(value)
31
+ return nil if value.nil?
32
+
33
+ case value
34
+ when ISO_PATTERN
35
+ plausible?(value[0, 4], value[5, 2], value[8, 2]) ? value : nil
36
+ when SLASH_PATTERN
37
+ plausible?(value[0, 4], value[5, 2], value[8, 2]) ? value.tr("/", "-") : nil
38
+ when US_PATTERN
39
+ m = ::Regexp.last_match
40
+ mm, dd, yyyy = m[1].rjust(2, "0"), m[2].rjust(2, "0"), m[3]
41
+ plausible?(yyyy, mm, dd) ? "#{yyyy}-#{mm}-#{dd}" : nil
42
+ end
43
+ end
44
+
45
+ def self.plausible?(y, m, d)
46
+ yi = y.to_i; mi = m.to_i; di = d.to_i
47
+ yi.between?(1900, 2100) && mi.between?(1, 12) && di.between?(1, 31)
48
+ end
49
+ end
50
+
51
+ DATE = Date.new
52
+ end
53
+ end
@@ -0,0 +1,37 @@
1
+ module Iriq
2
+ module Recognizers
3
+ # Base-10 integer. Also returns :timestamp for plausible UNIX seconds /
4
+ # millis ranges, and :date for plausible YYYYMMDD compact dates — these
5
+ # share the digit-only lexical shape, and we want the most specific type.
6
+ class Integer < Recognizer
7
+ PATTERN = /\A\d+\z/.freeze
8
+ COMPACT_DATE_PATTERN = /\A\d{8}\z/.freeze
9
+ TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
10
+ TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
11
+
12
+ def try(segment)
13
+ first = segment.getbyte(0)
14
+ digit0 = first && first >= 0x30 && first <= 0x39
15
+ return nil unless digit0 && PATTERN.match?(segment)
16
+
17
+ n = segment.to_i
18
+ if TS_MILLIS_RANGE.cover?(n) || TS_SECONDS_RANGE.cover?(n)
19
+ return { type: :timestamp, confidence: 1.0, specificity: Specificity::BOUNDED }
20
+ end
21
+
22
+ if COMPACT_DATE_PATTERN.match?(segment)
23
+ y = segment[0, 4].to_i
24
+ m = segment[4, 2].to_i
25
+ d = segment[6, 2].to_i
26
+ if y.between?(1900, 2100) && m.between?(1, 12) && d.between?(1, 31)
27
+ return { type: :date, confidence: 1.0, specificity: Specificity::STRUCTURED }
28
+ end
29
+ end
30
+
31
+ { type: :integer, confidence: 1.0, specificity: Specificity::TYPED }
32
+ end
33
+ end
34
+
35
+ INTEGER = Integer.new
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ module Iriq
2
+ module Recognizers
3
+ # RFC 4122 UUID. Shape-only — does not validate version/variant bits.
4
+ class Uuid < Recognizer
5
+ PATTERN = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
6
+
7
+ def try(segment)
8
+ return nil unless segment.size == 36 && segment.include?("-") && PATTERN.match?(segment)
9
+
10
+ { type: :uuid, confidence: 1.0, specificity: Specificity::SEMANTIC }
11
+ end
12
+ end
13
+
14
+ UUID = Uuid.new
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ module Iriq
2
+ # Reducers consume the Event stream emitted by Corpus#observe and update
3
+ # the storage backend's materialized views. Each Reducer is a callable
4
+ # that takes (event, storage) and applies the appropriate storage
5
+ # operation; non-applicable events are no-ops via the EVENT_TYPES gate.
6
+ #
7
+ # Adding a new metric is: define a new Event subtype, write a Reducer that
8
+ # handles it, register it in DEFAULTS — no other module changes.
9
+ module Reducer
10
+ # Each entry: { event_class => [lambda(event, storage) -> result] }.
11
+ # Lambdas may return the result of the underlying storage call so
12
+ # callers (Corpus#observe) can pick up the cluster they need to return.
13
+ DEFAULTS = {
14
+ Event::HostSeen => [->(e, s) { s.increment_host(e.host) }],
15
+ Event::PathLengthSeen => [->(e, s) { s.increment_path_length(e.length) }],
16
+ Event::RawShapeSeen => [->(e, s) { s.increment_raw_shape(e.shape) }],
17
+ Event::FingerprintSeen => [->(e, s) { s.increment_fingerprint(e.shape) }],
18
+ Event::PositionSeen => [->(e, s) { s.observe_position(e.position, e.value, e.type) }],
19
+ Event::ClusterAddition => [->(e, s) { s.add_to_cluster(e.key, e.host, e.scheme, e.shape, e.identifier) }],
20
+ }.freeze
21
+
22
+ module_function
23
+
24
+ # Apply the event to the storage via all Reducers registered for its
25
+ # type. Returns the last non-nil reducer result — used by Corpus#observe
26
+ # to pick up the Cluster created/updated by Event::ClusterAddition.
27
+ def apply(event, storage, reducers: DEFAULTS)
28
+ results = reducers.fetch(event.class, [])
29
+ result = nil
30
+ results.each do |r|
31
+ rv = r.call(event, storage)
32
+ result = rv unless rv.nil?
33
+ end
34
+ result
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,56 @@
1
+ module Iriq
2
+ # Heuristic registrable-domain extractor. Strips subdomains so that
3
+ # api.foo.com and app.foo.com both resolve to foo.com.
4
+ #
5
+ # Uses an inline allowlist of the ~50 most common multi-label public
6
+ # suffixes (.co.uk, .com.au, .gov.uk, etc.) — covers the long tail of
7
+ # real-world traffic without the ~3 MB cost of bundling the full Public
8
+ # Suffix List. Niche multi-label TLDs (.priv.no, .tas.gov.au, etc.) will
9
+ # be over-stripped; install the `public_suffix` gem and wire it in if
10
+ # accuracy on those matters for your workload.
11
+ module RegistrableDomain
12
+ # rubocop:disable Layout/LineLength
13
+ TWO_LABEL_SUFFIXES = %w[
14
+ co.uk org.uk gov.uk ac.uk net.uk me.uk ltd.uk plc.uk sch.uk
15
+ co.jp ac.jp or.jp ne.jp go.jp gr.jp ed.jp lg.jp
16
+ com.au net.au org.au edu.au gov.au asn.au id.au
17
+ co.nz net.nz org.nz govt.nz ac.nz school.nz
18
+ com.br net.br org.br gov.br edu.br
19
+ com.cn net.cn org.cn gov.cn edu.cn ac.cn
20
+ co.za net.za org.za gov.za ac.za
21
+ co.kr ne.kr or.kr re.kr go.kr ac.kr
22
+ co.in net.in org.in gov.in ac.in
23
+ co.il net.il org.il gov.il ac.il muni.il
24
+ com.mx net.mx org.mx gob.mx edu.mx
25
+ com.ar net.ar org.ar gov.ar
26
+ com.hk net.hk org.hk gov.hk edu.hk
27
+ com.tw net.tw org.tw gov.tw edu.tw
28
+ com.sg net.sg org.sg gov.sg edu.sg per.sg
29
+ com.tr net.tr org.tr gov.tr edu.tr k12.tr
30
+ ].to_set.freeze
31
+ # rubocop:enable Layout/LineLength
32
+
33
+ IP_V4_RE = /\A\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\z/.freeze
34
+
35
+ module_function
36
+
37
+ # Given an authority (hostname, no port), return the registrable
38
+ # domain. Returns the input unchanged for IP literals, single-label
39
+ # hosts (`localhost`), and hosts that already match a 2-label apex.
40
+ def for(host)
41
+ return host if host.nil? || host.empty?
42
+ return host if IP_V4_RE.match?(host)
43
+
44
+ labels = host.split(".")
45
+ return host if labels.size <= 2
46
+
47
+ tail_two = labels.last(2).join(".")
48
+ if TWO_LABEL_SUFFIXES.include?(tail_two)
49
+ # Multi-label public suffix — keep last 3 labels (`foo.co.uk`).
50
+ labels.last(3).join(".")
51
+ else
52
+ labels.last(2).join(".")
53
+ end
54
+ end
55
+ end
56
+ end