iriq 0.2.0 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +78 -0
- data/CLAUDE.md +128 -41
- data/Gemfile.lock +4 -4
- data/Makefile +80 -23
- data/README.md +225 -347
- data/completions/_iriq +52 -0
- data/completions/iriq.bash +70 -0
- data/docs/ARCHITECTURE.md +223 -0
- data/docs/ROADMAP.md +190 -0
- data/iriq.gemspec +2 -2
- data/lib/iriq/cli.rb +398 -46
- data/lib/iriq/cluster.rb +284 -12
- data/lib/iriq/corpus.rb +318 -36
- data/lib/iriq/cross_host_shape.rb +37 -0
- data/lib/iriq/event.rb +22 -0
- data/lib/iriq/evidence.rb +114 -0
- data/lib/iriq/explanation.rb +1 -1
- data/lib/iriq/normalizer.rb +71 -29
- data/lib/iriq/path_shape.rb +30 -24
- data/lib/iriq/position.rb +75 -0
- data/lib/iriq/position_stats.rb +74 -8
- data/lib/iriq/recognizer.rb +54 -0
- data/lib/iriq/recognizer_proposal.rb +167 -0
- data/lib/iriq/recognizers/date.rb +53 -0
- data/lib/iriq/recognizers/integer.rb +37 -0
- data/lib/iriq/recognizers/uuid.rb +16 -0
- data/lib/iriq/reducer.rb +37 -0
- data/lib/iriq/registrable_domain.rb +56 -0
- data/lib/iriq/segment_classifier.rb +475 -23
- data/lib/iriq/segment_hints.rb +9 -0
- data/lib/iriq/shape.rb +106 -0
- data/lib/iriq/specificity.rb +35 -0
- data/lib/iriq/storage/memory.rb +83 -12
- data/lib/iriq/storage/sqlite.rb +216 -37
- data/lib/iriq/synthesized_recognizer.rb +56 -0
- data/lib/iriq/trace.rb +294 -0
- data/lib/iriq/version.rb +1 -1
- data/lib/iriq.rb +17 -0
- metadata +22 -3
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
module Recognizers
|
|
3
|
+
# Base-10 integer. Also returns :timestamp for plausible UNIX seconds /
|
|
4
|
+
# millis ranges, and :date for plausible YYYYMMDD compact dates — these
|
|
5
|
+
# share the digit-only lexical shape, and we want the most specific type.
|
|
6
|
+
class Integer < Recognizer
|
|
7
|
+
PATTERN = /\A\d+\z/.freeze
|
|
8
|
+
COMPACT_DATE_PATTERN = /\A\d{8}\z/.freeze
|
|
9
|
+
TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
|
|
10
|
+
TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
|
|
11
|
+
|
|
12
|
+
def try(segment)
|
|
13
|
+
first = segment.getbyte(0)
|
|
14
|
+
digit0 = first && first >= 0x30 && first <= 0x39
|
|
15
|
+
return nil unless digit0 && PATTERN.match?(segment)
|
|
16
|
+
|
|
17
|
+
n = segment.to_i
|
|
18
|
+
if TS_MILLIS_RANGE.cover?(n) || TS_SECONDS_RANGE.cover?(n)
|
|
19
|
+
return { type: :timestamp, confidence: 1.0, specificity: Specificity::BOUNDED }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
if COMPACT_DATE_PATTERN.match?(segment)
|
|
23
|
+
y = segment[0, 4].to_i
|
|
24
|
+
m = segment[4, 2].to_i
|
|
25
|
+
d = segment[6, 2].to_i
|
|
26
|
+
if y.between?(1900, 2100) && m.between?(1, 12) && d.between?(1, 31)
|
|
27
|
+
return { type: :date, confidence: 1.0, specificity: Specificity::STRUCTURED }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
{ type: :integer, confidence: 1.0, specificity: Specificity::TYPED }
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
INTEGER = Integer.new
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
module Recognizers
|
|
3
|
+
# RFC 4122 UUID. Shape-only — does not validate version/variant bits.
|
|
4
|
+
class Uuid < Recognizer
|
|
5
|
+
PATTERN = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
|
|
6
|
+
|
|
7
|
+
def try(segment)
|
|
8
|
+
return nil unless segment.size == 36 && segment.include?("-") && PATTERN.match?(segment)
|
|
9
|
+
|
|
10
|
+
{ type: :uuid, confidence: 1.0, specificity: Specificity::SEMANTIC }
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
UUID = Uuid.new
|
|
15
|
+
end
|
|
16
|
+
end
|
data/lib/iriq/reducer.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Reducers consume the Event stream emitted by Corpus#observe and update
|
|
3
|
+
# the storage backend's materialized views. Each Reducer is a callable
|
|
4
|
+
# that takes (event, storage) and applies the appropriate storage
|
|
5
|
+
# operation; non-applicable events are no-ops via the EVENT_TYPES gate.
|
|
6
|
+
#
|
|
7
|
+
# Adding a new metric is: define a new Event subtype, write a Reducer that
|
|
8
|
+
# handles it, register it in DEFAULTS — no other module changes.
|
|
9
|
+
module Reducer
|
|
10
|
+
# Each entry: { event_class => [lambda(event, storage) -> result] }.
|
|
11
|
+
# Lambdas may return the result of the underlying storage call so
|
|
12
|
+
# callers (Corpus#observe) can pick up the cluster they need to return.
|
|
13
|
+
DEFAULTS = {
|
|
14
|
+
Event::HostSeen => [->(e, s) { s.increment_host(e.host) }],
|
|
15
|
+
Event::PathLengthSeen => [->(e, s) { s.increment_path_length(e.length) }],
|
|
16
|
+
Event::RawShapeSeen => [->(e, s) { s.increment_raw_shape(e.shape) }],
|
|
17
|
+
Event::FingerprintSeen => [->(e, s) { s.increment_fingerprint(e.shape) }],
|
|
18
|
+
Event::PositionSeen => [->(e, s) { s.observe_position(e.position, e.value, e.type) }],
|
|
19
|
+
Event::ClusterAddition => [->(e, s) { s.add_to_cluster(e.key, e.host, e.scheme, e.shape, e.identifier) }],
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
module_function
|
|
23
|
+
|
|
24
|
+
# Apply the event to the storage via all Reducers registered for its
|
|
25
|
+
# type. Returns the last non-nil reducer result — used by Corpus#observe
|
|
26
|
+
# to pick up the Cluster created/updated by Event::ClusterAddition.
|
|
27
|
+
def apply(event, storage, reducers: DEFAULTS)
|
|
28
|
+
results = reducers.fetch(event.class, [])
|
|
29
|
+
result = nil
|
|
30
|
+
results.each do |r|
|
|
31
|
+
rv = r.call(event, storage)
|
|
32
|
+
result = rv unless rv.nil?
|
|
33
|
+
end
|
|
34
|
+
result
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module Iriq
|
|
2
|
+
# Heuristic registrable-domain extractor. Strips subdomains so that
|
|
3
|
+
# api.foo.com and app.foo.com both resolve to foo.com.
|
|
4
|
+
#
|
|
5
|
+
# Uses an inline allowlist of the ~50 most common multi-label public
|
|
6
|
+
# suffixes (.co.uk, .com.au, .gov.uk, etc.) — covers the long tail of
|
|
7
|
+
# real-world traffic without the ~3 MB cost of bundling the full Public
|
|
8
|
+
# Suffix List. Niche multi-label TLDs (.priv.no, .tas.gov.au, etc.) will
|
|
9
|
+
# be over-stripped; install the `public_suffix` gem and wire it in if
|
|
10
|
+
# accuracy on those matters for your workload.
|
|
11
|
+
module RegistrableDomain
|
|
12
|
+
# rubocop:disable Layout/LineLength
|
|
13
|
+
TWO_LABEL_SUFFIXES = %w[
|
|
14
|
+
co.uk org.uk gov.uk ac.uk net.uk me.uk ltd.uk plc.uk sch.uk
|
|
15
|
+
co.jp ac.jp or.jp ne.jp go.jp gr.jp ed.jp lg.jp
|
|
16
|
+
com.au net.au org.au edu.au gov.au asn.au id.au
|
|
17
|
+
co.nz net.nz org.nz govt.nz ac.nz school.nz
|
|
18
|
+
com.br net.br org.br gov.br edu.br
|
|
19
|
+
com.cn net.cn org.cn gov.cn edu.cn ac.cn
|
|
20
|
+
co.za net.za org.za gov.za ac.za
|
|
21
|
+
co.kr ne.kr or.kr re.kr go.kr ac.kr
|
|
22
|
+
co.in net.in org.in gov.in ac.in
|
|
23
|
+
co.il net.il org.il gov.il ac.il muni.il
|
|
24
|
+
com.mx net.mx org.mx gob.mx edu.mx
|
|
25
|
+
com.ar net.ar org.ar gov.ar
|
|
26
|
+
com.hk net.hk org.hk gov.hk edu.hk
|
|
27
|
+
com.tw net.tw org.tw gov.tw edu.tw
|
|
28
|
+
com.sg net.sg org.sg gov.sg edu.sg per.sg
|
|
29
|
+
com.tr net.tr org.tr gov.tr edu.tr k12.tr
|
|
30
|
+
].to_set.freeze
|
|
31
|
+
# rubocop:enable Layout/LineLength
|
|
32
|
+
|
|
33
|
+
IP_V4_RE = /\A\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\z/.freeze
|
|
34
|
+
|
|
35
|
+
module_function
|
|
36
|
+
|
|
37
|
+
# Given an authority (hostname, no port), return the registrable
|
|
38
|
+
# domain. Returns the input unchanged for IP literals, single-label
|
|
39
|
+
# hosts (`localhost`), and hosts that already match a 2-label apex.
|
|
40
|
+
def for(host)
|
|
41
|
+
return host if host.nil? || host.empty?
|
|
42
|
+
return host if IP_V4_RE.match?(host)
|
|
43
|
+
|
|
44
|
+
labels = host.split(".")
|
|
45
|
+
return host if labels.size <= 2
|
|
46
|
+
|
|
47
|
+
tail_two = labels.last(2).join(".")
|
|
48
|
+
if TWO_LABEL_SUFFIXES.include?(tail_two)
|
|
49
|
+
# Multi-label public suffix — keep last 3 labels (`foo.co.uk`).
|
|
50
|
+
labels.last(3).join(".")
|
|
51
|
+
else
|
|
52
|
+
labels.last(2).join(".")
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|