iriq 0.2.0 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,37 @@
1
+ module Iriq
2
+ module Recognizers
3
+ # Base-10 integer. Also returns :timestamp for plausible UNIX seconds /
4
+ # millis ranges, and :date for plausible YYYYMMDD compact dates — these
5
+ # share the digit-only lexical shape, and we want the most specific type.
6
+ class Integer < Recognizer
7
+ PATTERN = /\A\d+\z/.freeze
8
+ COMPACT_DATE_PATTERN = /\A\d{8}\z/.freeze
9
+ TS_SECONDS_RANGE = 1_000_000_000..9_999_999_999
10
+ TS_MILLIS_RANGE = 1_000_000_000_000..9_999_999_999_999
11
+
12
+ def try(segment)
13
+ first = segment.getbyte(0)
14
+ digit0 = first && first >= 0x30 && first <= 0x39
15
+ return nil unless digit0 && PATTERN.match?(segment)
16
+
17
+ n = segment.to_i
18
+ if TS_MILLIS_RANGE.cover?(n) || TS_SECONDS_RANGE.cover?(n)
19
+ return { type: :timestamp, confidence: 1.0, specificity: Specificity::BOUNDED }
20
+ end
21
+
22
+ if COMPACT_DATE_PATTERN.match?(segment)
23
+ y = segment[0, 4].to_i
24
+ m = segment[4, 2].to_i
25
+ d = segment[6, 2].to_i
26
+ if y.between?(1900, 2100) && m.between?(1, 12) && d.between?(1, 31)
27
+ return { type: :date, confidence: 1.0, specificity: Specificity::STRUCTURED }
28
+ end
29
+ end
30
+
31
+ { type: :integer, confidence: 1.0, specificity: Specificity::TYPED }
32
+ end
33
+ end
34
+
35
+ INTEGER = Integer.new
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ module Iriq
2
+ module Recognizers
3
+ # RFC 4122 UUID. Shape-only — does not validate version/variant bits.
4
+ class Uuid < Recognizer
5
+ PATTERN = /\A\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\z/.freeze
6
+
7
+ def try(segment)
8
+ return nil unless segment.size == 36 && segment.include?("-") && PATTERN.match?(segment)
9
+
10
+ { type: :uuid, confidence: 1.0, specificity: Specificity::SEMANTIC }
11
+ end
12
+ end
13
+
14
+ UUID = Uuid.new
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ module Iriq
2
+ # Reducers consume the Event stream emitted by Corpus#observe and update
3
+ # the storage backend's materialized views. Each Reducer is a callable
4
+ # that takes (event, storage) and applies the appropriate storage
5
+ # operation; non-applicable events are no-ops via the EVENT_TYPES gate.
6
+ #
7
+ # Adding a new metric is: define a new Event subtype, write a Reducer that
8
+ # handles it, register it in DEFAULTS — no other module changes.
9
+ module Reducer
10
+ # Each entry: { event_class => [lambda(event, storage) -> result] }.
11
+ # Lambdas may return the result of the underlying storage call so
12
+ # callers (Corpus#observe) can pick up the cluster they need to return.
13
+ DEFAULTS = {
14
+ Event::HostSeen => [->(e, s) { s.increment_host(e.host) }],
15
+ Event::PathLengthSeen => [->(e, s) { s.increment_path_length(e.length) }],
16
+ Event::RawShapeSeen => [->(e, s) { s.increment_raw_shape(e.shape) }],
17
+ Event::FingerprintSeen => [->(e, s) { s.increment_fingerprint(e.shape) }],
18
+ Event::PositionSeen => [->(e, s) { s.observe_position(e.position, e.value, e.type) }],
19
+ Event::ClusterAddition => [->(e, s) { s.add_to_cluster(e.key, e.host, e.scheme, e.shape, e.identifier) }],
20
+ }.freeze
21
+
22
+ module_function
23
+
24
+ # Apply the event to the storage via all Reducers registered for its
25
+ # type. Returns the last non-nil reducer result — used by Corpus#observe
26
+ # to pick up the Cluster created/updated by Event::ClusterAddition.
27
+ def apply(event, storage, reducers: DEFAULTS)
28
+ results = reducers.fetch(event.class, [])
29
+ result = nil
30
+ results.each do |r|
31
+ rv = r.call(event, storage)
32
+ result = rv unless rv.nil?
33
+ end
34
+ result
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,56 @@
1
+ module Iriq
2
+ # Heuristic registrable-domain extractor. Strips subdomains so that
3
+ # api.foo.com and app.foo.com both resolve to foo.com.
4
+ #
5
+ # Uses an inline allowlist of the ~50 most common multi-label public
6
+ # suffixes (.co.uk, .com.au, .gov.uk, etc.) — covers the long tail of
7
+ # real-world traffic without the ~3 MB cost of bundling the full Public
8
+ # Suffix List. Niche multi-label TLDs (.priv.no, .tas.gov.au, etc.) will
9
+ # be over-stripped; install the `public_suffix` gem and wire it in if
10
+ # accuracy on those matters for your workload.
11
+ module RegistrableDomain
12
+ # rubocop:disable Layout/LineLength
13
+ TWO_LABEL_SUFFIXES = %w[
14
+ co.uk org.uk gov.uk ac.uk net.uk me.uk ltd.uk plc.uk sch.uk
15
+ co.jp ac.jp or.jp ne.jp go.jp gr.jp ed.jp lg.jp
16
+ com.au net.au org.au edu.au gov.au asn.au id.au
17
+ co.nz net.nz org.nz govt.nz ac.nz school.nz
18
+ com.br net.br org.br gov.br edu.br
19
+ com.cn net.cn org.cn gov.cn edu.cn ac.cn
20
+ co.za net.za org.za gov.za ac.za
21
+ co.kr ne.kr or.kr re.kr go.kr ac.kr
22
+ co.in net.in org.in gov.in ac.in
23
+ co.il net.il org.il gov.il ac.il muni.il
24
+ com.mx net.mx org.mx gob.mx edu.mx
25
+ com.ar net.ar org.ar gov.ar
26
+ com.hk net.hk org.hk gov.hk edu.hk
27
+ com.tw net.tw org.tw gov.tw edu.tw
28
+ com.sg net.sg org.sg gov.sg edu.sg per.sg
29
+ com.tr net.tr org.tr gov.tr edu.tr k12.tr
30
+ ].to_set.freeze
31
+ # rubocop:enable Layout/LineLength
32
+
33
+ IP_V4_RE = /\A\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\z/.freeze
34
+
35
+ module_function
36
+
37
+ # Given an authority (hostname, no port), return the registrable
38
+ # domain. Returns the input unchanged for IP literals, single-label
39
+ # hosts (`localhost`), and hosts that already match a 2-label apex.
40
+ def for(host)
41
+ return host if host.nil? || host.empty?
42
+ return host if IP_V4_RE.match?(host)
43
+
44
+ labels = host.split(".")
45
+ return host if labels.size <= 2
46
+
47
+ tail_two = labels.last(2).join(".")
48
+ if TWO_LABEL_SUFFIXES.include?(tail_two)
49
+ # Multi-label public suffix — keep last 3 labels (`foo.co.uk`).
50
+ labels.last(3).join(".")
51
+ else
52
+ labels.last(2).join(".")
53
+ end
54
+ end
55
+ end
56
+ end