apidepth 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d59e0f9e95678afd4a0f9ce173de1b63ac893ab01cbc68fc1d6bbcedc44bfaad
4
- data.tar.gz: 29524e4a1687cafb241caa6c07dbb2c1139ef2e44bc92f8ded25dc15c1305a92
3
+ metadata.gz: 69006b5a613f60527b41de94536502c71526d9e9dd274625c7f235d03624d194
4
+ data.tar.gz: 76941c29e08ec0c4a40e310a017f57f9c32f1c90cc37820ce794834c90ad8d15
5
5
  SHA512:
6
- metadata.gz: a4a88ac5df80e6af2987cfa29d354c67c5275c02e30caacbae086558d2a0ee1fe1304a42c11b679b2ec6a1358a1cd0e69c3969a5d9d400c246723592df448599
7
- data.tar.gz: 2005a0c7164ed0b20da0e1013cdc250be7d09f66fdfdd65f465168f38f4a105bfa6370e3a98bef015502f7cb2ba961a99189a6232e0098eb47ca11c588d90ed4
6
+ metadata.gz: d5e0babbb150df6d38aa32c83df843da7d7991c8fd17be37010d1454eaef8a936b878c4efdc8eb3373d714c70e75a124a99ada7d3fa43e248c4eb464920f833c
7
+ data.tar.gz: fd034d845f43a8f34af26f6cf99067ec51c690168744d379d19727de2c383ea88a8e9ffd89dbdc8ffc79b6fbd2e73fa29f684b1415c7708c1c88bc78ee01f52b
@@ -1,5 +1,4 @@
1
1
  # lib/apidepth/model_name_extractor.rb
2
- require "json"
3
2
  require "set"
4
3
  #
5
4
  # Extracts the model name from AI vendor JSON response bodies.
@@ -19,7 +18,12 @@ require "set"
19
18
  #
20
19
  # Streaming safety: streamed responses have Content-Type: text/event-stream, not
21
20
  # application/json. The content-type guard exits early before any body read.
22
- # The 8KB truncation is a belt-and-suspenders guard against unusually large bodies.
21
+ #
22
+ # Extraction strategy (RUBY-018): scan for the JSON "model": "<value>" field
23
+ # with a linear regex rather than JSON.parse-ing a truncated body. Embeddings
24
+ # and batch responses place `model` AFTER a large `data` array, so the old
25
+ # parse-after-8KB-truncate approach produced invalid JSON and silently dropped
26
+ # the model. The regex finds the first structural model field wherever it sits.
23
27
 
24
28
  module Apidepth
25
29
  module ModelNameExtractor
@@ -31,20 +35,33 @@ module Apidepth
31
35
  api.cohere.com
32
36
  ].to_set.freeze
33
37
 
34
- MAX_BODY_BYTES = 8_192
38
+ # Upper bound on how far into the body we scan for the model field. 256 KB
39
+ # comfortably covers realistic embeddings/batch responses (a few-input OpenAI
40
+ # embeddings body is ~23 KB) while bounding work on pathologically large bodies.
41
+ MODEL_SCAN_MAX_BYTES = 262_144
42
+
43
+ # Matches a structural JSON "model": "<value>" pair. Escaped quotes inside
44
+ # string values appear as \" so this never matches a "model" mentioned inside
45
+ # another JSON string. First match wins (the top-level model field).
46
+ MODEL_RE = /"model"\s*:\s*"([^"]+)"/.freeze
35
47
 
36
48
  def self.extract(host, response)
37
49
  return nil unless Apidepth.configuration.capture_model_names
38
- return nil unless AI_VENDOR_HOSTS.include?(host)
50
+ # Case-insensitive host match (RUBY-019): DNS hostnames are case-insensitive,
51
+ # so a vendor declared with mixed case (e.g. via extra_vendors) still matches.
52
+ return nil unless AI_VENDOR_HOSTS.include?(host.to_s.downcase)
39
53
  return nil unless response["content-type"]&.include?("application/json")
40
54
 
41
55
  body = response.body
42
56
  return nil if body.nil? || body.empty?
43
57
 
44
- parsed = JSON.parse(body.byteslice(0, MAX_BODY_BYTES), symbolize_names: true)
45
- model = parsed[:model]
46
- model.is_a?(String) && !model.empty? ? model : nil
47
- rescue JSON::ParserError, Encoding::UndefinedConversionError, TypeError
58
+ scan = body.byteslice(0, MODEL_SCAN_MAX_BYTES).to_s.dup.force_encoding("UTF-8")
59
+ match = MODEL_RE.match(scan)
60
+ match && !match[1].empty? ? match[1] : nil
61
+ rescue StandardError
62
+ # Covers malformed/invalid-encoding bodies and non-buffered streaming
63
+ # bodies (e.g. Net::ReadAdapter, which has no #empty?). Returning nil keeps
64
+ # the surrounding telemetry event intact rather than dropping it (RUBY-017).
48
65
  nil
49
66
  end
50
67
  end
@@ -58,12 +58,22 @@ module Apidepth
58
58
  }
59
59
  }.freeze
60
60
 
61
+ # Generic fallbacks applied after vendor-specific patterns. Canonical across
62
+ # all SDKs (XSDK-NORM) — see apidepth-collector/tests/fixtures/endpoint_cases.json.
63
+ # The :token rule requires at least one digit (?=[a-z0-9]*\d) so 24+ char
64
+ # readable slugs are left intact while opaque IDs/tokens — which effectively
65
+ # always contain a digit — are collapsed. UUID is case-insensitive.
61
66
  GENERIC_PATTERNS = [
62
- [%r{/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}}, "/:uuid"],
63
- [%r{/\d{4,}}, "/:id"],
64
- [%r{/[a-z0-9]{24,}}, "/:token"]
67
+ [%r{/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}}i, "/:uuid"],
68
+ [%r{/\d{4,}}, "/:id"],
69
+ [%r{/(?=[a-z0-9]*\d)[a-z0-9]{24,}}i, "/:token"]
65
70
  ].freeze
66
71
 
72
+ # Upper bound on path length we run the generic normalizers against. Realistic
73
+ # paths are well under 4 KB; above this we skip normalization because the
74
+ # :token lookahead is O(n^2) worst-case on a long digit-free alnum run.
75
+ GENERIC_MAX_PATH = 4096
76
+
67
77
  # True when the runtime supports Regexp.timeout (introduced in Ruby 3.2).
68
78
  # Used by apply_vendor_normalizers to enable ReDoS protection when available.
69
79
  RUBY_GTE_3_2 = Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("3.2")
@@ -157,7 +167,13 @@ module Apidepth
157
167
  next
158
168
  end
159
169
 
160
- [Regexp.new(match), rule["replace"].to_s]
170
+ # ReDoS protection: bake a per-pattern timeout into the Regexp at
171
+ # compile time on Ruby >= 3.2 (RUBY-020). This bounds match time for
172
+ # a pathological pattern from a compromised/misconfigured registry
173
+ # without mutating the process-global Regexp.timeout on every request
174
+ # (which would impose the limit on unrelated regexes in other threads).
175
+ compiled = RUBY_GTE_3_2 ? Regexp.new(match, timeout: 0.001) : Regexp.new(match)
176
+ [compiled, rule["replace"].to_s]
161
177
  rescue RegexpError => e
162
178
  Apidepth.logger&.warn(
163
179
  "[Apidepth] Skipping invalid pattern for #{Apidepth.sanitize_log(slug)} " \
@@ -179,25 +195,23 @@ module Apidepth
179
195
  # broader catch-alls (e.g. /v1/:resource/:id). A less-specific rule placed
180
196
  # earlier will shadow any more-specific rules that follow it.
181
197
  #
182
- # ReDoS protection: on Ruby >= 3.2 we apply a per-match timeout of 1ms so
183
- # that a pathological pattern from a compromised or misconfigured registry
184
- # cannot stall the request thread indefinitely. On older Ruby, Regexp.timeout
185
- # is not available use a trusted, internally-reviewed registry source.
198
+ # ReDoS protection: each compiled pattern carries its own 1ms timeout (set
199
+ # in build_patterns on Ruby >= 3.2), so a pathological pattern from a
200
+ # compromised or misconfigured registry cannot stall the request thread
201
+ # indefinitelywithout touching the process-global Regexp.timeout
202
+ # (RUBY-020). On older Ruby the timeout is absent; use a trusted,
203
+ # internally-reviewed registry source. A Regexp::TimeoutError that trips
204
+ # here propagates to identify's caller, which already rescues StandardError.
186
205
  def apply_vendor_normalizers(rules, path)
187
- if RUBY_GTE_3_2
188
- saved_timeout = Regexp.timeout
189
- Regexp.timeout = 0.001
190
- end
191
-
192
206
  rules.each do |pattern, replacement|
193
207
  return path.gsub(pattern, replacement) if path.match?(pattern)
194
208
  end
195
209
  path
196
- ensure
197
- Regexp.timeout = saved_timeout if RUBY_GTE_3_2
198
210
  end
199
211
 
200
212
  def apply_generic_normalizers(path)
213
+ return path if path.length > GENERIC_MAX_PATH
214
+
201
215
  GENERIC_PATTERNS.reduce(path) do |p, (pattern, replacement)|
202
216
  p.gsub(pattern, replacement)
203
217
  end
@@ -1,5 +1,5 @@
1
1
  # lib/apidepth/version.rb
2
2
 
3
3
  module Apidepth
4
- VERSION = "0.5.0".freeze
4
+ VERSION = "0.5.1".freeze
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apidepth
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Apidepth
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-03 00:00:00.000000000 Z
11
+ date: 2026-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json