oscura 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (497) hide show
  1. oscura/__init__.py +169 -167
  2. oscura/analyzers/__init__.py +3 -0
  3. oscura/analyzers/classification.py +659 -0
  4. oscura/analyzers/digital/edges.py +325 -65
  5. oscura/analyzers/digital/quality.py +293 -166
  6. oscura/analyzers/digital/timing.py +260 -115
  7. oscura/analyzers/digital/timing_numba.py +334 -0
  8. oscura/analyzers/entropy.py +605 -0
  9. oscura/analyzers/eye/diagram.py +176 -109
  10. oscura/analyzers/eye/metrics.py +5 -5
  11. oscura/analyzers/jitter/__init__.py +6 -4
  12. oscura/analyzers/jitter/ber.py +52 -52
  13. oscura/analyzers/jitter/classification.py +156 -0
  14. oscura/analyzers/jitter/decomposition.py +163 -113
  15. oscura/analyzers/jitter/spectrum.py +80 -64
  16. oscura/analyzers/ml/__init__.py +39 -0
  17. oscura/analyzers/ml/features.py +600 -0
  18. oscura/analyzers/ml/signal_classifier.py +604 -0
  19. oscura/analyzers/packet/daq.py +246 -158
  20. oscura/analyzers/packet/parser.py +12 -1
  21. oscura/analyzers/packet/payload.py +50 -2110
  22. oscura/analyzers/packet/payload_analysis.py +361 -181
  23. oscura/analyzers/packet/payload_patterns.py +133 -70
  24. oscura/analyzers/packet/stream.py +84 -23
  25. oscura/analyzers/patterns/__init__.py +26 -5
  26. oscura/analyzers/patterns/anomaly_detection.py +908 -0
  27. oscura/analyzers/patterns/clustering.py +169 -108
  28. oscura/analyzers/patterns/clustering_optimized.py +227 -0
  29. oscura/analyzers/patterns/discovery.py +1 -1
  30. oscura/analyzers/patterns/matching.py +581 -197
  31. oscura/analyzers/patterns/pattern_mining.py +778 -0
  32. oscura/analyzers/patterns/periodic.py +121 -38
  33. oscura/analyzers/patterns/sequences.py +175 -78
  34. oscura/analyzers/power/conduction.py +1 -1
  35. oscura/analyzers/power/soa.py +6 -6
  36. oscura/analyzers/power/switching.py +250 -110
  37. oscura/analyzers/protocol/__init__.py +17 -1
  38. oscura/analyzers/protocols/base.py +6 -6
  39. oscura/analyzers/protocols/ble/__init__.py +38 -0
  40. oscura/analyzers/protocols/ble/analyzer.py +809 -0
  41. oscura/analyzers/protocols/ble/uuids.py +288 -0
  42. oscura/analyzers/protocols/can.py +257 -127
  43. oscura/analyzers/protocols/can_fd.py +107 -80
  44. oscura/analyzers/protocols/flexray.py +139 -80
  45. oscura/analyzers/protocols/hdlc.py +93 -58
  46. oscura/analyzers/protocols/i2c.py +247 -106
  47. oscura/analyzers/protocols/i2s.py +138 -86
  48. oscura/analyzers/protocols/industrial/__init__.py +40 -0
  49. oscura/analyzers/protocols/industrial/bacnet/__init__.py +33 -0
  50. oscura/analyzers/protocols/industrial/bacnet/analyzer.py +708 -0
  51. oscura/analyzers/protocols/industrial/bacnet/encoding.py +412 -0
  52. oscura/analyzers/protocols/industrial/bacnet/services.py +622 -0
  53. oscura/analyzers/protocols/industrial/ethercat/__init__.py +30 -0
  54. oscura/analyzers/protocols/industrial/ethercat/analyzer.py +474 -0
  55. oscura/analyzers/protocols/industrial/ethercat/mailbox.py +339 -0
  56. oscura/analyzers/protocols/industrial/ethercat/topology.py +166 -0
  57. oscura/analyzers/protocols/industrial/modbus/__init__.py +31 -0
  58. oscura/analyzers/protocols/industrial/modbus/analyzer.py +525 -0
  59. oscura/analyzers/protocols/industrial/modbus/crc.py +79 -0
  60. oscura/analyzers/protocols/industrial/modbus/functions.py +436 -0
  61. oscura/analyzers/protocols/industrial/opcua/__init__.py +21 -0
  62. oscura/analyzers/protocols/industrial/opcua/analyzer.py +552 -0
  63. oscura/analyzers/protocols/industrial/opcua/datatypes.py +446 -0
  64. oscura/analyzers/protocols/industrial/opcua/services.py +264 -0
  65. oscura/analyzers/protocols/industrial/profinet/__init__.py +23 -0
  66. oscura/analyzers/protocols/industrial/profinet/analyzer.py +441 -0
  67. oscura/analyzers/protocols/industrial/profinet/dcp.py +263 -0
  68. oscura/analyzers/protocols/industrial/profinet/ptcp.py +200 -0
  69. oscura/analyzers/protocols/jtag.py +180 -98
  70. oscura/analyzers/protocols/lin.py +219 -114
  71. oscura/analyzers/protocols/manchester.py +4 -4
  72. oscura/analyzers/protocols/onewire.py +253 -149
  73. oscura/analyzers/protocols/parallel_bus/__init__.py +20 -0
  74. oscura/analyzers/protocols/parallel_bus/centronics.py +92 -0
  75. oscura/analyzers/protocols/parallel_bus/gpib.py +137 -0
  76. oscura/analyzers/protocols/spi.py +192 -95
  77. oscura/analyzers/protocols/swd.py +321 -167
  78. oscura/analyzers/protocols/uart.py +267 -125
  79. oscura/analyzers/protocols/usb.py +235 -131
  80. oscura/analyzers/side_channel/power.py +17 -12
  81. oscura/analyzers/signal/__init__.py +15 -0
  82. oscura/analyzers/signal/timing_analysis.py +1086 -0
  83. oscura/analyzers/signal_integrity/__init__.py +4 -1
  84. oscura/analyzers/signal_integrity/sparams.py +2 -19
  85. oscura/analyzers/spectral/chunked.py +129 -60
  86. oscura/analyzers/spectral/chunked_fft.py +300 -94
  87. oscura/analyzers/spectral/chunked_wavelet.py +100 -80
  88. oscura/analyzers/statistical/checksum.py +376 -217
  89. oscura/analyzers/statistical/classification.py +229 -107
  90. oscura/analyzers/statistical/entropy.py +78 -53
  91. oscura/analyzers/statistics/correlation.py +407 -211
  92. oscura/analyzers/statistics/outliers.py +2 -2
  93. oscura/analyzers/statistics/streaming.py +30 -5
  94. oscura/analyzers/validation.py +216 -101
  95. oscura/analyzers/waveform/measurements.py +9 -0
  96. oscura/analyzers/waveform/measurements_with_uncertainty.py +31 -15
  97. oscura/analyzers/waveform/spectral.py +500 -228
  98. oscura/api/__init__.py +31 -5
  99. oscura/api/dsl/__init__.py +582 -0
  100. oscura/{dsl → api/dsl}/commands.py +43 -76
  101. oscura/{dsl → api/dsl}/interpreter.py +26 -51
  102. oscura/{dsl → api/dsl}/parser.py +107 -77
  103. oscura/{dsl → api/dsl}/repl.py +2 -2
  104. oscura/api/dsl.py +1 -1
  105. oscura/{integrations → api/integrations}/__init__.py +1 -1
  106. oscura/{integrations → api/integrations}/llm.py +201 -102
  107. oscura/api/operators.py +3 -3
  108. oscura/api/optimization.py +144 -30
  109. oscura/api/rest_server.py +921 -0
  110. oscura/api/server/__init__.py +17 -0
  111. oscura/api/server/dashboard.py +850 -0
  112. oscura/api/server/static/README.md +34 -0
  113. oscura/api/server/templates/base.html +181 -0
  114. oscura/api/server/templates/export.html +120 -0
  115. oscura/api/server/templates/home.html +284 -0
  116. oscura/api/server/templates/protocols.html +58 -0
  117. oscura/api/server/templates/reports.html +43 -0
  118. oscura/api/server/templates/session_detail.html +89 -0
  119. oscura/api/server/templates/sessions.html +83 -0
  120. oscura/api/server/templates/waveforms.html +73 -0
  121. oscura/automotive/__init__.py +8 -1
  122. oscura/automotive/can/__init__.py +10 -0
  123. oscura/automotive/can/checksum.py +3 -1
  124. oscura/automotive/can/dbc_generator.py +590 -0
  125. oscura/automotive/can/message_wrapper.py +121 -74
  126. oscura/automotive/can/patterns.py +98 -21
  127. oscura/automotive/can/session.py +292 -56
  128. oscura/automotive/can/state_machine.py +6 -3
  129. oscura/automotive/can/stimulus_response.py +97 -75
  130. oscura/automotive/dbc/__init__.py +10 -2
  131. oscura/automotive/dbc/generator.py +84 -56
  132. oscura/automotive/dbc/parser.py +6 -6
  133. oscura/automotive/dtc/data.json +17 -102
  134. oscura/automotive/dtc/database.py +2 -2
  135. oscura/automotive/flexray/__init__.py +31 -0
  136. oscura/automotive/flexray/analyzer.py +504 -0
  137. oscura/automotive/flexray/crc.py +185 -0
  138. oscura/automotive/flexray/fibex.py +449 -0
  139. oscura/automotive/j1939/__init__.py +45 -8
  140. oscura/automotive/j1939/analyzer.py +605 -0
  141. oscura/automotive/j1939/spns.py +326 -0
  142. oscura/automotive/j1939/transport.py +306 -0
  143. oscura/automotive/lin/__init__.py +47 -0
  144. oscura/automotive/lin/analyzer.py +612 -0
  145. oscura/automotive/loaders/blf.py +13 -2
  146. oscura/automotive/loaders/csv_can.py +143 -72
  147. oscura/automotive/loaders/dispatcher.py +50 -2
  148. oscura/automotive/loaders/mdf.py +86 -45
  149. oscura/automotive/loaders/pcap.py +111 -61
  150. oscura/automotive/uds/__init__.py +4 -0
  151. oscura/automotive/uds/analyzer.py +725 -0
  152. oscura/automotive/uds/decoder.py +140 -58
  153. oscura/automotive/uds/models.py +7 -1
  154. oscura/automotive/visualization.py +1 -1
  155. oscura/cli/analyze.py +348 -0
  156. oscura/cli/batch.py +142 -122
  157. oscura/cli/benchmark.py +275 -0
  158. oscura/cli/characterize.py +137 -82
  159. oscura/cli/compare.py +224 -131
  160. oscura/cli/completion.py +250 -0
  161. oscura/cli/config_cmd.py +361 -0
  162. oscura/cli/decode.py +164 -87
  163. oscura/cli/export.py +286 -0
  164. oscura/cli/main.py +115 -31
  165. oscura/{onboarding → cli/onboarding}/__init__.py +3 -3
  166. oscura/{onboarding → cli/onboarding}/help.py +80 -58
  167. oscura/{onboarding → cli/onboarding}/tutorials.py +97 -72
  168. oscura/{onboarding → cli/onboarding}/wizard.py +55 -36
  169. oscura/cli/progress.py +147 -0
  170. oscura/cli/shell.py +157 -135
  171. oscura/cli/validate_cmd.py +204 -0
  172. oscura/cli/visualize.py +158 -0
  173. oscura/convenience.py +125 -79
  174. oscura/core/__init__.py +4 -2
  175. oscura/core/backend_selector.py +3 -3
  176. oscura/core/cache.py +126 -15
  177. oscura/core/cancellation.py +1 -1
  178. oscura/{config → core/config}/__init__.py +20 -11
  179. oscura/{config → core/config}/defaults.py +1 -1
  180. oscura/{config → core/config}/loader.py +7 -5
  181. oscura/{config → core/config}/memory.py +5 -5
  182. oscura/{config → core/config}/migration.py +1 -1
  183. oscura/{config → core/config}/pipeline.py +99 -23
  184. oscura/{config → core/config}/preferences.py +1 -1
  185. oscura/{config → core/config}/protocol.py +3 -3
  186. oscura/{config → core/config}/schema.py +426 -272
  187. oscura/{config → core/config}/settings.py +1 -1
  188. oscura/{config → core/config}/thresholds.py +195 -153
  189. oscura/core/correlation.py +5 -6
  190. oscura/core/cross_domain.py +0 -2
  191. oscura/core/debug.py +9 -5
  192. oscura/{extensibility → core/extensibility}/docs.py +158 -70
  193. oscura/{extensibility → core/extensibility}/extensions.py +160 -76
  194. oscura/{extensibility → core/extensibility}/logging.py +1 -1
  195. oscura/{extensibility → core/extensibility}/measurements.py +1 -1
  196. oscura/{extensibility → core/extensibility}/plugins.py +1 -1
  197. oscura/{extensibility → core/extensibility}/templates.py +73 -3
  198. oscura/{extensibility → core/extensibility}/validation.py +1 -1
  199. oscura/core/gpu_backend.py +11 -7
  200. oscura/core/log_query.py +101 -11
  201. oscura/core/logging.py +126 -54
  202. oscura/core/logging_advanced.py +5 -5
  203. oscura/core/memory_limits.py +108 -70
  204. oscura/core/memory_monitor.py +2 -2
  205. oscura/core/memory_progress.py +7 -7
  206. oscura/core/memory_warnings.py +1 -1
  207. oscura/core/numba_backend.py +13 -13
  208. oscura/{plugins → core/plugins}/__init__.py +9 -9
  209. oscura/{plugins → core/plugins}/base.py +7 -7
  210. oscura/{plugins → core/plugins}/cli.py +3 -3
  211. oscura/{plugins → core/plugins}/discovery.py +186 -106
  212. oscura/{plugins → core/plugins}/lifecycle.py +1 -1
  213. oscura/{plugins → core/plugins}/manager.py +7 -7
  214. oscura/{plugins → core/plugins}/registry.py +3 -3
  215. oscura/{plugins → core/plugins}/versioning.py +1 -1
  216. oscura/core/progress.py +16 -1
  217. oscura/core/provenance.py +8 -2
  218. oscura/{schemas → core/schemas}/__init__.py +2 -2
  219. oscura/{schemas → core/schemas}/device_mapping.json +2 -8
  220. oscura/{schemas → core/schemas}/packet_format.json +4 -24
  221. oscura/{schemas → core/schemas}/protocol_definition.json +2 -12
  222. oscura/core/types.py +4 -0
  223. oscura/core/uncertainty.py +3 -3
  224. oscura/correlation/__init__.py +52 -0
  225. oscura/correlation/multi_protocol.py +811 -0
  226. oscura/discovery/auto_decoder.py +117 -35
  227. oscura/discovery/comparison.py +191 -86
  228. oscura/discovery/quality_validator.py +155 -68
  229. oscura/discovery/signal_detector.py +196 -79
  230. oscura/export/__init__.py +18 -8
  231. oscura/export/kaitai_struct.py +513 -0
  232. oscura/export/scapy_layer.py +801 -0
  233. oscura/export/wireshark/generator.py +1 -1
  234. oscura/export/wireshark/templates/dissector.lua.j2 +2 -2
  235. oscura/export/wireshark_dissector.py +746 -0
  236. oscura/guidance/wizard.py +207 -111
  237. oscura/hardware/__init__.py +19 -0
  238. oscura/{acquisition → hardware/acquisition}/__init__.py +4 -4
  239. oscura/{acquisition → hardware/acquisition}/file.py +2 -2
  240. oscura/{acquisition → hardware/acquisition}/hardware.py +7 -7
  241. oscura/{acquisition → hardware/acquisition}/saleae.py +15 -12
  242. oscura/{acquisition → hardware/acquisition}/socketcan.py +1 -1
  243. oscura/{acquisition → hardware/acquisition}/streaming.py +2 -2
  244. oscura/{acquisition → hardware/acquisition}/synthetic.py +3 -3
  245. oscura/{acquisition → hardware/acquisition}/visa.py +33 -11
  246. oscura/hardware/firmware/__init__.py +29 -0
  247. oscura/hardware/firmware/pattern_recognition.py +874 -0
  248. oscura/hardware/hal_detector.py +736 -0
  249. oscura/hardware/security/__init__.py +37 -0
  250. oscura/hardware/security/side_channel_detector.py +1126 -0
  251. oscura/inference/__init__.py +4 -0
  252. oscura/inference/active_learning/observation_table.py +4 -1
  253. oscura/inference/alignment.py +216 -123
  254. oscura/inference/bayesian.py +113 -33
  255. oscura/inference/crc_reverse.py +101 -55
  256. oscura/inference/logic.py +6 -2
  257. oscura/inference/message_format.py +342 -183
  258. oscura/inference/protocol.py +95 -44
  259. oscura/inference/protocol_dsl.py +180 -82
  260. oscura/inference/signal_intelligence.py +1439 -706
  261. oscura/inference/spectral.py +99 -57
  262. oscura/inference/state_machine.py +810 -158
  263. oscura/inference/stream.py +270 -110
  264. oscura/iot/__init__.py +34 -0
  265. oscura/iot/coap/__init__.py +32 -0
  266. oscura/iot/coap/analyzer.py +668 -0
  267. oscura/iot/coap/options.py +212 -0
  268. oscura/iot/lorawan/__init__.py +21 -0
  269. oscura/iot/lorawan/crypto.py +206 -0
  270. oscura/iot/lorawan/decoder.py +801 -0
  271. oscura/iot/lorawan/mac_commands.py +341 -0
  272. oscura/iot/mqtt/__init__.py +27 -0
  273. oscura/iot/mqtt/analyzer.py +999 -0
  274. oscura/iot/mqtt/properties.py +315 -0
  275. oscura/iot/zigbee/__init__.py +31 -0
  276. oscura/iot/zigbee/analyzer.py +615 -0
  277. oscura/iot/zigbee/security.py +153 -0
  278. oscura/iot/zigbee/zcl.py +349 -0
  279. oscura/jupyter/display.py +125 -45
  280. oscura/{exploratory → jupyter/exploratory}/__init__.py +8 -8
  281. oscura/{exploratory → jupyter/exploratory}/error_recovery.py +298 -141
  282. oscura/jupyter/exploratory/fuzzy.py +746 -0
  283. oscura/{exploratory → jupyter/exploratory}/fuzzy_advanced.py +258 -100
  284. oscura/{exploratory → jupyter/exploratory}/legacy.py +464 -242
  285. oscura/{exploratory → jupyter/exploratory}/parse.py +167 -145
  286. oscura/{exploratory → jupyter/exploratory}/recovery.py +119 -87
  287. oscura/jupyter/exploratory/sync.py +612 -0
  288. oscura/{exploratory → jupyter/exploratory}/unknown.py +299 -176
  289. oscura/jupyter/magic.py +4 -4
  290. oscura/{ui → jupyter/ui}/__init__.py +2 -2
  291. oscura/{ui → jupyter/ui}/formatters.py +3 -3
  292. oscura/{ui → jupyter/ui}/progressive_display.py +153 -82
  293. oscura/loaders/__init__.py +183 -67
  294. oscura/loaders/binary.py +88 -1
  295. oscura/loaders/chipwhisperer.py +153 -137
  296. oscura/loaders/configurable.py +208 -86
  297. oscura/loaders/csv_loader.py +458 -215
  298. oscura/loaders/hdf5_loader.py +278 -119
  299. oscura/loaders/lazy.py +87 -54
  300. oscura/loaders/mmap_loader.py +1 -1
  301. oscura/loaders/numpy_loader.py +253 -116
  302. oscura/loaders/pcap.py +226 -151
  303. oscura/loaders/rigol.py +110 -49
  304. oscura/loaders/sigrok.py +201 -78
  305. oscura/loaders/tdms.py +81 -58
  306. oscura/loaders/tektronix.py +291 -174
  307. oscura/loaders/touchstone.py +182 -87
  308. oscura/loaders/tss.py +456 -0
  309. oscura/loaders/vcd.py +215 -117
  310. oscura/loaders/wav.py +155 -68
  311. oscura/reporting/__init__.py +9 -0
  312. oscura/reporting/analyze.py +352 -146
  313. oscura/reporting/argument_preparer.py +69 -14
  314. oscura/reporting/auto_report.py +97 -61
  315. oscura/reporting/batch.py +131 -58
  316. oscura/reporting/chart_selection.py +57 -45
  317. oscura/reporting/comparison.py +63 -17
  318. oscura/reporting/content/executive.py +76 -24
  319. oscura/reporting/core_formats/multi_format.py +11 -8
  320. oscura/reporting/engine.py +312 -158
  321. oscura/reporting/enhanced_reports.py +949 -0
  322. oscura/reporting/export.py +86 -43
  323. oscura/reporting/formatting/numbers.py +69 -42
  324. oscura/reporting/html.py +139 -58
  325. oscura/reporting/index.py +137 -65
  326. oscura/reporting/output.py +158 -67
  327. oscura/reporting/pdf.py +67 -102
  328. oscura/reporting/plots.py +191 -112
  329. oscura/reporting/sections.py +88 -47
  330. oscura/reporting/standards.py +104 -61
  331. oscura/reporting/summary_generator.py +75 -55
  332. oscura/reporting/tables.py +138 -54
  333. oscura/reporting/templates/enhanced/protocol_re.html +525 -0
  334. oscura/sessions/__init__.py +14 -23
  335. oscura/sessions/base.py +3 -3
  336. oscura/sessions/blackbox.py +106 -10
  337. oscura/sessions/generic.py +2 -2
  338. oscura/sessions/legacy.py +783 -0
  339. oscura/side_channel/__init__.py +63 -0
  340. oscura/side_channel/dpa.py +1025 -0
  341. oscura/utils/__init__.py +15 -1
  342. oscura/utils/bitwise.py +118 -0
  343. oscura/{builders → utils/builders}/__init__.py +1 -1
  344. oscura/{comparison → utils/comparison}/__init__.py +6 -6
  345. oscura/{comparison → utils/comparison}/compare.py +202 -101
  346. oscura/{comparison → utils/comparison}/golden.py +83 -63
  347. oscura/{comparison → utils/comparison}/limits.py +313 -89
  348. oscura/{comparison → utils/comparison}/mask.py +151 -45
  349. oscura/{comparison → utils/comparison}/trace_diff.py +1 -1
  350. oscura/{comparison → utils/comparison}/visualization.py +147 -89
  351. oscura/{component → utils/component}/__init__.py +3 -3
  352. oscura/{component → utils/component}/impedance.py +122 -58
  353. oscura/{component → utils/component}/reactive.py +165 -168
  354. oscura/{component → utils/component}/transmission_line.py +3 -3
  355. oscura/{filtering → utils/filtering}/__init__.py +6 -6
  356. oscura/{filtering → utils/filtering}/base.py +1 -1
  357. oscura/{filtering → utils/filtering}/convenience.py +2 -2
  358. oscura/{filtering → utils/filtering}/design.py +169 -93
  359. oscura/{filtering → utils/filtering}/filters.py +2 -2
  360. oscura/{filtering → utils/filtering}/introspection.py +2 -2
  361. oscura/utils/geometry.py +31 -0
  362. oscura/utils/imports.py +184 -0
  363. oscura/utils/lazy.py +1 -1
  364. oscura/{math → utils/math}/__init__.py +2 -2
  365. oscura/{math → utils/math}/arithmetic.py +114 -48
  366. oscura/{math → utils/math}/interpolation.py +139 -106
  367. oscura/utils/memory.py +129 -66
  368. oscura/utils/memory_advanced.py +92 -9
  369. oscura/utils/memory_extensions.py +10 -8
  370. oscura/{optimization → utils/optimization}/__init__.py +1 -1
  371. oscura/{optimization → utils/optimization}/search.py +2 -2
  372. oscura/utils/performance/__init__.py +58 -0
  373. oscura/utils/performance/caching.py +889 -0
  374. oscura/utils/performance/lsh_clustering.py +333 -0
  375. oscura/utils/performance/memory_optimizer.py +699 -0
  376. oscura/utils/performance/optimizations.py +675 -0
  377. oscura/utils/performance/parallel.py +654 -0
  378. oscura/utils/performance/profiling.py +661 -0
  379. oscura/{pipeline → utils/pipeline}/base.py +1 -1
  380. oscura/{pipeline → utils/pipeline}/composition.py +1 -1
  381. oscura/{pipeline → utils/pipeline}/parallel.py +3 -2
  382. oscura/{pipeline → utils/pipeline}/pipeline.py +1 -1
  383. oscura/{pipeline → utils/pipeline}/reverse_engineering.py +412 -221
  384. oscura/{search → utils/search}/__init__.py +3 -3
  385. oscura/{search → utils/search}/anomaly.py +188 -58
  386. oscura/utils/search/context.py +294 -0
  387. oscura/{search → utils/search}/pattern.py +138 -10
  388. oscura/utils/serial.py +51 -0
  389. oscura/utils/storage/__init__.py +61 -0
  390. oscura/utils/storage/database.py +1166 -0
  391. oscura/{streaming → utils/streaming}/chunked.py +302 -143
  392. oscura/{streaming → utils/streaming}/progressive.py +1 -1
  393. oscura/{streaming → utils/streaming}/realtime.py +3 -2
  394. oscura/{triggering → utils/triggering}/__init__.py +6 -6
  395. oscura/{triggering → utils/triggering}/base.py +6 -6
  396. oscura/{triggering → utils/triggering}/edge.py +2 -2
  397. oscura/{triggering → utils/triggering}/pattern.py +2 -2
  398. oscura/{triggering → utils/triggering}/pulse.py +115 -74
  399. oscura/{triggering → utils/triggering}/window.py +2 -2
  400. oscura/utils/validation.py +32 -0
  401. oscura/validation/__init__.py +121 -0
  402. oscura/{compliance → validation/compliance}/__init__.py +5 -5
  403. oscura/{compliance → validation/compliance}/advanced.py +5 -5
  404. oscura/{compliance → validation/compliance}/masks.py +1 -1
  405. oscura/{compliance → validation/compliance}/reporting.py +127 -53
  406. oscura/{compliance → validation/compliance}/testing.py +114 -52
  407. oscura/validation/compliance_tests.py +915 -0
  408. oscura/validation/fuzzer.py +990 -0
  409. oscura/validation/grammar_tests.py +596 -0
  410. oscura/validation/grammar_validator.py +904 -0
  411. oscura/validation/hil_testing.py +977 -0
  412. oscura/{quality → validation/quality}/__init__.py +4 -4
  413. oscura/{quality → validation/quality}/ensemble.py +251 -171
  414. oscura/{quality → validation/quality}/explainer.py +3 -3
  415. oscura/{quality → validation/quality}/scoring.py +1 -1
  416. oscura/{quality → validation/quality}/warnings.py +4 -4
  417. oscura/validation/regression_suite.py +808 -0
  418. oscura/validation/replay.py +788 -0
  419. oscura/{testing → validation/testing}/__init__.py +2 -2
  420. oscura/{testing → validation/testing}/synthetic.py +5 -5
  421. oscura/visualization/__init__.py +9 -0
  422. oscura/visualization/accessibility.py +1 -1
  423. oscura/visualization/annotations.py +64 -67
  424. oscura/visualization/colors.py +7 -7
  425. oscura/visualization/digital.py +180 -81
  426. oscura/visualization/eye.py +236 -85
  427. oscura/visualization/interactive.py +320 -143
  428. oscura/visualization/jitter.py +587 -247
  429. oscura/visualization/layout.py +169 -134
  430. oscura/visualization/optimization.py +103 -52
  431. oscura/visualization/palettes.py +1 -1
  432. oscura/visualization/power.py +427 -211
  433. oscura/visualization/power_extended.py +626 -297
  434. oscura/visualization/presets.py +2 -0
  435. oscura/visualization/protocols.py +495 -181
  436. oscura/visualization/render.py +79 -63
  437. oscura/visualization/reverse_engineering.py +171 -124
  438. oscura/visualization/signal_integrity.py +460 -279
  439. oscura/visualization/specialized.py +190 -100
  440. oscura/visualization/spectral.py +670 -255
  441. oscura/visualization/thumbnails.py +166 -137
  442. oscura/visualization/waveform.py +150 -63
  443. oscura/workflows/__init__.py +3 -0
  444. oscura/{batch → workflows/batch}/__init__.py +5 -5
  445. oscura/{batch → workflows/batch}/advanced.py +150 -75
  446. oscura/workflows/batch/aggregate.py +531 -0
  447. oscura/workflows/batch/analyze.py +236 -0
  448. oscura/{batch → workflows/batch}/logging.py +2 -2
  449. oscura/{batch → workflows/batch}/metrics.py +1 -1
  450. oscura/workflows/complete_re.py +1144 -0
  451. oscura/workflows/compliance.py +44 -54
  452. oscura/workflows/digital.py +197 -51
  453. oscura/workflows/legacy/__init__.py +12 -0
  454. oscura/{workflow → workflows/legacy}/dag.py +4 -1
  455. oscura/workflows/multi_trace.py +9 -9
  456. oscura/workflows/power.py +42 -62
  457. oscura/workflows/protocol.py +82 -49
  458. oscura/workflows/reverse_engineering.py +351 -150
  459. oscura/workflows/signal_integrity.py +157 -82
  460. oscura-0.7.0.dist-info/METADATA +661 -0
  461. oscura-0.7.0.dist-info/RECORD +591 -0
  462. oscura/batch/aggregate.py +0 -300
  463. oscura/batch/analyze.py +0 -139
  464. oscura/dsl/__init__.py +0 -73
  465. oscura/exceptions.py +0 -59
  466. oscura/exploratory/fuzzy.py +0 -513
  467. oscura/exploratory/sync.py +0 -384
  468. oscura/exporters/__init__.py +0 -94
  469. oscura/exporters/csv.py +0 -303
  470. oscura/exporters/exporters.py +0 -44
  471. oscura/exporters/hdf5.py +0 -217
  472. oscura/exporters/html_export.py +0 -701
  473. oscura/exporters/json_export.py +0 -291
  474. oscura/exporters/markdown_export.py +0 -367
  475. oscura/exporters/matlab_export.py +0 -354
  476. oscura/exporters/npz_export.py +0 -219
  477. oscura/exporters/spice_export.py +0 -210
  478. oscura/search/context.py +0 -149
  479. oscura/session/__init__.py +0 -34
  480. oscura/session/annotations.py +0 -289
  481. oscura/session/history.py +0 -313
  482. oscura/session/session.py +0 -520
  483. oscura/workflow/__init__.py +0 -13
  484. oscura-0.5.1.dist-info/METADATA +0 -583
  485. oscura-0.5.1.dist-info/RECORD +0 -481
  486. /oscura/core/{config.py → config/legacy.py} +0 -0
  487. /oscura/{extensibility → core/extensibility}/__init__.py +0 -0
  488. /oscura/{extensibility → core/extensibility}/registry.py +0 -0
  489. /oscura/{plugins → core/plugins}/isolation.py +0 -0
  490. /oscura/{schemas → core/schemas}/bus_configuration.json +0 -0
  491. /oscura/{builders → utils/builders}/signal_builder.py +0 -0
  492. /oscura/{optimization → utils/optimization}/parallel.py +0 -0
  493. /oscura/{pipeline → utils/pipeline}/__init__.py +0 -0
  494. /oscura/{streaming → utils/streaming}/__init__.py +0 -0
  495. {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/WHEEL +0 -0
  496. {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/entry_points.txt +0 -0
  497. {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -8,2137 +8,80 @@
8
8
 
9
9
  This module provides comprehensive payload extraction from PCAP packets,
10
10
  pattern search capabilities, delimiter detection, and comparison tools.
11
+
12
+ This is the public API module that re-exports functionality from specialized modules:
13
+ - payload_analysis: Field inference, diff, clustering
14
+ - payload_patterns: Pattern search, delimiters, boundaries
15
+ - payload_extraction: Payload extraction utilities
11
16
  """
12
17
 
13
18
  from __future__ import annotations
14
19
 
15
- import logging
16
- import re
17
- import struct
18
- from collections import Counter
19
- from collections.abc import Iterator, Sequence
20
- from dataclasses import dataclass, field
21
- from typing import Any, Literal, cast
22
-
23
- import numpy as np
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- @dataclass
29
- class PayloadInfo:
30
- """Extracted payload with metadata.
31
-
32
- Implements RE-PAY-001: Payload with preserved metadata.
33
-
34
- Attributes:
35
- data: Payload bytes.
36
- packet_index: Index of source packet.
37
- timestamp: Packet timestamp (optional).
38
- src_ip: Source IP address (optional).
39
- dst_ip: Destination IP address (optional).
40
- src_port: Source port (optional).
41
- dst_port: Destination port (optional).
42
- protocol: Protocol name (optional).
43
- is_fragment: Whether packet is a fragment.
44
- fragment_offset: Fragment offset if fragmented.
45
- """
46
-
47
- data: bytes
48
- packet_index: int
49
- timestamp: float | None = None
50
- src_ip: str | None = None
51
- dst_ip: str | None = None
52
- src_port: int | None = None
53
- dst_port: int | None = None
54
- protocol: str | None = None
55
- is_fragment: bool = False
56
- fragment_offset: int = 0
57
-
58
-
59
- @dataclass
60
- class PatternMatch:
61
- """Pattern match result.
62
-
63
- Implements RE-PAY-002: Pattern match with location info.
64
-
65
- Attributes:
66
- pattern_name: Name of matched pattern.
67
- offset: Byte offset within payload.
68
- matched: Matched bytes.
69
- packet_index: Source packet index.
70
- context: Surrounding bytes for context.
71
- """
72
-
73
- pattern_name: str
74
- offset: int
75
- matched: bytes
76
- packet_index: int
77
- context: bytes = b""
78
-
79
-
80
- @dataclass
81
- class DelimiterResult:
82
- """Detected delimiter information.
83
-
84
- Implements RE-PAY-003: Delimiter detection result.
85
-
86
- Attributes:
87
- delimiter: Detected delimiter bytes.
88
- delimiter_type: Type of delimiter (fixed, length_prefix, pattern).
89
- confidence: Detection confidence (0-1).
90
- occurrences: Number of occurrences found.
91
- positions: List of positions where delimiter found.
92
- """
93
-
94
- delimiter: bytes
95
- delimiter_type: Literal["fixed", "length_prefix", "pattern"]
96
- confidence: float
97
- occurrences: int
98
- positions: list[int] = field(default_factory=list)
99
-
100
-
101
- @dataclass
102
- class LengthPrefixResult:
103
- """Length prefix detection result.
104
-
105
- Implements RE-PAY-003: Length prefix format detection.
106
-
107
- Attributes:
108
- detected: Whether length prefix was detected.
109
- length_bytes: Number of bytes for length field.
110
- endian: Endianness (big or little).
111
- offset: Offset of length field from message start.
112
- includes_length: Whether length includes the length field itself.
113
- confidence: Detection confidence (0-1).
114
- """
115
-
116
- detected: bool
117
- length_bytes: int = 0
118
- endian: Literal["big", "little"] = "big"
119
- offset: int = 0
120
- includes_length: bool = False
121
- confidence: float = 0.0
122
-
123
-
124
- @dataclass
125
- class MessageBoundary:
126
- """Message boundary information.
127
-
128
- Implements RE-PAY-003: Message boundary detection.
129
-
130
- Attributes:
131
- start: Start offset of message.
132
- end: End offset of message.
133
- length: Message length.
134
- data: Message data.
135
- index: Message index.
136
- """
137
-
138
- start: int
139
- end: int
140
- length: int
141
- data: bytes
142
- index: int
143
-
144
-
145
- @dataclass
146
- class PayloadDiff:
147
- """Difference between two payloads.
148
-
149
- Implements RE-PAY-005: Payload comparison result.
150
-
151
- Attributes:
152
- common_prefix_length: Length of common prefix.
153
- common_suffix_length: Length of common suffix.
154
- differences: List of (offset, byte_a, byte_b) for differences.
155
- similarity: Similarity score (0-1).
156
- edit_distance: Levenshtein edit distance.
157
- """
158
-
159
- common_prefix_length: int
160
- common_suffix_length: int
161
- differences: list[tuple[int, int, int]]
162
- similarity: float
163
- edit_distance: int
164
-
165
-
166
- @dataclass
167
- class VariablePositions:
168
- """Analysis of which byte positions vary across payloads.
169
-
170
- Implements RE-PAY-005: Variable position analysis.
171
-
172
- Attributes:
173
- constant_positions: Positions that are constant.
174
- variable_positions: Positions that vary.
175
- constant_values: Values at constant positions.
176
- variance_by_position: Variance at each position.
177
- """
178
-
179
- constant_positions: list[int]
180
- variable_positions: list[int]
181
- constant_values: dict[int, int]
182
- variance_by_position: np.ndarray[tuple[int], np.dtype[np.float64]]
183
-
184
-
185
- @dataclass
186
- class PayloadCluster:
187
- """Cluster of similar payloads.
188
-
189
- Implements RE-PAY-005: Payload clustering result.
190
-
191
- Attributes:
192
- cluster_id: Cluster identifier.
193
- payloads: List of payload data in cluster.
194
- indices: Original indices of payloads.
195
- representative: Representative payload (centroid).
196
- size: Number of payloads in cluster.
197
- """
198
-
199
- cluster_id: int
200
- payloads: list[bytes]
201
- indices: list[int]
202
- representative: bytes
203
- size: int
204
-
205
-
206
- # =============================================================================
207
- # RE-PAY-004: Payload Field Inference
208
- # =============================================================================
209
-
210
-
211
- @dataclass
212
- class InferredField:
213
- """Inferred field from binary payload.
214
-
215
- Implements RE-PAY-004: Inferred field structure.
216
-
217
- Attributes:
218
- name: Field name (auto-generated).
219
- offset: Byte offset within message.
220
- size: Field size in bytes.
221
- inferred_type: Inferred data type.
222
- endianness: Detected endianness.
223
- is_constant: Whether field is constant across messages.
224
- is_sequence: Whether field appears to be a counter/sequence.
225
- is_checksum: Whether field appears to be a checksum.
226
- constant_value: Value if constant.
227
- confidence: Inference confidence (0-1).
228
- sample_values: Sample values from messages.
229
- """
230
-
231
- name: str
232
- offset: int
233
- size: int
234
- inferred_type: Literal[
235
- "uint8",
236
- "uint16",
237
- "uint32",
238
- "uint64",
239
- "int8",
240
- "int16",
241
- "int32",
242
- "int64",
243
- "float32",
244
- "float64",
245
- "bytes",
246
- "string",
247
- "unknown",
248
- ]
249
- endianness: Literal["big", "little", "n/a"] = "n/a"
250
- is_constant: bool = False
251
- is_sequence: bool = False
252
- is_checksum: bool = False
253
- constant_value: bytes | None = None
254
- confidence: float = 0.5
255
- sample_values: list[Any] = field(default_factory=list)
256
-
257
-
258
- @dataclass
259
- class MessageSchema:
260
- """Inferred message schema.
261
-
262
- Implements RE-PAY-004: Complete message schema.
263
-
264
- Attributes:
265
- fields: List of inferred fields.
266
- message_length: Total message length.
267
- fixed_length: Whether all messages have same length.
268
- length_range: (min, max) length range.
269
- sample_count: Number of samples analyzed.
270
- confidence: Overall schema confidence.
271
- """
272
-
273
- fields: list[InferredField]
274
- message_length: int
275
- fixed_length: bool
276
- length_range: tuple[int, int]
277
- sample_count: int
278
- confidence: float
279
-
280
-
281
- class FieldInferrer:
282
- """Infer field structure within binary payloads.
283
-
284
- Implements RE-PAY-004: Payload Field Inference.
285
-
286
- Uses statistical analysis, alignment detection, and type inference
287
- to reconstruct message formats from binary payload samples.
288
-
289
- Example:
290
- >>> inferrer = FieldInferrer()
291
- >>> messages = [pkt.data for pkt in udp_packets]
292
- >>> schema = inferrer.infer_fields(messages)
293
- >>> for field in schema.fields:
294
- ... print(f"{field.name}: {field.inferred_type} at offset {field.offset}")
295
- """
296
-
297
- def __init__(
298
- self,
299
- min_samples: int = 10,
300
- entropy_threshold: float = 0.5,
301
- sequence_threshold: int = 3,
302
- ) -> None:
303
- """Initialize field inferrer.
304
-
305
- Args:
306
- min_samples: Minimum samples for reliable inference.
307
- entropy_threshold: Entropy change threshold for boundary detection.
308
- sequence_threshold: Minimum consecutive incrementing values for sequence.
309
- """
310
- self.min_samples = min_samples
311
- self.entropy_threshold = entropy_threshold
312
- self.sequence_threshold = sequence_threshold
313
-
314
- def infer_fields(
315
- self,
316
- messages: Sequence[bytes],
317
- min_samples: int | None = None,
318
- ) -> MessageSchema:
319
- """Infer field structure from message samples.
320
-
321
- Implements RE-PAY-004: Complete field inference.
322
-
323
- Args:
324
- messages: List of binary message samples.
325
- min_samples: Override minimum sample count.
326
-
327
- Returns:
328
- MessageSchema with inferred field structure.
329
-
330
- Example:
331
- >>> schema = inferrer.infer_fields(messages)
332
- >>> print(f"Detected {len(schema.fields)} fields")
333
- """
334
- if not messages:
335
- return MessageSchema(
336
- fields=[],
337
- message_length=0,
338
- fixed_length=True,
339
- length_range=(0, 0),
340
- sample_count=0,
341
- confidence=0.0,
342
- )
343
-
344
- min_samples = min_samples or self.min_samples
345
- lengths = [len(m) for m in messages]
346
- min_len = min(lengths)
347
- max_len = max(lengths)
348
- fixed_length = min_len == max_len
349
-
350
- # Use shortest message length for analysis
351
- analysis_length = min_len
352
-
353
- # Find field boundaries using entropy transitions
354
- boundaries = self._detect_field_boundaries(messages, analysis_length)
355
-
356
- # Infer field types for each segment
357
- fields = []
358
- for i, (start, end) in enumerate(boundaries):
359
- field = self._infer_field(messages, start, end, i)
360
- fields.append(field)
361
-
362
- # Calculate overall confidence
363
- if fields:
364
- confidence = sum(f.confidence for f in fields) / len(fields)
365
- else:
366
- confidence = 0.0
367
-
368
- return MessageSchema(
369
- fields=fields,
370
- message_length=analysis_length,
371
- fixed_length=fixed_length,
372
- length_range=(min_len, max_len),
373
- sample_count=len(messages),
374
- confidence=confidence,
375
- )
376
-
377
- def detect_field_types(
378
- self,
379
- messages: Sequence[bytes],
380
- boundaries: list[tuple[int, int]],
381
- ) -> list[InferredField]:
382
- """Detect field types for given boundaries.
383
-
384
- Implements RE-PAY-004: Field type detection.
385
-
386
- Args:
387
- messages: Message samples.
388
- boundaries: List of (start, end) field boundaries.
389
-
390
- Returns:
391
- List of InferredField with type information.
392
- """
393
- fields = []
394
- for i, (start, end) in enumerate(boundaries):
395
- field = self._infer_field(messages, start, end, i)
396
- fields.append(field)
397
- return fields
398
-
399
- def find_sequence_fields(
400
- self,
401
- messages: Sequence[bytes],
402
- ) -> list[tuple[int, int]]:
403
- """Find fields that appear to be sequence/counter values.
404
-
405
- Implements RE-PAY-004: Sequence field detection.
406
-
407
- Args:
408
- messages: Message samples (should be in order).
409
-
410
- Returns:
411
- List of (offset, size) for sequence fields.
412
-
413
- Raises:
414
- ValueError: If messages are too short for field extraction.
415
- """
416
- if len(messages) < self.sequence_threshold:
417
- return []
418
-
419
- min_len = min(len(m) for m in messages)
420
- sequence_fields = []
421
-
422
- # Check each possible field size at each offset
423
- for size in [1, 2, 4]:
424
- for offset in range(min_len - size + 1):
425
- values = []
426
- try:
427
- for msg in messages:
428
- # Validate message length before slicing
429
- if len(msg) < offset + size:
430
- raise ValueError(
431
- f"Message too short: expected at least {offset + size} bytes, "
432
- f"got {len(msg)} bytes"
433
- )
434
- # Try both endianness
435
- val_be = int.from_bytes(msg[offset : offset + size], "big")
436
- values.append(val_be)
437
-
438
- if self._is_sequence(values):
439
- sequence_fields.append((offset, size))
440
- except (ValueError, IndexError) as e:
441
- # Skip this offset/size combination if extraction fails
442
- logger.debug(f"Skipping field at offset={offset}, size={size}: {e}")
443
- continue
444
-
445
- return sequence_fields
446
-
447
- def find_checksum_fields(
448
- self,
449
- messages: Sequence[bytes],
450
- ) -> list[tuple[int, int, str]]:
451
- """Find fields that appear to be checksums.
452
-
453
- Implements RE-PAY-004: Checksum field detection.
454
-
455
- Args:
456
- messages: Message samples.
457
-
458
- Returns:
459
- List of (offset, size, algorithm_hint) for checksum fields.
460
-
461
- Raises:
462
- ValueError: If checksum field validation fails.
463
- """
464
- if len(messages) < 5:
465
- return []
466
-
467
- min_len = min(len(m) for m in messages)
468
- checksum_fields = []
469
-
470
- # Common checksum sizes and positions
471
- for size in [1, 2, 4]:
472
- # Check last position (most common)
473
- for offset in [min_len - size, 0]:
474
- if offset < 0:
475
- continue
476
-
477
- try:
478
- # Validate offset and size before processing
479
- if offset + size > min_len:
480
- raise ValueError(
481
- f"Invalid checksum field: offset={offset} + size={size} exceeds "
482
- f"minimum message length={min_len}"
483
- )
484
-
485
- # Extract field values and message content
486
- score = self._check_checksum_correlation(messages, offset, size)
487
-
488
- if score > 0.8:
489
- algorithm = self._guess_checksum_algorithm(messages, offset, size)
490
- checksum_fields.append((offset, size, algorithm))
491
- except (ValueError, IndexError) as e:
492
- # Skip this offset/size combination if validation fails
493
- logger.debug(f"Skipping checksum field at offset={offset}, size={size}: {e}")
494
- continue
495
-
496
- return checksum_fields
497
-
498
- def _detect_field_boundaries(
499
- self,
500
- messages: Sequence[bytes],
501
- max_length: int,
502
- ) -> list[tuple[int, int]]:
503
- """Detect field boundaries using entropy analysis.
504
-
505
- Args:
506
- messages: Message samples.
507
- max_length: Maximum length to analyze.
508
-
509
- Returns:
510
- List of (start, end) boundaries.
511
- """
512
- if max_length == 0:
513
- return []
514
-
515
- # Calculate per-byte entropy
516
- byte_entropies = []
517
- for pos in range(max_length):
518
- values = [m[pos] for m in messages if len(m) > pos]
519
- if len(values) < 2:
520
- byte_entropies.append(0.0)
521
- continue
522
-
523
- counts = Counter(values)
524
- total = len(values)
525
- entropy = 0.0
526
- for count in counts.values():
527
- if count > 0:
528
- p = count / total
529
- entropy -= p * np.log2(p)
530
- byte_entropies.append(entropy)
531
-
532
- # Find boundaries at entropy transitions
533
- boundaries = []
534
- current_start = 0
535
-
536
- for i in range(1, len(byte_entropies)):
537
- delta = abs(byte_entropies[i] - byte_entropies[i - 1])
538
-
539
- # Also check for constant vs variable patterns
540
- if delta > self.entropy_threshold:
541
- if i > current_start:
542
- boundaries.append((current_start, i))
543
- current_start = i
544
-
545
- # Add final segment
546
- if max_length > current_start:
547
- boundaries.append((current_start, max_length))
548
-
549
- # Merge very small segments
550
- merged: list[tuple[int, int]] = []
551
- for start, end in boundaries:
552
- if merged and start - merged[-1][1] == 0 and end - start < 2:
553
- # Merge with previous
554
- merged[-1] = (merged[-1][0], end)
555
- else:
556
- merged.append((start, end))
557
-
558
- return merged if merged else [(0, max_length)]
559
-
560
- def _infer_field(
561
- self,
562
- messages: Sequence[bytes],
563
- start: int,
564
- end: int,
565
- index: int,
566
- ) -> InferredField:
567
- """Infer type for a single field.
568
-
569
- Args:
570
- messages: Message samples.
571
- start: Field start offset.
572
- end: Field end offset.
573
- index: Field index for naming.
574
-
575
- Returns:
576
- InferredField with inferred type.
577
- """
578
- size = end - start
579
- name = f"field_{index}"
580
-
581
- # Extract field values
582
- values = []
583
- raw_values = []
584
- for msg in messages:
585
- if len(msg) >= end:
586
- field_bytes = msg[start:end]
587
- raw_values.append(field_bytes)
588
- values.append(field_bytes)
589
-
590
- if not values:
591
- return InferredField(
592
- name=name,
593
- offset=start,
594
- size=size,
595
- inferred_type="unknown",
596
- confidence=0.0,
597
- )
598
-
599
- # Check if constant
600
- unique_values = set(raw_values)
601
- is_constant = len(unique_values) == 1
602
-
603
- # Check if sequence
604
- is_sequence = False
605
- if not is_constant and size in [1, 2, 4, 8]:
606
- int_values = [int.from_bytes(v, "big") for v in raw_values]
607
- is_sequence = self._is_sequence(int_values)
608
-
609
- # Check for checksum patterns
610
- is_checksum = False
611
- if start >= min(len(m) for m in messages) - 4:
612
- score = self._check_checksum_correlation(messages, start, size)
613
- is_checksum = score > 0.7
614
-
615
- # Infer type
616
- inferred_type, endianness, confidence = self._infer_type(raw_values, size)
617
-
618
- # Sample values for debugging
619
- sample_values: list[int | str] = []
620
- for v in raw_values[:5]:
621
- if inferred_type.startswith("uint") or inferred_type.startswith("int"):
622
- try:
623
- # Cast endianness to Literal type for type checker
624
- byte_order: Literal["big", "little"] = (
625
- "big" if endianness == "n/a" else endianness # type: ignore[assignment]
626
- )
627
- sample_values.append(int.from_bytes(v, byte_order))
628
- except Exception:
629
- sample_values.append(v.hex())
630
- elif inferred_type == "string":
631
- try:
632
- sample_values.append(v.decode("utf-8", errors="replace"))
633
- except Exception:
634
- sample_values.append(v.hex())
635
- else:
636
- sample_values.append(v.hex())
637
-
638
- # Cast to Literal types for type checker
639
- inferred_type_literal: Literal[
640
- "uint8",
641
- "uint16",
642
- "uint32",
643
- "uint64",
644
- "int8",
645
- "int16",
646
- "int32",
647
- "int64",
648
- "float32",
649
- "float64",
650
- "bytes",
651
- "string",
652
- "unknown",
653
- ] = inferred_type # type: ignore[assignment]
654
- endianness_literal: Literal["big", "little", "n/a"] = endianness # type: ignore[assignment]
655
-
656
- return InferredField(
657
- name=name,
658
- offset=start,
659
- size=size,
660
- inferred_type=inferred_type_literal,
661
- endianness=endianness_literal,
662
- is_constant=is_constant,
663
- is_sequence=is_sequence,
664
- is_checksum=is_checksum,
665
- constant_value=raw_values[0] if is_constant else None,
666
- confidence=confidence,
667
- sample_values=sample_values,
668
- )
669
-
670
- def _infer_type(
671
- self,
672
- values: list[bytes],
673
- size: int,
674
- ) -> tuple[str, str, float]:
675
- """Infer data type from values.
676
-
677
- Args:
678
- values: Field values.
679
- size: Field size.
680
-
681
- Returns:
682
- Tuple of (type, endianness, confidence).
683
- """
684
- if not values:
685
- return "unknown", "n/a", 0.0
686
-
687
- # Check for string (high printable ratio)
688
- printable_ratio = sum(
689
- 1 for v in values for b in v if 32 <= b <= 126 or b in (9, 10, 13)
690
- ) / (len(values) * size)
691
-
692
- if printable_ratio > 0.8:
693
- return "string", "n/a", printable_ratio
694
-
695
- # Check for standard integer sizes
696
- if size == 1:
697
- return "uint8", "n/a", 0.9
698
-
699
- elif size == 2:
700
- # Try to detect endianness
701
- be_variance = np.var([int.from_bytes(v, "big") for v in values])
702
- le_variance = np.var([int.from_bytes(v, "little") for v in values])
703
-
704
- if be_variance < le_variance:
705
- endian = "big"
706
- else:
707
- endian = "little"
708
-
709
- return "uint16", endian, 0.8
710
-
711
- elif size == 4:
712
- # Check for float
713
- float_valid = 0
714
- for v in values:
715
- try:
716
- f = struct.unpack(">f", v)[0]
717
- if not (np.isnan(f) or np.isinf(f)) and -1e10 < f < 1e10:
718
- float_valid += 1
719
- except Exception:
720
- pass
721
-
722
- if float_valid / len(values) > 0.8:
723
- return "float32", "big", 0.7
724
-
725
- # Otherwise integer
726
- be_variance = np.var([int.from_bytes(v, "big") for v in values])
727
- le_variance = np.var([int.from_bytes(v, "little") for v in values])
728
- endian = "big" if be_variance < le_variance else "little"
729
- return "uint32", endian, 0.8
730
-
731
- elif size == 8:
732
- # Check for float64 or uint64
733
- be_variance = np.var([int.from_bytes(v, "big") for v in values])
734
- le_variance = np.var([int.from_bytes(v, "little") for v in values])
735
- endian = "big" if be_variance < le_variance else "little"
736
- return "uint64", endian, 0.7
737
-
738
- else:
739
- return "bytes", "n/a", 0.6
740
-
741
- def _is_sequence(self, values: list[int]) -> bool:
742
- """Check if values form a sequence.
743
-
744
- Args:
745
- values: Integer values.
746
-
747
- Returns:
748
- True if values are incrementing/decrementing.
749
- """
750
- if len(values) < self.sequence_threshold:
751
- return False
752
-
753
- # Check for incrementing sequence
754
- diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
755
-
756
- # Most diffs should be 1 (or consistent)
757
- counter = Counter(diffs)
758
- if not counter:
759
- return False
760
-
761
- most_common_diff, count = counter.most_common(1)[0]
762
- ratio = count / len(diffs)
763
-
764
- return ratio > 0.8 and most_common_diff in [1, -1, 0]
765
-
766
- def _check_checksum_correlation(
767
- self,
768
- messages: Sequence[bytes],
769
- offset: int,
770
- size: int,
771
- ) -> float:
772
- """Check if field correlates with message content like a checksum.
773
-
774
- Args:
775
- messages: Message samples.
776
- offset: Field offset.
777
- size: Field size.
778
-
779
- Returns:
780
- Correlation score (0-1).
781
- """
782
- # Simple heuristic: checksum fields have high correlation with
783
- # changes in other parts of the message
784
-
785
- if len(messages) < 5:
786
- return 0.0
787
-
788
- # Extract checksum values and message content
789
- checksums = []
790
- contents = []
791
-
792
- for msg in messages:
793
- if len(msg) >= offset + size:
794
- checksums.append(int.from_bytes(msg[offset : offset + size], "big"))
795
- # Content before checksum
796
- content = msg[:offset] + msg[offset + size :]
797
- contents.append(sum(content) % 65536)
798
-
799
- if len(checksums) < 5:
800
- return 0.0
801
-
802
- # Check if checksum changes correlate with content changes
803
- unique_contents = len(set(contents))
804
- unique_checksums = len(set(checksums))
805
-
806
- if unique_contents == 1 and unique_checksums == 1:
807
- return 0.3 # Both constant - inconclusive
808
-
809
- # Simple correlation check
810
- if unique_contents > 1 and unique_checksums > 1:
811
- return 0.8
812
-
813
- return 0.3
814
-
815
- def _guess_checksum_algorithm(
816
- self,
817
- messages: Sequence[bytes],
818
- offset: int,
819
- size: int,
820
- ) -> str:
821
- """Guess the checksum algorithm.
822
-
823
- Args:
824
- messages: Message samples.
825
- offset: Checksum offset.
826
- size: Checksum size.
827
-
828
- Returns:
829
- Algorithm name hint.
830
- """
831
- if size == 1:
832
- return "xor8_or_sum8"
833
- elif size == 2:
834
- return "crc16_or_sum16"
835
- elif size == 4:
836
- return "crc32"
837
- return "unknown"
838
-
839
-
840
- # =============================================================================
841
- # RE-PAY-004: Convenience functions
842
- # =============================================================================
843
-
844
-
845
- def infer_fields(messages: Sequence[bytes], min_samples: int = 10) -> MessageSchema:
846
- """Infer field structure from message samples.
847
-
848
- Implements RE-PAY-004: Payload Field Inference.
849
-
850
- Args:
851
- messages: List of binary message samples.
852
- min_samples: Minimum samples for reliable inference.
853
-
854
- Returns:
855
- MessageSchema with inferred field structure.
856
-
857
- Example:
858
- >>> messages = [pkt.data for pkt in packets]
859
- >>> schema = infer_fields(messages)
860
- >>> for field in schema.fields:
861
- ... print(f"{field.name}: {field.inferred_type}")
862
- """
863
- inferrer = FieldInferrer(min_samples=min_samples)
864
- return inferrer.infer_fields(messages)
865
-
866
-
867
- def detect_field_types(
868
- messages: Sequence[bytes],
869
- boundaries: list[tuple[int, int]],
870
- ) -> list[InferredField]:
871
- """Detect field types for given boundaries.
872
-
873
- Implements RE-PAY-004: Field type detection.
874
-
875
- Args:
876
- messages: Message samples.
877
- boundaries: List of (start, end) field boundaries.
878
-
879
- Returns:
880
- List of InferredField with type information.
881
- """
882
- inferrer = FieldInferrer()
883
- return inferrer.detect_field_types(messages, boundaries)
884
-
885
-
886
- def find_sequence_fields(messages: Sequence[bytes]) -> list[tuple[int, int]]:
887
- """Find fields that appear to be sequence/counter values.
888
-
889
- Implements RE-PAY-004: Sequence field detection.
890
-
891
- Args:
892
- messages: Message samples (should be in order).
893
-
894
- Returns:
895
- List of (offset, size) for sequence fields.
896
- """
897
- inferrer = FieldInferrer()
898
- return inferrer.find_sequence_fields(messages)
899
-
900
-
901
- def find_checksum_fields(messages: Sequence[bytes]) -> list[tuple[int, int, str]]:
902
- """Find fields that appear to be checksums.
903
-
904
- Implements RE-PAY-004: Checksum field detection.
905
-
906
- Args:
907
- messages: Message samples.
908
-
909
- Returns:
910
- List of (offset, size, algorithm_hint) for checksum fields.
911
- """
912
- inferrer = FieldInferrer()
913
- return inferrer.find_checksum_fields(messages)
914
-
915
-
916
- class PayloadExtractor:
917
- """Extract payloads from network packets.
918
-
919
- Implements RE-PAY-001: Payload Extraction Framework.
920
-
921
- Provides zero-copy payload extraction from UDP/TCP packets
922
- with metadata preservation and fragment handling.
923
-
924
- Example:
925
- >>> extractor = PayloadExtractor()
926
- >>> payloads = extractor.extract_all_payloads(packets, protocol="UDP")
927
- >>> for p in payloads:
928
- ... print(f"{p.src_ip}:{p.src_port} -> {len(p.data)} bytes")
929
- """
930
-
931
- def __init__(
932
- self,
933
- include_headers: bool = False,
934
- zero_copy: bool = True,
935
- return_type: Literal["bytes", "memoryview", "numpy"] = "bytes",
936
- ) -> None:
937
- """Initialize payload extractor.
938
-
939
- Args:
940
- include_headers: Include protocol headers in payload.
941
- zero_copy: Use zero-copy memoryview where possible.
942
- return_type: Type for returned payload data.
943
- """
944
- self.include_headers = include_headers
945
- self.zero_copy = zero_copy
946
- self.return_type = return_type
947
-
948
- def extract_payload(
949
- self,
950
- packet: dict[str, Any] | bytes,
951
- layer: Literal["ethernet", "ip", "transport", "application"] = "application",
952
- ) -> bytes | memoryview | np.ndarray[tuple[int], np.dtype[np.uint8]]:
953
- """Extract payload from a single packet.
954
-
955
- Implements RE-PAY-001: Single packet payload extraction.
956
-
957
- Args:
958
- packet: Packet data (dict with 'data' key or raw bytes).
959
- layer: OSI layer to extract from.
960
-
961
- Returns:
962
- Payload data in requested format.
963
-
964
- Example:
965
- >>> payload = extractor.extract_payload(packet)
966
- >>> print(f"Payload: {len(payload)} bytes")
967
- """
968
- # Handle different packet formats
969
- if isinstance(packet, dict):
970
- raw_data = packet.get("data", packet.get("payload", b""))
971
- if isinstance(raw_data, list | tuple):
972
- raw_data = bytes(raw_data)
973
- else:
974
- raw_data = packet
975
-
976
- if not raw_data:
977
- return self._format_output(b"")
978
-
979
- # For raw bytes, return as-is
980
- if layer == "application":
981
- return self._format_output(raw_data)
982
-
983
- # Layer-based extraction would require protocol parsing
984
- # For now, return full data
985
- return self._format_output(raw_data)
986
-
987
- def extract_all_payloads(
988
- self,
989
- packets: Sequence[dict[str, Any] | bytes],
990
- protocol: str | None = None,
991
- port_filter: tuple[int | None, int | None] | None = None,
992
- ) -> list[PayloadInfo]:
993
- """Extract payloads from all packets with metadata.
994
-
995
- Implements RE-PAY-001: Batch payload extraction with metadata.
996
-
997
- Args:
998
- packets: Sequence of packets.
999
- protocol: Filter by protocol (e.g., "UDP", "TCP").
1000
- port_filter: (src_port, dst_port) filter tuple.
1001
-
1002
- Returns:
1003
- List of PayloadInfo with extracted data and metadata.
1004
-
1005
- Example:
1006
- >>> payloads = extractor.extract_all_payloads(packets, protocol="UDP")
1007
- >>> print(f"Extracted {len(payloads)} payloads")
1008
- """
1009
- results = []
1010
-
1011
- for i, packet in enumerate(packets):
1012
- if isinstance(packet, dict):
1013
- # Extract metadata from dict
1014
- pkt_protocol = packet.get("protocol", "")
1015
- src_port = packet.get("src_port")
1016
- dst_port = packet.get("dst_port")
1017
-
1018
- # Apply filters
1019
- if protocol and pkt_protocol.upper() != protocol.upper():
1020
- continue
1021
-
1022
- if port_filter:
1023
- if port_filter[0] is not None and src_port != port_filter[0]:
1024
- continue
1025
- if port_filter[1] is not None and dst_port != port_filter[1]:
1026
- continue
1027
-
1028
- payload = self.extract_payload(packet)
1029
- if isinstance(payload, memoryview | np.ndarray):
1030
- payload = bytes(payload)
1031
-
1032
- info = PayloadInfo(
1033
- data=payload,
1034
- packet_index=i,
1035
- timestamp=packet.get("timestamp"),
1036
- src_ip=packet.get("src_ip"),
1037
- dst_ip=packet.get("dst_ip"),
1038
- src_port=src_port,
1039
- dst_port=dst_port,
1040
- protocol=pkt_protocol,
1041
- is_fragment=packet.get("is_fragment", False),
1042
- fragment_offset=packet.get("fragment_offset", 0),
1043
- )
1044
- results.append(info)
1045
- else:
1046
- # Raw bytes
1047
- payload = bytes(packet)
1048
- info = PayloadInfo(data=payload, packet_index=i)
1049
- results.append(info)
1050
-
1051
- return results
1052
-
1053
- def iter_payloads(
1054
- self,
1055
- packets: Sequence[dict[str, Any] | bytes],
1056
- ) -> Iterator[PayloadInfo]:
1057
- """Iterate over payloads for memory-efficient processing.
1058
-
1059
- Implements RE-PAY-001: Streaming payload iteration.
1060
-
1061
- Args:
1062
- packets: Sequence of packets.
1063
-
1064
- Yields:
1065
- PayloadInfo for each packet.
1066
- """
1067
- for i, packet in enumerate(packets):
1068
- payload = self.extract_payload(packet)
1069
- if isinstance(payload, memoryview | np.ndarray):
1070
- payload = bytes(payload)
1071
-
1072
- if isinstance(packet, dict):
1073
- info = PayloadInfo(
1074
- data=payload,
1075
- packet_index=i,
1076
- timestamp=packet.get("timestamp"),
1077
- src_ip=packet.get("src_ip"),
1078
- dst_ip=packet.get("dst_ip"),
1079
- src_port=packet.get("src_port"),
1080
- dst_port=packet.get("dst_port"),
1081
- protocol=packet.get("protocol"),
1082
- )
1083
- else:
1084
- info = PayloadInfo(data=payload, packet_index=i)
1085
-
1086
- yield info
1087
-
1088
- def _format_output(
1089
- self, data: bytes
1090
- ) -> bytes | memoryview | np.ndarray[tuple[int], np.dtype[np.uint8]]:
1091
- """Format output according to return_type setting."""
1092
- if self.return_type == "bytes":
1093
- return data
1094
- elif self.return_type == "memoryview":
1095
- return memoryview(data)
1096
- # self.return_type == "numpy"
1097
- return np.frombuffer(data, dtype=np.uint8)
1098
-
1099
-
1100
- def search_pattern(
1101
- packets: Sequence[dict[str, Any] | bytes],
1102
- pattern: bytes | str,
1103
- pattern_type: Literal["exact", "wildcard", "regex"] = "exact",
1104
- context_bytes: int = 8,
1105
- ) -> list[PatternMatch]:
1106
- """Search for pattern in packet payloads.
1107
-
1108
- Implements RE-PAY-002: Payload Pattern Search.
1109
-
1110
- Args:
1111
- packets: Sequence of packets to search.
1112
- pattern: Pattern to search for.
1113
- pattern_type: Type of pattern matching.
1114
- context_bytes: Number of context bytes around match.
1115
-
1116
- Returns:
1117
- List of PatternMatch results.
1118
-
1119
- Example:
1120
- >>> matches = search_pattern(packets, b'\\x00\\x01\\x00\\x00')
1121
- >>> for m in matches:
1122
- ... print(f"Found at packet {m.packet_index}, offset {m.offset}")
1123
- """
1124
- extractor = PayloadExtractor()
1125
- results = []
1126
-
1127
- for i, packet in enumerate(packets):
1128
- payload = extractor.extract_payload(packet)
1129
- if isinstance(payload, memoryview | np.ndarray):
1130
- payload = bytes(payload)
1131
-
1132
- matches = _find_pattern_in_data(payload, pattern, pattern_type)
1133
-
1134
- for offset, matched in matches:
1135
- # Get context
1136
- start = max(0, offset - context_bytes)
1137
- end = min(len(payload), offset + len(matched) + context_bytes)
1138
- context = payload[start:end]
1139
-
1140
- results.append(
1141
- PatternMatch(
1142
- pattern_name=pattern.hex() if isinstance(pattern, bytes) else str(pattern),
1143
- offset=offset,
1144
- matched=matched,
1145
- packet_index=i,
1146
- context=context,
1147
- )
1148
- )
1149
-
1150
- return results
1151
-
1152
-
1153
- def search_patterns(
1154
- packets: Sequence[dict[str, Any] | bytes],
1155
- patterns: dict[str, bytes | str],
1156
- context_bytes: int = 8,
1157
- ) -> dict[str, list[PatternMatch]]:
1158
- """Search for multiple patterns simultaneously.
1159
-
1160
- Implements RE-PAY-002: Multi-pattern search.
1161
-
1162
- Args:
1163
- packets: Sequence of packets to search.
1164
- patterns: Dictionary mapping names to patterns.
1165
- context_bytes: Number of context bytes around match.
1166
-
1167
- Returns:
1168
- Dictionary mapping pattern names to match lists.
1169
-
1170
- Example:
1171
- >>> signatures = {
1172
- ... "header_a": b'\\xAA\\x55',
1173
- ... "header_b": b'\\xDE\\xAD',
1174
- ... }
1175
- >>> results = search_patterns(packets, signatures)
1176
- >>> for name, matches in results.items():
1177
- ... print(f"{name}: {len(matches)} matches")
1178
- """
1179
- results: dict[str, list[PatternMatch]] = {name: [] for name in patterns}
1180
- extractor = PayloadExtractor()
1181
-
1182
- for i, packet in enumerate(packets):
1183
- payload = extractor.extract_payload(packet)
1184
- if isinstance(payload, memoryview | np.ndarray):
1185
- payload = bytes(payload)
1186
-
1187
- for name, pattern in patterns.items():
1188
- # Detect pattern type
1189
- if isinstance(pattern, bytes):
1190
- if b"??" in pattern or b"\\x??" in pattern:
1191
- pattern_type = "wildcard"
1192
- else:
1193
- pattern_type = "exact"
1194
- else:
1195
- pattern_type = "regex"
1196
-
1197
- matches = _find_pattern_in_data(payload, pattern, pattern_type)
1198
-
1199
- for offset, matched in matches:
1200
- start = max(0, offset - context_bytes)
1201
- end = min(len(payload), offset + len(matched) + context_bytes)
1202
- context = payload[start:end]
1203
-
1204
- results[name].append(
1205
- PatternMatch(
1206
- pattern_name=name,
1207
- offset=offset,
1208
- matched=matched,
1209
- packet_index=i,
1210
- context=context,
1211
- )
1212
- )
1213
-
1214
- return results
1215
-
1216
-
1217
- def filter_by_pattern(
1218
- packets: Sequence[dict[str, Any] | bytes],
1219
- pattern: bytes | str,
1220
- pattern_type: Literal["exact", "wildcard", "regex"] = "exact",
1221
- ) -> list[dict[str, Any] | bytes]:
1222
- """Filter packets that contain a pattern.
1223
-
1224
- Implements RE-PAY-002: Pattern-based filtering.
1225
-
1226
- Args:
1227
- packets: Sequence of packets.
1228
- pattern: Pattern to match.
1229
- pattern_type: Type of pattern matching.
1230
-
1231
- Returns:
1232
- List of packets containing the pattern.
1233
- """
1234
- extractor = PayloadExtractor()
1235
- result = []
1236
-
1237
- for packet in packets:
1238
- payload = extractor.extract_payload(packet)
1239
- if isinstance(payload, memoryview | np.ndarray):
1240
- payload = bytes(payload)
1241
-
1242
- matches = _find_pattern_in_data(payload, pattern, pattern_type)
1243
- if len(matches) > 0:
1244
- result.append(packet)
1245
-
1246
- return result
1247
-
1248
-
1249
- def detect_delimiter(
1250
- payloads: Sequence[bytes] | bytes,
1251
- candidates: list[bytes] | None = None,
1252
- ) -> DelimiterResult:
1253
- """Automatically detect message delimiter.
1254
-
1255
- Implements RE-PAY-003: Delimiter detection.
1256
-
1257
- Args:
1258
- payloads: Payload data or list of payloads.
1259
- candidates: Optional list of candidate delimiters to test.
1260
-
1261
- Returns:
1262
- DelimiterResult with detected delimiter info.
1263
-
1264
- Example:
1265
- >>> data = b'msg1\\r\\nmsg2\\r\\nmsg3\\r\\n'
1266
- >>> result = detect_delimiter(data)
1267
- >>> print(f"Delimiter: {result.delimiter!r}")
1268
- """
1269
- # Combine payloads if list
1270
- if isinstance(payloads, list | tuple):
1271
- data: bytes = b"".join(payloads)
1272
- else:
1273
- # Type narrowing: payloads is bytes here
1274
- data = cast("bytes", payloads)
1275
-
1276
- if not data:
1277
- return DelimiterResult(
1278
- delimiter=b"",
1279
- delimiter_type="fixed",
1280
- confidence=0.0,
1281
- occurrences=0,
1282
- )
1283
-
1284
- # Default candidates
1285
- if candidates is None:
1286
- candidates = [
1287
- b"\r\n", # CRLF
1288
- b"\n", # LF
1289
- b"\x00", # Null
1290
- b"\r", # CR
1291
- b"\x0d\x0a", # CRLF (explicit)
1292
- ]
1293
-
1294
- best_result = None
1295
- best_score = 0.0
1296
-
1297
- for delim in candidates:
1298
- if len(delim) == 0:
1299
- continue
1300
-
1301
- count = data.count(delim)
1302
- if count < 2:
1303
- continue
1304
-
1305
- # Calculate score based on frequency and regularity
1306
- positions = []
1307
- pos = 0
1308
- while True:
1309
- pos = data.find(delim, pos)
1310
- if pos == -1:
1311
- break
1312
- positions.append(pos)
1313
- pos += len(delim)
1314
-
1315
- if len(positions) < 2:
1316
- continue
1317
-
1318
- # Calculate interval regularity
1319
- intervals = [positions[i + 1] - positions[i] for i in range(len(positions) - 1)]
1320
- if len(intervals) > 0:
1321
- mean_interval = sum(intervals) / len(intervals)
1322
- if mean_interval > 0:
1323
- variance = sum((x - mean_interval) ** 2 for x in intervals) / len(intervals)
1324
- cv = (variance**0.5) / mean_interval if mean_interval > 0 else 1.0
1325
- regularity = 1.0 / (1.0 + cv)
1326
- else:
1327
- regularity = 0.0
1328
- else:
1329
- regularity = 0.0
1330
-
1331
- # Score combines frequency and regularity
1332
- score = count * (0.5 + 0.5 * regularity)
1333
-
1334
- if score > best_score:
1335
- best_score = score
1336
- best_result = DelimiterResult(
1337
- delimiter=delim,
1338
- delimiter_type="fixed",
1339
- confidence=min(1.0, regularity * 0.8 + 0.2 * min(1.0, count / 10)),
1340
- occurrences=count,
1341
- positions=positions,
1342
- )
1343
-
1344
- if best_result is None:
1345
- return DelimiterResult(
1346
- delimiter=b"",
1347
- delimiter_type="fixed",
1348
- confidence=0.0,
1349
- occurrences=0,
1350
- )
1351
-
1352
- return best_result
1353
-
1354
-
1355
- def detect_length_prefix(
1356
- payloads: Sequence[bytes],
1357
- max_length_bytes: int = 4,
1358
- ) -> LengthPrefixResult:
1359
- """Detect length-prefixed message format.
1360
-
1361
- Implements RE-PAY-003: Length prefix detection.
1362
-
1363
- Args:
1364
- payloads: List of payload samples.
1365
- max_length_bytes: Maximum length field size to test.
1366
-
1367
- Returns:
1368
- LengthPrefixResult with detected format.
1369
-
1370
- Example:
1371
- >>> result = detect_length_prefix(payloads)
1372
- >>> if result.detected:
1373
- ... print(f"Length field: {result.length_bytes} bytes, {result.endian}")
1374
- """
1375
- if not payloads:
1376
- return LengthPrefixResult(detected=False)
1377
-
1378
- # Concatenate payloads for analysis
1379
- data = b"".join(payloads)
1380
-
1381
- best_result = LengthPrefixResult(detected=False)
1382
- best_score = 0.0
1383
-
1384
- # Try different length field sizes and offsets
1385
- # IMPORTANT: Prefer larger length_bytes values when scores are equal
1386
- # by iterating in reverse order (4, 2, 1) and using >= for comparison
1387
- for length_bytes in [4, 2, 1]:
1388
- if length_bytes > max_length_bytes:
1389
- continue
1390
-
1391
- for endian_str in ["big", "little"]:
1392
- endian: Literal["big", "little"] = endian_str # type: ignore[assignment]
1393
- for offset in range(min(8, len(data) - length_bytes)):
1394
- for includes_length in [False, True]:
1395
- score, matches = _test_length_prefix(
1396
- data, length_bytes, endian, offset, includes_length
1397
- )
1398
-
1399
- # Use > to prefer larger length_bytes (tested first) when scores are equal
1400
- if score > best_score and matches >= 3:
1401
- best_score = score
1402
- best_result = LengthPrefixResult(
1403
- detected=True,
1404
- length_bytes=length_bytes,
1405
- endian=endian,
1406
- offset=offset,
1407
- includes_length=includes_length,
1408
- confidence=score,
1409
- )
1410
-
1411
- return best_result
1412
-
1413
-
1414
- def find_message_boundaries(
1415
- payloads: Sequence[bytes] | bytes,
1416
- delimiter: bytes | DelimiterResult | None = None,
1417
- length_prefix: LengthPrefixResult | None = None,
1418
- ) -> list[MessageBoundary]:
1419
- """Find message boundaries in payload data.
1420
-
1421
- Implements RE-PAY-003: Message boundary detection.
1422
-
1423
- Args:
1424
- payloads: Payload data or list of payloads.
1425
- delimiter: Delimiter to use (auto-detect if None).
1426
- length_prefix: Length prefix format (test if None).
1427
-
1428
- Returns:
1429
- List of MessageBoundary objects.
1430
-
1431
- Example:
1432
- >>> boundaries = find_message_boundaries(data)
1433
- >>> for b in boundaries:
1434
- ... print(f"Message {b.index}: {b.length} bytes")
1435
- """
1436
- # Combine payloads if list
1437
- if isinstance(payloads, list | tuple):
1438
- data: bytes = b"".join(payloads)
1439
- else:
1440
- # Type narrowing: payloads is bytes here
1441
- data = cast("bytes", payloads)
1442
-
1443
- if not data:
1444
- return []
1445
-
1446
- boundaries = []
1447
-
1448
- # Try length prefix first
1449
- if length_prefix is None:
1450
- length_prefix = detect_length_prefix([data] if isinstance(data, bytes) else list(payloads))
1451
-
1452
- if length_prefix.detected:
1453
- boundaries = _extract_length_prefixed_messages(data, length_prefix)
1454
- if len(boundaries) > 0:
1455
- return boundaries
1456
-
1457
- # Fall back to delimiter
1458
- if delimiter is None:
1459
- delimiter = detect_delimiter(data)
1460
-
1461
- if isinstance(delimiter, DelimiterResult):
1462
- delim = delimiter.delimiter
1463
- else:
1464
- delim = delimiter
1465
-
1466
- if not delim:
1467
- # No delimiter found, return whole data as one message
1468
- return [MessageBoundary(start=0, end=len(data), length=len(data), data=data, index=0)]
1469
-
1470
- # Split by delimiter
1471
- parts = data.split(delim)
1472
- current_offset = 0
1473
-
1474
- for _i, part in enumerate(parts):
1475
- if part: # Skip empty parts
1476
- boundaries.append(
1477
- MessageBoundary(
1478
- start=current_offset,
1479
- end=current_offset + len(part),
1480
- length=len(part),
1481
- data=part,
1482
- index=len(boundaries),
1483
- )
1484
- )
1485
- current_offset += len(part) + len(delim)
1486
-
1487
- return boundaries
1488
-
1489
-
1490
- def segment_messages(
1491
- payloads: Sequence[bytes] | bytes,
1492
- delimiter: bytes | None = None,
1493
- length_prefix: LengthPrefixResult | None = None,
1494
- ) -> list[bytes]:
1495
- """Segment stream into individual messages.
1496
-
1497
- Implements RE-PAY-003: Message segmentation.
1498
-
1499
- Args:
1500
- payloads: Payload data or list of payloads.
1501
- delimiter: Delimiter to use (auto-detect if None).
1502
- length_prefix: Length prefix format (auto-detect if None).
1503
-
1504
- Returns:
1505
- List of message bytes.
1506
- """
1507
- boundaries = find_message_boundaries(payloads, delimiter, length_prefix)
1508
- return [b.data for b in boundaries]
1509
-
1510
-
1511
- def diff_payloads(payload_a: bytes, payload_b: bytes) -> PayloadDiff:
1512
- """Compare two payloads and identify differences.
1513
-
1514
- Implements RE-PAY-005: Payload differential analysis.
1515
-
1516
- Args:
1517
- payload_a: First payload.
1518
- payload_b: Second payload.
1519
-
1520
- Returns:
1521
- PayloadDiff with comparison results.
1522
-
1523
- Example:
1524
- >>> diff = diff_payloads(pkt1.data, pkt2.data)
1525
- >>> print(f"Common prefix: {diff.common_prefix_length} bytes")
1526
- >>> print(f"Different bytes: {len(diff.differences)}")
1527
- """
1528
- # Find common prefix
1529
- common_prefix = 0
1530
- min_len = min(len(payload_a), len(payload_b))
1531
- for i in range(min_len):
1532
- if payload_a[i] == payload_b[i]:
1533
- common_prefix += 1
1534
- else:
1535
- break
1536
-
1537
- # Find common suffix
1538
- common_suffix = 0
1539
- for i in range(1, min_len - common_prefix + 1):
1540
- if payload_a[-i] == payload_b[-i]:
1541
- common_suffix += 1
1542
- else:
1543
- break
1544
-
1545
- # Find all differences
1546
- differences = []
1547
- for i in range(min_len):
1548
- if payload_a[i] != payload_b[i]:
1549
- differences.append((i, payload_a[i], payload_b[i]))
1550
-
1551
- # Add length differences
1552
- if len(payload_a) > len(payload_b):
1553
- for i in range(len(payload_b), len(payload_a)):
1554
- differences.append((i, payload_a[i], -1))
1555
- elif len(payload_b) > len(payload_a):
1556
- for i in range(len(payload_a), len(payload_b)):
1557
- differences.append((i, -1, payload_b[i]))
1558
-
1559
- # Calculate similarity
1560
- max_len = max(len(payload_a), len(payload_b))
1561
- if max_len == 0:
1562
- similarity = 1.0
1563
- else:
1564
- matching = min_len - len([d for d in differences if d[0] < min_len])
1565
- similarity = matching / max_len
1566
-
1567
- # Calculate edit distance (simplified Levenshtein)
1568
- edit_distance = _levenshtein_distance(payload_a, payload_b)
1569
-
1570
- return PayloadDiff(
1571
- common_prefix_length=common_prefix,
1572
- common_suffix_length=common_suffix,
1573
- differences=differences,
1574
- similarity=similarity,
1575
- edit_distance=edit_distance,
1576
- )
1577
-
1578
-
1579
- def find_common_bytes(payloads: Sequence[bytes]) -> bytes:
1580
- """Find common prefix across all payloads.
1581
-
1582
- Implements RE-PAY-005: Common byte analysis.
1583
-
1584
- Args:
1585
- payloads: List of payloads to analyze.
1586
-
1587
- Returns:
1588
- Common prefix bytes.
1589
- """
1590
- if not payloads:
1591
- return b""
1592
-
1593
- if len(payloads) == 1:
1594
- return payloads[0]
1595
-
1596
- # Find minimum length
1597
- min_len = min(len(p) for p in payloads)
1598
-
1599
- # Find common prefix
1600
- common = bytearray()
1601
- for i in range(min_len):
1602
- byte = payloads[0][i]
1603
- if all(p[i] == byte for p in payloads):
1604
- common.append(byte)
1605
- else:
1606
- break
1607
-
1608
- return bytes(common)
1609
-
1610
-
1611
- def find_variable_positions(payloads: Sequence[bytes]) -> VariablePositions:
1612
- """Identify which byte positions vary across payloads.
1613
-
1614
- Implements RE-PAY-005: Variable position detection.
1615
-
1616
- Args:
1617
- payloads: List of payloads to analyze.
1618
-
1619
- Returns:
1620
- VariablePositions with constant and variable position info.
1621
-
1622
- Example:
1623
- >>> result = find_variable_positions(payloads)
1624
- >>> print(f"Constant positions: {result.constant_positions}")
1625
- >>> print(f"Variable positions: {result.variable_positions}")
1626
- """
1627
- if not payloads:
1628
- return VariablePositions(
1629
- constant_positions=[],
1630
- variable_positions=[],
1631
- constant_values={},
1632
- variance_by_position=np.array([]),
1633
- )
1634
-
1635
- # Use shortest payload length
1636
- min_len = min(len(p) for p in payloads)
1637
-
1638
- constant_positions = []
1639
- variable_positions = []
1640
- constant_values = {}
1641
- variances = []
1642
-
1643
- for i in range(min_len):
1644
- values = [p[i] for p in payloads]
1645
- unique = set(values)
1646
-
1647
- if len(unique) == 1:
1648
- constant_positions.append(i)
1649
- constant_values[i] = values[0]
1650
- variances.append(0.0)
1651
- else:
1652
- variable_positions.append(i)
1653
- variances.append(float(np.var(values)))
1654
-
1655
- return VariablePositions(
1656
- constant_positions=constant_positions,
1657
- variable_positions=variable_positions,
1658
- constant_values=constant_values,
1659
- variance_by_position=np.array(variances),
1660
- )
1661
-
1662
-
1663
- def compute_similarity(
1664
- payload_a: bytes,
1665
- payload_b: bytes,
1666
- metric: Literal["levenshtein", "hamming", "jaccard"] = "levenshtein",
1667
- ) -> float:
1668
- """Compute similarity between two payloads.
1669
-
1670
- Implements RE-PAY-005: Similarity computation.
1671
-
1672
- Args:
1673
- payload_a: First payload.
1674
- payload_b: Second payload.
1675
- metric: Similarity metric to use.
1676
-
1677
- Returns:
1678
- Similarity score (0-1).
1679
- """
1680
- if metric == "levenshtein":
1681
- max_len = max(len(payload_a), len(payload_b))
1682
- if max_len == 0:
1683
- return 1.0
1684
- distance = _levenshtein_distance(payload_a, payload_b)
1685
- return 1.0 - (distance / max_len)
1686
-
1687
- elif metric == "hamming":
1688
- if len(payload_a) != len(payload_b):
1689
- # Pad shorter one
1690
- max_len = max(len(payload_a), len(payload_b))
1691
- payload_a = payload_a.ljust(max_len, b"\x00")
1692
- payload_b = payload_b.ljust(max_len, b"\x00")
1693
-
1694
- matches = sum(a == b for a, b in zip(payload_a, payload_b, strict=True))
1695
- return matches / len(payload_a) if payload_a else 1.0
1696
-
1697
- # metric == "jaccard"
1698
- # Treat bytes as sets
1699
- set_a = set(payload_a)
1700
- set_b = set(payload_b)
1701
- intersection = len(set_a & set_b)
1702
- union = len(set_a | set_b)
1703
- return intersection / union if union > 0 else 1.0
1704
-
1705
-
1706
- def cluster_payloads(
1707
- payloads: Sequence[bytes],
1708
- threshold: float = 0.8,
1709
- algorithm: Literal["greedy", "dbscan"] = "greedy",
1710
- ) -> list[PayloadCluster]:
1711
- """Cluster similar payloads together.
1712
-
1713
- Implements RE-PAY-005: Payload clustering.
1714
-
1715
- Args:
1716
- payloads: List of payloads to cluster.
1717
- threshold: Similarity threshold for clustering.
1718
- algorithm: Clustering algorithm.
1719
-
1720
- Returns:
1721
- List of PayloadCluster objects.
1722
-
1723
- Example:
1724
- >>> clusters = cluster_payloads(payloads, threshold=0.85)
1725
- >>> for c in clusters:
1726
- ... print(f"Cluster {c.cluster_id}: {c.size} payloads")
1727
- """
1728
- if not payloads:
1729
- return []
1730
-
1731
- if algorithm == "greedy":
1732
- return _cluster_greedy_optimized(payloads, threshold)
1733
- # algorithm == "dbscan"
1734
- return _cluster_dbscan(payloads, threshold)
1735
-
1736
-
1737
- def correlate_request_response(
1738
- requests: Sequence[PayloadInfo],
1739
- responses: Sequence[PayloadInfo],
1740
- max_delay: float = 1.0,
1741
- ) -> list[tuple[PayloadInfo, PayloadInfo, float]]:
1742
- """Correlate request payloads with responses.
1743
-
1744
- Implements RE-PAY-005: Request-response correlation.
1745
-
1746
- Args:
1747
- requests: List of request PayloadInfo.
1748
- responses: List of response PayloadInfo.
1749
- max_delay: Maximum time between request and response.
1750
-
1751
- Returns:
1752
- List of (request, response, latency) tuples.
1753
- """
1754
- pairs = []
1755
-
1756
- for request in requests:
1757
- if request.timestamp is None:
1758
- continue
1759
-
1760
- best_response = None
1761
- best_latency = float("inf")
1762
-
1763
- for response in responses:
1764
- if response.timestamp is None:
1765
- continue
1766
-
1767
- latency = response.timestamp - request.timestamp
1768
- if 0 <= latency <= max_delay and latency < best_latency:
1769
- best_response = response
1770
- best_latency = latency
1771
-
1772
- if best_response is not None:
1773
- pairs.append((request, best_response, best_latency))
1774
-
1775
- return pairs
1776
-
1777
-
1778
- # =============================================================================
1779
- # Helper functions
1780
- # =============================================================================
1781
-
1782
-
1783
- def _find_pattern_in_data(
1784
- data: bytes,
1785
- pattern: bytes | str,
1786
- pattern_type: str,
1787
- ) -> list[tuple[int, bytes]]:
1788
- """Find pattern occurrences in data."""
1789
- matches = []
1790
-
1791
- if pattern_type == "exact":
1792
- if isinstance(pattern, str):
1793
- pattern = pattern.encode()
1794
- pos = 0
1795
- while True:
1796
- pos = data.find(pattern, pos)
1797
- if pos == -1:
1798
- break
1799
- matches.append((pos, pattern))
1800
- pos += 1
1801
-
1802
- elif pattern_type == "wildcard":
1803
- # Convert wildcard pattern to regex
1804
- if isinstance(pattern, bytes):
1805
- # Replace ?? with . for single byte match
1806
- regex_pattern = pattern.replace(b"??", b".")
1807
- try:
1808
- for match in re.finditer(regex_pattern, data, re.DOTALL):
1809
- matches.append((match.start(), match.group()))
1810
- except re.error:
1811
- pass
1812
-
1813
- elif pattern_type == "regex":
1814
- if isinstance(pattern, str):
1815
- pattern = pattern.encode()
1816
- try:
1817
- for match in re.finditer(pattern, data, re.DOTALL):
1818
- matches.append((match.start(), match.group()))
1819
- except re.error:
1820
- pass
1821
-
1822
- return matches
1823
-
1824
-
1825
- def _test_length_prefix(
1826
- data: bytes,
1827
- length_bytes: int,
1828
- endian: str,
1829
- offset: int,
1830
- includes_length: bool,
1831
- ) -> tuple[float, int]:
1832
- """Test if data follows a length-prefix pattern."""
1833
- matches = 0
1834
- pos = 0
1835
-
1836
- while pos + offset + length_bytes <= len(data):
1837
- # Read length field
1838
- length_data = data[pos + offset : pos + offset + length_bytes]
1839
- if endian == "big":
1840
- length = int.from_bytes(length_data, "big")
1841
- else:
1842
- length = int.from_bytes(length_data, "little")
1843
-
1844
- if includes_length:
1845
- expected_end = pos + length
1846
- else:
1847
- expected_end = pos + offset + length_bytes + length
1848
-
1849
- # Check if this makes sense
1850
- if 0 < length < 65536 and expected_end <= len(data):
1851
- matches += 1
1852
- pos = expected_end
1853
- else:
1854
- break
1855
-
1856
- # Score based on matches and coverage
1857
- coverage = pos / len(data) if len(data) > 0 else 0
1858
- score = min(1.0, matches / 5) * coverage
1859
-
1860
- return score, matches
1861
-
1862
-
1863
- def _extract_length_prefixed_messages(
1864
- data: bytes,
1865
- length_prefix: LengthPrefixResult,
1866
- ) -> list[MessageBoundary]:
1867
- """Extract messages using detected length prefix format."""
1868
- boundaries = []
1869
- pos = 0
1870
- index = 0
1871
-
1872
- while pos + length_prefix.offset + length_prefix.length_bytes <= len(data):
1873
- # Read length
1874
- length_data = data[
1875
- pos + length_prefix.offset : pos + length_prefix.offset + length_prefix.length_bytes
1876
- ]
1877
- if length_prefix.endian == "big":
1878
- length = int.from_bytes(length_data, "big")
1879
- else:
1880
- length = int.from_bytes(length_data, "little")
1881
-
1882
- if length_prefix.includes_length:
1883
- end = pos + length
1884
- else:
1885
- end = pos + length_prefix.offset + length_prefix.length_bytes + length
1886
-
1887
- if end > len(data) or length <= 0:
1888
- break
1889
-
1890
- msg_data = data[pos:end]
1891
- boundaries.append(
1892
- MessageBoundary(
1893
- start=pos,
1894
- end=end,
1895
- length=end - pos,
1896
- data=msg_data,
1897
- index=index,
1898
- )
1899
- )
1900
-
1901
- pos = end
1902
- index += 1
1903
-
1904
- return boundaries
1905
-
1906
-
1907
- def _levenshtein_distance(a: bytes, b: bytes) -> int:
1908
- """Calculate Levenshtein edit distance between two byte sequences."""
1909
- if len(a) < len(b):
1910
- return _levenshtein_distance(b, a)
1911
-
1912
- if len(b) == 0:
1913
- return len(a)
1914
-
1915
- previous_row: list[int] = list(range(len(b) + 1))
1916
- for i, c1 in enumerate(a):
1917
- current_row = [i + 1]
1918
- for j, c2 in enumerate(b):
1919
- insertions = previous_row[j + 1] + 1
1920
- deletions = current_row[j] + 1
1921
- substitutions = previous_row[j] + (c1 != c2)
1922
- current_row.append(min(insertions, deletions, substitutions))
1923
- previous_row = current_row
1924
-
1925
- return previous_row[-1]
1926
-
1927
-
1928
- def _fast_similarity(payload_a: bytes, payload_b: bytes, threshold: float) -> float | None:
1929
- """Fast similarity check with early termination.
1930
-
1931
- Uses length-based filtering and sampling to quickly reject dissimilar payloads.
1932
- Returns None if payloads are likely similar (needs full check),
1933
- or a similarity value if they can be quickly determined.
1934
-
1935
- Args:
1936
- payload_a: First payload.
1937
- payload_b: Second payload.
1938
- threshold: Similarity threshold for clustering.
1939
-
1940
- Returns:
1941
- Similarity value if quickly determined, None if full check needed.
1942
- """
1943
- len_a = len(payload_a)
1944
- len_b = len(payload_b)
1945
-
1946
- # Empty payloads
1947
- if len_a == 0 and len_b == 0:
1948
- return 1.0
1949
- if len_a == 0 or len_b == 0:
1950
- return 0.0
1951
-
1952
- # Length difference filter: if lengths differ by more than (1-threshold)*max_len,
1953
- # similarity can't exceed threshold
1954
- max_len = max(len_a, len_b)
1955
- min_len = min(len_a, len_b)
1956
- _length_diff = max_len - min_len
1957
-
1958
- # Maximum possible similarity given length difference
1959
- max_possible_similarity = min_len / max_len
1960
- if max_possible_similarity < threshold:
1961
- return max_possible_similarity
1962
-
1963
- # For same-length payloads, use fast hamming similarity
1964
- if len_a == len_b:
1965
- # Sample comparison for large payloads
1966
- if len_a > 50:
1967
- # Sample first 16, last 16, and some middle bytes
1968
- sample_size = min(48, len_a)
1969
- mismatches = 0
1970
-
1971
- # First 16 bytes
1972
- for i in range(min(16, len_a)):
1973
- if payload_a[i] != payload_b[i]:
1974
- mismatches += 1
1975
-
1976
- # Last 16 bytes
1977
- for i in range(1, min(17, len_a + 1)):
1978
- if payload_a[-i] != payload_b[-i]:
1979
- mismatches += 1
1980
-
1981
- # Middle samples
1982
- if len_a > 32:
1983
- step = (len_a - 32) // 16
1984
- if step > 0:
1985
- for i in range(16, len_a - 16, step):
1986
- if payload_a[i] != payload_b[i]:
1987
- mismatches += 1
1988
-
1989
- # Estimate similarity from sample
1990
- estimated_similarity = 1.0 - (mismatches / sample_size)
1991
-
1992
- # If sample shows very low similarity, reject early
1993
- if estimated_similarity < threshold * 0.8:
1994
- return estimated_similarity
1995
-
1996
- # Full hamming comparison for same-length payloads (faster than Levenshtein)
1997
- matches = sum(a == b for a, b in zip(payload_a, payload_b, strict=True))
1998
- return matches / len_a
1999
-
2000
- # For different-length payloads, use common prefix/suffix heuristic
2001
- common_prefix = 0
2002
- for i in range(min_len):
2003
- if payload_a[i] == payload_b[i]:
2004
- common_prefix += 1
2005
- else:
2006
- break
2007
-
2008
- common_suffix = 0
2009
- for i in range(1, min_len - common_prefix + 1):
2010
- if payload_a[-i] == payload_b[-i]:
2011
- common_suffix += 1
2012
- else:
2013
- break
2014
-
2015
- # Estimate similarity from prefix/suffix
2016
- common_bytes = common_prefix + common_suffix
2017
- estimated_similarity = common_bytes / max_len
2018
-
2019
- # If common bytes suggest low similarity, reject
2020
- if estimated_similarity < threshold * 0.7:
2021
- return estimated_similarity
2022
-
2023
- # Need full comparison
2024
- return None
2025
-
2026
-
2027
- def _cluster_greedy_optimized(
2028
- payloads: Sequence[bytes],
2029
- threshold: float,
2030
- ) -> list[PayloadCluster]:
2031
- """Optimized greedy clustering algorithm.
2032
-
2033
- Uses fast pre-filtering based on length and sampling to avoid
2034
- expensive Levenshtein distance calculations when possible.
2035
-
2036
- Args:
2037
- payloads: List of payloads to cluster.
2038
- threshold: Similarity threshold for clustering.
2039
-
2040
- Returns:
2041
- List of PayloadCluster objects.
2042
- """
2043
- clusters: list[PayloadCluster] = []
2044
- assigned = [False] * len(payloads)
2045
-
2046
- # Precompute lengths for fast filtering
2047
- lengths = [len(p) for p in payloads]
2048
-
2049
- for i, payload in enumerate(payloads):
2050
- if assigned[i]:
2051
- continue
2052
-
2053
- # Start new cluster
2054
- cluster_payloads = [payload]
2055
- cluster_indices = [i]
2056
- assigned[i] = True
2057
-
2058
- payload_len = lengths[i]
2059
-
2060
- # Find similar payloads
2061
- for j in range(i + 1, len(payloads)):
2062
- if assigned[j]:
2063
- continue
2064
-
2065
- other_len = lengths[j]
2066
-
2067
- # Quick length-based rejection
2068
- max_len = max(payload_len, other_len)
2069
- min_len = min(payload_len, other_len)
2070
- if min_len / max_len < threshold:
2071
- continue
2072
-
2073
- # Try fast similarity check first
2074
- fast_result = _fast_similarity(payload, payloads[j], threshold)
2075
-
2076
- if fast_result is not None:
2077
- similarity = fast_result
2078
- else:
2079
- # Fall back to Levenshtein for uncertain cases
2080
- similarity = compute_similarity(payload, payloads[j])
2081
-
2082
- if similarity >= threshold:
2083
- cluster_payloads.append(payloads[j])
2084
- cluster_indices.append(j)
2085
- assigned[j] = True
2086
-
2087
- clusters.append(
2088
- PayloadCluster(
2089
- cluster_id=len(clusters),
2090
- payloads=cluster_payloads,
2091
- indices=cluster_indices,
2092
- representative=payload,
2093
- size=len(cluster_payloads),
2094
- )
2095
- )
2096
-
2097
- return clusters
2098
-
2099
-
2100
- def _cluster_greedy(
2101
- payloads: Sequence[bytes],
2102
- threshold: float,
2103
- ) -> list[PayloadCluster]:
2104
- """Greedy clustering algorithm (legacy, uses optimized version)."""
2105
- return _cluster_greedy_optimized(payloads, threshold)
2106
-
2107
-
2108
- def _cluster_dbscan(
2109
- payloads: Sequence[bytes],
2110
- threshold: float,
2111
- ) -> list[PayloadCluster]:
2112
- """DBSCAN-style clustering (simplified)."""
2113
- # For simplicity, fall back to greedy
2114
- # Full DBSCAN would require scipy or custom implementation
2115
- return _cluster_greedy_optimized(payloads, threshold)
2116
-
20
+ # RE-PAY-004 & RE-PAY-005: Field Inference and Comparison
21
+ from oscura.analyzers.packet.payload_analysis import (
22
+ FieldInferrer,
23
+ InferredField,
24
+ MessageSchema,
25
+ PayloadCluster,
26
+ PayloadDiff,
27
+ VariablePositions,
28
+ cluster_payloads,
29
+ compute_similarity,
30
+ correlate_request_response,
31
+ detect_field_types,
32
+ diff_payloads,
33
+ find_checksum_fields,
34
+ find_common_bytes,
35
+ find_sequence_fields,
36
+ find_variable_positions,
37
+ infer_fields,
38
+ )
39
+
40
+ # RE-PAY-001: Payload Extraction
41
+ from oscura.analyzers.packet.payload_extraction import (
42
+ PayloadExtractor,
43
+ PayloadInfo,
44
+ )
45
+
46
+ # RE-PAY-002 & RE-PAY-003: Pattern Search and Delimiter Detection
47
+ from oscura.analyzers.packet.payload_patterns import (
48
+ DelimiterResult,
49
+ LengthPrefixResult,
50
+ MessageBoundary,
51
+ PatternMatch,
52
+ detect_delimiter,
53
+ detect_length_prefix,
54
+ filter_by_pattern,
55
+ find_message_boundaries,
56
+ search_pattern,
57
+ search_patterns,
58
+ segment_messages,
59
+ )
2117
60
 
2118
61
  __all__ = [
62
+ # RE-PAY-003: Delimiter Detection
2119
63
  "DelimiterResult",
2120
64
  "FieldInferrer",
2121
- # RE-PAY-004: Field inference
65
+ # RE-PAY-004: Field Inference
2122
66
  "InferredField",
2123
67
  "LengthPrefixResult",
2124
68
  "MessageBoundary",
2125
69
  "MessageSchema",
70
+ # RE-PAY-002: Pattern Search
2126
71
  "PatternMatch",
2127
72
  "PayloadCluster",
73
+ # RE-PAY-005: Payload Comparison
2128
74
  "PayloadDiff",
2129
- # Classes
2130
75
  "PayloadExtractor",
2131
- # Data classes
76
+ # RE-PAY-001: Payload Extraction
2132
77
  "PayloadInfo",
2133
78
  "VariablePositions",
2134
79
  "cluster_payloads",
2135
80
  "compute_similarity",
2136
81
  "correlate_request_response",
2137
- # RE-PAY-003: Delimiter detection
2138
82
  "detect_delimiter",
2139
83
  "detect_field_types",
2140
84
  "detect_length_prefix",
2141
- # RE-PAY-005: Comparison
2142
85
  "diff_payloads",
2143
86
  "filter_by_pattern",
2144
87
  "find_checksum_fields",
@@ -2147,9 +90,6 @@ __all__ = [
2147
90
  "find_sequence_fields",
2148
91
  "find_variable_positions",
2149
92
  "infer_fields",
2150
- # RE-PAY-001: Extraction
2151
- # (via PayloadExtractor methods)
2152
- # RE-PAY-002: Pattern search
2153
93
  "search_pattern",
2154
94
  "search_patterns",
2155
95
  "segment_messages",