oscura 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +169 -167
- oscura/analyzers/__init__.py +3 -0
- oscura/analyzers/classification.py +659 -0
- oscura/analyzers/digital/edges.py +325 -65
- oscura/analyzers/digital/quality.py +293 -166
- oscura/analyzers/digital/timing.py +260 -115
- oscura/analyzers/digital/timing_numba.py +334 -0
- oscura/analyzers/entropy.py +605 -0
- oscura/analyzers/eye/diagram.py +176 -109
- oscura/analyzers/eye/metrics.py +5 -5
- oscura/analyzers/jitter/__init__.py +6 -4
- oscura/analyzers/jitter/ber.py +52 -52
- oscura/analyzers/jitter/classification.py +156 -0
- oscura/analyzers/jitter/decomposition.py +163 -113
- oscura/analyzers/jitter/spectrum.py +80 -64
- oscura/analyzers/ml/__init__.py +39 -0
- oscura/analyzers/ml/features.py +600 -0
- oscura/analyzers/ml/signal_classifier.py +604 -0
- oscura/analyzers/packet/daq.py +246 -158
- oscura/analyzers/packet/parser.py +12 -1
- oscura/analyzers/packet/payload.py +50 -2110
- oscura/analyzers/packet/payload_analysis.py +361 -181
- oscura/analyzers/packet/payload_patterns.py +133 -70
- oscura/analyzers/packet/stream.py +84 -23
- oscura/analyzers/patterns/__init__.py +26 -5
- oscura/analyzers/patterns/anomaly_detection.py +908 -0
- oscura/analyzers/patterns/clustering.py +169 -108
- oscura/analyzers/patterns/clustering_optimized.py +227 -0
- oscura/analyzers/patterns/discovery.py +1 -1
- oscura/analyzers/patterns/matching.py +581 -197
- oscura/analyzers/patterns/pattern_mining.py +778 -0
- oscura/analyzers/patterns/periodic.py +121 -38
- oscura/analyzers/patterns/sequences.py +175 -78
- oscura/analyzers/power/conduction.py +1 -1
- oscura/analyzers/power/soa.py +6 -6
- oscura/analyzers/power/switching.py +250 -110
- oscura/analyzers/protocol/__init__.py +17 -1
- oscura/analyzers/protocols/base.py +6 -6
- oscura/analyzers/protocols/ble/__init__.py +38 -0
- oscura/analyzers/protocols/ble/analyzer.py +809 -0
- oscura/analyzers/protocols/ble/uuids.py +288 -0
- oscura/analyzers/protocols/can.py +257 -127
- oscura/analyzers/protocols/can_fd.py +107 -80
- oscura/analyzers/protocols/flexray.py +139 -80
- oscura/analyzers/protocols/hdlc.py +93 -58
- oscura/analyzers/protocols/i2c.py +247 -106
- oscura/analyzers/protocols/i2s.py +138 -86
- oscura/analyzers/protocols/industrial/__init__.py +40 -0
- oscura/analyzers/protocols/industrial/bacnet/__init__.py +33 -0
- oscura/analyzers/protocols/industrial/bacnet/analyzer.py +708 -0
- oscura/analyzers/protocols/industrial/bacnet/encoding.py +412 -0
- oscura/analyzers/protocols/industrial/bacnet/services.py +622 -0
- oscura/analyzers/protocols/industrial/ethercat/__init__.py +30 -0
- oscura/analyzers/protocols/industrial/ethercat/analyzer.py +474 -0
- oscura/analyzers/protocols/industrial/ethercat/mailbox.py +339 -0
- oscura/analyzers/protocols/industrial/ethercat/topology.py +166 -0
- oscura/analyzers/protocols/industrial/modbus/__init__.py +31 -0
- oscura/analyzers/protocols/industrial/modbus/analyzer.py +525 -0
- oscura/analyzers/protocols/industrial/modbus/crc.py +79 -0
- oscura/analyzers/protocols/industrial/modbus/functions.py +436 -0
- oscura/analyzers/protocols/industrial/opcua/__init__.py +21 -0
- oscura/analyzers/protocols/industrial/opcua/analyzer.py +552 -0
- oscura/analyzers/protocols/industrial/opcua/datatypes.py +446 -0
- oscura/analyzers/protocols/industrial/opcua/services.py +264 -0
- oscura/analyzers/protocols/industrial/profinet/__init__.py +23 -0
- oscura/analyzers/protocols/industrial/profinet/analyzer.py +441 -0
- oscura/analyzers/protocols/industrial/profinet/dcp.py +263 -0
- oscura/analyzers/protocols/industrial/profinet/ptcp.py +200 -0
- oscura/analyzers/protocols/jtag.py +180 -98
- oscura/analyzers/protocols/lin.py +219 -114
- oscura/analyzers/protocols/manchester.py +4 -4
- oscura/analyzers/protocols/onewire.py +253 -149
- oscura/analyzers/protocols/parallel_bus/__init__.py +20 -0
- oscura/analyzers/protocols/parallel_bus/centronics.py +92 -0
- oscura/analyzers/protocols/parallel_bus/gpib.py +137 -0
- oscura/analyzers/protocols/spi.py +192 -95
- oscura/analyzers/protocols/swd.py +321 -167
- oscura/analyzers/protocols/uart.py +267 -125
- oscura/analyzers/protocols/usb.py +235 -131
- oscura/analyzers/side_channel/power.py +17 -12
- oscura/analyzers/signal/__init__.py +15 -0
- oscura/analyzers/signal/timing_analysis.py +1086 -0
- oscura/analyzers/signal_integrity/__init__.py +4 -1
- oscura/analyzers/signal_integrity/sparams.py +2 -19
- oscura/analyzers/spectral/chunked.py +129 -60
- oscura/analyzers/spectral/chunked_fft.py +300 -94
- oscura/analyzers/spectral/chunked_wavelet.py +100 -80
- oscura/analyzers/statistical/checksum.py +376 -217
- oscura/analyzers/statistical/classification.py +229 -107
- oscura/analyzers/statistical/entropy.py +78 -53
- oscura/analyzers/statistics/correlation.py +407 -211
- oscura/analyzers/statistics/outliers.py +2 -2
- oscura/analyzers/statistics/streaming.py +30 -5
- oscura/analyzers/validation.py +216 -101
- oscura/analyzers/waveform/measurements.py +9 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +31 -15
- oscura/analyzers/waveform/spectral.py +500 -228
- oscura/api/__init__.py +31 -5
- oscura/api/dsl/__init__.py +582 -0
- oscura/{dsl → api/dsl}/commands.py +43 -76
- oscura/{dsl → api/dsl}/interpreter.py +26 -51
- oscura/{dsl → api/dsl}/parser.py +107 -77
- oscura/{dsl → api/dsl}/repl.py +2 -2
- oscura/api/dsl.py +1 -1
- oscura/{integrations → api/integrations}/__init__.py +1 -1
- oscura/{integrations → api/integrations}/llm.py +201 -102
- oscura/api/operators.py +3 -3
- oscura/api/optimization.py +144 -30
- oscura/api/rest_server.py +921 -0
- oscura/api/server/__init__.py +17 -0
- oscura/api/server/dashboard.py +850 -0
- oscura/api/server/static/README.md +34 -0
- oscura/api/server/templates/base.html +181 -0
- oscura/api/server/templates/export.html +120 -0
- oscura/api/server/templates/home.html +284 -0
- oscura/api/server/templates/protocols.html +58 -0
- oscura/api/server/templates/reports.html +43 -0
- oscura/api/server/templates/session_detail.html +89 -0
- oscura/api/server/templates/sessions.html +83 -0
- oscura/api/server/templates/waveforms.html +73 -0
- oscura/automotive/__init__.py +8 -1
- oscura/automotive/can/__init__.py +10 -0
- oscura/automotive/can/checksum.py +3 -1
- oscura/automotive/can/dbc_generator.py +590 -0
- oscura/automotive/can/message_wrapper.py +121 -74
- oscura/automotive/can/patterns.py +98 -21
- oscura/automotive/can/session.py +292 -56
- oscura/automotive/can/state_machine.py +6 -3
- oscura/automotive/can/stimulus_response.py +97 -75
- oscura/automotive/dbc/__init__.py +10 -2
- oscura/automotive/dbc/generator.py +84 -56
- oscura/automotive/dbc/parser.py +6 -6
- oscura/automotive/dtc/data.json +17 -102
- oscura/automotive/dtc/database.py +2 -2
- oscura/automotive/flexray/__init__.py +31 -0
- oscura/automotive/flexray/analyzer.py +504 -0
- oscura/automotive/flexray/crc.py +185 -0
- oscura/automotive/flexray/fibex.py +449 -0
- oscura/automotive/j1939/__init__.py +45 -8
- oscura/automotive/j1939/analyzer.py +605 -0
- oscura/automotive/j1939/spns.py +326 -0
- oscura/automotive/j1939/transport.py +306 -0
- oscura/automotive/lin/__init__.py +47 -0
- oscura/automotive/lin/analyzer.py +612 -0
- oscura/automotive/loaders/blf.py +13 -2
- oscura/automotive/loaders/csv_can.py +143 -72
- oscura/automotive/loaders/dispatcher.py +50 -2
- oscura/automotive/loaders/mdf.py +86 -45
- oscura/automotive/loaders/pcap.py +111 -61
- oscura/automotive/uds/__init__.py +4 -0
- oscura/automotive/uds/analyzer.py +725 -0
- oscura/automotive/uds/decoder.py +140 -58
- oscura/automotive/uds/models.py +7 -1
- oscura/automotive/visualization.py +1 -1
- oscura/cli/analyze.py +348 -0
- oscura/cli/batch.py +142 -122
- oscura/cli/benchmark.py +275 -0
- oscura/cli/characterize.py +137 -82
- oscura/cli/compare.py +224 -131
- oscura/cli/completion.py +250 -0
- oscura/cli/config_cmd.py +361 -0
- oscura/cli/decode.py +164 -87
- oscura/cli/export.py +286 -0
- oscura/cli/main.py +115 -31
- oscura/{onboarding → cli/onboarding}/__init__.py +3 -3
- oscura/{onboarding → cli/onboarding}/help.py +80 -58
- oscura/{onboarding → cli/onboarding}/tutorials.py +97 -72
- oscura/{onboarding → cli/onboarding}/wizard.py +55 -36
- oscura/cli/progress.py +147 -0
- oscura/cli/shell.py +157 -135
- oscura/cli/validate_cmd.py +204 -0
- oscura/cli/visualize.py +158 -0
- oscura/convenience.py +125 -79
- oscura/core/__init__.py +4 -2
- oscura/core/backend_selector.py +3 -3
- oscura/core/cache.py +126 -15
- oscura/core/cancellation.py +1 -1
- oscura/{config → core/config}/__init__.py +20 -11
- oscura/{config → core/config}/defaults.py +1 -1
- oscura/{config → core/config}/loader.py +7 -5
- oscura/{config → core/config}/memory.py +5 -5
- oscura/{config → core/config}/migration.py +1 -1
- oscura/{config → core/config}/pipeline.py +99 -23
- oscura/{config → core/config}/preferences.py +1 -1
- oscura/{config → core/config}/protocol.py +3 -3
- oscura/{config → core/config}/schema.py +426 -272
- oscura/{config → core/config}/settings.py +1 -1
- oscura/{config → core/config}/thresholds.py +195 -153
- oscura/core/correlation.py +5 -6
- oscura/core/cross_domain.py +0 -2
- oscura/core/debug.py +9 -5
- oscura/{extensibility → core/extensibility}/docs.py +158 -70
- oscura/{extensibility → core/extensibility}/extensions.py +160 -76
- oscura/{extensibility → core/extensibility}/logging.py +1 -1
- oscura/{extensibility → core/extensibility}/measurements.py +1 -1
- oscura/{extensibility → core/extensibility}/plugins.py +1 -1
- oscura/{extensibility → core/extensibility}/templates.py +73 -3
- oscura/{extensibility → core/extensibility}/validation.py +1 -1
- oscura/core/gpu_backend.py +11 -7
- oscura/core/log_query.py +101 -11
- oscura/core/logging.py +126 -54
- oscura/core/logging_advanced.py +5 -5
- oscura/core/memory_limits.py +108 -70
- oscura/core/memory_monitor.py +2 -2
- oscura/core/memory_progress.py +7 -7
- oscura/core/memory_warnings.py +1 -1
- oscura/core/numba_backend.py +13 -13
- oscura/{plugins → core/plugins}/__init__.py +9 -9
- oscura/{plugins → core/plugins}/base.py +7 -7
- oscura/{plugins → core/plugins}/cli.py +3 -3
- oscura/{plugins → core/plugins}/discovery.py +186 -106
- oscura/{plugins → core/plugins}/lifecycle.py +1 -1
- oscura/{plugins → core/plugins}/manager.py +7 -7
- oscura/{plugins → core/plugins}/registry.py +3 -3
- oscura/{plugins → core/plugins}/versioning.py +1 -1
- oscura/core/progress.py +16 -1
- oscura/core/provenance.py +8 -2
- oscura/{schemas → core/schemas}/__init__.py +2 -2
- oscura/{schemas → core/schemas}/device_mapping.json +2 -8
- oscura/{schemas → core/schemas}/packet_format.json +4 -24
- oscura/{schemas → core/schemas}/protocol_definition.json +2 -12
- oscura/core/types.py +4 -0
- oscura/core/uncertainty.py +3 -3
- oscura/correlation/__init__.py +52 -0
- oscura/correlation/multi_protocol.py +811 -0
- oscura/discovery/auto_decoder.py +117 -35
- oscura/discovery/comparison.py +191 -86
- oscura/discovery/quality_validator.py +155 -68
- oscura/discovery/signal_detector.py +196 -79
- oscura/export/__init__.py +18 -8
- oscura/export/kaitai_struct.py +513 -0
- oscura/export/scapy_layer.py +801 -0
- oscura/export/wireshark/generator.py +1 -1
- oscura/export/wireshark/templates/dissector.lua.j2 +2 -2
- oscura/export/wireshark_dissector.py +746 -0
- oscura/guidance/wizard.py +207 -111
- oscura/hardware/__init__.py +19 -0
- oscura/{acquisition → hardware/acquisition}/__init__.py +4 -4
- oscura/{acquisition → hardware/acquisition}/file.py +2 -2
- oscura/{acquisition → hardware/acquisition}/hardware.py +7 -7
- oscura/{acquisition → hardware/acquisition}/saleae.py +15 -12
- oscura/{acquisition → hardware/acquisition}/socketcan.py +1 -1
- oscura/{acquisition → hardware/acquisition}/streaming.py +2 -2
- oscura/{acquisition → hardware/acquisition}/synthetic.py +3 -3
- oscura/{acquisition → hardware/acquisition}/visa.py +33 -11
- oscura/hardware/firmware/__init__.py +29 -0
- oscura/hardware/firmware/pattern_recognition.py +874 -0
- oscura/hardware/hal_detector.py +736 -0
- oscura/hardware/security/__init__.py +37 -0
- oscura/hardware/security/side_channel_detector.py +1126 -0
- oscura/inference/__init__.py +4 -0
- oscura/inference/active_learning/observation_table.py +4 -1
- oscura/inference/alignment.py +216 -123
- oscura/inference/bayesian.py +113 -33
- oscura/inference/crc_reverse.py +101 -55
- oscura/inference/logic.py +6 -2
- oscura/inference/message_format.py +342 -183
- oscura/inference/protocol.py +95 -44
- oscura/inference/protocol_dsl.py +180 -82
- oscura/inference/signal_intelligence.py +1439 -706
- oscura/inference/spectral.py +99 -57
- oscura/inference/state_machine.py +810 -158
- oscura/inference/stream.py +270 -110
- oscura/iot/__init__.py +34 -0
- oscura/iot/coap/__init__.py +32 -0
- oscura/iot/coap/analyzer.py +668 -0
- oscura/iot/coap/options.py +212 -0
- oscura/iot/lorawan/__init__.py +21 -0
- oscura/iot/lorawan/crypto.py +206 -0
- oscura/iot/lorawan/decoder.py +801 -0
- oscura/iot/lorawan/mac_commands.py +341 -0
- oscura/iot/mqtt/__init__.py +27 -0
- oscura/iot/mqtt/analyzer.py +999 -0
- oscura/iot/mqtt/properties.py +315 -0
- oscura/iot/zigbee/__init__.py +31 -0
- oscura/iot/zigbee/analyzer.py +615 -0
- oscura/iot/zigbee/security.py +153 -0
- oscura/iot/zigbee/zcl.py +349 -0
- oscura/jupyter/display.py +125 -45
- oscura/{exploratory → jupyter/exploratory}/__init__.py +8 -8
- oscura/{exploratory → jupyter/exploratory}/error_recovery.py +298 -141
- oscura/jupyter/exploratory/fuzzy.py +746 -0
- oscura/{exploratory → jupyter/exploratory}/fuzzy_advanced.py +258 -100
- oscura/{exploratory → jupyter/exploratory}/legacy.py +464 -242
- oscura/{exploratory → jupyter/exploratory}/parse.py +167 -145
- oscura/{exploratory → jupyter/exploratory}/recovery.py +119 -87
- oscura/jupyter/exploratory/sync.py +612 -0
- oscura/{exploratory → jupyter/exploratory}/unknown.py +299 -176
- oscura/jupyter/magic.py +4 -4
- oscura/{ui → jupyter/ui}/__init__.py +2 -2
- oscura/{ui → jupyter/ui}/formatters.py +3 -3
- oscura/{ui → jupyter/ui}/progressive_display.py +153 -82
- oscura/loaders/__init__.py +183 -67
- oscura/loaders/binary.py +88 -1
- oscura/loaders/chipwhisperer.py +153 -137
- oscura/loaders/configurable.py +208 -86
- oscura/loaders/csv_loader.py +458 -215
- oscura/loaders/hdf5_loader.py +278 -119
- oscura/loaders/lazy.py +87 -54
- oscura/loaders/mmap_loader.py +1 -1
- oscura/loaders/numpy_loader.py +253 -116
- oscura/loaders/pcap.py +226 -151
- oscura/loaders/rigol.py +110 -49
- oscura/loaders/sigrok.py +201 -78
- oscura/loaders/tdms.py +81 -58
- oscura/loaders/tektronix.py +291 -174
- oscura/loaders/touchstone.py +182 -87
- oscura/loaders/tss.py +456 -0
- oscura/loaders/vcd.py +215 -117
- oscura/loaders/wav.py +155 -68
- oscura/reporting/__init__.py +9 -0
- oscura/reporting/analyze.py +352 -146
- oscura/reporting/argument_preparer.py +69 -14
- oscura/reporting/auto_report.py +97 -61
- oscura/reporting/batch.py +131 -58
- oscura/reporting/chart_selection.py +57 -45
- oscura/reporting/comparison.py +63 -17
- oscura/reporting/content/executive.py +76 -24
- oscura/reporting/core_formats/multi_format.py +11 -8
- oscura/reporting/engine.py +312 -158
- oscura/reporting/enhanced_reports.py +949 -0
- oscura/reporting/export.py +86 -43
- oscura/reporting/formatting/numbers.py +69 -42
- oscura/reporting/html.py +139 -58
- oscura/reporting/index.py +137 -65
- oscura/reporting/output.py +158 -67
- oscura/reporting/pdf.py +67 -102
- oscura/reporting/plots.py +191 -112
- oscura/reporting/sections.py +88 -47
- oscura/reporting/standards.py +104 -61
- oscura/reporting/summary_generator.py +75 -55
- oscura/reporting/tables.py +138 -54
- oscura/reporting/templates/enhanced/protocol_re.html +525 -0
- oscura/sessions/__init__.py +14 -23
- oscura/sessions/base.py +3 -3
- oscura/sessions/blackbox.py +106 -10
- oscura/sessions/generic.py +2 -2
- oscura/sessions/legacy.py +783 -0
- oscura/side_channel/__init__.py +63 -0
- oscura/side_channel/dpa.py +1025 -0
- oscura/utils/__init__.py +15 -1
- oscura/utils/bitwise.py +118 -0
- oscura/{builders → utils/builders}/__init__.py +1 -1
- oscura/{comparison → utils/comparison}/__init__.py +6 -6
- oscura/{comparison → utils/comparison}/compare.py +202 -101
- oscura/{comparison → utils/comparison}/golden.py +83 -63
- oscura/{comparison → utils/comparison}/limits.py +313 -89
- oscura/{comparison → utils/comparison}/mask.py +151 -45
- oscura/{comparison → utils/comparison}/trace_diff.py +1 -1
- oscura/{comparison → utils/comparison}/visualization.py +147 -89
- oscura/{component → utils/component}/__init__.py +3 -3
- oscura/{component → utils/component}/impedance.py +122 -58
- oscura/{component → utils/component}/reactive.py +165 -168
- oscura/{component → utils/component}/transmission_line.py +3 -3
- oscura/{filtering → utils/filtering}/__init__.py +6 -6
- oscura/{filtering → utils/filtering}/base.py +1 -1
- oscura/{filtering → utils/filtering}/convenience.py +2 -2
- oscura/{filtering → utils/filtering}/design.py +169 -93
- oscura/{filtering → utils/filtering}/filters.py +2 -2
- oscura/{filtering → utils/filtering}/introspection.py +2 -2
- oscura/utils/geometry.py +31 -0
- oscura/utils/imports.py +184 -0
- oscura/utils/lazy.py +1 -1
- oscura/{math → utils/math}/__init__.py +2 -2
- oscura/{math → utils/math}/arithmetic.py +114 -48
- oscura/{math → utils/math}/interpolation.py +139 -106
- oscura/utils/memory.py +129 -66
- oscura/utils/memory_advanced.py +92 -9
- oscura/utils/memory_extensions.py +10 -8
- oscura/{optimization → utils/optimization}/__init__.py +1 -1
- oscura/{optimization → utils/optimization}/search.py +2 -2
- oscura/utils/performance/__init__.py +58 -0
- oscura/utils/performance/caching.py +889 -0
- oscura/utils/performance/lsh_clustering.py +333 -0
- oscura/utils/performance/memory_optimizer.py +699 -0
- oscura/utils/performance/optimizations.py +675 -0
- oscura/utils/performance/parallel.py +654 -0
- oscura/utils/performance/profiling.py +661 -0
- oscura/{pipeline → utils/pipeline}/base.py +1 -1
- oscura/{pipeline → utils/pipeline}/composition.py +1 -1
- oscura/{pipeline → utils/pipeline}/parallel.py +3 -2
- oscura/{pipeline → utils/pipeline}/pipeline.py +1 -1
- oscura/{pipeline → utils/pipeline}/reverse_engineering.py +412 -221
- oscura/{search → utils/search}/__init__.py +3 -3
- oscura/{search → utils/search}/anomaly.py +188 -58
- oscura/utils/search/context.py +294 -0
- oscura/{search → utils/search}/pattern.py +138 -10
- oscura/utils/serial.py +51 -0
- oscura/utils/storage/__init__.py +61 -0
- oscura/utils/storage/database.py +1166 -0
- oscura/{streaming → utils/streaming}/chunked.py +302 -143
- oscura/{streaming → utils/streaming}/progressive.py +1 -1
- oscura/{streaming → utils/streaming}/realtime.py +3 -2
- oscura/{triggering → utils/triggering}/__init__.py +6 -6
- oscura/{triggering → utils/triggering}/base.py +6 -6
- oscura/{triggering → utils/triggering}/edge.py +2 -2
- oscura/{triggering → utils/triggering}/pattern.py +2 -2
- oscura/{triggering → utils/triggering}/pulse.py +115 -74
- oscura/{triggering → utils/triggering}/window.py +2 -2
- oscura/utils/validation.py +32 -0
- oscura/validation/__init__.py +121 -0
- oscura/{compliance → validation/compliance}/__init__.py +5 -5
- oscura/{compliance → validation/compliance}/advanced.py +5 -5
- oscura/{compliance → validation/compliance}/masks.py +1 -1
- oscura/{compliance → validation/compliance}/reporting.py +127 -53
- oscura/{compliance → validation/compliance}/testing.py +114 -52
- oscura/validation/compliance_tests.py +915 -0
- oscura/validation/fuzzer.py +990 -0
- oscura/validation/grammar_tests.py +596 -0
- oscura/validation/grammar_validator.py +904 -0
- oscura/validation/hil_testing.py +977 -0
- oscura/{quality → validation/quality}/__init__.py +4 -4
- oscura/{quality → validation/quality}/ensemble.py +251 -171
- oscura/{quality → validation/quality}/explainer.py +3 -3
- oscura/{quality → validation/quality}/scoring.py +1 -1
- oscura/{quality → validation/quality}/warnings.py +4 -4
- oscura/validation/regression_suite.py +808 -0
- oscura/validation/replay.py +788 -0
- oscura/{testing → validation/testing}/__init__.py +2 -2
- oscura/{testing → validation/testing}/synthetic.py +5 -5
- oscura/visualization/__init__.py +9 -0
- oscura/visualization/accessibility.py +1 -1
- oscura/visualization/annotations.py +64 -67
- oscura/visualization/colors.py +7 -7
- oscura/visualization/digital.py +180 -81
- oscura/visualization/eye.py +236 -85
- oscura/visualization/interactive.py +320 -143
- oscura/visualization/jitter.py +587 -247
- oscura/visualization/layout.py +169 -134
- oscura/visualization/optimization.py +103 -52
- oscura/visualization/palettes.py +1 -1
- oscura/visualization/power.py +427 -211
- oscura/visualization/power_extended.py +626 -297
- oscura/visualization/presets.py +2 -0
- oscura/visualization/protocols.py +495 -181
- oscura/visualization/render.py +79 -63
- oscura/visualization/reverse_engineering.py +171 -124
- oscura/visualization/signal_integrity.py +460 -279
- oscura/visualization/specialized.py +190 -100
- oscura/visualization/spectral.py +670 -255
- oscura/visualization/thumbnails.py +166 -137
- oscura/visualization/waveform.py +150 -63
- oscura/workflows/__init__.py +3 -0
- oscura/{batch → workflows/batch}/__init__.py +5 -5
- oscura/{batch → workflows/batch}/advanced.py +150 -75
- oscura/workflows/batch/aggregate.py +531 -0
- oscura/workflows/batch/analyze.py +236 -0
- oscura/{batch → workflows/batch}/logging.py +2 -2
- oscura/{batch → workflows/batch}/metrics.py +1 -1
- oscura/workflows/complete_re.py +1144 -0
- oscura/workflows/compliance.py +44 -54
- oscura/workflows/digital.py +197 -51
- oscura/workflows/legacy/__init__.py +12 -0
- oscura/{workflow → workflows/legacy}/dag.py +4 -1
- oscura/workflows/multi_trace.py +9 -9
- oscura/workflows/power.py +42 -62
- oscura/workflows/protocol.py +82 -49
- oscura/workflows/reverse_engineering.py +351 -150
- oscura/workflows/signal_integrity.py +157 -82
- oscura-0.7.0.dist-info/METADATA +661 -0
- oscura-0.7.0.dist-info/RECORD +591 -0
- oscura/batch/aggregate.py +0 -300
- oscura/batch/analyze.py +0 -139
- oscura/dsl/__init__.py +0 -73
- oscura/exceptions.py +0 -59
- oscura/exploratory/fuzzy.py +0 -513
- oscura/exploratory/sync.py +0 -384
- oscura/exporters/__init__.py +0 -94
- oscura/exporters/csv.py +0 -303
- oscura/exporters/exporters.py +0 -44
- oscura/exporters/hdf5.py +0 -217
- oscura/exporters/html_export.py +0 -701
- oscura/exporters/json_export.py +0 -291
- oscura/exporters/markdown_export.py +0 -367
- oscura/exporters/matlab_export.py +0 -354
- oscura/exporters/npz_export.py +0 -219
- oscura/exporters/spice_export.py +0 -210
- oscura/search/context.py +0 -149
- oscura/session/__init__.py +0 -34
- oscura/session/annotations.py +0 -289
- oscura/session/history.py +0 -313
- oscura/session/session.py +0 -520
- oscura/workflow/__init__.py +0 -13
- oscura-0.5.1.dist-info/METADATA +0 -583
- oscura-0.5.1.dist-info/RECORD +0 -481
- /oscura/core/{config.py → config/legacy.py} +0 -0
- /oscura/{extensibility → core/extensibility}/__init__.py +0 -0
- /oscura/{extensibility → core/extensibility}/registry.py +0 -0
- /oscura/{plugins → core/plugins}/isolation.py +0 -0
- /oscura/{schemas → core/schemas}/bus_configuration.json +0 -0
- /oscura/{builders → utils/builders}/signal_builder.py +0 -0
- /oscura/{optimization → utils/optimization}/parallel.py +0 -0
- /oscura/{pipeline → utils/pipeline}/__init__.py +0 -0
- /oscura/{streaming → utils/streaming}/__init__.py +0 -0
- {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/WHEEL +0 -0
- {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/entry_points.txt +0 -0
- {oscura-0.5.1.dist-info → oscura-0.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,2137 +8,80 @@
|
|
|
8
8
|
|
|
9
9
|
This module provides comprehensive payload extraction from PCAP packets,
|
|
10
10
|
pattern search capabilities, delimiter detection, and comparison tools.
|
|
11
|
+
|
|
12
|
+
This is the public API module that re-exports functionality from specialized modules:
|
|
13
|
+
- payload_analysis: Field inference, diff, clustering
|
|
14
|
+
- payload_patterns: Pattern search, delimiters, boundaries
|
|
15
|
+
- payload_extraction: Payload extraction utilities
|
|
11
16
|
"""
|
|
12
17
|
|
|
13
18
|
from __future__ import annotations
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
import
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
is_fragment: bool = False
|
|
56
|
-
fragment_offset: int = 0
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass
|
|
60
|
-
class PatternMatch:
|
|
61
|
-
"""Pattern match result.
|
|
62
|
-
|
|
63
|
-
Implements RE-PAY-002: Pattern match with location info.
|
|
64
|
-
|
|
65
|
-
Attributes:
|
|
66
|
-
pattern_name: Name of matched pattern.
|
|
67
|
-
offset: Byte offset within payload.
|
|
68
|
-
matched: Matched bytes.
|
|
69
|
-
packet_index: Source packet index.
|
|
70
|
-
context: Surrounding bytes for context.
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
pattern_name: str
|
|
74
|
-
offset: int
|
|
75
|
-
matched: bytes
|
|
76
|
-
packet_index: int
|
|
77
|
-
context: bytes = b""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@dataclass
|
|
81
|
-
class DelimiterResult:
|
|
82
|
-
"""Detected delimiter information.
|
|
83
|
-
|
|
84
|
-
Implements RE-PAY-003: Delimiter detection result.
|
|
85
|
-
|
|
86
|
-
Attributes:
|
|
87
|
-
delimiter: Detected delimiter bytes.
|
|
88
|
-
delimiter_type: Type of delimiter (fixed, length_prefix, pattern).
|
|
89
|
-
confidence: Detection confidence (0-1).
|
|
90
|
-
occurrences: Number of occurrences found.
|
|
91
|
-
positions: List of positions where delimiter found.
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
delimiter: bytes
|
|
95
|
-
delimiter_type: Literal["fixed", "length_prefix", "pattern"]
|
|
96
|
-
confidence: float
|
|
97
|
-
occurrences: int
|
|
98
|
-
positions: list[int] = field(default_factory=list)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
@dataclass
|
|
102
|
-
class LengthPrefixResult:
|
|
103
|
-
"""Length prefix detection result.
|
|
104
|
-
|
|
105
|
-
Implements RE-PAY-003: Length prefix format detection.
|
|
106
|
-
|
|
107
|
-
Attributes:
|
|
108
|
-
detected: Whether length prefix was detected.
|
|
109
|
-
length_bytes: Number of bytes for length field.
|
|
110
|
-
endian: Endianness (big or little).
|
|
111
|
-
offset: Offset of length field from message start.
|
|
112
|
-
includes_length: Whether length includes the length field itself.
|
|
113
|
-
confidence: Detection confidence (0-1).
|
|
114
|
-
"""
|
|
115
|
-
|
|
116
|
-
detected: bool
|
|
117
|
-
length_bytes: int = 0
|
|
118
|
-
endian: Literal["big", "little"] = "big"
|
|
119
|
-
offset: int = 0
|
|
120
|
-
includes_length: bool = False
|
|
121
|
-
confidence: float = 0.0
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
@dataclass
|
|
125
|
-
class MessageBoundary:
|
|
126
|
-
"""Message boundary information.
|
|
127
|
-
|
|
128
|
-
Implements RE-PAY-003: Message boundary detection.
|
|
129
|
-
|
|
130
|
-
Attributes:
|
|
131
|
-
start: Start offset of message.
|
|
132
|
-
end: End offset of message.
|
|
133
|
-
length: Message length.
|
|
134
|
-
data: Message data.
|
|
135
|
-
index: Message index.
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
start: int
|
|
139
|
-
end: int
|
|
140
|
-
length: int
|
|
141
|
-
data: bytes
|
|
142
|
-
index: int
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
@dataclass
|
|
146
|
-
class PayloadDiff:
|
|
147
|
-
"""Difference between two payloads.
|
|
148
|
-
|
|
149
|
-
Implements RE-PAY-005: Payload comparison result.
|
|
150
|
-
|
|
151
|
-
Attributes:
|
|
152
|
-
common_prefix_length: Length of common prefix.
|
|
153
|
-
common_suffix_length: Length of common suffix.
|
|
154
|
-
differences: List of (offset, byte_a, byte_b) for differences.
|
|
155
|
-
similarity: Similarity score (0-1).
|
|
156
|
-
edit_distance: Levenshtein edit distance.
|
|
157
|
-
"""
|
|
158
|
-
|
|
159
|
-
common_prefix_length: int
|
|
160
|
-
common_suffix_length: int
|
|
161
|
-
differences: list[tuple[int, int, int]]
|
|
162
|
-
similarity: float
|
|
163
|
-
edit_distance: int
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
@dataclass
|
|
167
|
-
class VariablePositions:
|
|
168
|
-
"""Analysis of which byte positions vary across payloads.
|
|
169
|
-
|
|
170
|
-
Implements RE-PAY-005: Variable position analysis.
|
|
171
|
-
|
|
172
|
-
Attributes:
|
|
173
|
-
constant_positions: Positions that are constant.
|
|
174
|
-
variable_positions: Positions that vary.
|
|
175
|
-
constant_values: Values at constant positions.
|
|
176
|
-
variance_by_position: Variance at each position.
|
|
177
|
-
"""
|
|
178
|
-
|
|
179
|
-
constant_positions: list[int]
|
|
180
|
-
variable_positions: list[int]
|
|
181
|
-
constant_values: dict[int, int]
|
|
182
|
-
variance_by_position: np.ndarray[tuple[int], np.dtype[np.float64]]
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
@dataclass
|
|
186
|
-
class PayloadCluster:
|
|
187
|
-
"""Cluster of similar payloads.
|
|
188
|
-
|
|
189
|
-
Implements RE-PAY-005: Payload clustering result.
|
|
190
|
-
|
|
191
|
-
Attributes:
|
|
192
|
-
cluster_id: Cluster identifier.
|
|
193
|
-
payloads: List of payload data in cluster.
|
|
194
|
-
indices: Original indices of payloads.
|
|
195
|
-
representative: Representative payload (centroid).
|
|
196
|
-
size: Number of payloads in cluster.
|
|
197
|
-
"""
|
|
198
|
-
|
|
199
|
-
cluster_id: int
|
|
200
|
-
payloads: list[bytes]
|
|
201
|
-
indices: list[int]
|
|
202
|
-
representative: bytes
|
|
203
|
-
size: int
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
# =============================================================================
|
|
207
|
-
# RE-PAY-004: Payload Field Inference
|
|
208
|
-
# =============================================================================
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
@dataclass
|
|
212
|
-
class InferredField:
|
|
213
|
-
"""Inferred field from binary payload.
|
|
214
|
-
|
|
215
|
-
Implements RE-PAY-004: Inferred field structure.
|
|
216
|
-
|
|
217
|
-
Attributes:
|
|
218
|
-
name: Field name (auto-generated).
|
|
219
|
-
offset: Byte offset within message.
|
|
220
|
-
size: Field size in bytes.
|
|
221
|
-
inferred_type: Inferred data type.
|
|
222
|
-
endianness: Detected endianness.
|
|
223
|
-
is_constant: Whether field is constant across messages.
|
|
224
|
-
is_sequence: Whether field appears to be a counter/sequence.
|
|
225
|
-
is_checksum: Whether field appears to be a checksum.
|
|
226
|
-
constant_value: Value if constant.
|
|
227
|
-
confidence: Inference confidence (0-1).
|
|
228
|
-
sample_values: Sample values from messages.
|
|
229
|
-
"""
|
|
230
|
-
|
|
231
|
-
name: str
|
|
232
|
-
offset: int
|
|
233
|
-
size: int
|
|
234
|
-
inferred_type: Literal[
|
|
235
|
-
"uint8",
|
|
236
|
-
"uint16",
|
|
237
|
-
"uint32",
|
|
238
|
-
"uint64",
|
|
239
|
-
"int8",
|
|
240
|
-
"int16",
|
|
241
|
-
"int32",
|
|
242
|
-
"int64",
|
|
243
|
-
"float32",
|
|
244
|
-
"float64",
|
|
245
|
-
"bytes",
|
|
246
|
-
"string",
|
|
247
|
-
"unknown",
|
|
248
|
-
]
|
|
249
|
-
endianness: Literal["big", "little", "n/a"] = "n/a"
|
|
250
|
-
is_constant: bool = False
|
|
251
|
-
is_sequence: bool = False
|
|
252
|
-
is_checksum: bool = False
|
|
253
|
-
constant_value: bytes | None = None
|
|
254
|
-
confidence: float = 0.5
|
|
255
|
-
sample_values: list[Any] = field(default_factory=list)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
@dataclass
|
|
259
|
-
class MessageSchema:
|
|
260
|
-
"""Inferred message schema.
|
|
261
|
-
|
|
262
|
-
Implements RE-PAY-004: Complete message schema.
|
|
263
|
-
|
|
264
|
-
Attributes:
|
|
265
|
-
fields: List of inferred fields.
|
|
266
|
-
message_length: Total message length.
|
|
267
|
-
fixed_length: Whether all messages have same length.
|
|
268
|
-
length_range: (min, max) length range.
|
|
269
|
-
sample_count: Number of samples analyzed.
|
|
270
|
-
confidence: Overall schema confidence.
|
|
271
|
-
"""
|
|
272
|
-
|
|
273
|
-
fields: list[InferredField]
|
|
274
|
-
message_length: int
|
|
275
|
-
fixed_length: bool
|
|
276
|
-
length_range: tuple[int, int]
|
|
277
|
-
sample_count: int
|
|
278
|
-
confidence: float
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
class FieldInferrer:
|
|
282
|
-
"""Infer field structure within binary payloads.
|
|
283
|
-
|
|
284
|
-
Implements RE-PAY-004: Payload Field Inference.
|
|
285
|
-
|
|
286
|
-
Uses statistical analysis, alignment detection, and type inference
|
|
287
|
-
to reconstruct message formats from binary payload samples.
|
|
288
|
-
|
|
289
|
-
Example:
|
|
290
|
-
>>> inferrer = FieldInferrer()
|
|
291
|
-
>>> messages = [pkt.data for pkt in udp_packets]
|
|
292
|
-
>>> schema = inferrer.infer_fields(messages)
|
|
293
|
-
>>> for field in schema.fields:
|
|
294
|
-
... print(f"{field.name}: {field.inferred_type} at offset {field.offset}")
|
|
295
|
-
"""
|
|
296
|
-
|
|
297
|
-
def __init__(
|
|
298
|
-
self,
|
|
299
|
-
min_samples: int = 10,
|
|
300
|
-
entropy_threshold: float = 0.5,
|
|
301
|
-
sequence_threshold: int = 3,
|
|
302
|
-
) -> None:
|
|
303
|
-
"""Initialize field inferrer.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
min_samples: Minimum samples for reliable inference.
|
|
307
|
-
entropy_threshold: Entropy change threshold for boundary detection.
|
|
308
|
-
sequence_threshold: Minimum consecutive incrementing values for sequence.
|
|
309
|
-
"""
|
|
310
|
-
self.min_samples = min_samples
|
|
311
|
-
self.entropy_threshold = entropy_threshold
|
|
312
|
-
self.sequence_threshold = sequence_threshold
|
|
313
|
-
|
|
314
|
-
def infer_fields(
|
|
315
|
-
self,
|
|
316
|
-
messages: Sequence[bytes],
|
|
317
|
-
min_samples: int | None = None,
|
|
318
|
-
) -> MessageSchema:
|
|
319
|
-
"""Infer field structure from message samples.
|
|
320
|
-
|
|
321
|
-
Implements RE-PAY-004: Complete field inference.
|
|
322
|
-
|
|
323
|
-
Args:
|
|
324
|
-
messages: List of binary message samples.
|
|
325
|
-
min_samples: Override minimum sample count.
|
|
326
|
-
|
|
327
|
-
Returns:
|
|
328
|
-
MessageSchema with inferred field structure.
|
|
329
|
-
|
|
330
|
-
Example:
|
|
331
|
-
>>> schema = inferrer.infer_fields(messages)
|
|
332
|
-
>>> print(f"Detected {len(schema.fields)} fields")
|
|
333
|
-
"""
|
|
334
|
-
if not messages:
|
|
335
|
-
return MessageSchema(
|
|
336
|
-
fields=[],
|
|
337
|
-
message_length=0,
|
|
338
|
-
fixed_length=True,
|
|
339
|
-
length_range=(0, 0),
|
|
340
|
-
sample_count=0,
|
|
341
|
-
confidence=0.0,
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
min_samples = min_samples or self.min_samples
|
|
345
|
-
lengths = [len(m) for m in messages]
|
|
346
|
-
min_len = min(lengths)
|
|
347
|
-
max_len = max(lengths)
|
|
348
|
-
fixed_length = min_len == max_len
|
|
349
|
-
|
|
350
|
-
# Use shortest message length for analysis
|
|
351
|
-
analysis_length = min_len
|
|
352
|
-
|
|
353
|
-
# Find field boundaries using entropy transitions
|
|
354
|
-
boundaries = self._detect_field_boundaries(messages, analysis_length)
|
|
355
|
-
|
|
356
|
-
# Infer field types for each segment
|
|
357
|
-
fields = []
|
|
358
|
-
for i, (start, end) in enumerate(boundaries):
|
|
359
|
-
field = self._infer_field(messages, start, end, i)
|
|
360
|
-
fields.append(field)
|
|
361
|
-
|
|
362
|
-
# Calculate overall confidence
|
|
363
|
-
if fields:
|
|
364
|
-
confidence = sum(f.confidence for f in fields) / len(fields)
|
|
365
|
-
else:
|
|
366
|
-
confidence = 0.0
|
|
367
|
-
|
|
368
|
-
return MessageSchema(
|
|
369
|
-
fields=fields,
|
|
370
|
-
message_length=analysis_length,
|
|
371
|
-
fixed_length=fixed_length,
|
|
372
|
-
length_range=(min_len, max_len),
|
|
373
|
-
sample_count=len(messages),
|
|
374
|
-
confidence=confidence,
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
def detect_field_types(
|
|
378
|
-
self,
|
|
379
|
-
messages: Sequence[bytes],
|
|
380
|
-
boundaries: list[tuple[int, int]],
|
|
381
|
-
) -> list[InferredField]:
|
|
382
|
-
"""Detect field types for given boundaries.
|
|
383
|
-
|
|
384
|
-
Implements RE-PAY-004: Field type detection.
|
|
385
|
-
|
|
386
|
-
Args:
|
|
387
|
-
messages: Message samples.
|
|
388
|
-
boundaries: List of (start, end) field boundaries.
|
|
389
|
-
|
|
390
|
-
Returns:
|
|
391
|
-
List of InferredField with type information.
|
|
392
|
-
"""
|
|
393
|
-
fields = []
|
|
394
|
-
for i, (start, end) in enumerate(boundaries):
|
|
395
|
-
field = self._infer_field(messages, start, end, i)
|
|
396
|
-
fields.append(field)
|
|
397
|
-
return fields
|
|
398
|
-
|
|
399
|
-
def find_sequence_fields(
|
|
400
|
-
self,
|
|
401
|
-
messages: Sequence[bytes],
|
|
402
|
-
) -> list[tuple[int, int]]:
|
|
403
|
-
"""Find fields that appear to be sequence/counter values.
|
|
404
|
-
|
|
405
|
-
Implements RE-PAY-004: Sequence field detection.
|
|
406
|
-
|
|
407
|
-
Args:
|
|
408
|
-
messages: Message samples (should be in order).
|
|
409
|
-
|
|
410
|
-
Returns:
|
|
411
|
-
List of (offset, size) for sequence fields.
|
|
412
|
-
|
|
413
|
-
Raises:
|
|
414
|
-
ValueError: If messages are too short for field extraction.
|
|
415
|
-
"""
|
|
416
|
-
if len(messages) < self.sequence_threshold:
|
|
417
|
-
return []
|
|
418
|
-
|
|
419
|
-
min_len = min(len(m) for m in messages)
|
|
420
|
-
sequence_fields = []
|
|
421
|
-
|
|
422
|
-
# Check each possible field size at each offset
|
|
423
|
-
for size in [1, 2, 4]:
|
|
424
|
-
for offset in range(min_len - size + 1):
|
|
425
|
-
values = []
|
|
426
|
-
try:
|
|
427
|
-
for msg in messages:
|
|
428
|
-
# Validate message length before slicing
|
|
429
|
-
if len(msg) < offset + size:
|
|
430
|
-
raise ValueError(
|
|
431
|
-
f"Message too short: expected at least {offset + size} bytes, "
|
|
432
|
-
f"got {len(msg)} bytes"
|
|
433
|
-
)
|
|
434
|
-
# Try both endianness
|
|
435
|
-
val_be = int.from_bytes(msg[offset : offset + size], "big")
|
|
436
|
-
values.append(val_be)
|
|
437
|
-
|
|
438
|
-
if self._is_sequence(values):
|
|
439
|
-
sequence_fields.append((offset, size))
|
|
440
|
-
except (ValueError, IndexError) as e:
|
|
441
|
-
# Skip this offset/size combination if extraction fails
|
|
442
|
-
logger.debug(f"Skipping field at offset={offset}, size={size}: {e}")
|
|
443
|
-
continue
|
|
444
|
-
|
|
445
|
-
return sequence_fields
|
|
446
|
-
|
|
447
|
-
def find_checksum_fields(
|
|
448
|
-
self,
|
|
449
|
-
messages: Sequence[bytes],
|
|
450
|
-
) -> list[tuple[int, int, str]]:
|
|
451
|
-
"""Find fields that appear to be checksums.
|
|
452
|
-
|
|
453
|
-
Implements RE-PAY-004: Checksum field detection.
|
|
454
|
-
|
|
455
|
-
Args:
|
|
456
|
-
messages: Message samples.
|
|
457
|
-
|
|
458
|
-
Returns:
|
|
459
|
-
List of (offset, size, algorithm_hint) for checksum fields.
|
|
460
|
-
|
|
461
|
-
Raises:
|
|
462
|
-
ValueError: If checksum field validation fails.
|
|
463
|
-
"""
|
|
464
|
-
if len(messages) < 5:
|
|
465
|
-
return []
|
|
466
|
-
|
|
467
|
-
min_len = min(len(m) for m in messages)
|
|
468
|
-
checksum_fields = []
|
|
469
|
-
|
|
470
|
-
# Common checksum sizes and positions
|
|
471
|
-
for size in [1, 2, 4]:
|
|
472
|
-
# Check last position (most common)
|
|
473
|
-
for offset in [min_len - size, 0]:
|
|
474
|
-
if offset < 0:
|
|
475
|
-
continue
|
|
476
|
-
|
|
477
|
-
try:
|
|
478
|
-
# Validate offset and size before processing
|
|
479
|
-
if offset + size > min_len:
|
|
480
|
-
raise ValueError(
|
|
481
|
-
f"Invalid checksum field: offset={offset} + size={size} exceeds "
|
|
482
|
-
f"minimum message length={min_len}"
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
# Extract field values and message content
|
|
486
|
-
score = self._check_checksum_correlation(messages, offset, size)
|
|
487
|
-
|
|
488
|
-
if score > 0.8:
|
|
489
|
-
algorithm = self._guess_checksum_algorithm(messages, offset, size)
|
|
490
|
-
checksum_fields.append((offset, size, algorithm))
|
|
491
|
-
except (ValueError, IndexError) as e:
|
|
492
|
-
# Skip this offset/size combination if validation fails
|
|
493
|
-
logger.debug(f"Skipping checksum field at offset={offset}, size={size}: {e}")
|
|
494
|
-
continue
|
|
495
|
-
|
|
496
|
-
return checksum_fields
|
|
497
|
-
|
|
498
|
-
def _detect_field_boundaries(
|
|
499
|
-
self,
|
|
500
|
-
messages: Sequence[bytes],
|
|
501
|
-
max_length: int,
|
|
502
|
-
) -> list[tuple[int, int]]:
|
|
503
|
-
"""Detect field boundaries using entropy analysis.
|
|
504
|
-
|
|
505
|
-
Args:
|
|
506
|
-
messages: Message samples.
|
|
507
|
-
max_length: Maximum length to analyze.
|
|
508
|
-
|
|
509
|
-
Returns:
|
|
510
|
-
List of (start, end) boundaries.
|
|
511
|
-
"""
|
|
512
|
-
if max_length == 0:
|
|
513
|
-
return []
|
|
514
|
-
|
|
515
|
-
# Calculate per-byte entropy
|
|
516
|
-
byte_entropies = []
|
|
517
|
-
for pos in range(max_length):
|
|
518
|
-
values = [m[pos] for m in messages if len(m) > pos]
|
|
519
|
-
if len(values) < 2:
|
|
520
|
-
byte_entropies.append(0.0)
|
|
521
|
-
continue
|
|
522
|
-
|
|
523
|
-
counts = Counter(values)
|
|
524
|
-
total = len(values)
|
|
525
|
-
entropy = 0.0
|
|
526
|
-
for count in counts.values():
|
|
527
|
-
if count > 0:
|
|
528
|
-
p = count / total
|
|
529
|
-
entropy -= p * np.log2(p)
|
|
530
|
-
byte_entropies.append(entropy)
|
|
531
|
-
|
|
532
|
-
# Find boundaries at entropy transitions
|
|
533
|
-
boundaries = []
|
|
534
|
-
current_start = 0
|
|
535
|
-
|
|
536
|
-
for i in range(1, len(byte_entropies)):
|
|
537
|
-
delta = abs(byte_entropies[i] - byte_entropies[i - 1])
|
|
538
|
-
|
|
539
|
-
# Also check for constant vs variable patterns
|
|
540
|
-
if delta > self.entropy_threshold:
|
|
541
|
-
if i > current_start:
|
|
542
|
-
boundaries.append((current_start, i))
|
|
543
|
-
current_start = i
|
|
544
|
-
|
|
545
|
-
# Add final segment
|
|
546
|
-
if max_length > current_start:
|
|
547
|
-
boundaries.append((current_start, max_length))
|
|
548
|
-
|
|
549
|
-
# Merge very small segments
|
|
550
|
-
merged: list[tuple[int, int]] = []
|
|
551
|
-
for start, end in boundaries:
|
|
552
|
-
if merged and start - merged[-1][1] == 0 and end - start < 2:
|
|
553
|
-
# Merge with previous
|
|
554
|
-
merged[-1] = (merged[-1][0], end)
|
|
555
|
-
else:
|
|
556
|
-
merged.append((start, end))
|
|
557
|
-
|
|
558
|
-
return merged if merged else [(0, max_length)]
|
|
559
|
-
|
|
560
|
-
def _infer_field(
|
|
561
|
-
self,
|
|
562
|
-
messages: Sequence[bytes],
|
|
563
|
-
start: int,
|
|
564
|
-
end: int,
|
|
565
|
-
index: int,
|
|
566
|
-
) -> InferredField:
|
|
567
|
-
"""Infer type for a single field.
|
|
568
|
-
|
|
569
|
-
Args:
|
|
570
|
-
messages: Message samples.
|
|
571
|
-
start: Field start offset.
|
|
572
|
-
end: Field end offset.
|
|
573
|
-
index: Field index for naming.
|
|
574
|
-
|
|
575
|
-
Returns:
|
|
576
|
-
InferredField with inferred type.
|
|
577
|
-
"""
|
|
578
|
-
size = end - start
|
|
579
|
-
name = f"field_{index}"
|
|
580
|
-
|
|
581
|
-
# Extract field values
|
|
582
|
-
values = []
|
|
583
|
-
raw_values = []
|
|
584
|
-
for msg in messages:
|
|
585
|
-
if len(msg) >= end:
|
|
586
|
-
field_bytes = msg[start:end]
|
|
587
|
-
raw_values.append(field_bytes)
|
|
588
|
-
values.append(field_bytes)
|
|
589
|
-
|
|
590
|
-
if not values:
|
|
591
|
-
return InferredField(
|
|
592
|
-
name=name,
|
|
593
|
-
offset=start,
|
|
594
|
-
size=size,
|
|
595
|
-
inferred_type="unknown",
|
|
596
|
-
confidence=0.0,
|
|
597
|
-
)
|
|
598
|
-
|
|
599
|
-
# Check if constant
|
|
600
|
-
unique_values = set(raw_values)
|
|
601
|
-
is_constant = len(unique_values) == 1
|
|
602
|
-
|
|
603
|
-
# Check if sequence
|
|
604
|
-
is_sequence = False
|
|
605
|
-
if not is_constant and size in [1, 2, 4, 8]:
|
|
606
|
-
int_values = [int.from_bytes(v, "big") for v in raw_values]
|
|
607
|
-
is_sequence = self._is_sequence(int_values)
|
|
608
|
-
|
|
609
|
-
# Check for checksum patterns
|
|
610
|
-
is_checksum = False
|
|
611
|
-
if start >= min(len(m) for m in messages) - 4:
|
|
612
|
-
score = self._check_checksum_correlation(messages, start, size)
|
|
613
|
-
is_checksum = score > 0.7
|
|
614
|
-
|
|
615
|
-
# Infer type
|
|
616
|
-
inferred_type, endianness, confidence = self._infer_type(raw_values, size)
|
|
617
|
-
|
|
618
|
-
# Sample values for debugging
|
|
619
|
-
sample_values: list[int | str] = []
|
|
620
|
-
for v in raw_values[:5]:
|
|
621
|
-
if inferred_type.startswith("uint") or inferred_type.startswith("int"):
|
|
622
|
-
try:
|
|
623
|
-
# Cast endianness to Literal type for type checker
|
|
624
|
-
byte_order: Literal["big", "little"] = (
|
|
625
|
-
"big" if endianness == "n/a" else endianness # type: ignore[assignment]
|
|
626
|
-
)
|
|
627
|
-
sample_values.append(int.from_bytes(v, byte_order))
|
|
628
|
-
except Exception:
|
|
629
|
-
sample_values.append(v.hex())
|
|
630
|
-
elif inferred_type == "string":
|
|
631
|
-
try:
|
|
632
|
-
sample_values.append(v.decode("utf-8", errors="replace"))
|
|
633
|
-
except Exception:
|
|
634
|
-
sample_values.append(v.hex())
|
|
635
|
-
else:
|
|
636
|
-
sample_values.append(v.hex())
|
|
637
|
-
|
|
638
|
-
# Cast to Literal types for type checker
|
|
639
|
-
inferred_type_literal: Literal[
|
|
640
|
-
"uint8",
|
|
641
|
-
"uint16",
|
|
642
|
-
"uint32",
|
|
643
|
-
"uint64",
|
|
644
|
-
"int8",
|
|
645
|
-
"int16",
|
|
646
|
-
"int32",
|
|
647
|
-
"int64",
|
|
648
|
-
"float32",
|
|
649
|
-
"float64",
|
|
650
|
-
"bytes",
|
|
651
|
-
"string",
|
|
652
|
-
"unknown",
|
|
653
|
-
] = inferred_type # type: ignore[assignment]
|
|
654
|
-
endianness_literal: Literal["big", "little", "n/a"] = endianness # type: ignore[assignment]
|
|
655
|
-
|
|
656
|
-
return InferredField(
|
|
657
|
-
name=name,
|
|
658
|
-
offset=start,
|
|
659
|
-
size=size,
|
|
660
|
-
inferred_type=inferred_type_literal,
|
|
661
|
-
endianness=endianness_literal,
|
|
662
|
-
is_constant=is_constant,
|
|
663
|
-
is_sequence=is_sequence,
|
|
664
|
-
is_checksum=is_checksum,
|
|
665
|
-
constant_value=raw_values[0] if is_constant else None,
|
|
666
|
-
confidence=confidence,
|
|
667
|
-
sample_values=sample_values,
|
|
668
|
-
)
|
|
669
|
-
|
|
670
|
-
def _infer_type(
|
|
671
|
-
self,
|
|
672
|
-
values: list[bytes],
|
|
673
|
-
size: int,
|
|
674
|
-
) -> tuple[str, str, float]:
|
|
675
|
-
"""Infer data type from values.
|
|
676
|
-
|
|
677
|
-
Args:
|
|
678
|
-
values: Field values.
|
|
679
|
-
size: Field size.
|
|
680
|
-
|
|
681
|
-
Returns:
|
|
682
|
-
Tuple of (type, endianness, confidence).
|
|
683
|
-
"""
|
|
684
|
-
if not values:
|
|
685
|
-
return "unknown", "n/a", 0.0
|
|
686
|
-
|
|
687
|
-
# Check for string (high printable ratio)
|
|
688
|
-
printable_ratio = sum(
|
|
689
|
-
1 for v in values for b in v if 32 <= b <= 126 or b in (9, 10, 13)
|
|
690
|
-
) / (len(values) * size)
|
|
691
|
-
|
|
692
|
-
if printable_ratio > 0.8:
|
|
693
|
-
return "string", "n/a", printable_ratio
|
|
694
|
-
|
|
695
|
-
# Check for standard integer sizes
|
|
696
|
-
if size == 1:
|
|
697
|
-
return "uint8", "n/a", 0.9
|
|
698
|
-
|
|
699
|
-
elif size == 2:
|
|
700
|
-
# Try to detect endianness
|
|
701
|
-
be_variance = np.var([int.from_bytes(v, "big") for v in values])
|
|
702
|
-
le_variance = np.var([int.from_bytes(v, "little") for v in values])
|
|
703
|
-
|
|
704
|
-
if be_variance < le_variance:
|
|
705
|
-
endian = "big"
|
|
706
|
-
else:
|
|
707
|
-
endian = "little"
|
|
708
|
-
|
|
709
|
-
return "uint16", endian, 0.8
|
|
710
|
-
|
|
711
|
-
elif size == 4:
|
|
712
|
-
# Check for float
|
|
713
|
-
float_valid = 0
|
|
714
|
-
for v in values:
|
|
715
|
-
try:
|
|
716
|
-
f = struct.unpack(">f", v)[0]
|
|
717
|
-
if not (np.isnan(f) or np.isinf(f)) and -1e10 < f < 1e10:
|
|
718
|
-
float_valid += 1
|
|
719
|
-
except Exception:
|
|
720
|
-
pass
|
|
721
|
-
|
|
722
|
-
if float_valid / len(values) > 0.8:
|
|
723
|
-
return "float32", "big", 0.7
|
|
724
|
-
|
|
725
|
-
# Otherwise integer
|
|
726
|
-
be_variance = np.var([int.from_bytes(v, "big") for v in values])
|
|
727
|
-
le_variance = np.var([int.from_bytes(v, "little") for v in values])
|
|
728
|
-
endian = "big" if be_variance < le_variance else "little"
|
|
729
|
-
return "uint32", endian, 0.8
|
|
730
|
-
|
|
731
|
-
elif size == 8:
|
|
732
|
-
# Check for float64 or uint64
|
|
733
|
-
be_variance = np.var([int.from_bytes(v, "big") for v in values])
|
|
734
|
-
le_variance = np.var([int.from_bytes(v, "little") for v in values])
|
|
735
|
-
endian = "big" if be_variance < le_variance else "little"
|
|
736
|
-
return "uint64", endian, 0.7
|
|
737
|
-
|
|
738
|
-
else:
|
|
739
|
-
return "bytes", "n/a", 0.6
|
|
740
|
-
|
|
741
|
-
def _is_sequence(self, values: list[int]) -> bool:
|
|
742
|
-
"""Check if values form a sequence.
|
|
743
|
-
|
|
744
|
-
Args:
|
|
745
|
-
values: Integer values.
|
|
746
|
-
|
|
747
|
-
Returns:
|
|
748
|
-
True if values are incrementing/decrementing.
|
|
749
|
-
"""
|
|
750
|
-
if len(values) < self.sequence_threshold:
|
|
751
|
-
return False
|
|
752
|
-
|
|
753
|
-
# Check for incrementing sequence
|
|
754
|
-
diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
|
|
755
|
-
|
|
756
|
-
# Most diffs should be 1 (or consistent)
|
|
757
|
-
counter = Counter(diffs)
|
|
758
|
-
if not counter:
|
|
759
|
-
return False
|
|
760
|
-
|
|
761
|
-
most_common_diff, count = counter.most_common(1)[0]
|
|
762
|
-
ratio = count / len(diffs)
|
|
763
|
-
|
|
764
|
-
return ratio > 0.8 and most_common_diff in [1, -1, 0]
|
|
765
|
-
|
|
766
|
-
def _check_checksum_correlation(
|
|
767
|
-
self,
|
|
768
|
-
messages: Sequence[bytes],
|
|
769
|
-
offset: int,
|
|
770
|
-
size: int,
|
|
771
|
-
) -> float:
|
|
772
|
-
"""Check if field correlates with message content like a checksum.
|
|
773
|
-
|
|
774
|
-
Args:
|
|
775
|
-
messages: Message samples.
|
|
776
|
-
offset: Field offset.
|
|
777
|
-
size: Field size.
|
|
778
|
-
|
|
779
|
-
Returns:
|
|
780
|
-
Correlation score (0-1).
|
|
781
|
-
"""
|
|
782
|
-
# Simple heuristic: checksum fields have high correlation with
|
|
783
|
-
# changes in other parts of the message
|
|
784
|
-
|
|
785
|
-
if len(messages) < 5:
|
|
786
|
-
return 0.0
|
|
787
|
-
|
|
788
|
-
# Extract checksum values and message content
|
|
789
|
-
checksums = []
|
|
790
|
-
contents = []
|
|
791
|
-
|
|
792
|
-
for msg in messages:
|
|
793
|
-
if len(msg) >= offset + size:
|
|
794
|
-
checksums.append(int.from_bytes(msg[offset : offset + size], "big"))
|
|
795
|
-
# Content before checksum
|
|
796
|
-
content = msg[:offset] + msg[offset + size :]
|
|
797
|
-
contents.append(sum(content) % 65536)
|
|
798
|
-
|
|
799
|
-
if len(checksums) < 5:
|
|
800
|
-
return 0.0
|
|
801
|
-
|
|
802
|
-
# Check if checksum changes correlate with content changes
|
|
803
|
-
unique_contents = len(set(contents))
|
|
804
|
-
unique_checksums = len(set(checksums))
|
|
805
|
-
|
|
806
|
-
if unique_contents == 1 and unique_checksums == 1:
|
|
807
|
-
return 0.3 # Both constant - inconclusive
|
|
808
|
-
|
|
809
|
-
# Simple correlation check
|
|
810
|
-
if unique_contents > 1 and unique_checksums > 1:
|
|
811
|
-
return 0.8
|
|
812
|
-
|
|
813
|
-
return 0.3
|
|
814
|
-
|
|
815
|
-
def _guess_checksum_algorithm(
|
|
816
|
-
self,
|
|
817
|
-
messages: Sequence[bytes],
|
|
818
|
-
offset: int,
|
|
819
|
-
size: int,
|
|
820
|
-
) -> str:
|
|
821
|
-
"""Guess the checksum algorithm.
|
|
822
|
-
|
|
823
|
-
Args:
|
|
824
|
-
messages: Message samples.
|
|
825
|
-
offset: Checksum offset.
|
|
826
|
-
size: Checksum size.
|
|
827
|
-
|
|
828
|
-
Returns:
|
|
829
|
-
Algorithm name hint.
|
|
830
|
-
"""
|
|
831
|
-
if size == 1:
|
|
832
|
-
return "xor8_or_sum8"
|
|
833
|
-
elif size == 2:
|
|
834
|
-
return "crc16_or_sum16"
|
|
835
|
-
elif size == 4:
|
|
836
|
-
return "crc32"
|
|
837
|
-
return "unknown"
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
# =============================================================================
|
|
841
|
-
# RE-PAY-004: Convenience functions
|
|
842
|
-
# =============================================================================
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
def infer_fields(messages: Sequence[bytes], min_samples: int = 10) -> MessageSchema:
|
|
846
|
-
"""Infer field structure from message samples.
|
|
847
|
-
|
|
848
|
-
Implements RE-PAY-004: Payload Field Inference.
|
|
849
|
-
|
|
850
|
-
Args:
|
|
851
|
-
messages: List of binary message samples.
|
|
852
|
-
min_samples: Minimum samples for reliable inference.
|
|
853
|
-
|
|
854
|
-
Returns:
|
|
855
|
-
MessageSchema with inferred field structure.
|
|
856
|
-
|
|
857
|
-
Example:
|
|
858
|
-
>>> messages = [pkt.data for pkt in packets]
|
|
859
|
-
>>> schema = infer_fields(messages)
|
|
860
|
-
>>> for field in schema.fields:
|
|
861
|
-
... print(f"{field.name}: {field.inferred_type}")
|
|
862
|
-
"""
|
|
863
|
-
inferrer = FieldInferrer(min_samples=min_samples)
|
|
864
|
-
return inferrer.infer_fields(messages)
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
def detect_field_types(
|
|
868
|
-
messages: Sequence[bytes],
|
|
869
|
-
boundaries: list[tuple[int, int]],
|
|
870
|
-
) -> list[InferredField]:
|
|
871
|
-
"""Detect field types for given boundaries.
|
|
872
|
-
|
|
873
|
-
Implements RE-PAY-004: Field type detection.
|
|
874
|
-
|
|
875
|
-
Args:
|
|
876
|
-
messages: Message samples.
|
|
877
|
-
boundaries: List of (start, end) field boundaries.
|
|
878
|
-
|
|
879
|
-
Returns:
|
|
880
|
-
List of InferredField with type information.
|
|
881
|
-
"""
|
|
882
|
-
inferrer = FieldInferrer()
|
|
883
|
-
return inferrer.detect_field_types(messages, boundaries)
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
def find_sequence_fields(messages: Sequence[bytes]) -> list[tuple[int, int]]:
|
|
887
|
-
"""Find fields that appear to be sequence/counter values.
|
|
888
|
-
|
|
889
|
-
Implements RE-PAY-004: Sequence field detection.
|
|
890
|
-
|
|
891
|
-
Args:
|
|
892
|
-
messages: Message samples (should be in order).
|
|
893
|
-
|
|
894
|
-
Returns:
|
|
895
|
-
List of (offset, size) for sequence fields.
|
|
896
|
-
"""
|
|
897
|
-
inferrer = FieldInferrer()
|
|
898
|
-
return inferrer.find_sequence_fields(messages)
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
def find_checksum_fields(messages: Sequence[bytes]) -> list[tuple[int, int, str]]:
|
|
902
|
-
"""Find fields that appear to be checksums.
|
|
903
|
-
|
|
904
|
-
Implements RE-PAY-004: Checksum field detection.
|
|
905
|
-
|
|
906
|
-
Args:
|
|
907
|
-
messages: Message samples.
|
|
908
|
-
|
|
909
|
-
Returns:
|
|
910
|
-
List of (offset, size, algorithm_hint) for checksum fields.
|
|
911
|
-
"""
|
|
912
|
-
inferrer = FieldInferrer()
|
|
913
|
-
return inferrer.find_checksum_fields(messages)
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
class PayloadExtractor:
|
|
917
|
-
"""Extract payloads from network packets.
|
|
918
|
-
|
|
919
|
-
Implements RE-PAY-001: Payload Extraction Framework.
|
|
920
|
-
|
|
921
|
-
Provides zero-copy payload extraction from UDP/TCP packets
|
|
922
|
-
with metadata preservation and fragment handling.
|
|
923
|
-
|
|
924
|
-
Example:
|
|
925
|
-
>>> extractor = PayloadExtractor()
|
|
926
|
-
>>> payloads = extractor.extract_all_payloads(packets, protocol="UDP")
|
|
927
|
-
>>> for p in payloads:
|
|
928
|
-
... print(f"{p.src_ip}:{p.src_port} -> {len(p.data)} bytes")
|
|
929
|
-
"""
|
|
930
|
-
|
|
931
|
-
def __init__(
|
|
932
|
-
self,
|
|
933
|
-
include_headers: bool = False,
|
|
934
|
-
zero_copy: bool = True,
|
|
935
|
-
return_type: Literal["bytes", "memoryview", "numpy"] = "bytes",
|
|
936
|
-
) -> None:
|
|
937
|
-
"""Initialize payload extractor.
|
|
938
|
-
|
|
939
|
-
Args:
|
|
940
|
-
include_headers: Include protocol headers in payload.
|
|
941
|
-
zero_copy: Use zero-copy memoryview where possible.
|
|
942
|
-
return_type: Type for returned payload data.
|
|
943
|
-
"""
|
|
944
|
-
self.include_headers = include_headers
|
|
945
|
-
self.zero_copy = zero_copy
|
|
946
|
-
self.return_type = return_type
|
|
947
|
-
|
|
948
|
-
def extract_payload(
|
|
949
|
-
self,
|
|
950
|
-
packet: dict[str, Any] | bytes,
|
|
951
|
-
layer: Literal["ethernet", "ip", "transport", "application"] = "application",
|
|
952
|
-
) -> bytes | memoryview | np.ndarray[tuple[int], np.dtype[np.uint8]]:
|
|
953
|
-
"""Extract payload from a single packet.
|
|
954
|
-
|
|
955
|
-
Implements RE-PAY-001: Single packet payload extraction.
|
|
956
|
-
|
|
957
|
-
Args:
|
|
958
|
-
packet: Packet data (dict with 'data' key or raw bytes).
|
|
959
|
-
layer: OSI layer to extract from.
|
|
960
|
-
|
|
961
|
-
Returns:
|
|
962
|
-
Payload data in requested format.
|
|
963
|
-
|
|
964
|
-
Example:
|
|
965
|
-
>>> payload = extractor.extract_payload(packet)
|
|
966
|
-
>>> print(f"Payload: {len(payload)} bytes")
|
|
967
|
-
"""
|
|
968
|
-
# Handle different packet formats
|
|
969
|
-
if isinstance(packet, dict):
|
|
970
|
-
raw_data = packet.get("data", packet.get("payload", b""))
|
|
971
|
-
if isinstance(raw_data, list | tuple):
|
|
972
|
-
raw_data = bytes(raw_data)
|
|
973
|
-
else:
|
|
974
|
-
raw_data = packet
|
|
975
|
-
|
|
976
|
-
if not raw_data:
|
|
977
|
-
return self._format_output(b"")
|
|
978
|
-
|
|
979
|
-
# For raw bytes, return as-is
|
|
980
|
-
if layer == "application":
|
|
981
|
-
return self._format_output(raw_data)
|
|
982
|
-
|
|
983
|
-
# Layer-based extraction would require protocol parsing
|
|
984
|
-
# For now, return full data
|
|
985
|
-
return self._format_output(raw_data)
|
|
986
|
-
|
|
987
|
-
def extract_all_payloads(
|
|
988
|
-
self,
|
|
989
|
-
packets: Sequence[dict[str, Any] | bytes],
|
|
990
|
-
protocol: str | None = None,
|
|
991
|
-
port_filter: tuple[int | None, int | None] | None = None,
|
|
992
|
-
) -> list[PayloadInfo]:
|
|
993
|
-
"""Extract payloads from all packets with metadata.
|
|
994
|
-
|
|
995
|
-
Implements RE-PAY-001: Batch payload extraction with metadata.
|
|
996
|
-
|
|
997
|
-
Args:
|
|
998
|
-
packets: Sequence of packets.
|
|
999
|
-
protocol: Filter by protocol (e.g., "UDP", "TCP").
|
|
1000
|
-
port_filter: (src_port, dst_port) filter tuple.
|
|
1001
|
-
|
|
1002
|
-
Returns:
|
|
1003
|
-
List of PayloadInfo with extracted data and metadata.
|
|
1004
|
-
|
|
1005
|
-
Example:
|
|
1006
|
-
>>> payloads = extractor.extract_all_payloads(packets, protocol="UDP")
|
|
1007
|
-
>>> print(f"Extracted {len(payloads)} payloads")
|
|
1008
|
-
"""
|
|
1009
|
-
results = []
|
|
1010
|
-
|
|
1011
|
-
for i, packet in enumerate(packets):
|
|
1012
|
-
if isinstance(packet, dict):
|
|
1013
|
-
# Extract metadata from dict
|
|
1014
|
-
pkt_protocol = packet.get("protocol", "")
|
|
1015
|
-
src_port = packet.get("src_port")
|
|
1016
|
-
dst_port = packet.get("dst_port")
|
|
1017
|
-
|
|
1018
|
-
# Apply filters
|
|
1019
|
-
if protocol and pkt_protocol.upper() != protocol.upper():
|
|
1020
|
-
continue
|
|
1021
|
-
|
|
1022
|
-
if port_filter:
|
|
1023
|
-
if port_filter[0] is not None and src_port != port_filter[0]:
|
|
1024
|
-
continue
|
|
1025
|
-
if port_filter[1] is not None and dst_port != port_filter[1]:
|
|
1026
|
-
continue
|
|
1027
|
-
|
|
1028
|
-
payload = self.extract_payload(packet)
|
|
1029
|
-
if isinstance(payload, memoryview | np.ndarray):
|
|
1030
|
-
payload = bytes(payload)
|
|
1031
|
-
|
|
1032
|
-
info = PayloadInfo(
|
|
1033
|
-
data=payload,
|
|
1034
|
-
packet_index=i,
|
|
1035
|
-
timestamp=packet.get("timestamp"),
|
|
1036
|
-
src_ip=packet.get("src_ip"),
|
|
1037
|
-
dst_ip=packet.get("dst_ip"),
|
|
1038
|
-
src_port=src_port,
|
|
1039
|
-
dst_port=dst_port,
|
|
1040
|
-
protocol=pkt_protocol,
|
|
1041
|
-
is_fragment=packet.get("is_fragment", False),
|
|
1042
|
-
fragment_offset=packet.get("fragment_offset", 0),
|
|
1043
|
-
)
|
|
1044
|
-
results.append(info)
|
|
1045
|
-
else:
|
|
1046
|
-
# Raw bytes
|
|
1047
|
-
payload = bytes(packet)
|
|
1048
|
-
info = PayloadInfo(data=payload, packet_index=i)
|
|
1049
|
-
results.append(info)
|
|
1050
|
-
|
|
1051
|
-
return results
|
|
1052
|
-
|
|
1053
|
-
def iter_payloads(
|
|
1054
|
-
self,
|
|
1055
|
-
packets: Sequence[dict[str, Any] | bytes],
|
|
1056
|
-
) -> Iterator[PayloadInfo]:
|
|
1057
|
-
"""Iterate over payloads for memory-efficient processing.
|
|
1058
|
-
|
|
1059
|
-
Implements RE-PAY-001: Streaming payload iteration.
|
|
1060
|
-
|
|
1061
|
-
Args:
|
|
1062
|
-
packets: Sequence of packets.
|
|
1063
|
-
|
|
1064
|
-
Yields:
|
|
1065
|
-
PayloadInfo for each packet.
|
|
1066
|
-
"""
|
|
1067
|
-
for i, packet in enumerate(packets):
|
|
1068
|
-
payload = self.extract_payload(packet)
|
|
1069
|
-
if isinstance(payload, memoryview | np.ndarray):
|
|
1070
|
-
payload = bytes(payload)
|
|
1071
|
-
|
|
1072
|
-
if isinstance(packet, dict):
|
|
1073
|
-
info = PayloadInfo(
|
|
1074
|
-
data=payload,
|
|
1075
|
-
packet_index=i,
|
|
1076
|
-
timestamp=packet.get("timestamp"),
|
|
1077
|
-
src_ip=packet.get("src_ip"),
|
|
1078
|
-
dst_ip=packet.get("dst_ip"),
|
|
1079
|
-
src_port=packet.get("src_port"),
|
|
1080
|
-
dst_port=packet.get("dst_port"),
|
|
1081
|
-
protocol=packet.get("protocol"),
|
|
1082
|
-
)
|
|
1083
|
-
else:
|
|
1084
|
-
info = PayloadInfo(data=payload, packet_index=i)
|
|
1085
|
-
|
|
1086
|
-
yield info
|
|
1087
|
-
|
|
1088
|
-
def _format_output(
|
|
1089
|
-
self, data: bytes
|
|
1090
|
-
) -> bytes | memoryview | np.ndarray[tuple[int], np.dtype[np.uint8]]:
|
|
1091
|
-
"""Format output according to return_type setting."""
|
|
1092
|
-
if self.return_type == "bytes":
|
|
1093
|
-
return data
|
|
1094
|
-
elif self.return_type == "memoryview":
|
|
1095
|
-
return memoryview(data)
|
|
1096
|
-
# self.return_type == "numpy"
|
|
1097
|
-
return np.frombuffer(data, dtype=np.uint8)
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
def search_pattern(
|
|
1101
|
-
packets: Sequence[dict[str, Any] | bytes],
|
|
1102
|
-
pattern: bytes | str,
|
|
1103
|
-
pattern_type: Literal["exact", "wildcard", "regex"] = "exact",
|
|
1104
|
-
context_bytes: int = 8,
|
|
1105
|
-
) -> list[PatternMatch]:
|
|
1106
|
-
"""Search for pattern in packet payloads.
|
|
1107
|
-
|
|
1108
|
-
Implements RE-PAY-002: Payload Pattern Search.
|
|
1109
|
-
|
|
1110
|
-
Args:
|
|
1111
|
-
packets: Sequence of packets to search.
|
|
1112
|
-
pattern: Pattern to search for.
|
|
1113
|
-
pattern_type: Type of pattern matching.
|
|
1114
|
-
context_bytes: Number of context bytes around match.
|
|
1115
|
-
|
|
1116
|
-
Returns:
|
|
1117
|
-
List of PatternMatch results.
|
|
1118
|
-
|
|
1119
|
-
Example:
|
|
1120
|
-
>>> matches = search_pattern(packets, b'\\x00\\x01\\x00\\x00')
|
|
1121
|
-
>>> for m in matches:
|
|
1122
|
-
... print(f"Found at packet {m.packet_index}, offset {m.offset}")
|
|
1123
|
-
"""
|
|
1124
|
-
extractor = PayloadExtractor()
|
|
1125
|
-
results = []
|
|
1126
|
-
|
|
1127
|
-
for i, packet in enumerate(packets):
|
|
1128
|
-
payload = extractor.extract_payload(packet)
|
|
1129
|
-
if isinstance(payload, memoryview | np.ndarray):
|
|
1130
|
-
payload = bytes(payload)
|
|
1131
|
-
|
|
1132
|
-
matches = _find_pattern_in_data(payload, pattern, pattern_type)
|
|
1133
|
-
|
|
1134
|
-
for offset, matched in matches:
|
|
1135
|
-
# Get context
|
|
1136
|
-
start = max(0, offset - context_bytes)
|
|
1137
|
-
end = min(len(payload), offset + len(matched) + context_bytes)
|
|
1138
|
-
context = payload[start:end]
|
|
1139
|
-
|
|
1140
|
-
results.append(
|
|
1141
|
-
PatternMatch(
|
|
1142
|
-
pattern_name=pattern.hex() if isinstance(pattern, bytes) else str(pattern),
|
|
1143
|
-
offset=offset,
|
|
1144
|
-
matched=matched,
|
|
1145
|
-
packet_index=i,
|
|
1146
|
-
context=context,
|
|
1147
|
-
)
|
|
1148
|
-
)
|
|
1149
|
-
|
|
1150
|
-
return results
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
def search_patterns(
|
|
1154
|
-
packets: Sequence[dict[str, Any] | bytes],
|
|
1155
|
-
patterns: dict[str, bytes | str],
|
|
1156
|
-
context_bytes: int = 8,
|
|
1157
|
-
) -> dict[str, list[PatternMatch]]:
|
|
1158
|
-
"""Search for multiple patterns simultaneously.
|
|
1159
|
-
|
|
1160
|
-
Implements RE-PAY-002: Multi-pattern search.
|
|
1161
|
-
|
|
1162
|
-
Args:
|
|
1163
|
-
packets: Sequence of packets to search.
|
|
1164
|
-
patterns: Dictionary mapping names to patterns.
|
|
1165
|
-
context_bytes: Number of context bytes around match.
|
|
1166
|
-
|
|
1167
|
-
Returns:
|
|
1168
|
-
Dictionary mapping pattern names to match lists.
|
|
1169
|
-
|
|
1170
|
-
Example:
|
|
1171
|
-
>>> signatures = {
|
|
1172
|
-
... "header_a": b'\\xAA\\x55',
|
|
1173
|
-
... "header_b": b'\\xDE\\xAD',
|
|
1174
|
-
... }
|
|
1175
|
-
>>> results = search_patterns(packets, signatures)
|
|
1176
|
-
>>> for name, matches in results.items():
|
|
1177
|
-
... print(f"{name}: {len(matches)} matches")
|
|
1178
|
-
"""
|
|
1179
|
-
results: dict[str, list[PatternMatch]] = {name: [] for name in patterns}
|
|
1180
|
-
extractor = PayloadExtractor()
|
|
1181
|
-
|
|
1182
|
-
for i, packet in enumerate(packets):
|
|
1183
|
-
payload = extractor.extract_payload(packet)
|
|
1184
|
-
if isinstance(payload, memoryview | np.ndarray):
|
|
1185
|
-
payload = bytes(payload)
|
|
1186
|
-
|
|
1187
|
-
for name, pattern in patterns.items():
|
|
1188
|
-
# Detect pattern type
|
|
1189
|
-
if isinstance(pattern, bytes):
|
|
1190
|
-
if b"??" in pattern or b"\\x??" in pattern:
|
|
1191
|
-
pattern_type = "wildcard"
|
|
1192
|
-
else:
|
|
1193
|
-
pattern_type = "exact"
|
|
1194
|
-
else:
|
|
1195
|
-
pattern_type = "regex"
|
|
1196
|
-
|
|
1197
|
-
matches = _find_pattern_in_data(payload, pattern, pattern_type)
|
|
1198
|
-
|
|
1199
|
-
for offset, matched in matches:
|
|
1200
|
-
start = max(0, offset - context_bytes)
|
|
1201
|
-
end = min(len(payload), offset + len(matched) + context_bytes)
|
|
1202
|
-
context = payload[start:end]
|
|
1203
|
-
|
|
1204
|
-
results[name].append(
|
|
1205
|
-
PatternMatch(
|
|
1206
|
-
pattern_name=name,
|
|
1207
|
-
offset=offset,
|
|
1208
|
-
matched=matched,
|
|
1209
|
-
packet_index=i,
|
|
1210
|
-
context=context,
|
|
1211
|
-
)
|
|
1212
|
-
)
|
|
1213
|
-
|
|
1214
|
-
return results
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
def filter_by_pattern(
|
|
1218
|
-
packets: Sequence[dict[str, Any] | bytes],
|
|
1219
|
-
pattern: bytes | str,
|
|
1220
|
-
pattern_type: Literal["exact", "wildcard", "regex"] = "exact",
|
|
1221
|
-
) -> list[dict[str, Any] | bytes]:
|
|
1222
|
-
"""Filter packets that contain a pattern.
|
|
1223
|
-
|
|
1224
|
-
Implements RE-PAY-002: Pattern-based filtering.
|
|
1225
|
-
|
|
1226
|
-
Args:
|
|
1227
|
-
packets: Sequence of packets.
|
|
1228
|
-
pattern: Pattern to match.
|
|
1229
|
-
pattern_type: Type of pattern matching.
|
|
1230
|
-
|
|
1231
|
-
Returns:
|
|
1232
|
-
List of packets containing the pattern.
|
|
1233
|
-
"""
|
|
1234
|
-
extractor = PayloadExtractor()
|
|
1235
|
-
result = []
|
|
1236
|
-
|
|
1237
|
-
for packet in packets:
|
|
1238
|
-
payload = extractor.extract_payload(packet)
|
|
1239
|
-
if isinstance(payload, memoryview | np.ndarray):
|
|
1240
|
-
payload = bytes(payload)
|
|
1241
|
-
|
|
1242
|
-
matches = _find_pattern_in_data(payload, pattern, pattern_type)
|
|
1243
|
-
if len(matches) > 0:
|
|
1244
|
-
result.append(packet)
|
|
1245
|
-
|
|
1246
|
-
return result
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
def detect_delimiter(
|
|
1250
|
-
payloads: Sequence[bytes] | bytes,
|
|
1251
|
-
candidates: list[bytes] | None = None,
|
|
1252
|
-
) -> DelimiterResult:
|
|
1253
|
-
"""Automatically detect message delimiter.
|
|
1254
|
-
|
|
1255
|
-
Implements RE-PAY-003: Delimiter detection.
|
|
1256
|
-
|
|
1257
|
-
Args:
|
|
1258
|
-
payloads: Payload data or list of payloads.
|
|
1259
|
-
candidates: Optional list of candidate delimiters to test.
|
|
1260
|
-
|
|
1261
|
-
Returns:
|
|
1262
|
-
DelimiterResult with detected delimiter info.
|
|
1263
|
-
|
|
1264
|
-
Example:
|
|
1265
|
-
>>> data = b'msg1\\r\\nmsg2\\r\\nmsg3\\r\\n'
|
|
1266
|
-
>>> result = detect_delimiter(data)
|
|
1267
|
-
>>> print(f"Delimiter: {result.delimiter!r}")
|
|
1268
|
-
"""
|
|
1269
|
-
# Combine payloads if list
|
|
1270
|
-
if isinstance(payloads, list | tuple):
|
|
1271
|
-
data: bytes = b"".join(payloads)
|
|
1272
|
-
else:
|
|
1273
|
-
# Type narrowing: payloads is bytes here
|
|
1274
|
-
data = cast("bytes", payloads)
|
|
1275
|
-
|
|
1276
|
-
if not data:
|
|
1277
|
-
return DelimiterResult(
|
|
1278
|
-
delimiter=b"",
|
|
1279
|
-
delimiter_type="fixed",
|
|
1280
|
-
confidence=0.0,
|
|
1281
|
-
occurrences=0,
|
|
1282
|
-
)
|
|
1283
|
-
|
|
1284
|
-
# Default candidates
|
|
1285
|
-
if candidates is None:
|
|
1286
|
-
candidates = [
|
|
1287
|
-
b"\r\n", # CRLF
|
|
1288
|
-
b"\n", # LF
|
|
1289
|
-
b"\x00", # Null
|
|
1290
|
-
b"\r", # CR
|
|
1291
|
-
b"\x0d\x0a", # CRLF (explicit)
|
|
1292
|
-
]
|
|
1293
|
-
|
|
1294
|
-
best_result = None
|
|
1295
|
-
best_score = 0.0
|
|
1296
|
-
|
|
1297
|
-
for delim in candidates:
|
|
1298
|
-
if len(delim) == 0:
|
|
1299
|
-
continue
|
|
1300
|
-
|
|
1301
|
-
count = data.count(delim)
|
|
1302
|
-
if count < 2:
|
|
1303
|
-
continue
|
|
1304
|
-
|
|
1305
|
-
# Calculate score based on frequency and regularity
|
|
1306
|
-
positions = []
|
|
1307
|
-
pos = 0
|
|
1308
|
-
while True:
|
|
1309
|
-
pos = data.find(delim, pos)
|
|
1310
|
-
if pos == -1:
|
|
1311
|
-
break
|
|
1312
|
-
positions.append(pos)
|
|
1313
|
-
pos += len(delim)
|
|
1314
|
-
|
|
1315
|
-
if len(positions) < 2:
|
|
1316
|
-
continue
|
|
1317
|
-
|
|
1318
|
-
# Calculate interval regularity
|
|
1319
|
-
intervals = [positions[i + 1] - positions[i] for i in range(len(positions) - 1)]
|
|
1320
|
-
if len(intervals) > 0:
|
|
1321
|
-
mean_interval = sum(intervals) / len(intervals)
|
|
1322
|
-
if mean_interval > 0:
|
|
1323
|
-
variance = sum((x - mean_interval) ** 2 for x in intervals) / len(intervals)
|
|
1324
|
-
cv = (variance**0.5) / mean_interval if mean_interval > 0 else 1.0
|
|
1325
|
-
regularity = 1.0 / (1.0 + cv)
|
|
1326
|
-
else:
|
|
1327
|
-
regularity = 0.0
|
|
1328
|
-
else:
|
|
1329
|
-
regularity = 0.0
|
|
1330
|
-
|
|
1331
|
-
# Score combines frequency and regularity
|
|
1332
|
-
score = count * (0.5 + 0.5 * regularity)
|
|
1333
|
-
|
|
1334
|
-
if score > best_score:
|
|
1335
|
-
best_score = score
|
|
1336
|
-
best_result = DelimiterResult(
|
|
1337
|
-
delimiter=delim,
|
|
1338
|
-
delimiter_type="fixed",
|
|
1339
|
-
confidence=min(1.0, regularity * 0.8 + 0.2 * min(1.0, count / 10)),
|
|
1340
|
-
occurrences=count,
|
|
1341
|
-
positions=positions,
|
|
1342
|
-
)
|
|
1343
|
-
|
|
1344
|
-
if best_result is None:
|
|
1345
|
-
return DelimiterResult(
|
|
1346
|
-
delimiter=b"",
|
|
1347
|
-
delimiter_type="fixed",
|
|
1348
|
-
confidence=0.0,
|
|
1349
|
-
occurrences=0,
|
|
1350
|
-
)
|
|
1351
|
-
|
|
1352
|
-
return best_result
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
def detect_length_prefix(
|
|
1356
|
-
payloads: Sequence[bytes],
|
|
1357
|
-
max_length_bytes: int = 4,
|
|
1358
|
-
) -> LengthPrefixResult:
|
|
1359
|
-
"""Detect length-prefixed message format.
|
|
1360
|
-
|
|
1361
|
-
Implements RE-PAY-003: Length prefix detection.
|
|
1362
|
-
|
|
1363
|
-
Args:
|
|
1364
|
-
payloads: List of payload samples.
|
|
1365
|
-
max_length_bytes: Maximum length field size to test.
|
|
1366
|
-
|
|
1367
|
-
Returns:
|
|
1368
|
-
LengthPrefixResult with detected format.
|
|
1369
|
-
|
|
1370
|
-
Example:
|
|
1371
|
-
>>> result = detect_length_prefix(payloads)
|
|
1372
|
-
>>> if result.detected:
|
|
1373
|
-
... print(f"Length field: {result.length_bytes} bytes, {result.endian}")
|
|
1374
|
-
"""
|
|
1375
|
-
if not payloads:
|
|
1376
|
-
return LengthPrefixResult(detected=False)
|
|
1377
|
-
|
|
1378
|
-
# Concatenate payloads for analysis
|
|
1379
|
-
data = b"".join(payloads)
|
|
1380
|
-
|
|
1381
|
-
best_result = LengthPrefixResult(detected=False)
|
|
1382
|
-
best_score = 0.0
|
|
1383
|
-
|
|
1384
|
-
# Try different length field sizes and offsets
|
|
1385
|
-
# IMPORTANT: Prefer larger length_bytes values when scores are equal
|
|
1386
|
-
# by iterating in reverse order (4, 2, 1) and using >= for comparison
|
|
1387
|
-
for length_bytes in [4, 2, 1]:
|
|
1388
|
-
if length_bytes > max_length_bytes:
|
|
1389
|
-
continue
|
|
1390
|
-
|
|
1391
|
-
for endian_str in ["big", "little"]:
|
|
1392
|
-
endian: Literal["big", "little"] = endian_str # type: ignore[assignment]
|
|
1393
|
-
for offset in range(min(8, len(data) - length_bytes)):
|
|
1394
|
-
for includes_length in [False, True]:
|
|
1395
|
-
score, matches = _test_length_prefix(
|
|
1396
|
-
data, length_bytes, endian, offset, includes_length
|
|
1397
|
-
)
|
|
1398
|
-
|
|
1399
|
-
# Use > to prefer larger length_bytes (tested first) when scores are equal
|
|
1400
|
-
if score > best_score and matches >= 3:
|
|
1401
|
-
best_score = score
|
|
1402
|
-
best_result = LengthPrefixResult(
|
|
1403
|
-
detected=True,
|
|
1404
|
-
length_bytes=length_bytes,
|
|
1405
|
-
endian=endian,
|
|
1406
|
-
offset=offset,
|
|
1407
|
-
includes_length=includes_length,
|
|
1408
|
-
confidence=score,
|
|
1409
|
-
)
|
|
1410
|
-
|
|
1411
|
-
return best_result
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
def find_message_boundaries(
|
|
1415
|
-
payloads: Sequence[bytes] | bytes,
|
|
1416
|
-
delimiter: bytes | DelimiterResult | None = None,
|
|
1417
|
-
length_prefix: LengthPrefixResult | None = None,
|
|
1418
|
-
) -> list[MessageBoundary]:
|
|
1419
|
-
"""Find message boundaries in payload data.
|
|
1420
|
-
|
|
1421
|
-
Implements RE-PAY-003: Message boundary detection.
|
|
1422
|
-
|
|
1423
|
-
Args:
|
|
1424
|
-
payloads: Payload data or list of payloads.
|
|
1425
|
-
delimiter: Delimiter to use (auto-detect if None).
|
|
1426
|
-
length_prefix: Length prefix format (test if None).
|
|
1427
|
-
|
|
1428
|
-
Returns:
|
|
1429
|
-
List of MessageBoundary objects.
|
|
1430
|
-
|
|
1431
|
-
Example:
|
|
1432
|
-
>>> boundaries = find_message_boundaries(data)
|
|
1433
|
-
>>> for b in boundaries:
|
|
1434
|
-
... print(f"Message {b.index}: {b.length} bytes")
|
|
1435
|
-
"""
|
|
1436
|
-
# Combine payloads if list
|
|
1437
|
-
if isinstance(payloads, list | tuple):
|
|
1438
|
-
data: bytes = b"".join(payloads)
|
|
1439
|
-
else:
|
|
1440
|
-
# Type narrowing: payloads is bytes here
|
|
1441
|
-
data = cast("bytes", payloads)
|
|
1442
|
-
|
|
1443
|
-
if not data:
|
|
1444
|
-
return []
|
|
1445
|
-
|
|
1446
|
-
boundaries = []
|
|
1447
|
-
|
|
1448
|
-
# Try length prefix first
|
|
1449
|
-
if length_prefix is None:
|
|
1450
|
-
length_prefix = detect_length_prefix([data] if isinstance(data, bytes) else list(payloads))
|
|
1451
|
-
|
|
1452
|
-
if length_prefix.detected:
|
|
1453
|
-
boundaries = _extract_length_prefixed_messages(data, length_prefix)
|
|
1454
|
-
if len(boundaries) > 0:
|
|
1455
|
-
return boundaries
|
|
1456
|
-
|
|
1457
|
-
# Fall back to delimiter
|
|
1458
|
-
if delimiter is None:
|
|
1459
|
-
delimiter = detect_delimiter(data)
|
|
1460
|
-
|
|
1461
|
-
if isinstance(delimiter, DelimiterResult):
|
|
1462
|
-
delim = delimiter.delimiter
|
|
1463
|
-
else:
|
|
1464
|
-
delim = delimiter
|
|
1465
|
-
|
|
1466
|
-
if not delim:
|
|
1467
|
-
# No delimiter found, return whole data as one message
|
|
1468
|
-
return [MessageBoundary(start=0, end=len(data), length=len(data), data=data, index=0)]
|
|
1469
|
-
|
|
1470
|
-
# Split by delimiter
|
|
1471
|
-
parts = data.split(delim)
|
|
1472
|
-
current_offset = 0
|
|
1473
|
-
|
|
1474
|
-
for _i, part in enumerate(parts):
|
|
1475
|
-
if part: # Skip empty parts
|
|
1476
|
-
boundaries.append(
|
|
1477
|
-
MessageBoundary(
|
|
1478
|
-
start=current_offset,
|
|
1479
|
-
end=current_offset + len(part),
|
|
1480
|
-
length=len(part),
|
|
1481
|
-
data=part,
|
|
1482
|
-
index=len(boundaries),
|
|
1483
|
-
)
|
|
1484
|
-
)
|
|
1485
|
-
current_offset += len(part) + len(delim)
|
|
1486
|
-
|
|
1487
|
-
return boundaries
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
def segment_messages(
|
|
1491
|
-
payloads: Sequence[bytes] | bytes,
|
|
1492
|
-
delimiter: bytes | None = None,
|
|
1493
|
-
length_prefix: LengthPrefixResult | None = None,
|
|
1494
|
-
) -> list[bytes]:
|
|
1495
|
-
"""Segment stream into individual messages.
|
|
1496
|
-
|
|
1497
|
-
Implements RE-PAY-003: Message segmentation.
|
|
1498
|
-
|
|
1499
|
-
Args:
|
|
1500
|
-
payloads: Payload data or list of payloads.
|
|
1501
|
-
delimiter: Delimiter to use (auto-detect if None).
|
|
1502
|
-
length_prefix: Length prefix format (auto-detect if None).
|
|
1503
|
-
|
|
1504
|
-
Returns:
|
|
1505
|
-
List of message bytes.
|
|
1506
|
-
"""
|
|
1507
|
-
boundaries = find_message_boundaries(payloads, delimiter, length_prefix)
|
|
1508
|
-
return [b.data for b in boundaries]
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
def diff_payloads(payload_a: bytes, payload_b: bytes) -> PayloadDiff:
|
|
1512
|
-
"""Compare two payloads and identify differences.
|
|
1513
|
-
|
|
1514
|
-
Implements RE-PAY-005: Payload differential analysis.
|
|
1515
|
-
|
|
1516
|
-
Args:
|
|
1517
|
-
payload_a: First payload.
|
|
1518
|
-
payload_b: Second payload.
|
|
1519
|
-
|
|
1520
|
-
Returns:
|
|
1521
|
-
PayloadDiff with comparison results.
|
|
1522
|
-
|
|
1523
|
-
Example:
|
|
1524
|
-
>>> diff = diff_payloads(pkt1.data, pkt2.data)
|
|
1525
|
-
>>> print(f"Common prefix: {diff.common_prefix_length} bytes")
|
|
1526
|
-
>>> print(f"Different bytes: {len(diff.differences)}")
|
|
1527
|
-
"""
|
|
1528
|
-
# Find common prefix
|
|
1529
|
-
common_prefix = 0
|
|
1530
|
-
min_len = min(len(payload_a), len(payload_b))
|
|
1531
|
-
for i in range(min_len):
|
|
1532
|
-
if payload_a[i] == payload_b[i]:
|
|
1533
|
-
common_prefix += 1
|
|
1534
|
-
else:
|
|
1535
|
-
break
|
|
1536
|
-
|
|
1537
|
-
# Find common suffix
|
|
1538
|
-
common_suffix = 0
|
|
1539
|
-
for i in range(1, min_len - common_prefix + 1):
|
|
1540
|
-
if payload_a[-i] == payload_b[-i]:
|
|
1541
|
-
common_suffix += 1
|
|
1542
|
-
else:
|
|
1543
|
-
break
|
|
1544
|
-
|
|
1545
|
-
# Find all differences
|
|
1546
|
-
differences = []
|
|
1547
|
-
for i in range(min_len):
|
|
1548
|
-
if payload_a[i] != payload_b[i]:
|
|
1549
|
-
differences.append((i, payload_a[i], payload_b[i]))
|
|
1550
|
-
|
|
1551
|
-
# Add length differences
|
|
1552
|
-
if len(payload_a) > len(payload_b):
|
|
1553
|
-
for i in range(len(payload_b), len(payload_a)):
|
|
1554
|
-
differences.append((i, payload_a[i], -1))
|
|
1555
|
-
elif len(payload_b) > len(payload_a):
|
|
1556
|
-
for i in range(len(payload_a), len(payload_b)):
|
|
1557
|
-
differences.append((i, -1, payload_b[i]))
|
|
1558
|
-
|
|
1559
|
-
# Calculate similarity
|
|
1560
|
-
max_len = max(len(payload_a), len(payload_b))
|
|
1561
|
-
if max_len == 0:
|
|
1562
|
-
similarity = 1.0
|
|
1563
|
-
else:
|
|
1564
|
-
matching = min_len - len([d for d in differences if d[0] < min_len])
|
|
1565
|
-
similarity = matching / max_len
|
|
1566
|
-
|
|
1567
|
-
# Calculate edit distance (simplified Levenshtein)
|
|
1568
|
-
edit_distance = _levenshtein_distance(payload_a, payload_b)
|
|
1569
|
-
|
|
1570
|
-
return PayloadDiff(
|
|
1571
|
-
common_prefix_length=common_prefix,
|
|
1572
|
-
common_suffix_length=common_suffix,
|
|
1573
|
-
differences=differences,
|
|
1574
|
-
similarity=similarity,
|
|
1575
|
-
edit_distance=edit_distance,
|
|
1576
|
-
)
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
def find_common_bytes(payloads: Sequence[bytes]) -> bytes:
|
|
1580
|
-
"""Find common prefix across all payloads.
|
|
1581
|
-
|
|
1582
|
-
Implements RE-PAY-005: Common byte analysis.
|
|
1583
|
-
|
|
1584
|
-
Args:
|
|
1585
|
-
payloads: List of payloads to analyze.
|
|
1586
|
-
|
|
1587
|
-
Returns:
|
|
1588
|
-
Common prefix bytes.
|
|
1589
|
-
"""
|
|
1590
|
-
if not payloads:
|
|
1591
|
-
return b""
|
|
1592
|
-
|
|
1593
|
-
if len(payloads) == 1:
|
|
1594
|
-
return payloads[0]
|
|
1595
|
-
|
|
1596
|
-
# Find minimum length
|
|
1597
|
-
min_len = min(len(p) for p in payloads)
|
|
1598
|
-
|
|
1599
|
-
# Find common prefix
|
|
1600
|
-
common = bytearray()
|
|
1601
|
-
for i in range(min_len):
|
|
1602
|
-
byte = payloads[0][i]
|
|
1603
|
-
if all(p[i] == byte for p in payloads):
|
|
1604
|
-
common.append(byte)
|
|
1605
|
-
else:
|
|
1606
|
-
break
|
|
1607
|
-
|
|
1608
|
-
return bytes(common)
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
def find_variable_positions(payloads: Sequence[bytes]) -> VariablePositions:
|
|
1612
|
-
"""Identify which byte positions vary across payloads.
|
|
1613
|
-
|
|
1614
|
-
Implements RE-PAY-005: Variable position detection.
|
|
1615
|
-
|
|
1616
|
-
Args:
|
|
1617
|
-
payloads: List of payloads to analyze.
|
|
1618
|
-
|
|
1619
|
-
Returns:
|
|
1620
|
-
VariablePositions with constant and variable position info.
|
|
1621
|
-
|
|
1622
|
-
Example:
|
|
1623
|
-
>>> result = find_variable_positions(payloads)
|
|
1624
|
-
>>> print(f"Constant positions: {result.constant_positions}")
|
|
1625
|
-
>>> print(f"Variable positions: {result.variable_positions}")
|
|
1626
|
-
"""
|
|
1627
|
-
if not payloads:
|
|
1628
|
-
return VariablePositions(
|
|
1629
|
-
constant_positions=[],
|
|
1630
|
-
variable_positions=[],
|
|
1631
|
-
constant_values={},
|
|
1632
|
-
variance_by_position=np.array([]),
|
|
1633
|
-
)
|
|
1634
|
-
|
|
1635
|
-
# Use shortest payload length
|
|
1636
|
-
min_len = min(len(p) for p in payloads)
|
|
1637
|
-
|
|
1638
|
-
constant_positions = []
|
|
1639
|
-
variable_positions = []
|
|
1640
|
-
constant_values = {}
|
|
1641
|
-
variances = []
|
|
1642
|
-
|
|
1643
|
-
for i in range(min_len):
|
|
1644
|
-
values = [p[i] for p in payloads]
|
|
1645
|
-
unique = set(values)
|
|
1646
|
-
|
|
1647
|
-
if len(unique) == 1:
|
|
1648
|
-
constant_positions.append(i)
|
|
1649
|
-
constant_values[i] = values[0]
|
|
1650
|
-
variances.append(0.0)
|
|
1651
|
-
else:
|
|
1652
|
-
variable_positions.append(i)
|
|
1653
|
-
variances.append(float(np.var(values)))
|
|
1654
|
-
|
|
1655
|
-
return VariablePositions(
|
|
1656
|
-
constant_positions=constant_positions,
|
|
1657
|
-
variable_positions=variable_positions,
|
|
1658
|
-
constant_values=constant_values,
|
|
1659
|
-
variance_by_position=np.array(variances),
|
|
1660
|
-
)
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
def compute_similarity(
|
|
1664
|
-
payload_a: bytes,
|
|
1665
|
-
payload_b: bytes,
|
|
1666
|
-
metric: Literal["levenshtein", "hamming", "jaccard"] = "levenshtein",
|
|
1667
|
-
) -> float:
|
|
1668
|
-
"""Compute similarity between two payloads.
|
|
1669
|
-
|
|
1670
|
-
Implements RE-PAY-005: Similarity computation.
|
|
1671
|
-
|
|
1672
|
-
Args:
|
|
1673
|
-
payload_a: First payload.
|
|
1674
|
-
payload_b: Second payload.
|
|
1675
|
-
metric: Similarity metric to use.
|
|
1676
|
-
|
|
1677
|
-
Returns:
|
|
1678
|
-
Similarity score (0-1).
|
|
1679
|
-
"""
|
|
1680
|
-
if metric == "levenshtein":
|
|
1681
|
-
max_len = max(len(payload_a), len(payload_b))
|
|
1682
|
-
if max_len == 0:
|
|
1683
|
-
return 1.0
|
|
1684
|
-
distance = _levenshtein_distance(payload_a, payload_b)
|
|
1685
|
-
return 1.0 - (distance / max_len)
|
|
1686
|
-
|
|
1687
|
-
elif metric == "hamming":
|
|
1688
|
-
if len(payload_a) != len(payload_b):
|
|
1689
|
-
# Pad shorter one
|
|
1690
|
-
max_len = max(len(payload_a), len(payload_b))
|
|
1691
|
-
payload_a = payload_a.ljust(max_len, b"\x00")
|
|
1692
|
-
payload_b = payload_b.ljust(max_len, b"\x00")
|
|
1693
|
-
|
|
1694
|
-
matches = sum(a == b for a, b in zip(payload_a, payload_b, strict=True))
|
|
1695
|
-
return matches / len(payload_a) if payload_a else 1.0
|
|
1696
|
-
|
|
1697
|
-
# metric == "jaccard"
|
|
1698
|
-
# Treat bytes as sets
|
|
1699
|
-
set_a = set(payload_a)
|
|
1700
|
-
set_b = set(payload_b)
|
|
1701
|
-
intersection = len(set_a & set_b)
|
|
1702
|
-
union = len(set_a | set_b)
|
|
1703
|
-
return intersection / union if union > 0 else 1.0
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
def cluster_payloads(
|
|
1707
|
-
payloads: Sequence[bytes],
|
|
1708
|
-
threshold: float = 0.8,
|
|
1709
|
-
algorithm: Literal["greedy", "dbscan"] = "greedy",
|
|
1710
|
-
) -> list[PayloadCluster]:
|
|
1711
|
-
"""Cluster similar payloads together.
|
|
1712
|
-
|
|
1713
|
-
Implements RE-PAY-005: Payload clustering.
|
|
1714
|
-
|
|
1715
|
-
Args:
|
|
1716
|
-
payloads: List of payloads to cluster.
|
|
1717
|
-
threshold: Similarity threshold for clustering.
|
|
1718
|
-
algorithm: Clustering algorithm.
|
|
1719
|
-
|
|
1720
|
-
Returns:
|
|
1721
|
-
List of PayloadCluster objects.
|
|
1722
|
-
|
|
1723
|
-
Example:
|
|
1724
|
-
>>> clusters = cluster_payloads(payloads, threshold=0.85)
|
|
1725
|
-
>>> for c in clusters:
|
|
1726
|
-
... print(f"Cluster {c.cluster_id}: {c.size} payloads")
|
|
1727
|
-
"""
|
|
1728
|
-
if not payloads:
|
|
1729
|
-
return []
|
|
1730
|
-
|
|
1731
|
-
if algorithm == "greedy":
|
|
1732
|
-
return _cluster_greedy_optimized(payloads, threshold)
|
|
1733
|
-
# algorithm == "dbscan"
|
|
1734
|
-
return _cluster_dbscan(payloads, threshold)
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
def correlate_request_response(
|
|
1738
|
-
requests: Sequence[PayloadInfo],
|
|
1739
|
-
responses: Sequence[PayloadInfo],
|
|
1740
|
-
max_delay: float = 1.0,
|
|
1741
|
-
) -> list[tuple[PayloadInfo, PayloadInfo, float]]:
|
|
1742
|
-
"""Correlate request payloads with responses.
|
|
1743
|
-
|
|
1744
|
-
Implements RE-PAY-005: Request-response correlation.
|
|
1745
|
-
|
|
1746
|
-
Args:
|
|
1747
|
-
requests: List of request PayloadInfo.
|
|
1748
|
-
responses: List of response PayloadInfo.
|
|
1749
|
-
max_delay: Maximum time between request and response.
|
|
1750
|
-
|
|
1751
|
-
Returns:
|
|
1752
|
-
List of (request, response, latency) tuples.
|
|
1753
|
-
"""
|
|
1754
|
-
pairs = []
|
|
1755
|
-
|
|
1756
|
-
for request in requests:
|
|
1757
|
-
if request.timestamp is None:
|
|
1758
|
-
continue
|
|
1759
|
-
|
|
1760
|
-
best_response = None
|
|
1761
|
-
best_latency = float("inf")
|
|
1762
|
-
|
|
1763
|
-
for response in responses:
|
|
1764
|
-
if response.timestamp is None:
|
|
1765
|
-
continue
|
|
1766
|
-
|
|
1767
|
-
latency = response.timestamp - request.timestamp
|
|
1768
|
-
if 0 <= latency <= max_delay and latency < best_latency:
|
|
1769
|
-
best_response = response
|
|
1770
|
-
best_latency = latency
|
|
1771
|
-
|
|
1772
|
-
if best_response is not None:
|
|
1773
|
-
pairs.append((request, best_response, best_latency))
|
|
1774
|
-
|
|
1775
|
-
return pairs
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
# =============================================================================
|
|
1779
|
-
# Helper functions
|
|
1780
|
-
# =============================================================================
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
def _find_pattern_in_data(
|
|
1784
|
-
data: bytes,
|
|
1785
|
-
pattern: bytes | str,
|
|
1786
|
-
pattern_type: str,
|
|
1787
|
-
) -> list[tuple[int, bytes]]:
|
|
1788
|
-
"""Find pattern occurrences in data."""
|
|
1789
|
-
matches = []
|
|
1790
|
-
|
|
1791
|
-
if pattern_type == "exact":
|
|
1792
|
-
if isinstance(pattern, str):
|
|
1793
|
-
pattern = pattern.encode()
|
|
1794
|
-
pos = 0
|
|
1795
|
-
while True:
|
|
1796
|
-
pos = data.find(pattern, pos)
|
|
1797
|
-
if pos == -1:
|
|
1798
|
-
break
|
|
1799
|
-
matches.append((pos, pattern))
|
|
1800
|
-
pos += 1
|
|
1801
|
-
|
|
1802
|
-
elif pattern_type == "wildcard":
|
|
1803
|
-
# Convert wildcard pattern to regex
|
|
1804
|
-
if isinstance(pattern, bytes):
|
|
1805
|
-
# Replace ?? with . for single byte match
|
|
1806
|
-
regex_pattern = pattern.replace(b"??", b".")
|
|
1807
|
-
try:
|
|
1808
|
-
for match in re.finditer(regex_pattern, data, re.DOTALL):
|
|
1809
|
-
matches.append((match.start(), match.group()))
|
|
1810
|
-
except re.error:
|
|
1811
|
-
pass
|
|
1812
|
-
|
|
1813
|
-
elif pattern_type == "regex":
|
|
1814
|
-
if isinstance(pattern, str):
|
|
1815
|
-
pattern = pattern.encode()
|
|
1816
|
-
try:
|
|
1817
|
-
for match in re.finditer(pattern, data, re.DOTALL):
|
|
1818
|
-
matches.append((match.start(), match.group()))
|
|
1819
|
-
except re.error:
|
|
1820
|
-
pass
|
|
1821
|
-
|
|
1822
|
-
return matches
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
def _test_length_prefix(
|
|
1826
|
-
data: bytes,
|
|
1827
|
-
length_bytes: int,
|
|
1828
|
-
endian: str,
|
|
1829
|
-
offset: int,
|
|
1830
|
-
includes_length: bool,
|
|
1831
|
-
) -> tuple[float, int]:
|
|
1832
|
-
"""Test if data follows a length-prefix pattern."""
|
|
1833
|
-
matches = 0
|
|
1834
|
-
pos = 0
|
|
1835
|
-
|
|
1836
|
-
while pos + offset + length_bytes <= len(data):
|
|
1837
|
-
# Read length field
|
|
1838
|
-
length_data = data[pos + offset : pos + offset + length_bytes]
|
|
1839
|
-
if endian == "big":
|
|
1840
|
-
length = int.from_bytes(length_data, "big")
|
|
1841
|
-
else:
|
|
1842
|
-
length = int.from_bytes(length_data, "little")
|
|
1843
|
-
|
|
1844
|
-
if includes_length:
|
|
1845
|
-
expected_end = pos + length
|
|
1846
|
-
else:
|
|
1847
|
-
expected_end = pos + offset + length_bytes + length
|
|
1848
|
-
|
|
1849
|
-
# Check if this makes sense
|
|
1850
|
-
if 0 < length < 65536 and expected_end <= len(data):
|
|
1851
|
-
matches += 1
|
|
1852
|
-
pos = expected_end
|
|
1853
|
-
else:
|
|
1854
|
-
break
|
|
1855
|
-
|
|
1856
|
-
# Score based on matches and coverage
|
|
1857
|
-
coverage = pos / len(data) if len(data) > 0 else 0
|
|
1858
|
-
score = min(1.0, matches / 5) * coverage
|
|
1859
|
-
|
|
1860
|
-
return score, matches
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
def _extract_length_prefixed_messages(
|
|
1864
|
-
data: bytes,
|
|
1865
|
-
length_prefix: LengthPrefixResult,
|
|
1866
|
-
) -> list[MessageBoundary]:
|
|
1867
|
-
"""Extract messages using detected length prefix format."""
|
|
1868
|
-
boundaries = []
|
|
1869
|
-
pos = 0
|
|
1870
|
-
index = 0
|
|
1871
|
-
|
|
1872
|
-
while pos + length_prefix.offset + length_prefix.length_bytes <= len(data):
|
|
1873
|
-
# Read length
|
|
1874
|
-
length_data = data[
|
|
1875
|
-
pos + length_prefix.offset : pos + length_prefix.offset + length_prefix.length_bytes
|
|
1876
|
-
]
|
|
1877
|
-
if length_prefix.endian == "big":
|
|
1878
|
-
length = int.from_bytes(length_data, "big")
|
|
1879
|
-
else:
|
|
1880
|
-
length = int.from_bytes(length_data, "little")
|
|
1881
|
-
|
|
1882
|
-
if length_prefix.includes_length:
|
|
1883
|
-
end = pos + length
|
|
1884
|
-
else:
|
|
1885
|
-
end = pos + length_prefix.offset + length_prefix.length_bytes + length
|
|
1886
|
-
|
|
1887
|
-
if end > len(data) or length <= 0:
|
|
1888
|
-
break
|
|
1889
|
-
|
|
1890
|
-
msg_data = data[pos:end]
|
|
1891
|
-
boundaries.append(
|
|
1892
|
-
MessageBoundary(
|
|
1893
|
-
start=pos,
|
|
1894
|
-
end=end,
|
|
1895
|
-
length=end - pos,
|
|
1896
|
-
data=msg_data,
|
|
1897
|
-
index=index,
|
|
1898
|
-
)
|
|
1899
|
-
)
|
|
1900
|
-
|
|
1901
|
-
pos = end
|
|
1902
|
-
index += 1
|
|
1903
|
-
|
|
1904
|
-
return boundaries
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
def _levenshtein_distance(a: bytes, b: bytes) -> int:
|
|
1908
|
-
"""Calculate Levenshtein edit distance between two byte sequences."""
|
|
1909
|
-
if len(a) < len(b):
|
|
1910
|
-
return _levenshtein_distance(b, a)
|
|
1911
|
-
|
|
1912
|
-
if len(b) == 0:
|
|
1913
|
-
return len(a)
|
|
1914
|
-
|
|
1915
|
-
previous_row: list[int] = list(range(len(b) + 1))
|
|
1916
|
-
for i, c1 in enumerate(a):
|
|
1917
|
-
current_row = [i + 1]
|
|
1918
|
-
for j, c2 in enumerate(b):
|
|
1919
|
-
insertions = previous_row[j + 1] + 1
|
|
1920
|
-
deletions = current_row[j] + 1
|
|
1921
|
-
substitutions = previous_row[j] + (c1 != c2)
|
|
1922
|
-
current_row.append(min(insertions, deletions, substitutions))
|
|
1923
|
-
previous_row = current_row
|
|
1924
|
-
|
|
1925
|
-
return previous_row[-1]
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
def _fast_similarity(payload_a: bytes, payload_b: bytes, threshold: float) -> float | None:
|
|
1929
|
-
"""Fast similarity check with early termination.
|
|
1930
|
-
|
|
1931
|
-
Uses length-based filtering and sampling to quickly reject dissimilar payloads.
|
|
1932
|
-
Returns None if payloads are likely similar (needs full check),
|
|
1933
|
-
or a similarity value if they can be quickly determined.
|
|
1934
|
-
|
|
1935
|
-
Args:
|
|
1936
|
-
payload_a: First payload.
|
|
1937
|
-
payload_b: Second payload.
|
|
1938
|
-
threshold: Similarity threshold for clustering.
|
|
1939
|
-
|
|
1940
|
-
Returns:
|
|
1941
|
-
Similarity value if quickly determined, None if full check needed.
|
|
1942
|
-
"""
|
|
1943
|
-
len_a = len(payload_a)
|
|
1944
|
-
len_b = len(payload_b)
|
|
1945
|
-
|
|
1946
|
-
# Empty payloads
|
|
1947
|
-
if len_a == 0 and len_b == 0:
|
|
1948
|
-
return 1.0
|
|
1949
|
-
if len_a == 0 or len_b == 0:
|
|
1950
|
-
return 0.0
|
|
1951
|
-
|
|
1952
|
-
# Length difference filter: if lengths differ by more than (1-threshold)*max_len,
|
|
1953
|
-
# similarity can't exceed threshold
|
|
1954
|
-
max_len = max(len_a, len_b)
|
|
1955
|
-
min_len = min(len_a, len_b)
|
|
1956
|
-
_length_diff = max_len - min_len
|
|
1957
|
-
|
|
1958
|
-
# Maximum possible similarity given length difference
|
|
1959
|
-
max_possible_similarity = min_len / max_len
|
|
1960
|
-
if max_possible_similarity < threshold:
|
|
1961
|
-
return max_possible_similarity
|
|
1962
|
-
|
|
1963
|
-
# For same-length payloads, use fast hamming similarity
|
|
1964
|
-
if len_a == len_b:
|
|
1965
|
-
# Sample comparison for large payloads
|
|
1966
|
-
if len_a > 50:
|
|
1967
|
-
# Sample first 16, last 16, and some middle bytes
|
|
1968
|
-
sample_size = min(48, len_a)
|
|
1969
|
-
mismatches = 0
|
|
1970
|
-
|
|
1971
|
-
# First 16 bytes
|
|
1972
|
-
for i in range(min(16, len_a)):
|
|
1973
|
-
if payload_a[i] != payload_b[i]:
|
|
1974
|
-
mismatches += 1
|
|
1975
|
-
|
|
1976
|
-
# Last 16 bytes
|
|
1977
|
-
for i in range(1, min(17, len_a + 1)):
|
|
1978
|
-
if payload_a[-i] != payload_b[-i]:
|
|
1979
|
-
mismatches += 1
|
|
1980
|
-
|
|
1981
|
-
# Middle samples
|
|
1982
|
-
if len_a > 32:
|
|
1983
|
-
step = (len_a - 32) // 16
|
|
1984
|
-
if step > 0:
|
|
1985
|
-
for i in range(16, len_a - 16, step):
|
|
1986
|
-
if payload_a[i] != payload_b[i]:
|
|
1987
|
-
mismatches += 1
|
|
1988
|
-
|
|
1989
|
-
# Estimate similarity from sample
|
|
1990
|
-
estimated_similarity = 1.0 - (mismatches / sample_size)
|
|
1991
|
-
|
|
1992
|
-
# If sample shows very low similarity, reject early
|
|
1993
|
-
if estimated_similarity < threshold * 0.8:
|
|
1994
|
-
return estimated_similarity
|
|
1995
|
-
|
|
1996
|
-
# Full hamming comparison for same-length payloads (faster than Levenshtein)
|
|
1997
|
-
matches = sum(a == b for a, b in zip(payload_a, payload_b, strict=True))
|
|
1998
|
-
return matches / len_a
|
|
1999
|
-
|
|
2000
|
-
# For different-length payloads, use common prefix/suffix heuristic
|
|
2001
|
-
common_prefix = 0
|
|
2002
|
-
for i in range(min_len):
|
|
2003
|
-
if payload_a[i] == payload_b[i]:
|
|
2004
|
-
common_prefix += 1
|
|
2005
|
-
else:
|
|
2006
|
-
break
|
|
2007
|
-
|
|
2008
|
-
common_suffix = 0
|
|
2009
|
-
for i in range(1, min_len - common_prefix + 1):
|
|
2010
|
-
if payload_a[-i] == payload_b[-i]:
|
|
2011
|
-
common_suffix += 1
|
|
2012
|
-
else:
|
|
2013
|
-
break
|
|
2014
|
-
|
|
2015
|
-
# Estimate similarity from prefix/suffix
|
|
2016
|
-
common_bytes = common_prefix + common_suffix
|
|
2017
|
-
estimated_similarity = common_bytes / max_len
|
|
2018
|
-
|
|
2019
|
-
# If common bytes suggest low similarity, reject
|
|
2020
|
-
if estimated_similarity < threshold * 0.7:
|
|
2021
|
-
return estimated_similarity
|
|
2022
|
-
|
|
2023
|
-
# Need full comparison
|
|
2024
|
-
return None
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
def _cluster_greedy_optimized(
|
|
2028
|
-
payloads: Sequence[bytes],
|
|
2029
|
-
threshold: float,
|
|
2030
|
-
) -> list[PayloadCluster]:
|
|
2031
|
-
"""Optimized greedy clustering algorithm.
|
|
2032
|
-
|
|
2033
|
-
Uses fast pre-filtering based on length and sampling to avoid
|
|
2034
|
-
expensive Levenshtein distance calculations when possible.
|
|
2035
|
-
|
|
2036
|
-
Args:
|
|
2037
|
-
payloads: List of payloads to cluster.
|
|
2038
|
-
threshold: Similarity threshold for clustering.
|
|
2039
|
-
|
|
2040
|
-
Returns:
|
|
2041
|
-
List of PayloadCluster objects.
|
|
2042
|
-
"""
|
|
2043
|
-
clusters: list[PayloadCluster] = []
|
|
2044
|
-
assigned = [False] * len(payloads)
|
|
2045
|
-
|
|
2046
|
-
# Precompute lengths for fast filtering
|
|
2047
|
-
lengths = [len(p) for p in payloads]
|
|
2048
|
-
|
|
2049
|
-
for i, payload in enumerate(payloads):
|
|
2050
|
-
if assigned[i]:
|
|
2051
|
-
continue
|
|
2052
|
-
|
|
2053
|
-
# Start new cluster
|
|
2054
|
-
cluster_payloads = [payload]
|
|
2055
|
-
cluster_indices = [i]
|
|
2056
|
-
assigned[i] = True
|
|
2057
|
-
|
|
2058
|
-
payload_len = lengths[i]
|
|
2059
|
-
|
|
2060
|
-
# Find similar payloads
|
|
2061
|
-
for j in range(i + 1, len(payloads)):
|
|
2062
|
-
if assigned[j]:
|
|
2063
|
-
continue
|
|
2064
|
-
|
|
2065
|
-
other_len = lengths[j]
|
|
2066
|
-
|
|
2067
|
-
# Quick length-based rejection
|
|
2068
|
-
max_len = max(payload_len, other_len)
|
|
2069
|
-
min_len = min(payload_len, other_len)
|
|
2070
|
-
if min_len / max_len < threshold:
|
|
2071
|
-
continue
|
|
2072
|
-
|
|
2073
|
-
# Try fast similarity check first
|
|
2074
|
-
fast_result = _fast_similarity(payload, payloads[j], threshold)
|
|
2075
|
-
|
|
2076
|
-
if fast_result is not None:
|
|
2077
|
-
similarity = fast_result
|
|
2078
|
-
else:
|
|
2079
|
-
# Fall back to Levenshtein for uncertain cases
|
|
2080
|
-
similarity = compute_similarity(payload, payloads[j])
|
|
2081
|
-
|
|
2082
|
-
if similarity >= threshold:
|
|
2083
|
-
cluster_payloads.append(payloads[j])
|
|
2084
|
-
cluster_indices.append(j)
|
|
2085
|
-
assigned[j] = True
|
|
2086
|
-
|
|
2087
|
-
clusters.append(
|
|
2088
|
-
PayloadCluster(
|
|
2089
|
-
cluster_id=len(clusters),
|
|
2090
|
-
payloads=cluster_payloads,
|
|
2091
|
-
indices=cluster_indices,
|
|
2092
|
-
representative=payload,
|
|
2093
|
-
size=len(cluster_payloads),
|
|
2094
|
-
)
|
|
2095
|
-
)
|
|
2096
|
-
|
|
2097
|
-
return clusters
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
def _cluster_greedy(
|
|
2101
|
-
payloads: Sequence[bytes],
|
|
2102
|
-
threshold: float,
|
|
2103
|
-
) -> list[PayloadCluster]:
|
|
2104
|
-
"""Greedy clustering algorithm (legacy, uses optimized version)."""
|
|
2105
|
-
return _cluster_greedy_optimized(payloads, threshold)
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
def _cluster_dbscan(
|
|
2109
|
-
payloads: Sequence[bytes],
|
|
2110
|
-
threshold: float,
|
|
2111
|
-
) -> list[PayloadCluster]:
|
|
2112
|
-
"""DBSCAN-style clustering (simplified)."""
|
|
2113
|
-
# For simplicity, fall back to greedy
|
|
2114
|
-
# Full DBSCAN would require scipy or custom implementation
|
|
2115
|
-
return _cluster_greedy_optimized(payloads, threshold)
|
|
2116
|
-
|
|
20
|
+
# RE-PAY-004 & RE-PAY-005: Field Inference and Comparison
|
|
21
|
+
from oscura.analyzers.packet.payload_analysis import (
|
|
22
|
+
FieldInferrer,
|
|
23
|
+
InferredField,
|
|
24
|
+
MessageSchema,
|
|
25
|
+
PayloadCluster,
|
|
26
|
+
PayloadDiff,
|
|
27
|
+
VariablePositions,
|
|
28
|
+
cluster_payloads,
|
|
29
|
+
compute_similarity,
|
|
30
|
+
correlate_request_response,
|
|
31
|
+
detect_field_types,
|
|
32
|
+
diff_payloads,
|
|
33
|
+
find_checksum_fields,
|
|
34
|
+
find_common_bytes,
|
|
35
|
+
find_sequence_fields,
|
|
36
|
+
find_variable_positions,
|
|
37
|
+
infer_fields,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# RE-PAY-001: Payload Extraction
|
|
41
|
+
from oscura.analyzers.packet.payload_extraction import (
|
|
42
|
+
PayloadExtractor,
|
|
43
|
+
PayloadInfo,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# RE-PAY-002 & RE-PAY-003: Pattern Search and Delimiter Detection
|
|
47
|
+
from oscura.analyzers.packet.payload_patterns import (
|
|
48
|
+
DelimiterResult,
|
|
49
|
+
LengthPrefixResult,
|
|
50
|
+
MessageBoundary,
|
|
51
|
+
PatternMatch,
|
|
52
|
+
detect_delimiter,
|
|
53
|
+
detect_length_prefix,
|
|
54
|
+
filter_by_pattern,
|
|
55
|
+
find_message_boundaries,
|
|
56
|
+
search_pattern,
|
|
57
|
+
search_patterns,
|
|
58
|
+
segment_messages,
|
|
59
|
+
)
|
|
2117
60
|
|
|
2118
61
|
__all__ = [
|
|
62
|
+
# RE-PAY-003: Delimiter Detection
|
|
2119
63
|
"DelimiterResult",
|
|
2120
64
|
"FieldInferrer",
|
|
2121
|
-
# RE-PAY-004: Field
|
|
65
|
+
# RE-PAY-004: Field Inference
|
|
2122
66
|
"InferredField",
|
|
2123
67
|
"LengthPrefixResult",
|
|
2124
68
|
"MessageBoundary",
|
|
2125
69
|
"MessageSchema",
|
|
70
|
+
# RE-PAY-002: Pattern Search
|
|
2126
71
|
"PatternMatch",
|
|
2127
72
|
"PayloadCluster",
|
|
73
|
+
# RE-PAY-005: Payload Comparison
|
|
2128
74
|
"PayloadDiff",
|
|
2129
|
-
# Classes
|
|
2130
75
|
"PayloadExtractor",
|
|
2131
|
-
#
|
|
76
|
+
# RE-PAY-001: Payload Extraction
|
|
2132
77
|
"PayloadInfo",
|
|
2133
78
|
"VariablePositions",
|
|
2134
79
|
"cluster_payloads",
|
|
2135
80
|
"compute_similarity",
|
|
2136
81
|
"correlate_request_response",
|
|
2137
|
-
# RE-PAY-003: Delimiter detection
|
|
2138
82
|
"detect_delimiter",
|
|
2139
83
|
"detect_field_types",
|
|
2140
84
|
"detect_length_prefix",
|
|
2141
|
-
# RE-PAY-005: Comparison
|
|
2142
85
|
"diff_payloads",
|
|
2143
86
|
"filter_by_pattern",
|
|
2144
87
|
"find_checksum_fields",
|
|
@@ -2147,9 +90,6 @@ __all__ = [
|
|
|
2147
90
|
"find_sequence_fields",
|
|
2148
91
|
"find_variable_positions",
|
|
2149
92
|
"infer_fields",
|
|
2150
|
-
# RE-PAY-001: Extraction
|
|
2151
|
-
# (via PayloadExtractor methods)
|
|
2152
|
-
# RE-PAY-002: Pattern search
|
|
2153
93
|
"search_pattern",
|
|
2154
94
|
"search_patterns",
|
|
2155
95
|
"segment_messages",
|