oscura 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +813 -8
- oscura/__main__.py +392 -0
- oscura/analyzers/__init__.py +37 -0
- oscura/analyzers/digital/__init__.py +177 -0
- oscura/analyzers/digital/bus.py +691 -0
- oscura/analyzers/digital/clock.py +805 -0
- oscura/analyzers/digital/correlation.py +720 -0
- oscura/analyzers/digital/edges.py +632 -0
- oscura/analyzers/digital/extraction.py +413 -0
- oscura/analyzers/digital/quality.py +878 -0
- oscura/analyzers/digital/signal_quality.py +877 -0
- oscura/analyzers/digital/thresholds.py +708 -0
- oscura/analyzers/digital/timing.py +1104 -0
- oscura/analyzers/eye/__init__.py +46 -0
- oscura/analyzers/eye/diagram.py +434 -0
- oscura/analyzers/eye/metrics.py +555 -0
- oscura/analyzers/jitter/__init__.py +83 -0
- oscura/analyzers/jitter/ber.py +333 -0
- oscura/analyzers/jitter/decomposition.py +759 -0
- oscura/analyzers/jitter/measurements.py +413 -0
- oscura/analyzers/jitter/spectrum.py +220 -0
- oscura/analyzers/measurements.py +40 -0
- oscura/analyzers/packet/__init__.py +171 -0
- oscura/analyzers/packet/daq.py +1077 -0
- oscura/analyzers/packet/metrics.py +437 -0
- oscura/analyzers/packet/parser.py +327 -0
- oscura/analyzers/packet/payload.py +2156 -0
- oscura/analyzers/packet/payload_analysis.py +1312 -0
- oscura/analyzers/packet/payload_extraction.py +236 -0
- oscura/analyzers/packet/payload_patterns.py +670 -0
- oscura/analyzers/packet/stream.py +359 -0
- oscura/analyzers/patterns/__init__.py +266 -0
- oscura/analyzers/patterns/clustering.py +1036 -0
- oscura/analyzers/patterns/discovery.py +539 -0
- oscura/analyzers/patterns/learning.py +797 -0
- oscura/analyzers/patterns/matching.py +1091 -0
- oscura/analyzers/patterns/periodic.py +650 -0
- oscura/analyzers/patterns/sequences.py +767 -0
- oscura/analyzers/power/__init__.py +116 -0
- oscura/analyzers/power/ac_power.py +391 -0
- oscura/analyzers/power/basic.py +383 -0
- oscura/analyzers/power/conduction.py +314 -0
- oscura/analyzers/power/efficiency.py +297 -0
- oscura/analyzers/power/ripple.py +356 -0
- oscura/analyzers/power/soa.py +372 -0
- oscura/analyzers/power/switching.py +479 -0
- oscura/analyzers/protocol/__init__.py +150 -0
- oscura/analyzers/protocols/__init__.py +150 -0
- oscura/analyzers/protocols/base.py +500 -0
- oscura/analyzers/protocols/can.py +620 -0
- oscura/analyzers/protocols/can_fd.py +448 -0
- oscura/analyzers/protocols/flexray.py +405 -0
- oscura/analyzers/protocols/hdlc.py +399 -0
- oscura/analyzers/protocols/i2c.py +368 -0
- oscura/analyzers/protocols/i2s.py +296 -0
- oscura/analyzers/protocols/jtag.py +393 -0
- oscura/analyzers/protocols/lin.py +445 -0
- oscura/analyzers/protocols/manchester.py +333 -0
- oscura/analyzers/protocols/onewire.py +501 -0
- oscura/analyzers/protocols/spi.py +334 -0
- oscura/analyzers/protocols/swd.py +325 -0
- oscura/analyzers/protocols/uart.py +393 -0
- oscura/analyzers/protocols/usb.py +495 -0
- oscura/analyzers/signal_integrity/__init__.py +63 -0
- oscura/analyzers/signal_integrity/embedding.py +294 -0
- oscura/analyzers/signal_integrity/equalization.py +370 -0
- oscura/analyzers/signal_integrity/sparams.py +484 -0
- oscura/analyzers/spectral/__init__.py +53 -0
- oscura/analyzers/spectral/chunked.py +273 -0
- oscura/analyzers/spectral/chunked_fft.py +571 -0
- oscura/analyzers/spectral/chunked_wavelet.py +391 -0
- oscura/analyzers/spectral/fft.py +92 -0
- oscura/analyzers/statistical/__init__.py +250 -0
- oscura/analyzers/statistical/checksum.py +923 -0
- oscura/analyzers/statistical/chunked_corr.py +228 -0
- oscura/analyzers/statistical/classification.py +778 -0
- oscura/analyzers/statistical/entropy.py +1113 -0
- oscura/analyzers/statistical/ngrams.py +614 -0
- oscura/analyzers/statistics/__init__.py +119 -0
- oscura/analyzers/statistics/advanced.py +885 -0
- oscura/analyzers/statistics/basic.py +263 -0
- oscura/analyzers/statistics/correlation.py +630 -0
- oscura/analyzers/statistics/distribution.py +298 -0
- oscura/analyzers/statistics/outliers.py +463 -0
- oscura/analyzers/statistics/streaming.py +93 -0
- oscura/analyzers/statistics/trend.py +520 -0
- oscura/analyzers/validation.py +598 -0
- oscura/analyzers/waveform/__init__.py +36 -0
- oscura/analyzers/waveform/measurements.py +943 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +371 -0
- oscura/analyzers/waveform/spectral.py +1689 -0
- oscura/analyzers/waveform/wavelets.py +298 -0
- oscura/api/__init__.py +62 -0
- oscura/api/dsl.py +538 -0
- oscura/api/fluent.py +571 -0
- oscura/api/operators.py +498 -0
- oscura/api/optimization.py +392 -0
- oscura/api/profiling.py +396 -0
- oscura/automotive/__init__.py +73 -0
- oscura/automotive/can/__init__.py +52 -0
- oscura/automotive/can/analysis.py +356 -0
- oscura/automotive/can/checksum.py +250 -0
- oscura/automotive/can/correlation.py +212 -0
- oscura/automotive/can/discovery.py +355 -0
- oscura/automotive/can/message_wrapper.py +375 -0
- oscura/automotive/can/models.py +385 -0
- oscura/automotive/can/patterns.py +381 -0
- oscura/automotive/can/session.py +452 -0
- oscura/automotive/can/state_machine.py +300 -0
- oscura/automotive/can/stimulus_response.py +461 -0
- oscura/automotive/dbc/__init__.py +15 -0
- oscura/automotive/dbc/generator.py +156 -0
- oscura/automotive/dbc/parser.py +146 -0
- oscura/automotive/dtc/__init__.py +30 -0
- oscura/automotive/dtc/database.py +3036 -0
- oscura/automotive/j1939/__init__.py +14 -0
- oscura/automotive/j1939/decoder.py +745 -0
- oscura/automotive/loaders/__init__.py +35 -0
- oscura/automotive/loaders/asc.py +98 -0
- oscura/automotive/loaders/blf.py +77 -0
- oscura/automotive/loaders/csv_can.py +136 -0
- oscura/automotive/loaders/dispatcher.py +136 -0
- oscura/automotive/loaders/mdf.py +331 -0
- oscura/automotive/loaders/pcap.py +132 -0
- oscura/automotive/obd/__init__.py +14 -0
- oscura/automotive/obd/decoder.py +707 -0
- oscura/automotive/uds/__init__.py +48 -0
- oscura/automotive/uds/decoder.py +265 -0
- oscura/automotive/uds/models.py +64 -0
- oscura/automotive/visualization.py +369 -0
- oscura/batch/__init__.py +55 -0
- oscura/batch/advanced.py +627 -0
- oscura/batch/aggregate.py +300 -0
- oscura/batch/analyze.py +139 -0
- oscura/batch/logging.py +487 -0
- oscura/batch/metrics.py +556 -0
- oscura/builders/__init__.py +41 -0
- oscura/builders/signal_builder.py +1131 -0
- oscura/cli/__init__.py +14 -0
- oscura/cli/batch.py +339 -0
- oscura/cli/characterize.py +273 -0
- oscura/cli/compare.py +775 -0
- oscura/cli/decode.py +551 -0
- oscura/cli/main.py +247 -0
- oscura/cli/shell.py +350 -0
- oscura/comparison/__init__.py +66 -0
- oscura/comparison/compare.py +397 -0
- oscura/comparison/golden.py +487 -0
- oscura/comparison/limits.py +391 -0
- oscura/comparison/mask.py +434 -0
- oscura/comparison/trace_diff.py +30 -0
- oscura/comparison/visualization.py +481 -0
- oscura/compliance/__init__.py +70 -0
- oscura/compliance/advanced.py +756 -0
- oscura/compliance/masks.py +363 -0
- oscura/compliance/reporting.py +483 -0
- oscura/compliance/testing.py +298 -0
- oscura/component/__init__.py +38 -0
- oscura/component/impedance.py +365 -0
- oscura/component/reactive.py +598 -0
- oscura/component/transmission_line.py +312 -0
- oscura/config/__init__.py +191 -0
- oscura/config/defaults.py +254 -0
- oscura/config/loader.py +348 -0
- oscura/config/memory.py +271 -0
- oscura/config/migration.py +458 -0
- oscura/config/pipeline.py +1077 -0
- oscura/config/preferences.py +530 -0
- oscura/config/protocol.py +875 -0
- oscura/config/schema.py +713 -0
- oscura/config/settings.py +420 -0
- oscura/config/thresholds.py +599 -0
- oscura/convenience.py +457 -0
- oscura/core/__init__.py +299 -0
- oscura/core/audit.py +457 -0
- oscura/core/backend_selector.py +405 -0
- oscura/core/cache.py +590 -0
- oscura/core/cancellation.py +439 -0
- oscura/core/confidence.py +225 -0
- oscura/core/config.py +506 -0
- oscura/core/correlation.py +216 -0
- oscura/core/cross_domain.py +422 -0
- oscura/core/debug.py +301 -0
- oscura/core/edge_cases.py +541 -0
- oscura/core/exceptions.py +535 -0
- oscura/core/gpu_backend.py +523 -0
- oscura/core/lazy.py +832 -0
- oscura/core/log_query.py +540 -0
- oscura/core/logging.py +931 -0
- oscura/core/logging_advanced.py +952 -0
- oscura/core/memoize.py +171 -0
- oscura/core/memory_check.py +274 -0
- oscura/core/memory_guard.py +290 -0
- oscura/core/memory_limits.py +336 -0
- oscura/core/memory_monitor.py +453 -0
- oscura/core/memory_progress.py +465 -0
- oscura/core/memory_warnings.py +315 -0
- oscura/core/numba_backend.py +362 -0
- oscura/core/performance.py +352 -0
- oscura/core/progress.py +524 -0
- oscura/core/provenance.py +358 -0
- oscura/core/results.py +331 -0
- oscura/core/types.py +504 -0
- oscura/core/uncertainty.py +383 -0
- oscura/discovery/__init__.py +52 -0
- oscura/discovery/anomaly_detector.py +672 -0
- oscura/discovery/auto_decoder.py +415 -0
- oscura/discovery/comparison.py +497 -0
- oscura/discovery/quality_validator.py +528 -0
- oscura/discovery/signal_detector.py +769 -0
- oscura/dsl/__init__.py +73 -0
- oscura/dsl/commands.py +246 -0
- oscura/dsl/interpreter.py +455 -0
- oscura/dsl/parser.py +689 -0
- oscura/dsl/repl.py +172 -0
- oscura/exceptions.py +59 -0
- oscura/exploratory/__init__.py +111 -0
- oscura/exploratory/error_recovery.py +642 -0
- oscura/exploratory/fuzzy.py +513 -0
- oscura/exploratory/fuzzy_advanced.py +786 -0
- oscura/exploratory/legacy.py +831 -0
- oscura/exploratory/parse.py +358 -0
- oscura/exploratory/recovery.py +275 -0
- oscura/exploratory/sync.py +382 -0
- oscura/exploratory/unknown.py +707 -0
- oscura/export/__init__.py +25 -0
- oscura/export/wireshark/README.md +265 -0
- oscura/export/wireshark/__init__.py +47 -0
- oscura/export/wireshark/generator.py +312 -0
- oscura/export/wireshark/lua_builder.py +159 -0
- oscura/export/wireshark/templates/dissector.lua.j2 +92 -0
- oscura/export/wireshark/type_mapping.py +165 -0
- oscura/export/wireshark/validator.py +105 -0
- oscura/exporters/__init__.py +94 -0
- oscura/exporters/csv.py +303 -0
- oscura/exporters/exporters.py +44 -0
- oscura/exporters/hdf5.py +219 -0
- oscura/exporters/html_export.py +701 -0
- oscura/exporters/json_export.py +291 -0
- oscura/exporters/markdown_export.py +367 -0
- oscura/exporters/matlab_export.py +354 -0
- oscura/exporters/npz_export.py +219 -0
- oscura/exporters/spice_export.py +210 -0
- oscura/extensibility/__init__.py +131 -0
- oscura/extensibility/docs.py +752 -0
- oscura/extensibility/extensions.py +1125 -0
- oscura/extensibility/logging.py +259 -0
- oscura/extensibility/measurements.py +485 -0
- oscura/extensibility/plugins.py +414 -0
- oscura/extensibility/registry.py +346 -0
- oscura/extensibility/templates.py +913 -0
- oscura/extensibility/validation.py +651 -0
- oscura/filtering/__init__.py +89 -0
- oscura/filtering/base.py +563 -0
- oscura/filtering/convenience.py +564 -0
- oscura/filtering/design.py +725 -0
- oscura/filtering/filters.py +32 -0
- oscura/filtering/introspection.py +605 -0
- oscura/guidance/__init__.py +24 -0
- oscura/guidance/recommender.py +429 -0
- oscura/guidance/wizard.py +518 -0
- oscura/inference/__init__.py +251 -0
- oscura/inference/active_learning/README.md +153 -0
- oscura/inference/active_learning/__init__.py +38 -0
- oscura/inference/active_learning/lstar.py +257 -0
- oscura/inference/active_learning/observation_table.py +230 -0
- oscura/inference/active_learning/oracle.py +78 -0
- oscura/inference/active_learning/teachers/__init__.py +15 -0
- oscura/inference/active_learning/teachers/simulator.py +192 -0
- oscura/inference/adaptive_tuning.py +453 -0
- oscura/inference/alignment.py +653 -0
- oscura/inference/bayesian.py +943 -0
- oscura/inference/binary.py +1016 -0
- oscura/inference/crc_reverse.py +711 -0
- oscura/inference/logic.py +288 -0
- oscura/inference/message_format.py +1305 -0
- oscura/inference/protocol.py +417 -0
- oscura/inference/protocol_dsl.py +1084 -0
- oscura/inference/protocol_library.py +1230 -0
- oscura/inference/sequences.py +809 -0
- oscura/inference/signal_intelligence.py +1509 -0
- oscura/inference/spectral.py +215 -0
- oscura/inference/state_machine.py +634 -0
- oscura/inference/stream.py +918 -0
- oscura/integrations/__init__.py +59 -0
- oscura/integrations/llm.py +1827 -0
- oscura/jupyter/__init__.py +32 -0
- oscura/jupyter/display.py +268 -0
- oscura/jupyter/magic.py +334 -0
- oscura/loaders/__init__.py +526 -0
- oscura/loaders/binary.py +69 -0
- oscura/loaders/configurable.py +1255 -0
- oscura/loaders/csv.py +26 -0
- oscura/loaders/csv_loader.py +473 -0
- oscura/loaders/hdf5.py +9 -0
- oscura/loaders/hdf5_loader.py +510 -0
- oscura/loaders/lazy.py +370 -0
- oscura/loaders/mmap_loader.py +583 -0
- oscura/loaders/numpy_loader.py +436 -0
- oscura/loaders/pcap.py +432 -0
- oscura/loaders/preprocessing.py +368 -0
- oscura/loaders/rigol.py +287 -0
- oscura/loaders/sigrok.py +321 -0
- oscura/loaders/tdms.py +367 -0
- oscura/loaders/tektronix.py +711 -0
- oscura/loaders/validation.py +584 -0
- oscura/loaders/vcd.py +464 -0
- oscura/loaders/wav.py +233 -0
- oscura/math/__init__.py +45 -0
- oscura/math/arithmetic.py +824 -0
- oscura/math/interpolation.py +413 -0
- oscura/onboarding/__init__.py +39 -0
- oscura/onboarding/help.py +498 -0
- oscura/onboarding/tutorials.py +405 -0
- oscura/onboarding/wizard.py +466 -0
- oscura/optimization/__init__.py +19 -0
- oscura/optimization/parallel.py +440 -0
- oscura/optimization/search.py +532 -0
- oscura/pipeline/__init__.py +43 -0
- oscura/pipeline/base.py +338 -0
- oscura/pipeline/composition.py +242 -0
- oscura/pipeline/parallel.py +448 -0
- oscura/pipeline/pipeline.py +375 -0
- oscura/pipeline/reverse_engineering.py +1119 -0
- oscura/plugins/__init__.py +122 -0
- oscura/plugins/base.py +272 -0
- oscura/plugins/cli.py +497 -0
- oscura/plugins/discovery.py +411 -0
- oscura/plugins/isolation.py +418 -0
- oscura/plugins/lifecycle.py +959 -0
- oscura/plugins/manager.py +493 -0
- oscura/plugins/registry.py +421 -0
- oscura/plugins/versioning.py +372 -0
- oscura/py.typed +0 -0
- oscura/quality/__init__.py +65 -0
- oscura/quality/ensemble.py +740 -0
- oscura/quality/explainer.py +338 -0
- oscura/quality/scoring.py +616 -0
- oscura/quality/warnings.py +456 -0
- oscura/reporting/__init__.py +248 -0
- oscura/reporting/advanced.py +1234 -0
- oscura/reporting/analyze.py +448 -0
- oscura/reporting/argument_preparer.py +596 -0
- oscura/reporting/auto_report.py +507 -0
- oscura/reporting/batch.py +615 -0
- oscura/reporting/chart_selection.py +223 -0
- oscura/reporting/comparison.py +330 -0
- oscura/reporting/config.py +615 -0
- oscura/reporting/content/__init__.py +39 -0
- oscura/reporting/content/executive.py +127 -0
- oscura/reporting/content/filtering.py +191 -0
- oscura/reporting/content/minimal.py +257 -0
- oscura/reporting/content/verbosity.py +162 -0
- oscura/reporting/core.py +508 -0
- oscura/reporting/core_formats/__init__.py +17 -0
- oscura/reporting/core_formats/multi_format.py +210 -0
- oscura/reporting/engine.py +836 -0
- oscura/reporting/export.py +366 -0
- oscura/reporting/formatting/__init__.py +129 -0
- oscura/reporting/formatting/emphasis.py +81 -0
- oscura/reporting/formatting/numbers.py +403 -0
- oscura/reporting/formatting/standards.py +55 -0
- oscura/reporting/formatting.py +466 -0
- oscura/reporting/html.py +578 -0
- oscura/reporting/index.py +590 -0
- oscura/reporting/multichannel.py +296 -0
- oscura/reporting/output.py +379 -0
- oscura/reporting/pdf.py +373 -0
- oscura/reporting/plots.py +731 -0
- oscura/reporting/pptx_export.py +360 -0
- oscura/reporting/renderers/__init__.py +11 -0
- oscura/reporting/renderers/pdf.py +94 -0
- oscura/reporting/sections.py +471 -0
- oscura/reporting/standards.py +680 -0
- oscura/reporting/summary_generator.py +368 -0
- oscura/reporting/tables.py +397 -0
- oscura/reporting/template_system.py +724 -0
- oscura/reporting/templates/__init__.py +15 -0
- oscura/reporting/templates/definition.py +205 -0
- oscura/reporting/templates/index.html +649 -0
- oscura/reporting/templates/index.md +173 -0
- oscura/schemas/__init__.py +158 -0
- oscura/schemas/bus_configuration.json +322 -0
- oscura/schemas/device_mapping.json +182 -0
- oscura/schemas/packet_format.json +418 -0
- oscura/schemas/protocol_definition.json +363 -0
- oscura/search/__init__.py +16 -0
- oscura/search/anomaly.py +292 -0
- oscura/search/context.py +149 -0
- oscura/search/pattern.py +160 -0
- oscura/session/__init__.py +34 -0
- oscura/session/annotations.py +289 -0
- oscura/session/history.py +313 -0
- oscura/session/session.py +445 -0
- oscura/streaming/__init__.py +43 -0
- oscura/streaming/chunked.py +611 -0
- oscura/streaming/progressive.py +393 -0
- oscura/streaming/realtime.py +622 -0
- oscura/testing/__init__.py +54 -0
- oscura/testing/synthetic.py +808 -0
- oscura/triggering/__init__.py +68 -0
- oscura/triggering/base.py +229 -0
- oscura/triggering/edge.py +353 -0
- oscura/triggering/pattern.py +344 -0
- oscura/triggering/pulse.py +581 -0
- oscura/triggering/window.py +453 -0
- oscura/ui/__init__.py +48 -0
- oscura/ui/formatters.py +526 -0
- oscura/ui/progressive_display.py +340 -0
- oscura/utils/__init__.py +99 -0
- oscura/utils/autodetect.py +338 -0
- oscura/utils/buffer.py +389 -0
- oscura/utils/lazy.py +407 -0
- oscura/utils/lazy_imports.py +147 -0
- oscura/utils/memory.py +836 -0
- oscura/utils/memory_advanced.py +1326 -0
- oscura/utils/memory_extensions.py +465 -0
- oscura/utils/progressive.py +352 -0
- oscura/utils/windowing.py +362 -0
- oscura/visualization/__init__.py +321 -0
- oscura/visualization/accessibility.py +526 -0
- oscura/visualization/annotations.py +374 -0
- oscura/visualization/axis_scaling.py +305 -0
- oscura/visualization/colors.py +453 -0
- oscura/visualization/digital.py +337 -0
- oscura/visualization/eye.py +420 -0
- oscura/visualization/histogram.py +281 -0
- oscura/visualization/interactive.py +858 -0
- oscura/visualization/jitter.py +702 -0
- oscura/visualization/keyboard.py +394 -0
- oscura/visualization/layout.py +365 -0
- oscura/visualization/optimization.py +1028 -0
- oscura/visualization/palettes.py +446 -0
- oscura/visualization/plot.py +92 -0
- oscura/visualization/power.py +290 -0
- oscura/visualization/power_extended.py +626 -0
- oscura/visualization/presets.py +467 -0
- oscura/visualization/protocols.py +932 -0
- oscura/visualization/render.py +207 -0
- oscura/visualization/rendering.py +444 -0
- oscura/visualization/reverse_engineering.py +791 -0
- oscura/visualization/signal_integrity.py +808 -0
- oscura/visualization/specialized.py +553 -0
- oscura/visualization/spectral.py +811 -0
- oscura/visualization/styles.py +381 -0
- oscura/visualization/thumbnails.py +311 -0
- oscura/visualization/time_axis.py +351 -0
- oscura/visualization/waveform.py +367 -0
- oscura/workflow/__init__.py +13 -0
- oscura/workflow/dag.py +377 -0
- oscura/workflows/__init__.py +58 -0
- oscura/workflows/compliance.py +280 -0
- oscura/workflows/digital.py +272 -0
- oscura/workflows/multi_trace.py +502 -0
- oscura/workflows/power.py +178 -0
- oscura/workflows/protocol.py +492 -0
- oscura/workflows/reverse_engineering.py +639 -0
- oscura/workflows/signal_integrity.py +227 -0
- oscura-0.1.1.dist-info/METADATA +300 -0
- oscura-0.1.1.dist-info/RECORD +463 -0
- oscura-0.1.1.dist-info/entry_points.txt +2 -0
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/licenses/LICENSE +1 -1
- oscura-0.0.1.dist-info/METADATA +0 -63
- oscura-0.0.1.dist-info/RECORD +0 -5
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1305 @@
|
|
|
1
|
+
"""Message format inference using statistical analysis.
|
|
2
|
+
|
|
3
|
+
Requirements addressed: PSI-001
|
|
4
|
+
|
|
5
|
+
This module automatically infers message field structure from collections of
|
|
6
|
+
similar messages for protocol reverse engineering.
|
|
7
|
+
|
|
8
|
+
Key capabilities:
|
|
9
|
+
- Detect field boundaries via entropy transitions
|
|
10
|
+
- Classify fields as constant, variable, or sequential
|
|
11
|
+
- Infer field types (integer, counter, timestamp, checksum)
|
|
12
|
+
- Detect field dependencies (length fields, checksums)
|
|
13
|
+
- Generate message format specifications
|
|
14
|
+
- Voting expert ensemble for improved boundary detection (IPART-style)
|
|
15
|
+
|
|
16
|
+
References:
|
|
17
|
+
IPART: IP Packet Analysis using Random Forests. IEEE ISSRE 2014.
|
|
18
|
+
Discoverer: Automatic Protocol Reverse Engineering. USENIX Security 2007.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from dataclasses import field as dataclass_field
|
|
23
|
+
from typing import Any, Literal
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
from numpy.typing import NDArray
|
|
27
|
+
|
|
28
|
+
from oscura.inference.alignment import align_local
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class InferredField:
|
|
33
|
+
"""An inferred message field.
|
|
34
|
+
|
|
35
|
+
: Field classification and type inference.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
name: Auto-generated field name
|
|
39
|
+
offset: Byte offset from message start
|
|
40
|
+
size: Field size in bytes
|
|
41
|
+
field_type: Inferred field type classification
|
|
42
|
+
entropy: Shannon entropy of field values
|
|
43
|
+
variance: Statistical variance of field values
|
|
44
|
+
confidence: Confidence score (0-1) for type inference
|
|
45
|
+
values_seen: Sample values for validation
|
|
46
|
+
evidence: Evidence from each expert (for ensemble methods)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
name: str
|
|
50
|
+
offset: int
|
|
51
|
+
size: int
|
|
52
|
+
field_type: Literal["constant", "counter", "timestamp", "length", "checksum", "data", "unknown"]
|
|
53
|
+
entropy: float
|
|
54
|
+
variance: float
|
|
55
|
+
confidence: float
|
|
56
|
+
values_seen: list[Any] = dataclass_field(default_factory=list) # Sample values
|
|
57
|
+
evidence: dict[str, bool] = dataclass_field(default_factory=dict) # Expert evidence
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class MessageSchema:
|
|
62
|
+
"""Inferred message format schema.
|
|
63
|
+
|
|
64
|
+
: Complete message format specification.
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
total_size: Total message size in bytes
|
|
68
|
+
fields: List of inferred fields
|
|
69
|
+
field_boundaries: Byte offsets of field starts
|
|
70
|
+
header_size: Detected header size
|
|
71
|
+
payload_offset: Start of payload region
|
|
72
|
+
checksum_field: Detected checksum field if any
|
|
73
|
+
length_field: Detected length field if any
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
total_size: int
|
|
77
|
+
fields: list[InferredField]
|
|
78
|
+
field_boundaries: list[int] # Byte offsets of field starts
|
|
79
|
+
header_size: int # Detected header size
|
|
80
|
+
payload_offset: int
|
|
81
|
+
checksum_field: InferredField | None
|
|
82
|
+
length_field: InferredField | None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class MessageFormatInferrer:
|
|
86
|
+
"""Infer message format from samples.
|
|
87
|
+
|
|
88
|
+
: Message format inference using entropy and variance analysis.
|
|
89
|
+
|
|
90
|
+
Algorithm:
|
|
91
|
+
1. Detect field boundaries using entropy transitions
|
|
92
|
+
2. Classify fields based on statistical patterns
|
|
93
|
+
3. Detect dependencies between fields
|
|
94
|
+
4. Generate complete schema
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(self, min_samples: int = 10):
|
|
98
|
+
"""Initialize inferrer.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
min_samples: Minimum number of message samples required
|
|
102
|
+
"""
|
|
103
|
+
self.min_samples = min_samples
|
|
104
|
+
|
|
105
|
+
def infer_format(self, messages: list[bytes | NDArray[np.uint8]]) -> MessageSchema:
|
|
106
|
+
"""Infer message format from collection of similar messages.
|
|
107
|
+
|
|
108
|
+
: Complete format inference workflow.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
messages: List of message samples (bytes or np.ndarray)
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
MessageSchema with inferred field structure
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ValueError: If insufficient samples or invalid input
|
|
118
|
+
"""
|
|
119
|
+
if len(messages) < self.min_samples:
|
|
120
|
+
raise ValueError(f"Need at least {self.min_samples} messages, got {len(messages)}")
|
|
121
|
+
|
|
122
|
+
# Convert to numpy arrays for processing
|
|
123
|
+
msg_arrays = []
|
|
124
|
+
for msg in messages:
|
|
125
|
+
if isinstance(msg, bytes):
|
|
126
|
+
msg_arrays.append(np.frombuffer(msg, dtype=np.uint8))
|
|
127
|
+
elif isinstance(msg, np.ndarray):
|
|
128
|
+
msg_arrays.append(msg.astype(np.uint8))
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(f"Invalid message type: {type(msg)}")
|
|
131
|
+
|
|
132
|
+
# Check all messages are same length
|
|
133
|
+
lengths = [len(m) for m in msg_arrays]
|
|
134
|
+
if len(set(lengths)) > 1:
|
|
135
|
+
raise ValueError(f"Messages have varying lengths: {set(lengths)}")
|
|
136
|
+
|
|
137
|
+
msg_len = lengths[0]
|
|
138
|
+
|
|
139
|
+
# Detect field boundaries
|
|
140
|
+
boundaries = self.detect_field_boundaries(msg_arrays, method="combined")
|
|
141
|
+
|
|
142
|
+
# Detect field types
|
|
143
|
+
fields = self.detect_field_types(msg_arrays, boundaries)
|
|
144
|
+
|
|
145
|
+
# Determine header size (first high-entropy transition or first 4 fields)
|
|
146
|
+
header_size = self._estimate_header_size(fields)
|
|
147
|
+
|
|
148
|
+
# Find checksum and length fields
|
|
149
|
+
checksum_field = None
|
|
150
|
+
length_field = None
|
|
151
|
+
|
|
152
|
+
for f in fields:
|
|
153
|
+
if f.field_type == "checksum":
|
|
154
|
+
checksum_field = f
|
|
155
|
+
elif f.field_type == "length":
|
|
156
|
+
length_field = f
|
|
157
|
+
|
|
158
|
+
# Payload starts after header
|
|
159
|
+
payload_offset = header_size
|
|
160
|
+
|
|
161
|
+
schema = MessageSchema(
|
|
162
|
+
total_size=msg_len,
|
|
163
|
+
fields=fields,
|
|
164
|
+
field_boundaries=boundaries,
|
|
165
|
+
header_size=header_size,
|
|
166
|
+
payload_offset=payload_offset,
|
|
167
|
+
checksum_field=checksum_field,
|
|
168
|
+
length_field=length_field,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return schema
|
|
172
|
+
|
|
173
|
+
def detect_field_boundaries(
|
|
174
|
+
self,
|
|
175
|
+
messages: list[NDArray[np.uint8]],
|
|
176
|
+
method: Literal["entropy", "variance", "combined"] = "combined",
|
|
177
|
+
) -> list[int]:
|
|
178
|
+
"""Detect field boundaries using entropy transitions.
|
|
179
|
+
|
|
180
|
+
: Boundary detection via statistical transitions.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
messages: List of message arrays
|
|
184
|
+
method: Detection method ('entropy', 'variance', or 'combined')
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of byte offsets marking field starts (always includes 0)
|
|
188
|
+
"""
|
|
189
|
+
if not messages:
|
|
190
|
+
return [0]
|
|
191
|
+
|
|
192
|
+
msg_len = len(messages[0])
|
|
193
|
+
boundaries = [0] # Always start at offset 0
|
|
194
|
+
|
|
195
|
+
if method in ["entropy", "combined"]:
|
|
196
|
+
# Calculate entropy at each byte position
|
|
197
|
+
entropies = []
|
|
198
|
+
for offset in range(msg_len):
|
|
199
|
+
entropy = self._calculate_byte_entropy(messages, offset)
|
|
200
|
+
entropies.append(entropy)
|
|
201
|
+
|
|
202
|
+
# Find transitions (entropy changes > threshold)
|
|
203
|
+
entropy_threshold = 1.5 # bits
|
|
204
|
+
for i in range(1, len(entropies)):
|
|
205
|
+
delta = abs(entropies[i] - entropies[i - 1])
|
|
206
|
+
if delta > entropy_threshold and i not in boundaries:
|
|
207
|
+
boundaries.append(i)
|
|
208
|
+
|
|
209
|
+
if method in ["variance", "combined"]:
|
|
210
|
+
# Calculate variance at each byte position
|
|
211
|
+
variances = []
|
|
212
|
+
for offset in range(msg_len):
|
|
213
|
+
values = [msg[offset] for msg in messages]
|
|
214
|
+
variance = np.var(values)
|
|
215
|
+
variances.append(variance)
|
|
216
|
+
|
|
217
|
+
# Find variance transitions
|
|
218
|
+
var_threshold = 1000.0
|
|
219
|
+
for i in range(1, len(variances)):
|
|
220
|
+
delta = abs(variances[i] - variances[i - 1])
|
|
221
|
+
if delta > var_threshold and i not in boundaries:
|
|
222
|
+
boundaries.append(i)
|
|
223
|
+
|
|
224
|
+
# Sort and ensure we don't have too many tiny fields
|
|
225
|
+
boundaries = sorted(set(boundaries))
|
|
226
|
+
|
|
227
|
+
# Merge boundaries that are too close (< 2 bytes apart)
|
|
228
|
+
merged = [boundaries[0]]
|
|
229
|
+
for b in boundaries[1:]:
|
|
230
|
+
if b - merged[-1] >= 2:
|
|
231
|
+
merged.append(b)
|
|
232
|
+
|
|
233
|
+
return merged
|
|
234
|
+
|
|
235
|
+
def detect_boundaries_voting(
|
|
236
|
+
self,
|
|
237
|
+
messages: list[bytes],
|
|
238
|
+
min_confidence: float = 0.6,
|
|
239
|
+
) -> list[int]:
|
|
240
|
+
"""Detect field boundaries using voting expert algorithm.
|
|
241
|
+
|
|
242
|
+
: IPART-style voting expert for boundary detection.
|
|
243
|
+
|
|
244
|
+
Combines multiple detection strategies:
|
|
245
|
+
1. Entropy-based detection
|
|
246
|
+
2. Alignment-based detection (Smith-Waterman)
|
|
247
|
+
3. Statistical variance detection
|
|
248
|
+
4. Byte value distribution analysis
|
|
249
|
+
5. N-gram frequency analysis
|
|
250
|
+
|
|
251
|
+
Each "expert" votes on likely boundaries. Boundaries with
|
|
252
|
+
votes >= min_confidence threshold are returned.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
messages: List of protocol messages (bytes)
|
|
256
|
+
min_confidence: Minimum vote fraction to accept boundary (0.0-1.0)
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of byte positions that are likely field boundaries
|
|
260
|
+
|
|
261
|
+
References:
|
|
262
|
+
IPART: IP Packet Analysis using Random Forests.
|
|
263
|
+
IEEE ISSRE 2014.
|
|
264
|
+
"""
|
|
265
|
+
if not messages:
|
|
266
|
+
return [0]
|
|
267
|
+
|
|
268
|
+
# Convert to numpy arrays for processing
|
|
269
|
+
msg_arrays = []
|
|
270
|
+
for msg in messages:
|
|
271
|
+
msg_arrays.append(np.frombuffer(msg, dtype=np.uint8))
|
|
272
|
+
|
|
273
|
+
# Run each expert
|
|
274
|
+
experts = [
|
|
275
|
+
self._expert_entropy(msg_arrays),
|
|
276
|
+
self._expert_alignment(messages),
|
|
277
|
+
self._expert_variance(msg_arrays),
|
|
278
|
+
self._expert_distribution(msg_arrays),
|
|
279
|
+
self._expert_ngrams(msg_arrays, n=2),
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
num_experts = len(experts)
|
|
283
|
+
|
|
284
|
+
# Collect all possible boundary positions
|
|
285
|
+
all_boundaries = set()
|
|
286
|
+
for expert_boundaries in experts:
|
|
287
|
+
all_boundaries.update(expert_boundaries)
|
|
288
|
+
|
|
289
|
+
# Count votes for each boundary
|
|
290
|
+
boundary_votes: dict[int, int] = {}
|
|
291
|
+
for boundary in all_boundaries:
|
|
292
|
+
votes = sum(1 for expert in experts if boundary in expert)
|
|
293
|
+
boundary_votes[boundary] = votes
|
|
294
|
+
|
|
295
|
+
# Filter by confidence threshold
|
|
296
|
+
min_votes = int(num_experts * min_confidence)
|
|
297
|
+
accepted_boundaries = [pos for pos, votes in boundary_votes.items() if votes >= min_votes]
|
|
298
|
+
|
|
299
|
+
# Always include position 0
|
|
300
|
+
if 0 not in accepted_boundaries:
|
|
301
|
+
accepted_boundaries.append(0)
|
|
302
|
+
|
|
303
|
+
# Sort and merge close boundaries
|
|
304
|
+
accepted_boundaries = sorted(accepted_boundaries)
|
|
305
|
+
|
|
306
|
+
# Merge boundaries that are too close (< 2 bytes apart)
|
|
307
|
+
merged = [accepted_boundaries[0]]
|
|
308
|
+
for b in accepted_boundaries[1:]:
|
|
309
|
+
if b - merged[-1] >= 2:
|
|
310
|
+
merged.append(b)
|
|
311
|
+
|
|
312
|
+
return merged
|
|
313
|
+
|
|
314
|
+
def _expert_entropy(self, messages: list[NDArray[np.uint8]]) -> set[int]:
|
|
315
|
+
"""Detect boundaries based on entropy changes.
|
|
316
|
+
|
|
317
|
+
: Entropy-based boundary expert.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
messages: List of message arrays
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Set of boundary positions
|
|
324
|
+
"""
|
|
325
|
+
if not messages:
|
|
326
|
+
return {0}
|
|
327
|
+
|
|
328
|
+
msg_len = len(messages[0])
|
|
329
|
+
boundaries = {0}
|
|
330
|
+
|
|
331
|
+
# Calculate entropy at each byte position
|
|
332
|
+
entropies = []
|
|
333
|
+
for offset in range(msg_len):
|
|
334
|
+
entropy = self._calculate_byte_entropy(messages, offset)
|
|
335
|
+
entropies.append(entropy)
|
|
336
|
+
|
|
337
|
+
# Find transitions (entropy changes > threshold)
|
|
338
|
+
entropy_threshold = 1.5 # bits
|
|
339
|
+
for i in range(1, len(entropies)):
|
|
340
|
+
delta = abs(entropies[i] - entropies[i - 1])
|
|
341
|
+
if delta > entropy_threshold:
|
|
342
|
+
boundaries.add(i)
|
|
343
|
+
|
|
344
|
+
return boundaries
|
|
345
|
+
|
|
346
|
+
def _expert_alignment(self, messages: list[bytes]) -> set[int]:
|
|
347
|
+
"""Detect boundaries using Smith-Waterman alignment.
|
|
348
|
+
|
|
349
|
+
: Alignment-based boundary expert.
|
|
350
|
+
|
|
351
|
+
Uses local alignment to find conserved vs. variable regions.
|
|
352
|
+
Transitions between regions indicate likely boundaries.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
messages: List of protocol messages
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Set of boundary positions
|
|
359
|
+
"""
|
|
360
|
+
if len(messages) < 2:
|
|
361
|
+
return {0}
|
|
362
|
+
|
|
363
|
+
boundaries = {0}
|
|
364
|
+
|
|
365
|
+
# Compare first message to several others
|
|
366
|
+
num_comparisons = min(5, len(messages) - 1)
|
|
367
|
+
for i in range(1, num_comparisons + 1):
|
|
368
|
+
result = align_local(messages[0], messages[i])
|
|
369
|
+
|
|
370
|
+
# Boundaries at transitions between conserved and variable regions
|
|
371
|
+
for start, _end in result.conserved_regions:
|
|
372
|
+
if start > 0:
|
|
373
|
+
boundaries.add(start)
|
|
374
|
+
|
|
375
|
+
for start, _end in result.variable_regions:
|
|
376
|
+
if start > 0:
|
|
377
|
+
boundaries.add(start)
|
|
378
|
+
|
|
379
|
+
return boundaries
|
|
380
|
+
|
|
381
|
+
def _expert_variance(self, messages: list[NDArray[np.uint8]]) -> set[int]:
|
|
382
|
+
"""Detect boundaries based on statistical variance.
|
|
383
|
+
|
|
384
|
+
: Variance-based boundary expert.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
messages: List of message arrays
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
Set of boundary positions
|
|
391
|
+
"""
|
|
392
|
+
if not messages:
|
|
393
|
+
return {0}
|
|
394
|
+
|
|
395
|
+
msg_len = len(messages[0])
|
|
396
|
+
boundaries = {0}
|
|
397
|
+
|
|
398
|
+
# Calculate variance at each byte position
|
|
399
|
+
variances = []
|
|
400
|
+
for offset in range(msg_len):
|
|
401
|
+
values = [msg[offset] for msg in messages]
|
|
402
|
+
variance = np.var(values)
|
|
403
|
+
variances.append(variance)
|
|
404
|
+
|
|
405
|
+
# Find variance transitions
|
|
406
|
+
var_threshold = 1000.0
|
|
407
|
+
for i in range(1, len(variances)):
|
|
408
|
+
delta = abs(variances[i] - variances[i - 1])
|
|
409
|
+
if delta > var_threshold:
|
|
410
|
+
boundaries.add(i)
|
|
411
|
+
|
|
412
|
+
return boundaries
|
|
413
|
+
|
|
414
|
+
def _expert_distribution(self, messages: list[NDArray[np.uint8]]) -> set[int]:
|
|
415
|
+
"""Detect boundaries from byte value distribution changes.
|
|
416
|
+
|
|
417
|
+
: Distribution-based boundary expert.
|
|
418
|
+
|
|
419
|
+
Analyzes how the distribution of byte values changes
|
|
420
|
+
across positions. Sharp changes suggest boundaries.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
messages: List of message arrays
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
Set of boundary positions
|
|
427
|
+
"""
|
|
428
|
+
if not messages:
|
|
429
|
+
return {0}
|
|
430
|
+
|
|
431
|
+
msg_len = len(messages[0])
|
|
432
|
+
boundaries = {0}
|
|
433
|
+
|
|
434
|
+
# Calculate distribution metrics at each position
|
|
435
|
+
distributions = []
|
|
436
|
+
for offset in range(msg_len):
|
|
437
|
+
values = [msg[offset] for msg in messages]
|
|
438
|
+
# Use unique count as distribution metric
|
|
439
|
+
unique_count = len(set(values))
|
|
440
|
+
distributions.append(unique_count)
|
|
441
|
+
|
|
442
|
+
# Find sharp changes in distribution
|
|
443
|
+
for i in range(1, len(distributions)):
|
|
444
|
+
# Ratio of change
|
|
445
|
+
if distributions[i - 1] > 0:
|
|
446
|
+
ratio = distributions[i] / distributions[i - 1]
|
|
447
|
+
# Significant change (>2x or <0.5x)
|
|
448
|
+
if ratio > 2.0 or ratio < 0.5:
|
|
449
|
+
boundaries.add(i)
|
|
450
|
+
|
|
451
|
+
return boundaries
|
|
452
|
+
|
|
453
|
+
def _expert_ngrams(self, messages: list[NDArray[np.uint8]], n: int = 2) -> set[int]:
|
|
454
|
+
"""Detect boundaries using n-gram frequency analysis.
|
|
455
|
+
|
|
456
|
+
: N-gram based boundary expert.
|
|
457
|
+
|
|
458
|
+
Analyzes how n-gram patterns change across positions.
|
|
459
|
+
Different n-gram distributions suggest different fields.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
messages: List of message arrays
|
|
463
|
+
n: N-gram size (default: 2)
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Set of boundary positions
|
|
467
|
+
"""
|
|
468
|
+
if not messages or len(messages[0]) < n:
|
|
469
|
+
return {0}
|
|
470
|
+
|
|
471
|
+
msg_len = len(messages[0])
|
|
472
|
+
boundaries = {0}
|
|
473
|
+
|
|
474
|
+
# Collect n-grams at each position
|
|
475
|
+
ngram_sets = []
|
|
476
|
+
for offset in range(msg_len - n + 1):
|
|
477
|
+
ngrams = set()
|
|
478
|
+
for msg in messages:
|
|
479
|
+
if offset + n <= len(msg):
|
|
480
|
+
ngram = tuple(msg[offset : offset + n])
|
|
481
|
+
ngrams.add(ngram)
|
|
482
|
+
ngram_sets.append(ngrams)
|
|
483
|
+
|
|
484
|
+
# Find positions where n-gram patterns change significantly
|
|
485
|
+
for i in range(1, len(ngram_sets)):
|
|
486
|
+
# Calculate Jaccard similarity between adjacent positions
|
|
487
|
+
set1 = ngram_sets[i - 1]
|
|
488
|
+
set2 = ngram_sets[i]
|
|
489
|
+
|
|
490
|
+
if len(set1) == 0 or len(set2) == 0:
|
|
491
|
+
continue
|
|
492
|
+
|
|
493
|
+
intersection = len(set1 & set2)
|
|
494
|
+
union = len(set1 | set2)
|
|
495
|
+
|
|
496
|
+
if union > 0:
|
|
497
|
+
similarity = intersection / union
|
|
498
|
+
# Low similarity suggests boundary
|
|
499
|
+
if similarity < 0.3:
|
|
500
|
+
boundaries.add(i)
|
|
501
|
+
|
|
502
|
+
return boundaries
|
|
503
|
+
|
|
504
|
+
def infer_format_ensemble(
|
|
505
|
+
self,
|
|
506
|
+
messages: list[bytes | NDArray[np.uint8]],
|
|
507
|
+
min_field_confidence: float = 0.6,
|
|
508
|
+
min_boundary_confidence: float = 0.6,
|
|
509
|
+
) -> MessageSchema:
|
|
510
|
+
"""Infer message format using ensemble of techniques.
|
|
511
|
+
|
|
512
|
+
: Ensemble-based format inference with confidence scoring.
|
|
513
|
+
|
|
514
|
+
Combines:
|
|
515
|
+
- Voting expert for boundary detection
|
|
516
|
+
- Multiple field type detectors
|
|
517
|
+
- Confidence scoring for each field
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
messages: List of protocol messages
|
|
521
|
+
min_field_confidence: Minimum confidence to include field
|
|
522
|
+
min_boundary_confidence: Minimum confidence for boundaries
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Message schema with confidence-scored fields
|
|
526
|
+
|
|
527
|
+
Raises:
|
|
528
|
+
ValueError: If insufficient messages provided
|
|
529
|
+
"""
|
|
530
|
+
if len(messages) < self.min_samples:
|
|
531
|
+
raise ValueError(f"Need at least {self.min_samples} messages, got {len(messages)}")
|
|
532
|
+
|
|
533
|
+
# Convert all to bytes for voting
|
|
534
|
+
bytes_messages = []
|
|
535
|
+
for msg in messages:
|
|
536
|
+
if isinstance(msg, bytes):
|
|
537
|
+
bytes_messages.append(msg)
|
|
538
|
+
elif isinstance(msg, np.ndarray):
|
|
539
|
+
bytes_messages.append(msg.tobytes())
|
|
540
|
+
else:
|
|
541
|
+
raise ValueError(f"Invalid message type: {type(msg)}")
|
|
542
|
+
|
|
543
|
+
# Check all messages are same length
|
|
544
|
+
lengths = [len(m) for m in bytes_messages]
|
|
545
|
+
if len(set(lengths)) > 1:
|
|
546
|
+
raise ValueError(f"Messages have varying lengths: {set(lengths)}")
|
|
547
|
+
|
|
548
|
+
msg_len = lengths[0]
|
|
549
|
+
|
|
550
|
+
# Detect boundaries with voting
|
|
551
|
+
boundaries = self.detect_boundaries_voting(
|
|
552
|
+
bytes_messages, min_confidence=min_boundary_confidence
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
# Convert to numpy arrays for field type detection
|
|
556
|
+
msg_arrays = []
|
|
557
|
+
for msg in bytes_messages:
|
|
558
|
+
msg_arrays.append(np.frombuffer(msg, dtype=np.uint8))
|
|
559
|
+
|
|
560
|
+
# Extract field candidates
|
|
561
|
+
fields: list[InferredField] = []
|
|
562
|
+
for i in range(len(boundaries)):
|
|
563
|
+
offset = boundaries[i]
|
|
564
|
+
|
|
565
|
+
# Determine field size
|
|
566
|
+
if i < len(boundaries) - 1:
|
|
567
|
+
size = boundaries[i + 1] - offset
|
|
568
|
+
else:
|
|
569
|
+
size = msg_len - offset
|
|
570
|
+
|
|
571
|
+
# Extract field values
|
|
572
|
+
field_data = self._extract_field_data(msg_arrays, offset, size)
|
|
573
|
+
|
|
574
|
+
# Run multiple field type detectors
|
|
575
|
+
entropy_type, entropy_conf = self._detect_type_entropy(field_data)
|
|
576
|
+
pattern_type, pattern_conf = self._detect_type_patterns(
|
|
577
|
+
field_data, offset, size, msg_len
|
|
578
|
+
)
|
|
579
|
+
stats_type, stats_conf = self._detect_type_statistics(field_data)
|
|
580
|
+
|
|
581
|
+
# Vote on field type
|
|
582
|
+
field_type, confidence, evidence = self._vote_field_type(
|
|
583
|
+
[
|
|
584
|
+
(entropy_type, entropy_conf),
|
|
585
|
+
(pattern_type, pattern_conf),
|
|
586
|
+
(stats_type, stats_conf),
|
|
587
|
+
]
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
if confidence >= min_field_confidence:
|
|
591
|
+
# Sample values (first 5)
|
|
592
|
+
sample_values = field_data["values"][:5]
|
|
593
|
+
|
|
594
|
+
field_obj = InferredField(
|
|
595
|
+
name=f"field_{len(fields)}",
|
|
596
|
+
offset=offset,
|
|
597
|
+
size=size,
|
|
598
|
+
field_type=field_type, # type: ignore[arg-type]
|
|
599
|
+
entropy=float(field_data["entropy"]),
|
|
600
|
+
variance=float(field_data["variance"]),
|
|
601
|
+
confidence=confidence,
|
|
602
|
+
values_seen=sample_values,
|
|
603
|
+
evidence=evidence,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
fields.append(field_obj)
|
|
607
|
+
|
|
608
|
+
# Determine header size
|
|
609
|
+
header_size = self._estimate_header_size(fields)
|
|
610
|
+
|
|
611
|
+
# Find checksum and length fields
|
|
612
|
+
checksum_field = None
|
|
613
|
+
length_field = None
|
|
614
|
+
|
|
615
|
+
for f in fields:
|
|
616
|
+
if f.field_type == "checksum":
|
|
617
|
+
checksum_field = f
|
|
618
|
+
elif f.field_type == "length":
|
|
619
|
+
length_field = f
|
|
620
|
+
|
|
621
|
+
# Payload starts after header
|
|
622
|
+
payload_offset = header_size
|
|
623
|
+
|
|
624
|
+
schema = MessageSchema(
|
|
625
|
+
total_size=msg_len,
|
|
626
|
+
fields=fields,
|
|
627
|
+
field_boundaries=boundaries,
|
|
628
|
+
header_size=header_size,
|
|
629
|
+
payload_offset=payload_offset,
|
|
630
|
+
checksum_field=checksum_field,
|
|
631
|
+
length_field=length_field,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
return schema
|
|
635
|
+
|
|
636
|
+
def _extract_field_data(
|
|
637
|
+
self, messages: list[NDArray[np.uint8]], offset: int, size: int
|
|
638
|
+
) -> dict[str, Any]:
|
|
639
|
+
"""Extract field data for type detection.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
messages: List of message arrays
|
|
643
|
+
offset: Field offset
|
|
644
|
+
size: Field size
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
Dictionary with field values and statistics
|
|
648
|
+
"""
|
|
649
|
+
values: list[int | tuple[int, ...]]
|
|
650
|
+
if size <= 4:
|
|
651
|
+
# Use integer representation for small fields
|
|
652
|
+
int_values: list[int] = []
|
|
653
|
+
for msg in messages:
|
|
654
|
+
if size == 1:
|
|
655
|
+
val_int = int(msg[offset])
|
|
656
|
+
elif size == 2:
|
|
657
|
+
val_int = int(msg[offset]) << 8 | int(msg[offset + 1])
|
|
658
|
+
elif size == 4:
|
|
659
|
+
val_int = (
|
|
660
|
+
int(msg[offset]) << 24
|
|
661
|
+
| int(msg[offset + 1]) << 16
|
|
662
|
+
| int(msg[offset + 2]) << 8
|
|
663
|
+
| int(msg[offset + 3])
|
|
664
|
+
)
|
|
665
|
+
else: # size == 3
|
|
666
|
+
val_int = (
|
|
667
|
+
int(msg[offset]) << 16 | int(msg[offset + 1]) << 8 | int(msg[offset + 2])
|
|
668
|
+
)
|
|
669
|
+
int_values.append(val_int)
|
|
670
|
+
values = list(int_values) # type: ignore[assignment]
|
|
671
|
+
else:
|
|
672
|
+
# For larger fields, use bytes
|
|
673
|
+
tuple_values: list[tuple[int, ...]] = []
|
|
674
|
+
for msg in messages:
|
|
675
|
+
val_tuple = tuple(int(b) for b in msg[offset : offset + size])
|
|
676
|
+
tuple_values.append(val_tuple)
|
|
677
|
+
values = list(tuple_values) # type: ignore[assignment]
|
|
678
|
+
|
|
679
|
+
# Calculate statistics
|
|
680
|
+
if size > 4:
|
|
681
|
+
# Bytes field - calculate entropy across all bytes
|
|
682
|
+
all_bytes_list: list[int] = []
|
|
683
|
+
for v in values:
|
|
684
|
+
if isinstance(v, tuple):
|
|
685
|
+
all_bytes_list.extend(v)
|
|
686
|
+
all_bytes = np.array(all_bytes_list, dtype=np.uint8)
|
|
687
|
+
entropy = self._calculate_entropy(all_bytes)
|
|
688
|
+
variance = float(np.var(all_bytes))
|
|
689
|
+
else:
|
|
690
|
+
entropy = self._calculate_entropy(np.array(values, dtype=np.int64))
|
|
691
|
+
variance = float(np.var(values))
|
|
692
|
+
|
|
693
|
+
return {
|
|
694
|
+
"values": values,
|
|
695
|
+
"offset": offset,
|
|
696
|
+
"size": size,
|
|
697
|
+
"entropy": entropy,
|
|
698
|
+
"variance": variance,
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
def _detect_type_entropy(self, field_data: dict[str, Any]) -> tuple[str, float]:
|
|
702
|
+
"""Detect field type using entropy analysis.
|
|
703
|
+
|
|
704
|
+
: Entropy-based field type detection.
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
field_data: Field data dictionary
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
Tuple of (field_type, confidence)
|
|
711
|
+
"""
|
|
712
|
+
entropy = field_data["entropy"]
|
|
713
|
+
values = field_data["values"]
|
|
714
|
+
|
|
715
|
+
# Check if all values are identical (constant)
|
|
716
|
+
if len(set(values)) == 1:
|
|
717
|
+
return ("constant", 1.0)
|
|
718
|
+
|
|
719
|
+
# Low entropy suggests constant or semi-constant
|
|
720
|
+
if entropy < 1.0:
|
|
721
|
+
return ("constant", 0.8)
|
|
722
|
+
# Very high entropy suggests random data
|
|
723
|
+
elif entropy > 7.0:
|
|
724
|
+
return ("data", 0.7)
|
|
725
|
+
# Medium entropy could be various types
|
|
726
|
+
else:
|
|
727
|
+
return ("unknown", 0.3)
|
|
728
|
+
|
|
729
|
+
def _detect_type_patterns(
|
|
730
|
+
self, field_data: dict[str, Any], offset: int, size: int, msg_len: int
|
|
731
|
+
) -> tuple[str, float]:
|
|
732
|
+
"""Detect field type using pattern matching.
|
|
733
|
+
|
|
734
|
+
: Pattern-based field type detection.
|
|
735
|
+
|
|
736
|
+
Detects:
|
|
737
|
+
- Counters (incrementing values)
|
|
738
|
+
- Lengths (correlates with message size)
|
|
739
|
+
- Checksums (high entropy, end of message)
|
|
740
|
+
- Constants (no variation)
|
|
741
|
+
- Timestamps (steady increase)
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
field_data: Field data dictionary
|
|
745
|
+
offset: Field offset
|
|
746
|
+
size: Field size
|
|
747
|
+
msg_len: Total message length
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
Tuple of (field_type, confidence)
|
|
751
|
+
"""
|
|
752
|
+
values = field_data["values"]
|
|
753
|
+
|
|
754
|
+
# Check for counter (if integer values)
|
|
755
|
+
if not isinstance(values[0], tuple):
|
|
756
|
+
int_values = [v for v in values if isinstance(v, int)]
|
|
757
|
+
if self._detect_counter_field(int_values):
|
|
758
|
+
return ("counter", 0.9)
|
|
759
|
+
|
|
760
|
+
# Check for timestamp (similar to counter but larger values)
|
|
761
|
+
if len(int_values) >= 3:
|
|
762
|
+
diffs = [int_values[i + 1] - int_values[i] for i in range(len(int_values) - 1)]
|
|
763
|
+
positive_diffs = [d for d in diffs if d > 0]
|
|
764
|
+
if len(positive_diffs) >= len(diffs) * 0.7:
|
|
765
|
+
# Check if increments are relatively steady
|
|
766
|
+
if len(positive_diffs) > 0:
|
|
767
|
+
avg_diff = sum(positive_diffs) / len(positive_diffs)
|
|
768
|
+
if avg_diff > 100: # Large increments suggest timestamp
|
|
769
|
+
return ("timestamp", 0.7)
|
|
770
|
+
|
|
771
|
+
# Check for length field (small values, near start)
|
|
772
|
+
if offset < 8 and size <= 2 and not isinstance(values[0], tuple):
|
|
773
|
+
int_values = [v for v in values if isinstance(v, int)]
|
|
774
|
+
if int_values:
|
|
775
|
+
max_val = max(int_values)
|
|
776
|
+
if max_val < msg_len * 2:
|
|
777
|
+
return ("length", 0.6)
|
|
778
|
+
|
|
779
|
+
# Check for checksum (near end of message, but not the entire message)
|
|
780
|
+
if offset + size >= msg_len - 4 and offset > 0:
|
|
781
|
+
return ("checksum", 0.5)
|
|
782
|
+
|
|
783
|
+
return ("unknown", 0.3)
|
|
784
|
+
|
|
785
|
+
def _detect_type_statistics(self, field_data: dict[str, Any]) -> tuple[str, float]:
|
|
786
|
+
"""Detect field type using statistical properties.
|
|
787
|
+
|
|
788
|
+
: Statistics-based field type detection.
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
field_data: Field data dictionary
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
Tuple of (field_type, confidence)
|
|
795
|
+
"""
|
|
796
|
+
variance = field_data["variance"]
|
|
797
|
+
entropy = field_data["entropy"]
|
|
798
|
+
values = field_data["values"]
|
|
799
|
+
|
|
800
|
+
# Check if all values identical (truly constant)
|
|
801
|
+
if len(set(values)) == 1:
|
|
802
|
+
return ("constant", 0.9)
|
|
803
|
+
# Very low variance suggests constant
|
|
804
|
+
elif variance < 10:
|
|
805
|
+
return ("constant", 0.7)
|
|
806
|
+
# High entropy and variance suggests data
|
|
807
|
+
elif entropy > 6.0 and variance > 1000:
|
|
808
|
+
return ("data", 0.6)
|
|
809
|
+
else:
|
|
810
|
+
return ("unknown", 0.4)
|
|
811
|
+
|
|
812
|
+
def _vote_field_type(
|
|
813
|
+
self, detections: list[tuple[str, float]]
|
|
814
|
+
) -> tuple[str, float, dict[str, bool]]:
|
|
815
|
+
"""Vote on field type from multiple detectors.
|
|
816
|
+
|
|
817
|
+
: Voting mechanism for field type.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
detections: List of (field_type, confidence) tuples from detectors
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
Tuple of (field_type, confidence, evidence_dict)
|
|
824
|
+
"""
|
|
825
|
+
# Weight votes by confidence
|
|
826
|
+
votes: dict[str, float] = {}
|
|
827
|
+
evidence: dict[str, bool] = {}
|
|
828
|
+
|
|
829
|
+
detector_names = ["entropy", "patterns", "statistics"]
|
|
830
|
+
|
|
831
|
+
for i, (field_type, confidence) in enumerate(detections):
|
|
832
|
+
detector_name = detector_names[i] if i < len(detector_names) else f"detector_{i}"
|
|
833
|
+
|
|
834
|
+
if field_type not in votes:
|
|
835
|
+
votes[field_type] = 0.0
|
|
836
|
+
|
|
837
|
+
votes[field_type] += confidence
|
|
838
|
+
|
|
839
|
+
# Record evidence
|
|
840
|
+
evidence[f"{detector_name}_voted_{field_type}"] = True
|
|
841
|
+
|
|
842
|
+
# Find type with highest vote
|
|
843
|
+
if not votes:
|
|
844
|
+
return ("unknown", 0.0, evidence)
|
|
845
|
+
|
|
846
|
+
best_type = max(votes.items(), key=lambda x: x[1])
|
|
847
|
+
field_type = best_type[0]
|
|
848
|
+
total_confidence = best_type[1]
|
|
849
|
+
|
|
850
|
+
# Calculate total possible votes
|
|
851
|
+
total_possible = sum(conf for _, conf in detections)
|
|
852
|
+
|
|
853
|
+
# Normalize confidence as fraction of total possible votes
|
|
854
|
+
if total_possible > 0:
|
|
855
|
+
normalized_confidence = total_confidence / total_possible
|
|
856
|
+
else:
|
|
857
|
+
normalized_confidence = 0.0
|
|
858
|
+
|
|
859
|
+
return (field_type, normalized_confidence, evidence)
|
|
860
|
+
|
|
861
|
+
def detect_field_types(
|
|
862
|
+
self, messages: list[NDArray[np.uint8]], boundaries: list[int]
|
|
863
|
+
) -> list[InferredField]:
|
|
864
|
+
"""Classify field types based on value patterns.
|
|
865
|
+
|
|
866
|
+
: Field type classification.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
messages: List of message arrays
|
|
870
|
+
boundaries: Field boundary offsets
|
|
871
|
+
|
|
872
|
+
Returns:
|
|
873
|
+
List of InferredField objects
|
|
874
|
+
"""
|
|
875
|
+
fields = []
|
|
876
|
+
|
|
877
|
+
for i in range(len(boundaries)):
|
|
878
|
+
offset = boundaries[i]
|
|
879
|
+
|
|
880
|
+
# Determine field size
|
|
881
|
+
if i < len(boundaries) - 1:
|
|
882
|
+
size = boundaries[i + 1] - offset
|
|
883
|
+
else:
|
|
884
|
+
size = len(messages[0]) - offset
|
|
885
|
+
|
|
886
|
+
# Extract field values
|
|
887
|
+
values: list[int | tuple[int, ...]]
|
|
888
|
+
if size <= 4:
|
|
889
|
+
# Use integer representation for small fields
|
|
890
|
+
int_values: list[int] = []
|
|
891
|
+
for msg in messages:
|
|
892
|
+
if size == 1:
|
|
893
|
+
val_int = int(msg[offset])
|
|
894
|
+
elif size == 2:
|
|
895
|
+
val_int = int(msg[offset]) << 8 | int(msg[offset + 1])
|
|
896
|
+
elif size == 4:
|
|
897
|
+
val_int = (
|
|
898
|
+
int(msg[offset]) << 24
|
|
899
|
+
| int(msg[offset + 1]) << 16
|
|
900
|
+
| int(msg[offset + 2]) << 8
|
|
901
|
+
| int(msg[offset + 3])
|
|
902
|
+
)
|
|
903
|
+
else: # size == 3
|
|
904
|
+
val_int = (
|
|
905
|
+
int(msg[offset]) << 16
|
|
906
|
+
| int(msg[offset + 1]) << 8
|
|
907
|
+
| int(msg[offset + 2])
|
|
908
|
+
)
|
|
909
|
+
int_values.append(val_int)
|
|
910
|
+
values = list(int_values) # type: ignore[assignment]
|
|
911
|
+
else:
|
|
912
|
+
# For larger fields, use bytes
|
|
913
|
+
tuple_values: list[tuple[int, ...]] = []
|
|
914
|
+
for msg in messages:
|
|
915
|
+
val_tuple = tuple(int(b) for b in msg[offset : offset + size])
|
|
916
|
+
tuple_values.append(val_tuple)
|
|
917
|
+
values = list(tuple_values) # type: ignore[assignment]
|
|
918
|
+
|
|
919
|
+
# Calculate statistics
|
|
920
|
+
if size > 4:
|
|
921
|
+
# Bytes field - calculate entropy across all bytes
|
|
922
|
+
all_bytes_list: list[int] = []
|
|
923
|
+
for v in values:
|
|
924
|
+
if isinstance(v, tuple):
|
|
925
|
+
all_bytes_list.extend(v)
|
|
926
|
+
all_bytes = np.array(all_bytes_list, dtype=np.uint8)
|
|
927
|
+
entropy = self._calculate_entropy(all_bytes)
|
|
928
|
+
variance = float(np.var(all_bytes))
|
|
929
|
+
else:
|
|
930
|
+
entropy = self._calculate_entropy(np.array(values, dtype=np.int64))
|
|
931
|
+
variance = float(np.var(values))
|
|
932
|
+
|
|
933
|
+
# Classify field type
|
|
934
|
+
field_type, confidence = self._classify_field(values, offset, size, messages)
|
|
935
|
+
|
|
936
|
+
# Sample values (first 5)
|
|
937
|
+
sample_values = values[:5]
|
|
938
|
+
|
|
939
|
+
field_obj = InferredField(
|
|
940
|
+
name=f"field_{i}",
|
|
941
|
+
offset=offset,
|
|
942
|
+
size=size,
|
|
943
|
+
field_type=field_type, # type: ignore[arg-type]
|
|
944
|
+
entropy=float(entropy),
|
|
945
|
+
variance=float(variance),
|
|
946
|
+
confidence=confidence,
|
|
947
|
+
values_seen=sample_values,
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
fields.append(field_obj)
|
|
951
|
+
|
|
952
|
+
return fields
|
|
953
|
+
|
|
954
|
+
def find_dependencies(
|
|
955
|
+
self, messages: list[NDArray[np.uint8]], schema: MessageSchema
|
|
956
|
+
) -> dict[str, str]:
|
|
957
|
+
"""Find dependencies between fields (e.g., length->payload).
|
|
958
|
+
|
|
959
|
+
: Field dependency detection.
|
|
960
|
+
|
|
961
|
+
Args:
|
|
962
|
+
messages: List of message arrays
|
|
963
|
+
schema: Inferred message schema
|
|
964
|
+
|
|
965
|
+
Returns:
|
|
966
|
+
Dictionary mapping field names to dependency descriptions
|
|
967
|
+
"""
|
|
968
|
+
dependencies = {}
|
|
969
|
+
|
|
970
|
+
# Check for length field dependencies
|
|
971
|
+
for field in schema.fields:
|
|
972
|
+
if field.field_type == "length":
|
|
973
|
+
# Check if any field size correlates with this length value
|
|
974
|
+
for msg in messages:
|
|
975
|
+
_length_val = self._extract_field_value(msg, field)
|
|
976
|
+
# Look for fields that might be variable length
|
|
977
|
+
# This is a simplified check
|
|
978
|
+
dependencies[field.name] = "Potential length indicator"
|
|
979
|
+
|
|
980
|
+
return dependencies
|
|
981
|
+
|
|
982
|
+
def _calculate_byte_entropy(self, messages: list[NDArray[np.uint8]], offset: int) -> float:
|
|
983
|
+
"""Calculate entropy at byte offset across messages.
|
|
984
|
+
|
|
985
|
+
: Entropy calculation for boundary detection.
|
|
986
|
+
|
|
987
|
+
Args:
|
|
988
|
+
messages: List of message arrays
|
|
989
|
+
offset: Byte offset to analyze
|
|
990
|
+
|
|
991
|
+
Returns:
|
|
992
|
+
Shannon entropy in bits
|
|
993
|
+
"""
|
|
994
|
+
values = [msg[offset] for msg in messages]
|
|
995
|
+
return float(self._calculate_entropy(np.array(values)))
|
|
996
|
+
|
|
997
|
+
def _calculate_entropy(self, values: NDArray[np.int_ | np.uint8]) -> float:
|
|
998
|
+
"""Calculate Shannon entropy of values.
|
|
999
|
+
|
|
1000
|
+
Args:
|
|
1001
|
+
values: Array of values
|
|
1002
|
+
|
|
1003
|
+
Returns:
|
|
1004
|
+
Entropy in bits
|
|
1005
|
+
"""
|
|
1006
|
+
if len(values) == 0:
|
|
1007
|
+
return 0.0
|
|
1008
|
+
|
|
1009
|
+
# Count frequencies
|
|
1010
|
+
_unique, counts = np.unique(values, return_counts=True)
|
|
1011
|
+
probabilities = counts / len(values)
|
|
1012
|
+
|
|
1013
|
+
# Calculate Shannon entropy
|
|
1014
|
+
entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
|
|
1015
|
+
return float(entropy)
|
|
1016
|
+
|
|
1017
|
+
def _classify_field(
|
|
1018
|
+
self,
|
|
1019
|
+
values: list[int | tuple[int, ...]],
|
|
1020
|
+
offset: int,
|
|
1021
|
+
size: int,
|
|
1022
|
+
messages: list[NDArray[np.uint8]],
|
|
1023
|
+
) -> tuple[str, float]:
|
|
1024
|
+
"""Classify field type based on patterns.
|
|
1025
|
+
|
|
1026
|
+
: Field type classification logic.
|
|
1027
|
+
|
|
1028
|
+
Args:
|
|
1029
|
+
values: Field values across all messages
|
|
1030
|
+
offset: Field offset
|
|
1031
|
+
size: Field size
|
|
1032
|
+
messages: Original messages
|
|
1033
|
+
|
|
1034
|
+
Returns:
|
|
1035
|
+
Tuple of (field_type, confidence)
|
|
1036
|
+
"""
|
|
1037
|
+
# Handle byte fields (larger than 4 bytes)
|
|
1038
|
+
if isinstance(values[0], tuple):
|
|
1039
|
+
# Check if all tuples are identical (truly constant)
|
|
1040
|
+
if len(set(values)) == 1:
|
|
1041
|
+
return ("constant", 1.0)
|
|
1042
|
+
|
|
1043
|
+
entropy = self._calculate_entropy(np.concatenate([np.array(v) for v in values]))
|
|
1044
|
+
if entropy < 1.0:
|
|
1045
|
+
return ("constant", 0.9)
|
|
1046
|
+
elif entropy > 7.0:
|
|
1047
|
+
return ("data", 0.6)
|
|
1048
|
+
else:
|
|
1049
|
+
return ("data", 0.5)
|
|
1050
|
+
|
|
1051
|
+
# Check for constant field
|
|
1052
|
+
if len(set(values)) == 1:
|
|
1053
|
+
return ("constant", 1.0)
|
|
1054
|
+
|
|
1055
|
+
# Check for counter field
|
|
1056
|
+
if not isinstance(values[0], tuple) and self._detect_counter_field( # type: ignore[misc, unreachable]
|
|
1057
|
+
[v for v in values if isinstance(v, int)]
|
|
1058
|
+
):
|
|
1059
|
+
return ("counter", 0.9)
|
|
1060
|
+
|
|
1061
|
+
# Check for checksum (if near end of message)
|
|
1062
|
+
msg_len = len(messages[0])
|
|
1063
|
+
if offset + size >= msg_len - 4: # Within last 4 bytes
|
|
1064
|
+
if self._detect_checksum_field(messages, offset, size):
|
|
1065
|
+
return ("checksum", 0.8)
|
|
1066
|
+
|
|
1067
|
+
# Check for length field (small values, near start)
|
|
1068
|
+
if offset < 8 and size <= 2:
|
|
1069
|
+
if not isinstance(values[0], tuple): # type: ignore[unreachable]
|
|
1070
|
+
max_val = max(v for v in values if isinstance(v, int))
|
|
1071
|
+
if max_val < msg_len * 2: # Reasonable length value
|
|
1072
|
+
return ("length", 0.6)
|
|
1073
|
+
|
|
1074
|
+
# Check variance for classification
|
|
1075
|
+
variance = np.var(values)
|
|
1076
|
+
entropy = self._calculate_entropy(np.array(values))
|
|
1077
|
+
|
|
1078
|
+
if variance < 10:
|
|
1079
|
+
return ("constant", 0.6)
|
|
1080
|
+
elif entropy > 6.0:
|
|
1081
|
+
return ("data", 0.7)
|
|
1082
|
+
else:
|
|
1083
|
+
return ("unknown", 0.5)
|
|
1084
|
+
|
|
1085
|
+
def _detect_counter_field(self, values: list[int]) -> bool:
|
|
1086
|
+
"""Check if values form a counter sequence.
|
|
1087
|
+
|
|
1088
|
+
: Counter field detection.
|
|
1089
|
+
|
|
1090
|
+
Args:
|
|
1091
|
+
values: List of integer values
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
True if values appear to be a counter
|
|
1095
|
+
"""
|
|
1096
|
+
if len(values) < 3:
|
|
1097
|
+
return False
|
|
1098
|
+
|
|
1099
|
+
# Check for monotonic increase
|
|
1100
|
+
diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
|
|
1101
|
+
|
|
1102
|
+
# Allow wrapping
|
|
1103
|
+
diffs_filtered = [d for d in diffs if d >= 0]
|
|
1104
|
+
|
|
1105
|
+
# Check if most differences are 1 (counter increments)
|
|
1106
|
+
if len(diffs_filtered) < len(diffs) * 0.7:
|
|
1107
|
+
return False
|
|
1108
|
+
|
|
1109
|
+
ones = sum(1 for d in diffs_filtered if d == 1)
|
|
1110
|
+
return ones >= len(diffs_filtered) * 0.7
|
|
1111
|
+
|
|
1112
|
+
def _detect_checksum_field(
|
|
1113
|
+
self, messages: list[NDArray[np.uint8]], field_offset: int, field_size: int
|
|
1114
|
+
) -> bool:
|
|
1115
|
+
"""Check if field is likely a checksum.
|
|
1116
|
+
|
|
1117
|
+
: Checksum field detection.
|
|
1118
|
+
|
|
1119
|
+
Args:
|
|
1120
|
+
messages: List of message arrays
|
|
1121
|
+
field_offset: Offset of potential checksum field
|
|
1122
|
+
field_size: Size of potential checksum field
|
|
1123
|
+
|
|
1124
|
+
Returns:
|
|
1125
|
+
True if field appears to be a checksum
|
|
1126
|
+
"""
|
|
1127
|
+
if field_size not in [1, 2, 4]:
|
|
1128
|
+
return False
|
|
1129
|
+
|
|
1130
|
+
# Try simple XOR checksum
|
|
1131
|
+
for msg in messages[: min(5, len(messages))]:
|
|
1132
|
+
# Calculate XOR of all bytes before checksum
|
|
1133
|
+
xor_sum = 0
|
|
1134
|
+
for i in range(field_offset):
|
|
1135
|
+
xor_sum ^= int(msg[i])
|
|
1136
|
+
|
|
1137
|
+
# Extract checksum value
|
|
1138
|
+
if field_size == 1:
|
|
1139
|
+
checksum = int(msg[field_offset])
|
|
1140
|
+
elif field_size == 2:
|
|
1141
|
+
checksum = int(msg[field_offset]) << 8 | int(msg[field_offset + 1])
|
|
1142
|
+
else:
|
|
1143
|
+
checksum = (
|
|
1144
|
+
int(msg[field_offset]) << 24
|
|
1145
|
+
| int(msg[field_offset + 1]) << 16
|
|
1146
|
+
| int(msg[field_offset + 2]) << 8
|
|
1147
|
+
| int(msg[field_offset + 3])
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
# For single-byte, compare
|
|
1151
|
+
if field_size == 1 and (xor_sum & 0xFF) == checksum:
|
|
1152
|
+
continue
|
|
1153
|
+
else:
|
|
1154
|
+
return False # Not a match
|
|
1155
|
+
|
|
1156
|
+
return True # All matched
|
|
1157
|
+
|
|
1158
|
+
def _estimate_header_size(self, fields: list[InferredField]) -> int:
|
|
1159
|
+
"""Estimate header size from field patterns.
|
|
1160
|
+
|
|
1161
|
+
Args:
|
|
1162
|
+
fields: List of inferred fields
|
|
1163
|
+
|
|
1164
|
+
Returns:
|
|
1165
|
+
Estimated header size in bytes
|
|
1166
|
+
"""
|
|
1167
|
+
# Look for transition from low-entropy to high-entropy
|
|
1168
|
+
for i, field in enumerate(fields):
|
|
1169
|
+
if field.field_type == "data" and field.entropy > 6.0:
|
|
1170
|
+
if i > 0:
|
|
1171
|
+
return field.offset
|
|
1172
|
+
|
|
1173
|
+
# Default: first 4 fields or 16 bytes
|
|
1174
|
+
if len(fields) >= 5:
|
|
1175
|
+
# Header includes first 4 fields, so return offset of 5th field
|
|
1176
|
+
return fields[4].offset
|
|
1177
|
+
elif len(fields) >= 4:
|
|
1178
|
+
# If exactly 4 fields, header is up to end of 4th field
|
|
1179
|
+
return fields[3].offset + fields[3].size
|
|
1180
|
+
elif fields:
|
|
1181
|
+
# Fewer than 4 fields - use offset of last field
|
|
1182
|
+
return min(16, fields[-1].offset)
|
|
1183
|
+
else:
|
|
1184
|
+
return 16
|
|
1185
|
+
|
|
1186
|
+
def _extract_field_value(self, msg: NDArray[np.uint8], field: InferredField) -> int:
|
|
1187
|
+
"""Extract field value from message.
|
|
1188
|
+
|
|
1189
|
+
Args:
|
|
1190
|
+
msg: Message array
|
|
1191
|
+
field: Field definition
|
|
1192
|
+
|
|
1193
|
+
Returns:
|
|
1194
|
+
Field value as integer
|
|
1195
|
+
"""
|
|
1196
|
+
if field.size == 1:
|
|
1197
|
+
return int(msg[field.offset])
|
|
1198
|
+
elif field.size == 2:
|
|
1199
|
+
return int(msg[field.offset]) << 8 | int(msg[field.offset + 1])
|
|
1200
|
+
elif field.size == 4:
|
|
1201
|
+
return (
|
|
1202
|
+
int(msg[field.offset]) << 24
|
|
1203
|
+
| int(msg[field.offset + 1]) << 16
|
|
1204
|
+
| int(msg[field.offset + 2]) << 8
|
|
1205
|
+
| int(msg[field.offset + 3])
|
|
1206
|
+
)
|
|
1207
|
+
else:
|
|
1208
|
+
# Return first byte for larger fields
|
|
1209
|
+
return int(msg[field.offset])
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
def infer_format(messages: list[bytes | NDArray[np.uint8]], min_samples: int = 10) -> MessageSchema:
|
|
1213
|
+
"""Convenience function for format inference.
|
|
1214
|
+
|
|
1215
|
+
: Top-level API for message format inference.
|
|
1216
|
+
|
|
1217
|
+
Args:
|
|
1218
|
+
messages: List of message samples (bytes or np.ndarray)
|
|
1219
|
+
min_samples: Minimum required samples
|
|
1220
|
+
|
|
1221
|
+
Returns:
|
|
1222
|
+
MessageSchema with inferred structure
|
|
1223
|
+
"""
|
|
1224
|
+
inferrer = MessageFormatInferrer(min_samples=min_samples)
|
|
1225
|
+
return inferrer.infer_format(messages)
|
|
1226
|
+
|
|
1227
|
+
|
|
1228
|
+
def detect_field_types(
|
|
1229
|
+
messages: list[bytes | NDArray[np.uint8]] | bytes | NDArray[np.uint8],
|
|
1230
|
+
boundaries: list[int] | None = None,
|
|
1231
|
+
) -> list[InferredField]:
|
|
1232
|
+
"""Detect field types at boundaries.
|
|
1233
|
+
|
|
1234
|
+
: Field type detection.
|
|
1235
|
+
|
|
1236
|
+
Args:
|
|
1237
|
+
messages: List of message samples OR a single message
|
|
1238
|
+
boundaries: Field boundary offsets (auto-detected if not provided)
|
|
1239
|
+
|
|
1240
|
+
Returns:
|
|
1241
|
+
List of InferredField objects
|
|
1242
|
+
|
|
1243
|
+
Raises:
|
|
1244
|
+
ValueError: If message type is invalid.
|
|
1245
|
+
"""
|
|
1246
|
+
inferrer = MessageFormatInferrer()
|
|
1247
|
+
|
|
1248
|
+
# Handle single message case - convert to list
|
|
1249
|
+
if isinstance(messages, (bytes, np.ndarray)):
|
|
1250
|
+
messages_list: list[bytes | NDArray[np.uint8]] = [messages]
|
|
1251
|
+
else:
|
|
1252
|
+
messages_list = messages
|
|
1253
|
+
|
|
1254
|
+
# Convert to arrays
|
|
1255
|
+
msg_arrays = []
|
|
1256
|
+
for msg in messages_list:
|
|
1257
|
+
if isinstance(msg, bytes):
|
|
1258
|
+
msg_arrays.append(np.frombuffer(msg, dtype=np.uint8))
|
|
1259
|
+
elif isinstance(msg, np.ndarray):
|
|
1260
|
+
msg_arrays.append(msg.astype(np.uint8))
|
|
1261
|
+
else:
|
|
1262
|
+
raise ValueError(f"Invalid message type: {type(msg)}")
|
|
1263
|
+
|
|
1264
|
+
# Auto-detect boundaries if not provided
|
|
1265
|
+
if boundaries is None:
|
|
1266
|
+
boundaries = inferrer.detect_field_boundaries(msg_arrays, method="combined")
|
|
1267
|
+
|
|
1268
|
+
return inferrer.detect_field_types(msg_arrays, boundaries)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def find_dependencies(
|
|
1272
|
+
messages: list[bytes | NDArray[np.uint8]], schema: MessageSchema | None = None
|
|
1273
|
+
) -> dict[str, str]:
|
|
1274
|
+
"""Find field dependencies.
|
|
1275
|
+
|
|
1276
|
+
: Field dependency analysis.
|
|
1277
|
+
|
|
1278
|
+
Args:
|
|
1279
|
+
messages: List of message samples
|
|
1280
|
+
schema: Message schema (auto-inferred if not provided)
|
|
1281
|
+
|
|
1282
|
+
Returns:
|
|
1283
|
+
Dictionary of dependencies
|
|
1284
|
+
|
|
1285
|
+
Raises:
|
|
1286
|
+
ValueError: If message type is invalid.
|
|
1287
|
+
"""
|
|
1288
|
+
inferrer = MessageFormatInferrer()
|
|
1289
|
+
|
|
1290
|
+
# Convert to arrays
|
|
1291
|
+
msg_arrays = []
|
|
1292
|
+
for msg in messages:
|
|
1293
|
+
if isinstance(msg, bytes):
|
|
1294
|
+
msg_arrays.append(np.frombuffer(msg, dtype=np.uint8))
|
|
1295
|
+
elif isinstance(msg, np.ndarray):
|
|
1296
|
+
msg_arrays.append(msg.astype(np.uint8))
|
|
1297
|
+
else:
|
|
1298
|
+
raise ValueError(f"Invalid message type: {type(msg)}")
|
|
1299
|
+
|
|
1300
|
+
# Auto-infer schema if not provided
|
|
1301
|
+
if schema is None:
|
|
1302
|
+
# Cast to expected type (msg_arrays contains only NDArray after conversion)
|
|
1303
|
+
schema = inferrer.infer_format(msg_arrays) # type: ignore[arg-type]
|
|
1304
|
+
|
|
1305
|
+
return inferrer.find_dependencies(msg_arrays, schema) # type: ignore[arg-type]
|