oscura 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +813 -8
- oscura/__main__.py +392 -0
- oscura/analyzers/__init__.py +37 -0
- oscura/analyzers/digital/__init__.py +177 -0
- oscura/analyzers/digital/bus.py +691 -0
- oscura/analyzers/digital/clock.py +805 -0
- oscura/analyzers/digital/correlation.py +720 -0
- oscura/analyzers/digital/edges.py +632 -0
- oscura/analyzers/digital/extraction.py +413 -0
- oscura/analyzers/digital/quality.py +878 -0
- oscura/analyzers/digital/signal_quality.py +877 -0
- oscura/analyzers/digital/thresholds.py +708 -0
- oscura/analyzers/digital/timing.py +1104 -0
- oscura/analyzers/eye/__init__.py +46 -0
- oscura/analyzers/eye/diagram.py +434 -0
- oscura/analyzers/eye/metrics.py +555 -0
- oscura/analyzers/jitter/__init__.py +83 -0
- oscura/analyzers/jitter/ber.py +333 -0
- oscura/analyzers/jitter/decomposition.py +759 -0
- oscura/analyzers/jitter/measurements.py +413 -0
- oscura/analyzers/jitter/spectrum.py +220 -0
- oscura/analyzers/measurements.py +40 -0
- oscura/analyzers/packet/__init__.py +171 -0
- oscura/analyzers/packet/daq.py +1077 -0
- oscura/analyzers/packet/metrics.py +437 -0
- oscura/analyzers/packet/parser.py +327 -0
- oscura/analyzers/packet/payload.py +2156 -0
- oscura/analyzers/packet/payload_analysis.py +1312 -0
- oscura/analyzers/packet/payload_extraction.py +236 -0
- oscura/analyzers/packet/payload_patterns.py +670 -0
- oscura/analyzers/packet/stream.py +359 -0
- oscura/analyzers/patterns/__init__.py +266 -0
- oscura/analyzers/patterns/clustering.py +1036 -0
- oscura/analyzers/patterns/discovery.py +539 -0
- oscura/analyzers/patterns/learning.py +797 -0
- oscura/analyzers/patterns/matching.py +1091 -0
- oscura/analyzers/patterns/periodic.py +650 -0
- oscura/analyzers/patterns/sequences.py +767 -0
- oscura/analyzers/power/__init__.py +116 -0
- oscura/analyzers/power/ac_power.py +391 -0
- oscura/analyzers/power/basic.py +383 -0
- oscura/analyzers/power/conduction.py +314 -0
- oscura/analyzers/power/efficiency.py +297 -0
- oscura/analyzers/power/ripple.py +356 -0
- oscura/analyzers/power/soa.py +372 -0
- oscura/analyzers/power/switching.py +479 -0
- oscura/analyzers/protocol/__init__.py +150 -0
- oscura/analyzers/protocols/__init__.py +150 -0
- oscura/analyzers/protocols/base.py +500 -0
- oscura/analyzers/protocols/can.py +620 -0
- oscura/analyzers/protocols/can_fd.py +448 -0
- oscura/analyzers/protocols/flexray.py +405 -0
- oscura/analyzers/protocols/hdlc.py +399 -0
- oscura/analyzers/protocols/i2c.py +368 -0
- oscura/analyzers/protocols/i2s.py +296 -0
- oscura/analyzers/protocols/jtag.py +393 -0
- oscura/analyzers/protocols/lin.py +445 -0
- oscura/analyzers/protocols/manchester.py +333 -0
- oscura/analyzers/protocols/onewire.py +501 -0
- oscura/analyzers/protocols/spi.py +334 -0
- oscura/analyzers/protocols/swd.py +325 -0
- oscura/analyzers/protocols/uart.py +393 -0
- oscura/analyzers/protocols/usb.py +495 -0
- oscura/analyzers/signal_integrity/__init__.py +63 -0
- oscura/analyzers/signal_integrity/embedding.py +294 -0
- oscura/analyzers/signal_integrity/equalization.py +370 -0
- oscura/analyzers/signal_integrity/sparams.py +484 -0
- oscura/analyzers/spectral/__init__.py +53 -0
- oscura/analyzers/spectral/chunked.py +273 -0
- oscura/analyzers/spectral/chunked_fft.py +571 -0
- oscura/analyzers/spectral/chunked_wavelet.py +391 -0
- oscura/analyzers/spectral/fft.py +92 -0
- oscura/analyzers/statistical/__init__.py +250 -0
- oscura/analyzers/statistical/checksum.py +923 -0
- oscura/analyzers/statistical/chunked_corr.py +228 -0
- oscura/analyzers/statistical/classification.py +778 -0
- oscura/analyzers/statistical/entropy.py +1113 -0
- oscura/analyzers/statistical/ngrams.py +614 -0
- oscura/analyzers/statistics/__init__.py +119 -0
- oscura/analyzers/statistics/advanced.py +885 -0
- oscura/analyzers/statistics/basic.py +263 -0
- oscura/analyzers/statistics/correlation.py +630 -0
- oscura/analyzers/statistics/distribution.py +298 -0
- oscura/analyzers/statistics/outliers.py +463 -0
- oscura/analyzers/statistics/streaming.py +93 -0
- oscura/analyzers/statistics/trend.py +520 -0
- oscura/analyzers/validation.py +598 -0
- oscura/analyzers/waveform/__init__.py +36 -0
- oscura/analyzers/waveform/measurements.py +943 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +371 -0
- oscura/analyzers/waveform/spectral.py +1689 -0
- oscura/analyzers/waveform/wavelets.py +298 -0
- oscura/api/__init__.py +62 -0
- oscura/api/dsl.py +538 -0
- oscura/api/fluent.py +571 -0
- oscura/api/operators.py +498 -0
- oscura/api/optimization.py +392 -0
- oscura/api/profiling.py +396 -0
- oscura/automotive/__init__.py +73 -0
- oscura/automotive/can/__init__.py +52 -0
- oscura/automotive/can/analysis.py +356 -0
- oscura/automotive/can/checksum.py +250 -0
- oscura/automotive/can/correlation.py +212 -0
- oscura/automotive/can/discovery.py +355 -0
- oscura/automotive/can/message_wrapper.py +375 -0
- oscura/automotive/can/models.py +385 -0
- oscura/automotive/can/patterns.py +381 -0
- oscura/automotive/can/session.py +452 -0
- oscura/automotive/can/state_machine.py +300 -0
- oscura/automotive/can/stimulus_response.py +461 -0
- oscura/automotive/dbc/__init__.py +15 -0
- oscura/automotive/dbc/generator.py +156 -0
- oscura/automotive/dbc/parser.py +146 -0
- oscura/automotive/dtc/__init__.py +30 -0
- oscura/automotive/dtc/database.py +3036 -0
- oscura/automotive/j1939/__init__.py +14 -0
- oscura/automotive/j1939/decoder.py +745 -0
- oscura/automotive/loaders/__init__.py +35 -0
- oscura/automotive/loaders/asc.py +98 -0
- oscura/automotive/loaders/blf.py +77 -0
- oscura/automotive/loaders/csv_can.py +136 -0
- oscura/automotive/loaders/dispatcher.py +136 -0
- oscura/automotive/loaders/mdf.py +331 -0
- oscura/automotive/loaders/pcap.py +132 -0
- oscura/automotive/obd/__init__.py +14 -0
- oscura/automotive/obd/decoder.py +707 -0
- oscura/automotive/uds/__init__.py +48 -0
- oscura/automotive/uds/decoder.py +265 -0
- oscura/automotive/uds/models.py +64 -0
- oscura/automotive/visualization.py +369 -0
- oscura/batch/__init__.py +55 -0
- oscura/batch/advanced.py +627 -0
- oscura/batch/aggregate.py +300 -0
- oscura/batch/analyze.py +139 -0
- oscura/batch/logging.py +487 -0
- oscura/batch/metrics.py +556 -0
- oscura/builders/__init__.py +41 -0
- oscura/builders/signal_builder.py +1131 -0
- oscura/cli/__init__.py +14 -0
- oscura/cli/batch.py +339 -0
- oscura/cli/characterize.py +273 -0
- oscura/cli/compare.py +775 -0
- oscura/cli/decode.py +551 -0
- oscura/cli/main.py +247 -0
- oscura/cli/shell.py +350 -0
- oscura/comparison/__init__.py +66 -0
- oscura/comparison/compare.py +397 -0
- oscura/comparison/golden.py +487 -0
- oscura/comparison/limits.py +391 -0
- oscura/comparison/mask.py +434 -0
- oscura/comparison/trace_diff.py +30 -0
- oscura/comparison/visualization.py +481 -0
- oscura/compliance/__init__.py +70 -0
- oscura/compliance/advanced.py +756 -0
- oscura/compliance/masks.py +363 -0
- oscura/compliance/reporting.py +483 -0
- oscura/compliance/testing.py +298 -0
- oscura/component/__init__.py +38 -0
- oscura/component/impedance.py +365 -0
- oscura/component/reactive.py +598 -0
- oscura/component/transmission_line.py +312 -0
- oscura/config/__init__.py +191 -0
- oscura/config/defaults.py +254 -0
- oscura/config/loader.py +348 -0
- oscura/config/memory.py +271 -0
- oscura/config/migration.py +458 -0
- oscura/config/pipeline.py +1077 -0
- oscura/config/preferences.py +530 -0
- oscura/config/protocol.py +875 -0
- oscura/config/schema.py +713 -0
- oscura/config/settings.py +420 -0
- oscura/config/thresholds.py +599 -0
- oscura/convenience.py +457 -0
- oscura/core/__init__.py +299 -0
- oscura/core/audit.py +457 -0
- oscura/core/backend_selector.py +405 -0
- oscura/core/cache.py +590 -0
- oscura/core/cancellation.py +439 -0
- oscura/core/confidence.py +225 -0
- oscura/core/config.py +506 -0
- oscura/core/correlation.py +216 -0
- oscura/core/cross_domain.py +422 -0
- oscura/core/debug.py +301 -0
- oscura/core/edge_cases.py +541 -0
- oscura/core/exceptions.py +535 -0
- oscura/core/gpu_backend.py +523 -0
- oscura/core/lazy.py +832 -0
- oscura/core/log_query.py +540 -0
- oscura/core/logging.py +931 -0
- oscura/core/logging_advanced.py +952 -0
- oscura/core/memoize.py +171 -0
- oscura/core/memory_check.py +274 -0
- oscura/core/memory_guard.py +290 -0
- oscura/core/memory_limits.py +336 -0
- oscura/core/memory_monitor.py +453 -0
- oscura/core/memory_progress.py +465 -0
- oscura/core/memory_warnings.py +315 -0
- oscura/core/numba_backend.py +362 -0
- oscura/core/performance.py +352 -0
- oscura/core/progress.py +524 -0
- oscura/core/provenance.py +358 -0
- oscura/core/results.py +331 -0
- oscura/core/types.py +504 -0
- oscura/core/uncertainty.py +383 -0
- oscura/discovery/__init__.py +52 -0
- oscura/discovery/anomaly_detector.py +672 -0
- oscura/discovery/auto_decoder.py +415 -0
- oscura/discovery/comparison.py +497 -0
- oscura/discovery/quality_validator.py +528 -0
- oscura/discovery/signal_detector.py +769 -0
- oscura/dsl/__init__.py +73 -0
- oscura/dsl/commands.py +246 -0
- oscura/dsl/interpreter.py +455 -0
- oscura/dsl/parser.py +689 -0
- oscura/dsl/repl.py +172 -0
- oscura/exceptions.py +59 -0
- oscura/exploratory/__init__.py +111 -0
- oscura/exploratory/error_recovery.py +642 -0
- oscura/exploratory/fuzzy.py +513 -0
- oscura/exploratory/fuzzy_advanced.py +786 -0
- oscura/exploratory/legacy.py +831 -0
- oscura/exploratory/parse.py +358 -0
- oscura/exploratory/recovery.py +275 -0
- oscura/exploratory/sync.py +382 -0
- oscura/exploratory/unknown.py +707 -0
- oscura/export/__init__.py +25 -0
- oscura/export/wireshark/README.md +265 -0
- oscura/export/wireshark/__init__.py +47 -0
- oscura/export/wireshark/generator.py +312 -0
- oscura/export/wireshark/lua_builder.py +159 -0
- oscura/export/wireshark/templates/dissector.lua.j2 +92 -0
- oscura/export/wireshark/type_mapping.py +165 -0
- oscura/export/wireshark/validator.py +105 -0
- oscura/exporters/__init__.py +94 -0
- oscura/exporters/csv.py +303 -0
- oscura/exporters/exporters.py +44 -0
- oscura/exporters/hdf5.py +219 -0
- oscura/exporters/html_export.py +701 -0
- oscura/exporters/json_export.py +291 -0
- oscura/exporters/markdown_export.py +367 -0
- oscura/exporters/matlab_export.py +354 -0
- oscura/exporters/npz_export.py +219 -0
- oscura/exporters/spice_export.py +210 -0
- oscura/extensibility/__init__.py +131 -0
- oscura/extensibility/docs.py +752 -0
- oscura/extensibility/extensions.py +1125 -0
- oscura/extensibility/logging.py +259 -0
- oscura/extensibility/measurements.py +485 -0
- oscura/extensibility/plugins.py +414 -0
- oscura/extensibility/registry.py +346 -0
- oscura/extensibility/templates.py +913 -0
- oscura/extensibility/validation.py +651 -0
- oscura/filtering/__init__.py +89 -0
- oscura/filtering/base.py +563 -0
- oscura/filtering/convenience.py +564 -0
- oscura/filtering/design.py +725 -0
- oscura/filtering/filters.py +32 -0
- oscura/filtering/introspection.py +605 -0
- oscura/guidance/__init__.py +24 -0
- oscura/guidance/recommender.py +429 -0
- oscura/guidance/wizard.py +518 -0
- oscura/inference/__init__.py +251 -0
- oscura/inference/active_learning/README.md +153 -0
- oscura/inference/active_learning/__init__.py +38 -0
- oscura/inference/active_learning/lstar.py +257 -0
- oscura/inference/active_learning/observation_table.py +230 -0
- oscura/inference/active_learning/oracle.py +78 -0
- oscura/inference/active_learning/teachers/__init__.py +15 -0
- oscura/inference/active_learning/teachers/simulator.py +192 -0
- oscura/inference/adaptive_tuning.py +453 -0
- oscura/inference/alignment.py +653 -0
- oscura/inference/bayesian.py +943 -0
- oscura/inference/binary.py +1016 -0
- oscura/inference/crc_reverse.py +711 -0
- oscura/inference/logic.py +288 -0
- oscura/inference/message_format.py +1305 -0
- oscura/inference/protocol.py +417 -0
- oscura/inference/protocol_dsl.py +1084 -0
- oscura/inference/protocol_library.py +1230 -0
- oscura/inference/sequences.py +809 -0
- oscura/inference/signal_intelligence.py +1509 -0
- oscura/inference/spectral.py +215 -0
- oscura/inference/state_machine.py +634 -0
- oscura/inference/stream.py +918 -0
- oscura/integrations/__init__.py +59 -0
- oscura/integrations/llm.py +1827 -0
- oscura/jupyter/__init__.py +32 -0
- oscura/jupyter/display.py +268 -0
- oscura/jupyter/magic.py +334 -0
- oscura/loaders/__init__.py +526 -0
- oscura/loaders/binary.py +69 -0
- oscura/loaders/configurable.py +1255 -0
- oscura/loaders/csv.py +26 -0
- oscura/loaders/csv_loader.py +473 -0
- oscura/loaders/hdf5.py +9 -0
- oscura/loaders/hdf5_loader.py +510 -0
- oscura/loaders/lazy.py +370 -0
- oscura/loaders/mmap_loader.py +583 -0
- oscura/loaders/numpy_loader.py +436 -0
- oscura/loaders/pcap.py +432 -0
- oscura/loaders/preprocessing.py +368 -0
- oscura/loaders/rigol.py +287 -0
- oscura/loaders/sigrok.py +321 -0
- oscura/loaders/tdms.py +367 -0
- oscura/loaders/tektronix.py +711 -0
- oscura/loaders/validation.py +584 -0
- oscura/loaders/vcd.py +464 -0
- oscura/loaders/wav.py +233 -0
- oscura/math/__init__.py +45 -0
- oscura/math/arithmetic.py +824 -0
- oscura/math/interpolation.py +413 -0
- oscura/onboarding/__init__.py +39 -0
- oscura/onboarding/help.py +498 -0
- oscura/onboarding/tutorials.py +405 -0
- oscura/onboarding/wizard.py +466 -0
- oscura/optimization/__init__.py +19 -0
- oscura/optimization/parallel.py +440 -0
- oscura/optimization/search.py +532 -0
- oscura/pipeline/__init__.py +43 -0
- oscura/pipeline/base.py +338 -0
- oscura/pipeline/composition.py +242 -0
- oscura/pipeline/parallel.py +448 -0
- oscura/pipeline/pipeline.py +375 -0
- oscura/pipeline/reverse_engineering.py +1119 -0
- oscura/plugins/__init__.py +122 -0
- oscura/plugins/base.py +272 -0
- oscura/plugins/cli.py +497 -0
- oscura/plugins/discovery.py +411 -0
- oscura/plugins/isolation.py +418 -0
- oscura/plugins/lifecycle.py +959 -0
- oscura/plugins/manager.py +493 -0
- oscura/plugins/registry.py +421 -0
- oscura/plugins/versioning.py +372 -0
- oscura/py.typed +0 -0
- oscura/quality/__init__.py +65 -0
- oscura/quality/ensemble.py +740 -0
- oscura/quality/explainer.py +338 -0
- oscura/quality/scoring.py +616 -0
- oscura/quality/warnings.py +456 -0
- oscura/reporting/__init__.py +248 -0
- oscura/reporting/advanced.py +1234 -0
- oscura/reporting/analyze.py +448 -0
- oscura/reporting/argument_preparer.py +596 -0
- oscura/reporting/auto_report.py +507 -0
- oscura/reporting/batch.py +615 -0
- oscura/reporting/chart_selection.py +223 -0
- oscura/reporting/comparison.py +330 -0
- oscura/reporting/config.py +615 -0
- oscura/reporting/content/__init__.py +39 -0
- oscura/reporting/content/executive.py +127 -0
- oscura/reporting/content/filtering.py +191 -0
- oscura/reporting/content/minimal.py +257 -0
- oscura/reporting/content/verbosity.py +162 -0
- oscura/reporting/core.py +508 -0
- oscura/reporting/core_formats/__init__.py +17 -0
- oscura/reporting/core_formats/multi_format.py +210 -0
- oscura/reporting/engine.py +836 -0
- oscura/reporting/export.py +366 -0
- oscura/reporting/formatting/__init__.py +129 -0
- oscura/reporting/formatting/emphasis.py +81 -0
- oscura/reporting/formatting/numbers.py +403 -0
- oscura/reporting/formatting/standards.py +55 -0
- oscura/reporting/formatting.py +466 -0
- oscura/reporting/html.py +578 -0
- oscura/reporting/index.py +590 -0
- oscura/reporting/multichannel.py +296 -0
- oscura/reporting/output.py +379 -0
- oscura/reporting/pdf.py +373 -0
- oscura/reporting/plots.py +731 -0
- oscura/reporting/pptx_export.py +360 -0
- oscura/reporting/renderers/__init__.py +11 -0
- oscura/reporting/renderers/pdf.py +94 -0
- oscura/reporting/sections.py +471 -0
- oscura/reporting/standards.py +680 -0
- oscura/reporting/summary_generator.py +368 -0
- oscura/reporting/tables.py +397 -0
- oscura/reporting/template_system.py +724 -0
- oscura/reporting/templates/__init__.py +15 -0
- oscura/reporting/templates/definition.py +205 -0
- oscura/reporting/templates/index.html +649 -0
- oscura/reporting/templates/index.md +173 -0
- oscura/schemas/__init__.py +158 -0
- oscura/schemas/bus_configuration.json +322 -0
- oscura/schemas/device_mapping.json +182 -0
- oscura/schemas/packet_format.json +418 -0
- oscura/schemas/protocol_definition.json +363 -0
- oscura/search/__init__.py +16 -0
- oscura/search/anomaly.py +292 -0
- oscura/search/context.py +149 -0
- oscura/search/pattern.py +160 -0
- oscura/session/__init__.py +34 -0
- oscura/session/annotations.py +289 -0
- oscura/session/history.py +313 -0
- oscura/session/session.py +445 -0
- oscura/streaming/__init__.py +43 -0
- oscura/streaming/chunked.py +611 -0
- oscura/streaming/progressive.py +393 -0
- oscura/streaming/realtime.py +622 -0
- oscura/testing/__init__.py +54 -0
- oscura/testing/synthetic.py +808 -0
- oscura/triggering/__init__.py +68 -0
- oscura/triggering/base.py +229 -0
- oscura/triggering/edge.py +353 -0
- oscura/triggering/pattern.py +344 -0
- oscura/triggering/pulse.py +581 -0
- oscura/triggering/window.py +453 -0
- oscura/ui/__init__.py +48 -0
- oscura/ui/formatters.py +526 -0
- oscura/ui/progressive_display.py +340 -0
- oscura/utils/__init__.py +99 -0
- oscura/utils/autodetect.py +338 -0
- oscura/utils/buffer.py +389 -0
- oscura/utils/lazy.py +407 -0
- oscura/utils/lazy_imports.py +147 -0
- oscura/utils/memory.py +836 -0
- oscura/utils/memory_advanced.py +1326 -0
- oscura/utils/memory_extensions.py +465 -0
- oscura/utils/progressive.py +352 -0
- oscura/utils/windowing.py +362 -0
- oscura/visualization/__init__.py +321 -0
- oscura/visualization/accessibility.py +526 -0
- oscura/visualization/annotations.py +374 -0
- oscura/visualization/axis_scaling.py +305 -0
- oscura/visualization/colors.py +453 -0
- oscura/visualization/digital.py +337 -0
- oscura/visualization/eye.py +420 -0
- oscura/visualization/histogram.py +281 -0
- oscura/visualization/interactive.py +858 -0
- oscura/visualization/jitter.py +702 -0
- oscura/visualization/keyboard.py +394 -0
- oscura/visualization/layout.py +365 -0
- oscura/visualization/optimization.py +1028 -0
- oscura/visualization/palettes.py +446 -0
- oscura/visualization/plot.py +92 -0
- oscura/visualization/power.py +290 -0
- oscura/visualization/power_extended.py +626 -0
- oscura/visualization/presets.py +467 -0
- oscura/visualization/protocols.py +932 -0
- oscura/visualization/render.py +207 -0
- oscura/visualization/rendering.py +444 -0
- oscura/visualization/reverse_engineering.py +791 -0
- oscura/visualization/signal_integrity.py +808 -0
- oscura/visualization/specialized.py +553 -0
- oscura/visualization/spectral.py +811 -0
- oscura/visualization/styles.py +381 -0
- oscura/visualization/thumbnails.py +311 -0
- oscura/visualization/time_axis.py +351 -0
- oscura/visualization/waveform.py +367 -0
- oscura/workflow/__init__.py +13 -0
- oscura/workflow/dag.py +377 -0
- oscura/workflows/__init__.py +58 -0
- oscura/workflows/compliance.py +280 -0
- oscura/workflows/digital.py +272 -0
- oscura/workflows/multi_trace.py +502 -0
- oscura/workflows/power.py +178 -0
- oscura/workflows/protocol.py +492 -0
- oscura/workflows/reverse_engineering.py +639 -0
- oscura/workflows/signal_integrity.py +227 -0
- oscura-0.1.0.dist-info/METADATA +300 -0
- oscura-0.1.0.dist-info/RECORD +463 -0
- oscura-0.1.0.dist-info/entry_points.txt +2 -0
- {oscura-0.0.1.dist-info → oscura-0.1.0.dist-info}/licenses/LICENSE +1 -1
- oscura-0.0.1.dist-info/METADATA +0 -63
- oscura-0.0.1.dist-info/RECORD +0 -5
- {oscura-0.0.1.dist-info → oscura-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
"""Pattern learning and automatic discovery from binary data.
|
|
2
|
+
|
|
3
|
+
- RE-PAT-004: Pattern Learning and Discovery
|
|
4
|
+
|
|
5
|
+
This module provides machine learning inspired approaches for discovering
|
|
6
|
+
patterns in binary data without prior knowledge, including entropy-based
|
|
7
|
+
segmentation, frequency analysis, and structural inference.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections import Counter, defaultdict
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class LearnedPattern:
|
|
21
|
+
"""A pattern discovered through learning.
|
|
22
|
+
|
|
23
|
+
Implements RE-PAT-004: Learned pattern representation.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
pattern: The pattern bytes.
|
|
27
|
+
frequency: Number of occurrences.
|
|
28
|
+
confidence: Confidence score (0-1).
|
|
29
|
+
positions: List of positions where found.
|
|
30
|
+
context_before: Common bytes appearing before pattern.
|
|
31
|
+
context_after: Common bytes appearing after pattern.
|
|
32
|
+
is_structural: Whether pattern appears to be structural.
|
|
33
|
+
is_delimiter: Whether pattern appears to be a delimiter.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
pattern: bytes
|
|
37
|
+
frequency: int
|
|
38
|
+
confidence: float
|
|
39
|
+
positions: list[int] = field(default_factory=list)
|
|
40
|
+
context_before: bytes = b""
|
|
41
|
+
context_after: bytes = b""
|
|
42
|
+
is_structural: bool = False
|
|
43
|
+
is_delimiter: bool = False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class StructureHypothesis:
|
|
48
|
+
"""Hypothesis about data structure.
|
|
49
|
+
|
|
50
|
+
Implements RE-PAT-004: Structure hypothesis.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
field_boundaries: Detected field boundaries.
|
|
54
|
+
field_types: Inferred field types.
|
|
55
|
+
header_size: Estimated header size.
|
|
56
|
+
record_size: Estimated record size (if fixed).
|
|
57
|
+
delimiters: Detected delimiters.
|
|
58
|
+
confidence: Overall confidence.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
field_boundaries: list[int]
|
|
62
|
+
field_types: list[str]
|
|
63
|
+
header_size: int
|
|
64
|
+
record_size: int | None
|
|
65
|
+
delimiters: list[bytes]
|
|
66
|
+
confidence: float
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class NgramModel:
|
|
71
|
+
"""N-gram language model for binary data.
|
|
72
|
+
|
|
73
|
+
Implements RE-PAT-004: N-gram modeling.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
n: N-gram size.
|
|
77
|
+
counts: N-gram frequency counts.
|
|
78
|
+
total: Total n-grams observed.
|
|
79
|
+
vocabulary_size: Number of unique n-grams.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
n: int
|
|
83
|
+
counts: dict[bytes, int] = field(default_factory=dict)
|
|
84
|
+
total: int = 0
|
|
85
|
+
vocabulary_size: int = 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PatternLearner:
|
|
89
|
+
"""Learn patterns from binary data samples.
|
|
90
|
+
|
|
91
|
+
Implements RE-PAT-004: Pattern Learning and Discovery.
|
|
92
|
+
|
|
93
|
+
Uses entropy analysis, n-gram frequency, and positional statistics
|
|
94
|
+
to discover recurring patterns without prior knowledge.
|
|
95
|
+
|
|
96
|
+
Example:
|
|
97
|
+
>>> learner = PatternLearner()
|
|
98
|
+
>>> learner.add_sample(data1)
|
|
99
|
+
>>> learner.add_sample(data2)
|
|
100
|
+
>>> patterns = learner.learn_patterns()
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self,
|
|
105
|
+
min_pattern_length: int = 2,
|
|
106
|
+
max_pattern_length: int = 16,
|
|
107
|
+
min_frequency: int = 3,
|
|
108
|
+
min_confidence: float = 0.5,
|
|
109
|
+
) -> None:
|
|
110
|
+
"""Initialize pattern learner.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
min_pattern_length: Minimum pattern length to consider.
|
|
114
|
+
max_pattern_length: Maximum pattern length to consider.
|
|
115
|
+
min_frequency: Minimum occurrences to consider pattern.
|
|
116
|
+
min_confidence: Minimum confidence threshold.
|
|
117
|
+
"""
|
|
118
|
+
self.min_pattern_length = min_pattern_length
|
|
119
|
+
self.max_pattern_length = max_pattern_length
|
|
120
|
+
self.min_frequency = min_frequency
|
|
121
|
+
self.min_confidence = min_confidence
|
|
122
|
+
|
|
123
|
+
self._samples: list[bytes] = []
|
|
124
|
+
self._ngram_models: dict[int, NgramModel] = {}
|
|
125
|
+
self._position_stats: dict[bytes, list[int]] = defaultdict(list)
|
|
126
|
+
|
|
127
|
+
def add_sample(self, data: bytes) -> None:
|
|
128
|
+
"""Add a data sample for learning.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
data: Binary data sample.
|
|
132
|
+
"""
|
|
133
|
+
self._samples.append(data)
|
|
134
|
+
|
|
135
|
+
def add_samples(self, samples: Sequence[bytes]) -> None:
|
|
136
|
+
"""Add multiple data samples.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
samples: List of binary data samples.
|
|
140
|
+
"""
|
|
141
|
+
self._samples.extend(samples)
|
|
142
|
+
|
|
143
|
+
def learn_patterns(self, top_k: int = 20) -> list[LearnedPattern]:
|
|
144
|
+
"""Learn patterns from accumulated samples.
|
|
145
|
+
|
|
146
|
+
Implements RE-PAT-004: Pattern discovery workflow.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
top_k: Maximum number of patterns to return.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of discovered patterns, sorted by confidence.
|
|
153
|
+
"""
|
|
154
|
+
if not self._samples:
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
# Build n-gram models
|
|
158
|
+
self._build_ngram_models()
|
|
159
|
+
|
|
160
|
+
# Find candidate patterns
|
|
161
|
+
candidates = self._find_candidates()
|
|
162
|
+
|
|
163
|
+
# Score and filter patterns
|
|
164
|
+
scored = self._score_patterns(candidates)
|
|
165
|
+
|
|
166
|
+
# Sort by confidence and return top K
|
|
167
|
+
scored.sort(key=lambda p: -p.confidence)
|
|
168
|
+
return scored[:top_k]
|
|
169
|
+
|
|
170
|
+
def learn_structure(self) -> StructureHypothesis:
|
|
171
|
+
"""Learn structural patterns from samples.
|
|
172
|
+
|
|
173
|
+
Implements RE-PAT-004: Structure inference.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
StructureHypothesis about data organization.
|
|
177
|
+
"""
|
|
178
|
+
if not self._samples:
|
|
179
|
+
return StructureHypothesis(
|
|
180
|
+
field_boundaries=[],
|
|
181
|
+
field_types=[],
|
|
182
|
+
header_size=0,
|
|
183
|
+
record_size=None,
|
|
184
|
+
delimiters=[],
|
|
185
|
+
confidence=0.0,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Analyze entropy profile for field boundaries
|
|
189
|
+
boundaries = self._detect_field_boundaries()
|
|
190
|
+
|
|
191
|
+
# Infer field types
|
|
192
|
+
field_types = self._infer_field_types(boundaries)
|
|
193
|
+
|
|
194
|
+
# Estimate header size
|
|
195
|
+
header_size = self._estimate_header_size(boundaries)
|
|
196
|
+
|
|
197
|
+
# Check for fixed record size
|
|
198
|
+
record_size = self._detect_record_size()
|
|
199
|
+
|
|
200
|
+
# Find delimiters
|
|
201
|
+
delimiters = self._find_delimiters()
|
|
202
|
+
|
|
203
|
+
# Calculate confidence
|
|
204
|
+
confidence = self._calculate_structure_confidence(
|
|
205
|
+
boundaries, field_types, record_size, delimiters
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return StructureHypothesis(
|
|
209
|
+
field_boundaries=boundaries,
|
|
210
|
+
field_types=field_types,
|
|
211
|
+
header_size=header_size,
|
|
212
|
+
record_size=record_size,
|
|
213
|
+
delimiters=delimiters,
|
|
214
|
+
confidence=confidence,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def predict_next_bytes(
|
|
218
|
+
self, context: bytes, n_predictions: int = 5
|
|
219
|
+
) -> list[tuple[bytes, float]]:
|
|
220
|
+
"""Predict likely next bytes given context.
|
|
221
|
+
|
|
222
|
+
Implements RE-PAT-004: Byte prediction using n-gram models.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
context: Context bytes.
|
|
226
|
+
n_predictions: Number of predictions to return.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of (next_byte, probability) tuples.
|
|
230
|
+
"""
|
|
231
|
+
predictions = []
|
|
232
|
+
|
|
233
|
+
# Use largest n-gram model that fits context
|
|
234
|
+
for n in range(min(len(context) + 1, self.max_pattern_length), 0, -1):
|
|
235
|
+
if n not in self._ngram_models:
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
model = self._ngram_models[n]
|
|
239
|
+
prefix = context[-(n - 1) :] if n > 1 else b""
|
|
240
|
+
|
|
241
|
+
# Find matching prefixes
|
|
242
|
+
matching = {}
|
|
243
|
+
for ngram, count in model.counts.items():
|
|
244
|
+
if ngram[:-1] == prefix:
|
|
245
|
+
matching[ngram[-1:]] = count
|
|
246
|
+
|
|
247
|
+
if matching:
|
|
248
|
+
total = sum(matching.values())
|
|
249
|
+
for byte_val, count in matching.items():
|
|
250
|
+
prob = count / total
|
|
251
|
+
predictions.append((byte_val, prob))
|
|
252
|
+
break
|
|
253
|
+
|
|
254
|
+
# Sort by probability
|
|
255
|
+
predictions.sort(key=lambda x: -x[1])
|
|
256
|
+
return predictions[:n_predictions]
|
|
257
|
+
|
|
258
|
+
def build_ngram_model(self, n: int) -> NgramModel:
|
|
259
|
+
"""Build n-gram model from samples.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
n: N-gram size.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
NgramModel with frequency statistics.
|
|
266
|
+
"""
|
|
267
|
+
model = NgramModel(n=n)
|
|
268
|
+
|
|
269
|
+
for sample in self._samples:
|
|
270
|
+
for i in range(len(sample) - n + 1):
|
|
271
|
+
ngram = sample[i : i + n]
|
|
272
|
+
if ngram not in model.counts:
|
|
273
|
+
model.counts[ngram] = 0
|
|
274
|
+
model.vocabulary_size += 1
|
|
275
|
+
model.counts[ngram] += 1
|
|
276
|
+
model.total += 1
|
|
277
|
+
|
|
278
|
+
self._ngram_models[n] = model
|
|
279
|
+
return model
|
|
280
|
+
|
|
281
|
+
def _build_ngram_models(self) -> None:
|
|
282
|
+
"""Build n-gram models for all sizes."""
|
|
283
|
+
for n in range(self.min_pattern_length, self.max_pattern_length + 1):
|
|
284
|
+
self.build_ngram_model(n)
|
|
285
|
+
|
|
286
|
+
def _find_candidates(self) -> dict[bytes, int]:
|
|
287
|
+
"""Find candidate patterns based on frequency.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Dictionary mapping patterns to frequencies.
|
|
291
|
+
"""
|
|
292
|
+
candidates = {}
|
|
293
|
+
|
|
294
|
+
for n in range(self.min_pattern_length, self.max_pattern_length + 1):
|
|
295
|
+
if n not in self._ngram_models:
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
model = self._ngram_models[n]
|
|
299
|
+
for pattern, count in model.counts.items():
|
|
300
|
+
if count >= self.min_frequency:
|
|
301
|
+
candidates[pattern] = count
|
|
302
|
+
|
|
303
|
+
return candidates
|
|
304
|
+
|
|
305
|
+
def _score_patterns(self, candidates: dict[bytes, int]) -> list[LearnedPattern]:
|
|
306
|
+
"""Score candidate patterns.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
candidates: Dictionary of pattern -> frequency.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of scored LearnedPattern objects.
|
|
313
|
+
"""
|
|
314
|
+
patterns = []
|
|
315
|
+
|
|
316
|
+
for pattern, frequency in candidates.items():
|
|
317
|
+
# Find all positions across samples
|
|
318
|
+
positions = []
|
|
319
|
+
for sample_idx, sample in enumerate(self._samples):
|
|
320
|
+
start = 0
|
|
321
|
+
while True:
|
|
322
|
+
pos = sample.find(pattern, start)
|
|
323
|
+
if pos == -1:
|
|
324
|
+
break
|
|
325
|
+
positions.append((sample_idx, pos))
|
|
326
|
+
start = pos + 1
|
|
327
|
+
|
|
328
|
+
# Calculate confidence based on distribution
|
|
329
|
+
confidence = self._calculate_pattern_confidence(pattern, positions)
|
|
330
|
+
|
|
331
|
+
if confidence < self.min_confidence:
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
# Get context
|
|
335
|
+
context_before, context_after = self._get_context(pattern, positions)
|
|
336
|
+
|
|
337
|
+
# Check if structural
|
|
338
|
+
is_structural = self._is_structural(pattern, positions)
|
|
339
|
+
|
|
340
|
+
# Check if delimiter
|
|
341
|
+
is_delimiter = self._is_delimiter(pattern, positions)
|
|
342
|
+
|
|
343
|
+
patterns.append(
|
|
344
|
+
LearnedPattern(
|
|
345
|
+
pattern=pattern,
|
|
346
|
+
frequency=frequency,
|
|
347
|
+
confidence=confidence,
|
|
348
|
+
positions=[p for _, p in positions],
|
|
349
|
+
context_before=context_before,
|
|
350
|
+
context_after=context_after,
|
|
351
|
+
is_structural=is_structural,
|
|
352
|
+
is_delimiter=is_delimiter,
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
return patterns
|
|
357
|
+
|
|
358
|
+
def _calculate_pattern_confidence(
|
|
359
|
+
self, pattern: bytes, positions: list[tuple[int, int]]
|
|
360
|
+
) -> float:
|
|
361
|
+
"""Calculate confidence score for pattern.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
pattern: The pattern.
|
|
365
|
+
positions: List of (sample_idx, position) tuples.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Confidence score (0-1).
|
|
369
|
+
"""
|
|
370
|
+
if not positions:
|
|
371
|
+
return 0.0
|
|
372
|
+
|
|
373
|
+
# Factor 1: Frequency across samples
|
|
374
|
+
samples_with_pattern = len({p[0] for p in positions})
|
|
375
|
+
sample_coverage = samples_with_pattern / len(self._samples)
|
|
376
|
+
|
|
377
|
+
# Factor 2: Positional consistency
|
|
378
|
+
position_offsets = [p[1] for p in positions]
|
|
379
|
+
if len(position_offsets) > 1:
|
|
380
|
+
variance = float(np.var(position_offsets))
|
|
381
|
+
max_pos = max(max(len(s) for s in self._samples), 1)
|
|
382
|
+
position_consistency = 1.0 / (1.0 + variance / (max_pos**2))
|
|
383
|
+
else:
|
|
384
|
+
position_consistency = 0.5
|
|
385
|
+
|
|
386
|
+
# Factor 3: Pattern complexity (non-trivial patterns)
|
|
387
|
+
unique_bytes = len(set(pattern))
|
|
388
|
+
complexity = unique_bytes / len(pattern) if pattern else 0
|
|
389
|
+
|
|
390
|
+
# Combined score
|
|
391
|
+
confidence = 0.4 * sample_coverage + 0.3 * position_consistency + 0.3 * complexity
|
|
392
|
+
|
|
393
|
+
return float(min(1.0, confidence))
|
|
394
|
+
|
|
395
|
+
def _get_context(self, pattern: bytes, positions: list[tuple[int, int]]) -> tuple[bytes, bytes]:
|
|
396
|
+
"""Get common context before and after pattern.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
pattern: The pattern.
|
|
400
|
+
positions: List of (sample_idx, position) tuples.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Tuple of (context_before, context_after).
|
|
404
|
+
"""
|
|
405
|
+
before_bytes = []
|
|
406
|
+
after_bytes = []
|
|
407
|
+
|
|
408
|
+
context_len = min(4, self.min_pattern_length)
|
|
409
|
+
|
|
410
|
+
for sample_idx, pos in positions[:100]: # Limit samples
|
|
411
|
+
sample = self._samples[sample_idx]
|
|
412
|
+
|
|
413
|
+
# Bytes before
|
|
414
|
+
if pos >= context_len:
|
|
415
|
+
before_bytes.append(sample[pos - context_len : pos])
|
|
416
|
+
|
|
417
|
+
# Bytes after
|
|
418
|
+
end_pos = pos + len(pattern)
|
|
419
|
+
if end_pos + context_len <= len(sample):
|
|
420
|
+
after_bytes.append(sample[end_pos : end_pos + context_len])
|
|
421
|
+
|
|
422
|
+
# Find most common
|
|
423
|
+
context_before = b""
|
|
424
|
+
context_after = b""
|
|
425
|
+
|
|
426
|
+
if before_bytes:
|
|
427
|
+
counter = Counter(before_bytes)
|
|
428
|
+
most_common = counter.most_common(1)
|
|
429
|
+
if most_common and most_common[0][1] >= 2:
|
|
430
|
+
context_before = most_common[0][0]
|
|
431
|
+
|
|
432
|
+
if after_bytes:
|
|
433
|
+
counter = Counter(after_bytes)
|
|
434
|
+
most_common = counter.most_common(1)
|
|
435
|
+
if most_common and most_common[0][1] >= 2:
|
|
436
|
+
context_after = most_common[0][0]
|
|
437
|
+
|
|
438
|
+
return context_before, context_after
|
|
439
|
+
|
|
440
|
+
def _is_structural(self, pattern: bytes, positions: list[tuple[int, int]]) -> bool:
|
|
441
|
+
"""Check if pattern appears structural.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
pattern: The pattern.
|
|
445
|
+
positions: List of positions.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
True if pattern appears structural.
|
|
449
|
+
"""
|
|
450
|
+
if not positions:
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
# Structural patterns tend to appear at consistent offsets
|
|
454
|
+
offsets = [p[1] for p in positions]
|
|
455
|
+
if len(set(offsets)) == 1:
|
|
456
|
+
return True
|
|
457
|
+
|
|
458
|
+
# Or at regular intervals
|
|
459
|
+
if len(offsets) > 2:
|
|
460
|
+
diffs = [
|
|
461
|
+
offsets[i + 1] - offsets[i]
|
|
462
|
+
for i in range(len(offsets) - 1)
|
|
463
|
+
if offsets[i + 1] > offsets[i]
|
|
464
|
+
]
|
|
465
|
+
if diffs and len(set(diffs)) == 1:
|
|
466
|
+
return True
|
|
467
|
+
|
|
468
|
+
return False
|
|
469
|
+
|
|
470
|
+
def _is_delimiter(self, pattern: bytes, positions: list[tuple[int, int]]) -> bool:
|
|
471
|
+
"""Check if pattern appears to be a delimiter.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
pattern: The pattern.
|
|
475
|
+
positions: List of positions.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
True if pattern appears to be a delimiter.
|
|
479
|
+
"""
|
|
480
|
+
# Delimiters often have regular spacing
|
|
481
|
+
if not positions:
|
|
482
|
+
return False
|
|
483
|
+
|
|
484
|
+
# Group by sample
|
|
485
|
+
by_sample = defaultdict(list)
|
|
486
|
+
for sample_idx, pos in positions:
|
|
487
|
+
by_sample[sample_idx].append(pos)
|
|
488
|
+
|
|
489
|
+
regular_count = 0
|
|
490
|
+
for sample_positions in by_sample.values():
|
|
491
|
+
if len(sample_positions) >= 3:
|
|
492
|
+
diffs = [
|
|
493
|
+
sample_positions[i + 1] - sample_positions[i]
|
|
494
|
+
for i in range(len(sample_positions) - 1)
|
|
495
|
+
]
|
|
496
|
+
# Check for regular intervals
|
|
497
|
+
if len(set(diffs)) == 1 or (diffs and max(diffs) - min(diffs) < 4):
|
|
498
|
+
regular_count += 1
|
|
499
|
+
|
|
500
|
+
return regular_count >= len(by_sample) * 0.5
|
|
501
|
+
|
|
502
|
+
def _detect_field_boundaries(self) -> list[int]:
|
|
503
|
+
"""Detect field boundaries using entropy transitions."""
|
|
504
|
+
if not self._samples:
|
|
505
|
+
return []
|
|
506
|
+
|
|
507
|
+
# Use first sample or combined samples
|
|
508
|
+
combined = b"".join(self._samples[:10])
|
|
509
|
+
|
|
510
|
+
from oscura.analyzers.statistical.entropy import detect_entropy_transitions
|
|
511
|
+
|
|
512
|
+
try:
|
|
513
|
+
transitions = detect_entropy_transitions(combined, window=64, threshold=0.8, min_gap=4)
|
|
514
|
+
return [t.offset for t in transitions]
|
|
515
|
+
except ValueError:
|
|
516
|
+
return []
|
|
517
|
+
|
|
518
|
+
def _infer_field_types(self, boundaries: list[int]) -> list[str]:
|
|
519
|
+
"""Infer field types based on content patterns.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
boundaries: Field boundary offsets.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
List of inferred field types.
|
|
526
|
+
"""
|
|
527
|
+
if not boundaries or not self._samples:
|
|
528
|
+
return []
|
|
529
|
+
|
|
530
|
+
field_types = []
|
|
531
|
+
sample = self._samples[0]
|
|
532
|
+
boundaries = [0] + boundaries + [len(sample)]
|
|
533
|
+
|
|
534
|
+
for i in range(len(boundaries) - 1):
|
|
535
|
+
start = boundaries[i]
|
|
536
|
+
end = min(boundaries[i + 1], len(sample))
|
|
537
|
+
field_data = sample[start:end]
|
|
538
|
+
|
|
539
|
+
field_type = self._classify_field(field_data)
|
|
540
|
+
field_types.append(field_type)
|
|
541
|
+
|
|
542
|
+
return field_types
|
|
543
|
+
|
|
544
|
+
def _classify_field(self, data: bytes) -> str:
|
|
545
|
+
"""Classify a field based on its content.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
data: Field data.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
Field type string.
|
|
552
|
+
"""
|
|
553
|
+
if not data:
|
|
554
|
+
return "empty"
|
|
555
|
+
|
|
556
|
+
# Check for constant
|
|
557
|
+
if len(set(data)) == 1:
|
|
558
|
+
return "constant"
|
|
559
|
+
|
|
560
|
+
# Check for counter (monotonic)
|
|
561
|
+
if len(data) <= 4:
|
|
562
|
+
values = list(data)
|
|
563
|
+
if all(values[i] <= values[i + 1] for i in range(len(values) - 1)):
|
|
564
|
+
return "counter"
|
|
565
|
+
|
|
566
|
+
# Check for printable text
|
|
567
|
+
printable = sum(1 for b in data if 32 <= b <= 126)
|
|
568
|
+
if printable / len(data) > 0.8:
|
|
569
|
+
return "text"
|
|
570
|
+
|
|
571
|
+
# Check for high entropy (random/encrypted)
|
|
572
|
+
from oscura.analyzers.statistical.entropy import shannon_entropy
|
|
573
|
+
|
|
574
|
+
entropy = shannon_entropy(data)
|
|
575
|
+
if entropy > 7.0:
|
|
576
|
+
return "random"
|
|
577
|
+
elif entropy > 5.0:
|
|
578
|
+
return "binary"
|
|
579
|
+
|
|
580
|
+
return "structured"
|
|
581
|
+
|
|
582
|
+
def _estimate_header_size(self, boundaries: list[int]) -> int:
|
|
583
|
+
"""Estimate header size from boundaries.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
boundaries: Field boundary offsets.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Estimated header size.
|
|
590
|
+
"""
|
|
591
|
+
if not boundaries:
|
|
592
|
+
return 0
|
|
593
|
+
|
|
594
|
+
# Header typically ends at first high-entropy transition
|
|
595
|
+
for b in boundaries:
|
|
596
|
+
if b > 0:
|
|
597
|
+
return b
|
|
598
|
+
|
|
599
|
+
return boundaries[0] if boundaries else 0
|
|
600
|
+
|
|
601
|
+
def _detect_record_size(self) -> int | None:
|
|
602
|
+
"""Detect fixed record size if present.
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
Record size or None if variable.
|
|
606
|
+
"""
|
|
607
|
+
if len(self._samples) < 2:
|
|
608
|
+
return None
|
|
609
|
+
|
|
610
|
+
# Check if all samples have same length
|
|
611
|
+
lengths = [len(s) for s in self._samples]
|
|
612
|
+
if len(set(lengths)) == 1:
|
|
613
|
+
return lengths[0]
|
|
614
|
+
|
|
615
|
+
# Check for GCD of lengths (might indicate record size)
|
|
616
|
+
from functools import reduce
|
|
617
|
+
from math import gcd
|
|
618
|
+
|
|
619
|
+
if all(length > 0 for length in lengths):
|
|
620
|
+
common_div = reduce(gcd, lengths)
|
|
621
|
+
if common_div > 1 and common_div != min(lengths):
|
|
622
|
+
return common_div
|
|
623
|
+
|
|
624
|
+
return None
|
|
625
|
+
|
|
626
|
+
def _find_delimiters(self) -> list[bytes]:
|
|
627
|
+
"""Find delimiter patterns.
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
List of likely delimiter bytes.
|
|
631
|
+
"""
|
|
632
|
+
patterns = self.learn_patterns(top_k=50)
|
|
633
|
+
return [p.pattern for p in patterns if p.is_delimiter][:5]
|
|
634
|
+
|
|
635
|
+
def _calculate_structure_confidence(
|
|
636
|
+
self,
|
|
637
|
+
boundaries: list[int],
|
|
638
|
+
field_types: list[str],
|
|
639
|
+
record_size: int | None,
|
|
640
|
+
delimiters: list[bytes],
|
|
641
|
+
) -> float:
|
|
642
|
+
"""Calculate confidence in structure hypothesis.
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
boundaries: Detected boundaries.
|
|
646
|
+
field_types: Inferred types.
|
|
647
|
+
record_size: Detected record size.
|
|
648
|
+
delimiters: Found delimiters.
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
Confidence score (0-1).
|
|
652
|
+
"""
|
|
653
|
+
score = 0.0
|
|
654
|
+
|
|
655
|
+
# Having boundaries adds confidence
|
|
656
|
+
if boundaries:
|
|
657
|
+
score += 0.3
|
|
658
|
+
|
|
659
|
+
# Having non-unknown field types adds confidence
|
|
660
|
+
known_types = sum(1 for t in field_types if t != "structured")
|
|
661
|
+
if field_types:
|
|
662
|
+
score += 0.2 * (known_types / len(field_types))
|
|
663
|
+
|
|
664
|
+
# Fixed record size adds confidence
|
|
665
|
+
if record_size is not None:
|
|
666
|
+
score += 0.2
|
|
667
|
+
|
|
668
|
+
# Delimiters add confidence
|
|
669
|
+
if delimiters:
|
|
670
|
+
score += 0.2
|
|
671
|
+
|
|
672
|
+
# Multiple samples add confidence
|
|
673
|
+
if len(self._samples) > 5:
|
|
674
|
+
score += 0.1
|
|
675
|
+
|
|
676
|
+
return min(1.0, score)
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def learn_patterns_from_data(
|
|
680
|
+
data: bytes | Sequence[bytes],
|
|
681
|
+
min_length: int = 2,
|
|
682
|
+
max_length: int = 16,
|
|
683
|
+
min_frequency: int = 3,
|
|
684
|
+
top_k: int = 20,
|
|
685
|
+
) -> list[LearnedPattern]:
|
|
686
|
+
"""Learn patterns from binary data.
|
|
687
|
+
|
|
688
|
+
Implements RE-PAT-004: Pattern Learning and Discovery.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
data: Single data sample or list of samples.
|
|
692
|
+
min_length: Minimum pattern length.
|
|
693
|
+
max_length: Maximum pattern length.
|
|
694
|
+
min_frequency: Minimum occurrences.
|
|
695
|
+
top_k: Number of patterns to return.
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
List of discovered patterns.
|
|
699
|
+
|
|
700
|
+
Example:
|
|
701
|
+
>>> patterns = learn_patterns_from_data(binary_data)
|
|
702
|
+
>>> for p in patterns:
|
|
703
|
+
... print(f"Pattern: {p.pattern.hex()}, freq: {p.frequency}")
|
|
704
|
+
"""
|
|
705
|
+
learner = PatternLearner(
|
|
706
|
+
min_pattern_length=min_length,
|
|
707
|
+
max_pattern_length=max_length,
|
|
708
|
+
min_frequency=min_frequency,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if isinstance(data, bytes):
|
|
712
|
+
learner.add_sample(data)
|
|
713
|
+
else:
|
|
714
|
+
learner.add_samples(data)
|
|
715
|
+
|
|
716
|
+
return learner.learn_patterns(top_k=top_k)
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def infer_structure(samples: Sequence[bytes]) -> StructureHypothesis:
|
|
720
|
+
"""Infer data structure from samples.
|
|
721
|
+
|
|
722
|
+
Implements RE-PAT-004: Structure inference.
|
|
723
|
+
|
|
724
|
+
Args:
|
|
725
|
+
samples: List of binary data samples.
|
|
726
|
+
|
|
727
|
+
Returns:
|
|
728
|
+
StructureHypothesis about data organization.
|
|
729
|
+
|
|
730
|
+
Example:
|
|
731
|
+
>>> hypothesis = infer_structure(packet_samples)
|
|
732
|
+
>>> print(f"Header size: {hypothesis.header_size}")
|
|
733
|
+
"""
|
|
734
|
+
learner = PatternLearner()
|
|
735
|
+
learner.add_samples(samples)
|
|
736
|
+
return learner.learn_structure()
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def find_recurring_structures(
|
|
740
|
+
data: bytes,
|
|
741
|
+
min_size: int = 8,
|
|
742
|
+
max_size: int = 256,
|
|
743
|
+
) -> list[tuple[int, int, float]]:
|
|
744
|
+
"""Find recurring fixed-size structures in data.
|
|
745
|
+
|
|
746
|
+
Implements RE-PAT-004: Structure detection.
|
|
747
|
+
|
|
748
|
+
Args:
|
|
749
|
+
data: Binary data.
|
|
750
|
+
min_size: Minimum structure size.
|
|
751
|
+
max_size: Maximum structure size.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
List of (size, offset, confidence) tuples for detected structures.
|
|
755
|
+
"""
|
|
756
|
+
results = []
|
|
757
|
+
|
|
758
|
+
for size in range(min_size, min(max_size, len(data) // 2) + 1):
|
|
759
|
+
# Check if data divides evenly
|
|
760
|
+
if len(data) % size != 0:
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
num_records = len(data) // size
|
|
764
|
+
if num_records < 2:
|
|
765
|
+
continue
|
|
766
|
+
|
|
767
|
+
# Compare records for similarity
|
|
768
|
+
records = [data[i * size : (i + 1) * size] for i in range(num_records)]
|
|
769
|
+
|
|
770
|
+
# Calculate similarity between consecutive records
|
|
771
|
+
similarities = []
|
|
772
|
+
for i in range(len(records) - 1):
|
|
773
|
+
matching = sum(a == b for a, b in zip(records[i], records[i + 1], strict=True))
|
|
774
|
+
similarities.append(matching / size)
|
|
775
|
+
|
|
776
|
+
if similarities:
|
|
777
|
+
avg_similarity = sum(similarities) / len(similarities)
|
|
778
|
+
if avg_similarity > 0.3: # Some structural similarity
|
|
779
|
+
results.append((size, 0, avg_similarity))
|
|
780
|
+
|
|
781
|
+
# Sort by confidence
|
|
782
|
+
results.sort(key=lambda x: -x[2])
|
|
783
|
+
return results[:5]
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
__all__ = [
|
|
787
|
+
# Data classes
|
|
788
|
+
"LearnedPattern",
|
|
789
|
+
"NgramModel",
|
|
790
|
+
# Classes
|
|
791
|
+
"PatternLearner",
|
|
792
|
+
"StructureHypothesis",
|
|
793
|
+
"find_recurring_structures",
|
|
794
|
+
"infer_structure",
|
|
795
|
+
# Functions
|
|
796
|
+
"learn_patterns_from_data",
|
|
797
|
+
]
|