oscura 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +813 -8
- oscura/__main__.py +392 -0
- oscura/analyzers/__init__.py +37 -0
- oscura/analyzers/digital/__init__.py +177 -0
- oscura/analyzers/digital/bus.py +691 -0
- oscura/analyzers/digital/clock.py +805 -0
- oscura/analyzers/digital/correlation.py +720 -0
- oscura/analyzers/digital/edges.py +632 -0
- oscura/analyzers/digital/extraction.py +413 -0
- oscura/analyzers/digital/quality.py +878 -0
- oscura/analyzers/digital/signal_quality.py +877 -0
- oscura/analyzers/digital/thresholds.py +708 -0
- oscura/analyzers/digital/timing.py +1104 -0
- oscura/analyzers/eye/__init__.py +46 -0
- oscura/analyzers/eye/diagram.py +434 -0
- oscura/analyzers/eye/metrics.py +555 -0
- oscura/analyzers/jitter/__init__.py +83 -0
- oscura/analyzers/jitter/ber.py +333 -0
- oscura/analyzers/jitter/decomposition.py +759 -0
- oscura/analyzers/jitter/measurements.py +413 -0
- oscura/analyzers/jitter/spectrum.py +220 -0
- oscura/analyzers/measurements.py +40 -0
- oscura/analyzers/packet/__init__.py +171 -0
- oscura/analyzers/packet/daq.py +1077 -0
- oscura/analyzers/packet/metrics.py +437 -0
- oscura/analyzers/packet/parser.py +327 -0
- oscura/analyzers/packet/payload.py +2156 -0
- oscura/analyzers/packet/payload_analysis.py +1312 -0
- oscura/analyzers/packet/payload_extraction.py +236 -0
- oscura/analyzers/packet/payload_patterns.py +670 -0
- oscura/analyzers/packet/stream.py +359 -0
- oscura/analyzers/patterns/__init__.py +266 -0
- oscura/analyzers/patterns/clustering.py +1036 -0
- oscura/analyzers/patterns/discovery.py +539 -0
- oscura/analyzers/patterns/learning.py +797 -0
- oscura/analyzers/patterns/matching.py +1091 -0
- oscura/analyzers/patterns/periodic.py +650 -0
- oscura/analyzers/patterns/sequences.py +767 -0
- oscura/analyzers/power/__init__.py +116 -0
- oscura/analyzers/power/ac_power.py +391 -0
- oscura/analyzers/power/basic.py +383 -0
- oscura/analyzers/power/conduction.py +314 -0
- oscura/analyzers/power/efficiency.py +297 -0
- oscura/analyzers/power/ripple.py +356 -0
- oscura/analyzers/power/soa.py +372 -0
- oscura/analyzers/power/switching.py +479 -0
- oscura/analyzers/protocol/__init__.py +150 -0
- oscura/analyzers/protocols/__init__.py +150 -0
- oscura/analyzers/protocols/base.py +500 -0
- oscura/analyzers/protocols/can.py +620 -0
- oscura/analyzers/protocols/can_fd.py +448 -0
- oscura/analyzers/protocols/flexray.py +405 -0
- oscura/analyzers/protocols/hdlc.py +399 -0
- oscura/analyzers/protocols/i2c.py +368 -0
- oscura/analyzers/protocols/i2s.py +296 -0
- oscura/analyzers/protocols/jtag.py +393 -0
- oscura/analyzers/protocols/lin.py +445 -0
- oscura/analyzers/protocols/manchester.py +333 -0
- oscura/analyzers/protocols/onewire.py +501 -0
- oscura/analyzers/protocols/spi.py +334 -0
- oscura/analyzers/protocols/swd.py +325 -0
- oscura/analyzers/protocols/uart.py +393 -0
- oscura/analyzers/protocols/usb.py +495 -0
- oscura/analyzers/signal_integrity/__init__.py +63 -0
- oscura/analyzers/signal_integrity/embedding.py +294 -0
- oscura/analyzers/signal_integrity/equalization.py +370 -0
- oscura/analyzers/signal_integrity/sparams.py +484 -0
- oscura/analyzers/spectral/__init__.py +53 -0
- oscura/analyzers/spectral/chunked.py +273 -0
- oscura/analyzers/spectral/chunked_fft.py +571 -0
- oscura/analyzers/spectral/chunked_wavelet.py +391 -0
- oscura/analyzers/spectral/fft.py +92 -0
- oscura/analyzers/statistical/__init__.py +250 -0
- oscura/analyzers/statistical/checksum.py +923 -0
- oscura/analyzers/statistical/chunked_corr.py +228 -0
- oscura/analyzers/statistical/classification.py +778 -0
- oscura/analyzers/statistical/entropy.py +1113 -0
- oscura/analyzers/statistical/ngrams.py +614 -0
- oscura/analyzers/statistics/__init__.py +119 -0
- oscura/analyzers/statistics/advanced.py +885 -0
- oscura/analyzers/statistics/basic.py +263 -0
- oscura/analyzers/statistics/correlation.py +630 -0
- oscura/analyzers/statistics/distribution.py +298 -0
- oscura/analyzers/statistics/outliers.py +463 -0
- oscura/analyzers/statistics/streaming.py +93 -0
- oscura/analyzers/statistics/trend.py +520 -0
- oscura/analyzers/validation.py +598 -0
- oscura/analyzers/waveform/__init__.py +36 -0
- oscura/analyzers/waveform/measurements.py +943 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +371 -0
- oscura/analyzers/waveform/spectral.py +1689 -0
- oscura/analyzers/waveform/wavelets.py +298 -0
- oscura/api/__init__.py +62 -0
- oscura/api/dsl.py +538 -0
- oscura/api/fluent.py +571 -0
- oscura/api/operators.py +498 -0
- oscura/api/optimization.py +392 -0
- oscura/api/profiling.py +396 -0
- oscura/automotive/__init__.py +73 -0
- oscura/automotive/can/__init__.py +52 -0
- oscura/automotive/can/analysis.py +356 -0
- oscura/automotive/can/checksum.py +250 -0
- oscura/automotive/can/correlation.py +212 -0
- oscura/automotive/can/discovery.py +355 -0
- oscura/automotive/can/message_wrapper.py +375 -0
- oscura/automotive/can/models.py +385 -0
- oscura/automotive/can/patterns.py +381 -0
- oscura/automotive/can/session.py +452 -0
- oscura/automotive/can/state_machine.py +300 -0
- oscura/automotive/can/stimulus_response.py +461 -0
- oscura/automotive/dbc/__init__.py +15 -0
- oscura/automotive/dbc/generator.py +156 -0
- oscura/automotive/dbc/parser.py +146 -0
- oscura/automotive/dtc/__init__.py +30 -0
- oscura/automotive/dtc/database.py +3036 -0
- oscura/automotive/j1939/__init__.py +14 -0
- oscura/automotive/j1939/decoder.py +745 -0
- oscura/automotive/loaders/__init__.py +35 -0
- oscura/automotive/loaders/asc.py +98 -0
- oscura/automotive/loaders/blf.py +77 -0
- oscura/automotive/loaders/csv_can.py +136 -0
- oscura/automotive/loaders/dispatcher.py +136 -0
- oscura/automotive/loaders/mdf.py +331 -0
- oscura/automotive/loaders/pcap.py +132 -0
- oscura/automotive/obd/__init__.py +14 -0
- oscura/automotive/obd/decoder.py +707 -0
- oscura/automotive/uds/__init__.py +48 -0
- oscura/automotive/uds/decoder.py +265 -0
- oscura/automotive/uds/models.py +64 -0
- oscura/automotive/visualization.py +369 -0
- oscura/batch/__init__.py +55 -0
- oscura/batch/advanced.py +627 -0
- oscura/batch/aggregate.py +300 -0
- oscura/batch/analyze.py +139 -0
- oscura/batch/logging.py +487 -0
- oscura/batch/metrics.py +556 -0
- oscura/builders/__init__.py +41 -0
- oscura/builders/signal_builder.py +1131 -0
- oscura/cli/__init__.py +14 -0
- oscura/cli/batch.py +339 -0
- oscura/cli/characterize.py +273 -0
- oscura/cli/compare.py +775 -0
- oscura/cli/decode.py +551 -0
- oscura/cli/main.py +247 -0
- oscura/cli/shell.py +350 -0
- oscura/comparison/__init__.py +66 -0
- oscura/comparison/compare.py +397 -0
- oscura/comparison/golden.py +487 -0
- oscura/comparison/limits.py +391 -0
- oscura/comparison/mask.py +434 -0
- oscura/comparison/trace_diff.py +30 -0
- oscura/comparison/visualization.py +481 -0
- oscura/compliance/__init__.py +70 -0
- oscura/compliance/advanced.py +756 -0
- oscura/compliance/masks.py +363 -0
- oscura/compliance/reporting.py +483 -0
- oscura/compliance/testing.py +298 -0
- oscura/component/__init__.py +38 -0
- oscura/component/impedance.py +365 -0
- oscura/component/reactive.py +598 -0
- oscura/component/transmission_line.py +312 -0
- oscura/config/__init__.py +191 -0
- oscura/config/defaults.py +254 -0
- oscura/config/loader.py +348 -0
- oscura/config/memory.py +271 -0
- oscura/config/migration.py +458 -0
- oscura/config/pipeline.py +1077 -0
- oscura/config/preferences.py +530 -0
- oscura/config/protocol.py +875 -0
- oscura/config/schema.py +713 -0
- oscura/config/settings.py +420 -0
- oscura/config/thresholds.py +599 -0
- oscura/convenience.py +457 -0
- oscura/core/__init__.py +299 -0
- oscura/core/audit.py +457 -0
- oscura/core/backend_selector.py +405 -0
- oscura/core/cache.py +590 -0
- oscura/core/cancellation.py +439 -0
- oscura/core/confidence.py +225 -0
- oscura/core/config.py +506 -0
- oscura/core/correlation.py +216 -0
- oscura/core/cross_domain.py +422 -0
- oscura/core/debug.py +301 -0
- oscura/core/edge_cases.py +541 -0
- oscura/core/exceptions.py +535 -0
- oscura/core/gpu_backend.py +523 -0
- oscura/core/lazy.py +832 -0
- oscura/core/log_query.py +540 -0
- oscura/core/logging.py +931 -0
- oscura/core/logging_advanced.py +952 -0
- oscura/core/memoize.py +171 -0
- oscura/core/memory_check.py +274 -0
- oscura/core/memory_guard.py +290 -0
- oscura/core/memory_limits.py +336 -0
- oscura/core/memory_monitor.py +453 -0
- oscura/core/memory_progress.py +465 -0
- oscura/core/memory_warnings.py +315 -0
- oscura/core/numba_backend.py +362 -0
- oscura/core/performance.py +352 -0
- oscura/core/progress.py +524 -0
- oscura/core/provenance.py +358 -0
- oscura/core/results.py +331 -0
- oscura/core/types.py +504 -0
- oscura/core/uncertainty.py +383 -0
- oscura/discovery/__init__.py +52 -0
- oscura/discovery/anomaly_detector.py +672 -0
- oscura/discovery/auto_decoder.py +415 -0
- oscura/discovery/comparison.py +497 -0
- oscura/discovery/quality_validator.py +528 -0
- oscura/discovery/signal_detector.py +769 -0
- oscura/dsl/__init__.py +73 -0
- oscura/dsl/commands.py +246 -0
- oscura/dsl/interpreter.py +455 -0
- oscura/dsl/parser.py +689 -0
- oscura/dsl/repl.py +172 -0
- oscura/exceptions.py +59 -0
- oscura/exploratory/__init__.py +111 -0
- oscura/exploratory/error_recovery.py +642 -0
- oscura/exploratory/fuzzy.py +513 -0
- oscura/exploratory/fuzzy_advanced.py +786 -0
- oscura/exploratory/legacy.py +831 -0
- oscura/exploratory/parse.py +358 -0
- oscura/exploratory/recovery.py +275 -0
- oscura/exploratory/sync.py +382 -0
- oscura/exploratory/unknown.py +707 -0
- oscura/export/__init__.py +25 -0
- oscura/export/wireshark/README.md +265 -0
- oscura/export/wireshark/__init__.py +47 -0
- oscura/export/wireshark/generator.py +312 -0
- oscura/export/wireshark/lua_builder.py +159 -0
- oscura/export/wireshark/templates/dissector.lua.j2 +92 -0
- oscura/export/wireshark/type_mapping.py +165 -0
- oscura/export/wireshark/validator.py +105 -0
- oscura/exporters/__init__.py +94 -0
- oscura/exporters/csv.py +303 -0
- oscura/exporters/exporters.py +44 -0
- oscura/exporters/hdf5.py +219 -0
- oscura/exporters/html_export.py +701 -0
- oscura/exporters/json_export.py +291 -0
- oscura/exporters/markdown_export.py +367 -0
- oscura/exporters/matlab_export.py +354 -0
- oscura/exporters/npz_export.py +219 -0
- oscura/exporters/spice_export.py +210 -0
- oscura/extensibility/__init__.py +131 -0
- oscura/extensibility/docs.py +752 -0
- oscura/extensibility/extensions.py +1125 -0
- oscura/extensibility/logging.py +259 -0
- oscura/extensibility/measurements.py +485 -0
- oscura/extensibility/plugins.py +414 -0
- oscura/extensibility/registry.py +346 -0
- oscura/extensibility/templates.py +913 -0
- oscura/extensibility/validation.py +651 -0
- oscura/filtering/__init__.py +89 -0
- oscura/filtering/base.py +563 -0
- oscura/filtering/convenience.py +564 -0
- oscura/filtering/design.py +725 -0
- oscura/filtering/filters.py +32 -0
- oscura/filtering/introspection.py +605 -0
- oscura/guidance/__init__.py +24 -0
- oscura/guidance/recommender.py +429 -0
- oscura/guidance/wizard.py +518 -0
- oscura/inference/__init__.py +251 -0
- oscura/inference/active_learning/README.md +153 -0
- oscura/inference/active_learning/__init__.py +38 -0
- oscura/inference/active_learning/lstar.py +257 -0
- oscura/inference/active_learning/observation_table.py +230 -0
- oscura/inference/active_learning/oracle.py +78 -0
- oscura/inference/active_learning/teachers/__init__.py +15 -0
- oscura/inference/active_learning/teachers/simulator.py +192 -0
- oscura/inference/adaptive_tuning.py +453 -0
- oscura/inference/alignment.py +653 -0
- oscura/inference/bayesian.py +943 -0
- oscura/inference/binary.py +1016 -0
- oscura/inference/crc_reverse.py +711 -0
- oscura/inference/logic.py +288 -0
- oscura/inference/message_format.py +1305 -0
- oscura/inference/protocol.py +417 -0
- oscura/inference/protocol_dsl.py +1084 -0
- oscura/inference/protocol_library.py +1230 -0
- oscura/inference/sequences.py +809 -0
- oscura/inference/signal_intelligence.py +1509 -0
- oscura/inference/spectral.py +215 -0
- oscura/inference/state_machine.py +634 -0
- oscura/inference/stream.py +918 -0
- oscura/integrations/__init__.py +59 -0
- oscura/integrations/llm.py +1827 -0
- oscura/jupyter/__init__.py +32 -0
- oscura/jupyter/display.py +268 -0
- oscura/jupyter/magic.py +334 -0
- oscura/loaders/__init__.py +526 -0
- oscura/loaders/binary.py +69 -0
- oscura/loaders/configurable.py +1255 -0
- oscura/loaders/csv.py +26 -0
- oscura/loaders/csv_loader.py +473 -0
- oscura/loaders/hdf5.py +9 -0
- oscura/loaders/hdf5_loader.py +510 -0
- oscura/loaders/lazy.py +370 -0
- oscura/loaders/mmap_loader.py +583 -0
- oscura/loaders/numpy_loader.py +436 -0
- oscura/loaders/pcap.py +432 -0
- oscura/loaders/preprocessing.py +368 -0
- oscura/loaders/rigol.py +287 -0
- oscura/loaders/sigrok.py +321 -0
- oscura/loaders/tdms.py +367 -0
- oscura/loaders/tektronix.py +711 -0
- oscura/loaders/validation.py +584 -0
- oscura/loaders/vcd.py +464 -0
- oscura/loaders/wav.py +233 -0
- oscura/math/__init__.py +45 -0
- oscura/math/arithmetic.py +824 -0
- oscura/math/interpolation.py +413 -0
- oscura/onboarding/__init__.py +39 -0
- oscura/onboarding/help.py +498 -0
- oscura/onboarding/tutorials.py +405 -0
- oscura/onboarding/wizard.py +466 -0
- oscura/optimization/__init__.py +19 -0
- oscura/optimization/parallel.py +440 -0
- oscura/optimization/search.py +532 -0
- oscura/pipeline/__init__.py +43 -0
- oscura/pipeline/base.py +338 -0
- oscura/pipeline/composition.py +242 -0
- oscura/pipeline/parallel.py +448 -0
- oscura/pipeline/pipeline.py +375 -0
- oscura/pipeline/reverse_engineering.py +1119 -0
- oscura/plugins/__init__.py +122 -0
- oscura/plugins/base.py +272 -0
- oscura/plugins/cli.py +497 -0
- oscura/plugins/discovery.py +411 -0
- oscura/plugins/isolation.py +418 -0
- oscura/plugins/lifecycle.py +959 -0
- oscura/plugins/manager.py +493 -0
- oscura/plugins/registry.py +421 -0
- oscura/plugins/versioning.py +372 -0
- oscura/py.typed +0 -0
- oscura/quality/__init__.py +65 -0
- oscura/quality/ensemble.py +740 -0
- oscura/quality/explainer.py +338 -0
- oscura/quality/scoring.py +616 -0
- oscura/quality/warnings.py +456 -0
- oscura/reporting/__init__.py +248 -0
- oscura/reporting/advanced.py +1234 -0
- oscura/reporting/analyze.py +448 -0
- oscura/reporting/argument_preparer.py +596 -0
- oscura/reporting/auto_report.py +507 -0
- oscura/reporting/batch.py +615 -0
- oscura/reporting/chart_selection.py +223 -0
- oscura/reporting/comparison.py +330 -0
- oscura/reporting/config.py +615 -0
- oscura/reporting/content/__init__.py +39 -0
- oscura/reporting/content/executive.py +127 -0
- oscura/reporting/content/filtering.py +191 -0
- oscura/reporting/content/minimal.py +257 -0
- oscura/reporting/content/verbosity.py +162 -0
- oscura/reporting/core.py +508 -0
- oscura/reporting/core_formats/__init__.py +17 -0
- oscura/reporting/core_formats/multi_format.py +210 -0
- oscura/reporting/engine.py +836 -0
- oscura/reporting/export.py +366 -0
- oscura/reporting/formatting/__init__.py +129 -0
- oscura/reporting/formatting/emphasis.py +81 -0
- oscura/reporting/formatting/numbers.py +403 -0
- oscura/reporting/formatting/standards.py +55 -0
- oscura/reporting/formatting.py +466 -0
- oscura/reporting/html.py +578 -0
- oscura/reporting/index.py +590 -0
- oscura/reporting/multichannel.py +296 -0
- oscura/reporting/output.py +379 -0
- oscura/reporting/pdf.py +373 -0
- oscura/reporting/plots.py +731 -0
- oscura/reporting/pptx_export.py +360 -0
- oscura/reporting/renderers/__init__.py +11 -0
- oscura/reporting/renderers/pdf.py +94 -0
- oscura/reporting/sections.py +471 -0
- oscura/reporting/standards.py +680 -0
- oscura/reporting/summary_generator.py +368 -0
- oscura/reporting/tables.py +397 -0
- oscura/reporting/template_system.py +724 -0
- oscura/reporting/templates/__init__.py +15 -0
- oscura/reporting/templates/definition.py +205 -0
- oscura/reporting/templates/index.html +649 -0
- oscura/reporting/templates/index.md +173 -0
- oscura/schemas/__init__.py +158 -0
- oscura/schemas/bus_configuration.json +322 -0
- oscura/schemas/device_mapping.json +182 -0
- oscura/schemas/packet_format.json +418 -0
- oscura/schemas/protocol_definition.json +363 -0
- oscura/search/__init__.py +16 -0
- oscura/search/anomaly.py +292 -0
- oscura/search/context.py +149 -0
- oscura/search/pattern.py +160 -0
- oscura/session/__init__.py +34 -0
- oscura/session/annotations.py +289 -0
- oscura/session/history.py +313 -0
- oscura/session/session.py +445 -0
- oscura/streaming/__init__.py +43 -0
- oscura/streaming/chunked.py +611 -0
- oscura/streaming/progressive.py +393 -0
- oscura/streaming/realtime.py +622 -0
- oscura/testing/__init__.py +54 -0
- oscura/testing/synthetic.py +808 -0
- oscura/triggering/__init__.py +68 -0
- oscura/triggering/base.py +229 -0
- oscura/triggering/edge.py +353 -0
- oscura/triggering/pattern.py +344 -0
- oscura/triggering/pulse.py +581 -0
- oscura/triggering/window.py +453 -0
- oscura/ui/__init__.py +48 -0
- oscura/ui/formatters.py +526 -0
- oscura/ui/progressive_display.py +340 -0
- oscura/utils/__init__.py +99 -0
- oscura/utils/autodetect.py +338 -0
- oscura/utils/buffer.py +389 -0
- oscura/utils/lazy.py +407 -0
- oscura/utils/lazy_imports.py +147 -0
- oscura/utils/memory.py +836 -0
- oscura/utils/memory_advanced.py +1326 -0
- oscura/utils/memory_extensions.py +465 -0
- oscura/utils/progressive.py +352 -0
- oscura/utils/windowing.py +362 -0
- oscura/visualization/__init__.py +321 -0
- oscura/visualization/accessibility.py +526 -0
- oscura/visualization/annotations.py +374 -0
- oscura/visualization/axis_scaling.py +305 -0
- oscura/visualization/colors.py +453 -0
- oscura/visualization/digital.py +337 -0
- oscura/visualization/eye.py +420 -0
- oscura/visualization/histogram.py +281 -0
- oscura/visualization/interactive.py +858 -0
- oscura/visualization/jitter.py +702 -0
- oscura/visualization/keyboard.py +394 -0
- oscura/visualization/layout.py +365 -0
- oscura/visualization/optimization.py +1028 -0
- oscura/visualization/palettes.py +446 -0
- oscura/visualization/plot.py +92 -0
- oscura/visualization/power.py +290 -0
- oscura/visualization/power_extended.py +626 -0
- oscura/visualization/presets.py +467 -0
- oscura/visualization/protocols.py +932 -0
- oscura/visualization/render.py +207 -0
- oscura/visualization/rendering.py +444 -0
- oscura/visualization/reverse_engineering.py +791 -0
- oscura/visualization/signal_integrity.py +808 -0
- oscura/visualization/specialized.py +553 -0
- oscura/visualization/spectral.py +811 -0
- oscura/visualization/styles.py +381 -0
- oscura/visualization/thumbnails.py +311 -0
- oscura/visualization/time_axis.py +351 -0
- oscura/visualization/waveform.py +367 -0
- oscura/workflow/__init__.py +13 -0
- oscura/workflow/dag.py +377 -0
- oscura/workflows/__init__.py +58 -0
- oscura/workflows/compliance.py +280 -0
- oscura/workflows/digital.py +272 -0
- oscura/workflows/multi_trace.py +502 -0
- oscura/workflows/power.py +178 -0
- oscura/workflows/protocol.py +492 -0
- oscura/workflows/reverse_engineering.py +639 -0
- oscura/workflows/signal_integrity.py +227 -0
- oscura-0.1.0.dist-info/METADATA +300 -0
- oscura-0.1.0.dist-info/RECORD +463 -0
- oscura-0.1.0.dist-info/entry_points.txt +2 -0
- {oscura-0.0.1.dist-info → oscura-0.1.0.dist-info}/licenses/LICENSE +1 -1
- oscura-0.0.1.dist-info/METADATA +0 -63
- oscura-0.0.1.dist-info/RECORD +0 -5
- {oscura-0.0.1.dist-info → oscura-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,778 @@
|
|
|
1
|
+
"""Statistical data type classification.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
This module provides tools for classifying binary data regions as text,
|
|
5
|
+
binary, compressed, encrypted, or padding using multiple statistical tests
|
|
6
|
+
and heuristics.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Literal, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from .entropy import shannon_entropy
|
|
15
|
+
|
|
16
|
+
# Type alias for input data
|
|
17
|
+
DataType = Union[bytes, bytearray, "np.ndarray[Any, Any]"]
|
|
18
|
+
|
|
19
|
+
# Common compression signatures
|
|
20
|
+
COMPRESSION_SIGNATURES = {
|
|
21
|
+
b"\x1f\x8b": "gzip",
|
|
22
|
+
b"BZ": "bzip2",
|
|
23
|
+
b"\x50\x4b\x03\x04": "zip",
|
|
24
|
+
b"\x50\x4b\x05\x06": "zip",
|
|
25
|
+
b"\x50\x4b\x07\x08": "zip",
|
|
26
|
+
b"\xfd7zXZ\x00": "xz",
|
|
27
|
+
b"\x28\xb5\x2f\xfd": "zstd",
|
|
28
|
+
b"\x04\x22\x4d\x18": "lz4",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# Common executable/binary signatures
|
|
32
|
+
BINARY_SIGNATURES = {
|
|
33
|
+
b"\x7fELF": "elf", # ELF executable
|
|
34
|
+
b"MZ": "pe", # Windows PE/DOS executable
|
|
35
|
+
b"\xca\xfe\xba\xbe": "macho_fat", # Mach-O fat binary
|
|
36
|
+
b"\xfe\xed\xfa\xce": "macho_32", # Mach-O 32-bit
|
|
37
|
+
b"\xfe\xed\xfa\xcf": "macho_64", # Mach-O 64-bit
|
|
38
|
+
b"\xcf\xfa\xed\xfe": "macho_64_le", # Mach-O 64-bit little endian
|
|
39
|
+
b"\xce\xfa\xed\xfe": "macho_32_le", # Mach-O 32-bit little endian
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ClassificationResult:
|
|
45
|
+
"""Data type classification result.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
primary_type: Primary classification category
|
|
49
|
+
confidence: Confidence score for classification (0-1)
|
|
50
|
+
entropy: Shannon entropy value
|
|
51
|
+
printable_ratio: Fraction of printable ASCII characters
|
|
52
|
+
null_ratio: Fraction of null bytes
|
|
53
|
+
byte_variance: Variance of byte values
|
|
54
|
+
details: Additional classification details and metadata
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
primary_type: Literal["text", "binary", "compressed", "encrypted", "padding", "mixed"]
|
|
58
|
+
confidence: float
|
|
59
|
+
entropy: float
|
|
60
|
+
printable_ratio: float
|
|
61
|
+
null_ratio: float
|
|
62
|
+
byte_variance: float
|
|
63
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
# Alias for test compatibility
|
|
66
|
+
@property
|
|
67
|
+
def data_type(self) -> str:
|
|
68
|
+
"""Alias for primary_type for test compatibility."""
|
|
69
|
+
return self.primary_type
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class RegionClassification:
|
|
74
|
+
"""Classification of a data region.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
start: Start offset in bytes
|
|
78
|
+
end: End offset in bytes (exclusive)
|
|
79
|
+
length: Region length in bytes
|
|
80
|
+
classification: Classification result for this region
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
start: int
|
|
84
|
+
end: int
|
|
85
|
+
length: int
|
|
86
|
+
classification: ClassificationResult
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def classify_data_type(data: DataType) -> ClassificationResult:
|
|
90
|
+
"""Classify binary data type using multiple heuristics.
|
|
91
|
+
|
|
92
|
+
: Statistical Data Type Classification
|
|
93
|
+
|
|
94
|
+
Uses a combination of entropy analysis, printable character ratio,
|
|
95
|
+
byte distribution, and signature detection to classify data.
|
|
96
|
+
|
|
97
|
+
Classification logic:
|
|
98
|
+
1. Check for null/padding (null_ratio > 0.9)
|
|
99
|
+
2. Check for executable/binary signatures
|
|
100
|
+
3. Check for compression signatures
|
|
101
|
+
4. Check for encrypted/random (entropy > 7.5, low structure)
|
|
102
|
+
5. Check for text (high printable ratio, medium entropy)
|
|
103
|
+
6. Default to binary/structured
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
ClassificationResult with type and confidence
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ValueError: If data is empty
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> result = classify_data_type(b'Hello, World!')
|
|
116
|
+
>>> result.primary_type
|
|
117
|
+
'text'
|
|
118
|
+
"""
|
|
119
|
+
if isinstance(data, np.ndarray):
|
|
120
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
121
|
+
|
|
122
|
+
if not data:
|
|
123
|
+
raise ValueError("Cannot classify empty data")
|
|
124
|
+
|
|
125
|
+
# Calculate statistics
|
|
126
|
+
entropy_val = shannon_entropy(data)
|
|
127
|
+
|
|
128
|
+
# Printable ASCII: 0x20-0x7E plus tab, newline, carriage return
|
|
129
|
+
printable_count = sum(1 for b in data if 32 <= b <= 126 or b in (9, 10, 13))
|
|
130
|
+
printable_ratio = printable_count / len(data)
|
|
131
|
+
|
|
132
|
+
# Null byte ratio
|
|
133
|
+
null_count = sum(1 for b in data if b == 0)
|
|
134
|
+
null_ratio = null_count / len(data)
|
|
135
|
+
|
|
136
|
+
# Byte variance
|
|
137
|
+
byte_array = np.frombuffer(data, dtype=np.uint8)
|
|
138
|
+
byte_variance = float(np.var(byte_array))
|
|
139
|
+
|
|
140
|
+
details = {}
|
|
141
|
+
|
|
142
|
+
# Classification logic
|
|
143
|
+
# 1. Padding/null regions
|
|
144
|
+
if null_ratio > 0.9:
|
|
145
|
+
return ClassificationResult(
|
|
146
|
+
primary_type="padding",
|
|
147
|
+
confidence=min(1.0, null_ratio),
|
|
148
|
+
entropy=entropy_val,
|
|
149
|
+
printable_ratio=printable_ratio,
|
|
150
|
+
null_ratio=null_ratio,
|
|
151
|
+
byte_variance=byte_variance,
|
|
152
|
+
details={"reason": "high_null_ratio"},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# 2. Check for executable/binary signatures (BEFORE compression and encrypted)
|
|
156
|
+
for sig, bin_type in BINARY_SIGNATURES.items():
|
|
157
|
+
if data[: len(sig)] == sig:
|
|
158
|
+
details["binary_type"] = bin_type
|
|
159
|
+
return ClassificationResult(
|
|
160
|
+
primary_type="binary",
|
|
161
|
+
confidence=0.95,
|
|
162
|
+
entropy=entropy_val,
|
|
163
|
+
printable_ratio=printable_ratio,
|
|
164
|
+
null_ratio=null_ratio,
|
|
165
|
+
byte_variance=byte_variance,
|
|
166
|
+
details=details,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# 3. Check for compression signatures
|
|
170
|
+
for sig, comp_type in COMPRESSION_SIGNATURES.items():
|
|
171
|
+
if data[: len(sig)] == sig:
|
|
172
|
+
details["compression_type"] = comp_type
|
|
173
|
+
return ClassificationResult(
|
|
174
|
+
primary_type="compressed",
|
|
175
|
+
confidence=0.95,
|
|
176
|
+
entropy=entropy_val,
|
|
177
|
+
printable_ratio=printable_ratio,
|
|
178
|
+
null_ratio=null_ratio,
|
|
179
|
+
byte_variance=byte_variance,
|
|
180
|
+
details=details,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# 4. Text data (high printable ratio) - check BEFORE entropy-based classification
|
|
184
|
+
if printable_ratio > 0.75 and entropy_val < 6.5:
|
|
185
|
+
confidence = min(1.0, printable_ratio * 0.95)
|
|
186
|
+
details["reason"] = "high_printable_ratio"
|
|
187
|
+
return ClassificationResult(
|
|
188
|
+
primary_type="text",
|
|
189
|
+
confidence=confidence,
|
|
190
|
+
entropy=entropy_val,
|
|
191
|
+
printable_ratio=printable_ratio,
|
|
192
|
+
null_ratio=null_ratio,
|
|
193
|
+
byte_variance=byte_variance,
|
|
194
|
+
details=details,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# 5. Encrypted/random data (high entropy, no structure)
|
|
198
|
+
if entropy_val > 7.5 and byte_variance > 5000:
|
|
199
|
+
# High entropy with high variance suggests random/encrypted
|
|
200
|
+
confidence = min(1.0, (entropy_val - 7.5) / 0.5 + 0.7)
|
|
201
|
+
details["reason"] = "high_entropy_and_variance"
|
|
202
|
+
return ClassificationResult(
|
|
203
|
+
primary_type="encrypted",
|
|
204
|
+
confidence=confidence,
|
|
205
|
+
entropy=entropy_val,
|
|
206
|
+
printable_ratio=printable_ratio,
|
|
207
|
+
null_ratio=null_ratio,
|
|
208
|
+
byte_variance=byte_variance,
|
|
209
|
+
details=details,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# 6. Compressed data (high entropy, some structure)
|
|
213
|
+
if 6.5 <= entropy_val <= 7.5:
|
|
214
|
+
confidence = 0.7
|
|
215
|
+
details["reason"] = "compression_entropy_range"
|
|
216
|
+
return ClassificationResult(
|
|
217
|
+
primary_type="compressed",
|
|
218
|
+
confidence=confidence,
|
|
219
|
+
entropy=entropy_val,
|
|
220
|
+
printable_ratio=printable_ratio,
|
|
221
|
+
null_ratio=null_ratio,
|
|
222
|
+
byte_variance=byte_variance,
|
|
223
|
+
details=details,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# 7. Default to binary/structured
|
|
227
|
+
confidence = 0.6
|
|
228
|
+
details["reason"] = "default_binary"
|
|
229
|
+
return ClassificationResult(
|
|
230
|
+
primary_type="binary",
|
|
231
|
+
confidence=confidence,
|
|
232
|
+
entropy=entropy_val,
|
|
233
|
+
printable_ratio=printable_ratio,
|
|
234
|
+
null_ratio=null_ratio,
|
|
235
|
+
byte_variance=byte_variance,
|
|
236
|
+
details=details,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def detect_text_regions(
|
|
241
|
+
data: DataType, min_length: int = 8, min_printable: float = 0.8
|
|
242
|
+
) -> list[RegionClassification]:
|
|
243
|
+
"""Detect ASCII/UTF-8 text regions.
|
|
244
|
+
|
|
245
|
+
: Statistical Data Type Classification
|
|
246
|
+
|
|
247
|
+
Scans for contiguous regions with high printable character ratio.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
251
|
+
min_length: Minimum region length in bytes (default: 8)
|
|
252
|
+
min_printable: Minimum printable ratio to consider text (default: 0.8)
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of detected text regions
|
|
256
|
+
|
|
257
|
+
Example:
|
|
258
|
+
>>> data = b'\\x00' * 100 + b'Hello World' + b'\\x00' * 100
|
|
259
|
+
>>> regions = detect_text_regions(data)
|
|
260
|
+
>>> len(regions) > 0
|
|
261
|
+
True
|
|
262
|
+
"""
|
|
263
|
+
if isinstance(data, np.ndarray):
|
|
264
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
265
|
+
|
|
266
|
+
regions = []
|
|
267
|
+
in_region = False
|
|
268
|
+
region_start = 0
|
|
269
|
+
_printable_in_window = 0
|
|
270
|
+
window_size = min_length
|
|
271
|
+
|
|
272
|
+
for i, byte in enumerate(data):
|
|
273
|
+
_is_printable = 32 <= byte <= 126 or byte in (9, 10, 13)
|
|
274
|
+
|
|
275
|
+
if not in_region:
|
|
276
|
+
# Look for start of text region
|
|
277
|
+
if i >= window_size - 1:
|
|
278
|
+
# Check window
|
|
279
|
+
window = data[i - window_size + 1 : i + 1]
|
|
280
|
+
printable_count = sum(1 for b in window if 32 <= b <= 126 or b in (9, 10, 13))
|
|
281
|
+
if printable_count / window_size >= min_printable:
|
|
282
|
+
in_region = True
|
|
283
|
+
region_start = i - window_size + 1
|
|
284
|
+
else:
|
|
285
|
+
# In text region, look for end
|
|
286
|
+
# Use a sliding window to detect when printable ratio drops
|
|
287
|
+
if i >= region_start + window_size:
|
|
288
|
+
window = data[i - window_size + 1 : i + 1]
|
|
289
|
+
printable_count = sum(1 for b in window if 32 <= b <= 126 or b in (9, 10, 13))
|
|
290
|
+
if printable_count / window_size < min_printable:
|
|
291
|
+
# End of region
|
|
292
|
+
region_data = data[region_start : i - window_size + 1]
|
|
293
|
+
if len(region_data) >= min_length:
|
|
294
|
+
classification = classify_data_type(region_data)
|
|
295
|
+
regions.append(
|
|
296
|
+
RegionClassification(
|
|
297
|
+
start=region_start,
|
|
298
|
+
end=i - window_size + 1,
|
|
299
|
+
length=len(region_data),
|
|
300
|
+
classification=classification,
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
in_region = False
|
|
304
|
+
|
|
305
|
+
# Handle region extending to end
|
|
306
|
+
if in_region:
|
|
307
|
+
region_data = data[region_start:]
|
|
308
|
+
if len(region_data) >= min_length:
|
|
309
|
+
classification = classify_data_type(region_data)
|
|
310
|
+
regions.append(
|
|
311
|
+
RegionClassification(
|
|
312
|
+
start=region_start,
|
|
313
|
+
end=len(data),
|
|
314
|
+
length=len(region_data),
|
|
315
|
+
classification=classification,
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return regions
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def detect_encrypted_regions(
|
|
323
|
+
data: DataType, min_length: int = 64, min_entropy: float = 7.5
|
|
324
|
+
) -> list[RegionClassification]:
|
|
325
|
+
"""Detect potentially encrypted regions (high entropy, no structure).
|
|
326
|
+
|
|
327
|
+
: Statistical Data Type Classification
|
|
328
|
+
|
|
329
|
+
Identifies regions with very high entropy and uniform byte distribution,
|
|
330
|
+
characteristic of encrypted or cryptographically random data.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
334
|
+
min_length: Minimum region length in bytes (default: 64)
|
|
335
|
+
min_entropy: Minimum entropy threshold (default: 7.5)
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
List of detected encrypted regions
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
>>> import os
|
|
342
|
+
>>> random_data = os.urandom(100)
|
|
343
|
+
>>> regions = detect_encrypted_regions(random_data)
|
|
344
|
+
>>> len(regions) >= 0
|
|
345
|
+
True
|
|
346
|
+
"""
|
|
347
|
+
if isinstance(data, np.ndarray):
|
|
348
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
349
|
+
|
|
350
|
+
if len(data) < min_length:
|
|
351
|
+
return []
|
|
352
|
+
|
|
353
|
+
regions = []
|
|
354
|
+
window_size = min_length
|
|
355
|
+
step = window_size // 4
|
|
356
|
+
|
|
357
|
+
i = 0
|
|
358
|
+
while i < len(data) - window_size:
|
|
359
|
+
window = data[i : i + window_size]
|
|
360
|
+
entropy_val = shannon_entropy(window)
|
|
361
|
+
|
|
362
|
+
if entropy_val >= min_entropy:
|
|
363
|
+
# Found potential encrypted region, extend it
|
|
364
|
+
region_start = i
|
|
365
|
+
region_end = i + window_size
|
|
366
|
+
|
|
367
|
+
# Extend forward
|
|
368
|
+
while region_end < len(data):
|
|
369
|
+
next_window = data[region_end : region_end + window_size]
|
|
370
|
+
if len(next_window) < window_size:
|
|
371
|
+
break
|
|
372
|
+
if shannon_entropy(next_window) >= min_entropy:
|
|
373
|
+
region_end += step
|
|
374
|
+
else:
|
|
375
|
+
break
|
|
376
|
+
|
|
377
|
+
# Create region
|
|
378
|
+
region_data = data[region_start:region_end]
|
|
379
|
+
classification = classify_data_type(region_data)
|
|
380
|
+
regions.append(
|
|
381
|
+
RegionClassification(
|
|
382
|
+
start=region_start,
|
|
383
|
+
end=region_end,
|
|
384
|
+
length=len(region_data),
|
|
385
|
+
classification=classification,
|
|
386
|
+
)
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
i = region_end
|
|
390
|
+
else:
|
|
391
|
+
i += step
|
|
392
|
+
|
|
393
|
+
return regions
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def detect_compressed_regions(data: DataType, min_length: int = 64) -> list[RegionClassification]:
|
|
397
|
+
"""Detect compressed data regions (signatures + high entropy).
|
|
398
|
+
|
|
399
|
+
: Statistical Data Type Classification
|
|
400
|
+
|
|
401
|
+
Identifies compressed regions by looking for compression signatures
|
|
402
|
+
and characteristic entropy patterns.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
406
|
+
min_length: Minimum region length in bytes (default: 64)
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
List of detected compressed regions
|
|
410
|
+
|
|
411
|
+
Example:
|
|
412
|
+
>>> import gzip
|
|
413
|
+
>>> compressed = gzip.compress(b'Hello World' * 100)
|
|
414
|
+
>>> regions = detect_compressed_regions(compressed)
|
|
415
|
+
>>> len(regions) > 0
|
|
416
|
+
True
|
|
417
|
+
"""
|
|
418
|
+
if isinstance(data, np.ndarray):
|
|
419
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
420
|
+
|
|
421
|
+
regions = []
|
|
422
|
+
|
|
423
|
+
# Scan for compression signatures
|
|
424
|
+
for sig, comp_type in COMPRESSION_SIGNATURES.items():
|
|
425
|
+
offset = 0
|
|
426
|
+
while True:
|
|
427
|
+
pos = data.find(sig, offset)
|
|
428
|
+
if pos == -1:
|
|
429
|
+
break
|
|
430
|
+
|
|
431
|
+
# Try to determine compressed region size
|
|
432
|
+
# This is heuristic-based since we don't parse the format
|
|
433
|
+
region_start = pos
|
|
434
|
+
region_end = min(pos + min_length, len(data))
|
|
435
|
+
|
|
436
|
+
# Extend based on high entropy
|
|
437
|
+
window_size = 256
|
|
438
|
+
while region_end < len(data):
|
|
439
|
+
window = data[region_end : region_end + window_size]
|
|
440
|
+
if len(window) < window_size:
|
|
441
|
+
break
|
|
442
|
+
entropy_val = shannon_entropy(window)
|
|
443
|
+
if entropy_val >= 6.0: # Compressed threshold
|
|
444
|
+
region_end += window_size
|
|
445
|
+
else:
|
|
446
|
+
break
|
|
447
|
+
|
|
448
|
+
if region_end - region_start >= min_length:
|
|
449
|
+
region_data = data[region_start:region_end]
|
|
450
|
+
classification = classify_data_type(region_data)
|
|
451
|
+
classification.details["compression_signature"] = comp_type
|
|
452
|
+
|
|
453
|
+
regions.append(
|
|
454
|
+
RegionClassification(
|
|
455
|
+
start=region_start,
|
|
456
|
+
end=region_end,
|
|
457
|
+
length=len(region_data),
|
|
458
|
+
classification=classification,
|
|
459
|
+
)
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
offset = region_end
|
|
463
|
+
|
|
464
|
+
return regions
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def detect_padding_regions(data: DataType, min_length: int = 4) -> list[RegionClassification]:
|
|
468
|
+
"""Detect padding/null regions.
|
|
469
|
+
|
|
470
|
+
: Statistical Data Type Classification
|
|
471
|
+
|
|
472
|
+
Identifies contiguous regions of null bytes or repetitive padding patterns.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
476
|
+
min_length: Minimum region length in bytes (default: 4)
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
List of detected padding regions
|
|
480
|
+
|
|
481
|
+
Example:
|
|
482
|
+
>>> data = b'DATA' + b'\\x00' * 100 + b'DATA'
|
|
483
|
+
>>> regions = detect_padding_regions(data)
|
|
484
|
+
>>> len(regions) > 0
|
|
485
|
+
True
|
|
486
|
+
"""
|
|
487
|
+
if isinstance(data, np.ndarray):
|
|
488
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
489
|
+
|
|
490
|
+
regions = []
|
|
491
|
+
in_padding = False
|
|
492
|
+
padding_start = 0
|
|
493
|
+
padding_byte = None
|
|
494
|
+
|
|
495
|
+
for i, byte in enumerate(data):
|
|
496
|
+
if not in_padding:
|
|
497
|
+
# Check if this could be start of padding
|
|
498
|
+
if byte == 0 or byte == 0xFF:
|
|
499
|
+
in_padding = True
|
|
500
|
+
padding_start = i
|
|
501
|
+
padding_byte = byte
|
|
502
|
+
else:
|
|
503
|
+
# In padding region
|
|
504
|
+
if byte != padding_byte:
|
|
505
|
+
# End of padding
|
|
506
|
+
length = i - padding_start
|
|
507
|
+
if length >= min_length:
|
|
508
|
+
_region_data = data[padding_start:i]
|
|
509
|
+
classification = ClassificationResult(
|
|
510
|
+
primary_type="padding",
|
|
511
|
+
confidence=1.0,
|
|
512
|
+
entropy=0.0,
|
|
513
|
+
printable_ratio=0.0,
|
|
514
|
+
null_ratio=1.0 if padding_byte == 0 else 0.0,
|
|
515
|
+
byte_variance=0.0,
|
|
516
|
+
details={"padding_byte": f"0x{padding_byte:02X}"},
|
|
517
|
+
)
|
|
518
|
+
regions.append(
|
|
519
|
+
RegionClassification(
|
|
520
|
+
start=padding_start, end=i, length=length, classification=classification
|
|
521
|
+
)
|
|
522
|
+
)
|
|
523
|
+
in_padding = False
|
|
524
|
+
|
|
525
|
+
# Handle padding extending to end
|
|
526
|
+
if in_padding:
|
|
527
|
+
length = len(data) - padding_start
|
|
528
|
+
if length >= min_length:
|
|
529
|
+
_region_data = data[padding_start:]
|
|
530
|
+
classification = ClassificationResult(
|
|
531
|
+
primary_type="padding",
|
|
532
|
+
confidence=1.0,
|
|
533
|
+
entropy=0.0,
|
|
534
|
+
printable_ratio=0.0,
|
|
535
|
+
null_ratio=1.0 if padding_byte == 0 else 0.0,
|
|
536
|
+
byte_variance=0.0,
|
|
537
|
+
details={"padding_byte": f"0x{padding_byte:02X}"},
|
|
538
|
+
)
|
|
539
|
+
regions.append(
|
|
540
|
+
RegionClassification(
|
|
541
|
+
start=padding_start, end=len(data), length=length, classification=classification
|
|
542
|
+
)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
return regions
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def segment_by_type(data: DataType, min_segment: int = 32) -> list[RegionClassification]:
|
|
549
|
+
"""Segment data into regions by type.
|
|
550
|
+
|
|
551
|
+
: Statistical Data Type Classification
|
|
552
|
+
|
|
553
|
+
Divides data into homogeneous regions using a sliding window approach
|
|
554
|
+
and entropy-based segmentation.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
558
|
+
min_segment: Minimum segment size in bytes (default: 32)
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
List of classified regions covering the entire input
|
|
562
|
+
|
|
563
|
+
Example:
|
|
564
|
+
>>> data = b'Hello' + b'\\x00' * 50 + bytes(range(256))
|
|
565
|
+
>>> segments = segment_by_type(data)
|
|
566
|
+
>>> len(segments) >= 1
|
|
567
|
+
True
|
|
568
|
+
"""
|
|
569
|
+
if isinstance(data, np.ndarray):
|
|
570
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
571
|
+
|
|
572
|
+
if len(data) < min_segment:
|
|
573
|
+
# Single segment
|
|
574
|
+
classification = classify_data_type(data)
|
|
575
|
+
return [
|
|
576
|
+
RegionClassification(
|
|
577
|
+
start=0, end=len(data), length=len(data), classification=classification
|
|
578
|
+
)
|
|
579
|
+
]
|
|
580
|
+
|
|
581
|
+
segments = []
|
|
582
|
+
window_size = min_segment
|
|
583
|
+
step = window_size // 2
|
|
584
|
+
|
|
585
|
+
current_type = None
|
|
586
|
+
segment_start = 0
|
|
587
|
+
|
|
588
|
+
i = 0
|
|
589
|
+
while i < len(data):
|
|
590
|
+
window_end = min(i + window_size, len(data))
|
|
591
|
+
window = data[i:window_end]
|
|
592
|
+
|
|
593
|
+
if len(window) < min_segment and i > 0:
|
|
594
|
+
# Last small fragment, merge with previous segment
|
|
595
|
+
break
|
|
596
|
+
|
|
597
|
+
classification = classify_data_type(window)
|
|
598
|
+
detected_type = classification.primary_type
|
|
599
|
+
|
|
600
|
+
if current_type is None:
|
|
601
|
+
current_type = detected_type
|
|
602
|
+
segment_start = i
|
|
603
|
+
elif detected_type != current_type:
|
|
604
|
+
# Type changed, finalize previous segment
|
|
605
|
+
segment_data = data[segment_start:i]
|
|
606
|
+
if len(segment_data) >= min_segment:
|
|
607
|
+
seg_classification = classify_data_type(segment_data)
|
|
608
|
+
segments.append(
|
|
609
|
+
RegionClassification(
|
|
610
|
+
start=segment_start,
|
|
611
|
+
end=i,
|
|
612
|
+
length=len(segment_data),
|
|
613
|
+
classification=seg_classification,
|
|
614
|
+
)
|
|
615
|
+
)
|
|
616
|
+
current_type = detected_type
|
|
617
|
+
segment_start = i
|
|
618
|
+
|
|
619
|
+
i += step
|
|
620
|
+
|
|
621
|
+
# Finalize last segment
|
|
622
|
+
segment_data = data[segment_start:]
|
|
623
|
+
if len(segment_data) > 0:
|
|
624
|
+
seg_classification = classify_data_type(segment_data)
|
|
625
|
+
segments.append(
|
|
626
|
+
RegionClassification(
|
|
627
|
+
start=segment_start,
|
|
628
|
+
end=len(data),
|
|
629
|
+
length=len(segment_data),
|
|
630
|
+
classification=seg_classification,
|
|
631
|
+
)
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
return segments
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
class DataClassifier:
|
|
638
|
+
"""Object-oriented wrapper for data type classification.
|
|
639
|
+
|
|
640
|
+
Provides a class-based interface for data classification operations,
|
|
641
|
+
wrapping the functional API for consistency with test expectations.
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
Example:
|
|
646
|
+
>>> classifier = DataClassifier()
|
|
647
|
+
>>> data_type = classifier.classify(b'Hello, World!')
|
|
648
|
+
>>> data_type
|
|
649
|
+
'text'
|
|
650
|
+
"""
|
|
651
|
+
|
|
652
|
+
def __init__(self, min_segment_size: int = 32):
|
|
653
|
+
"""Initialize data classifier.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
min_segment_size: Minimum segment size for region detection.
|
|
657
|
+
"""
|
|
658
|
+
self.min_segment_size = min_segment_size
|
|
659
|
+
|
|
660
|
+
def classify(self, data: DataType) -> str:
|
|
661
|
+
"""Classify binary data type.
|
|
662
|
+
|
|
663
|
+
Returns the primary type as a string for test compatibility.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
data: Input data as bytes, bytearray, or numpy array.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
String data type classification ('text', 'binary', 'compressed',
|
|
670
|
+
'encrypted', 'padding', or 'mixed').
|
|
671
|
+
|
|
672
|
+
Example:
|
|
673
|
+
>>> classifier = DataClassifier()
|
|
674
|
+
>>> classifier.classify(b'Hello')
|
|
675
|
+
'text'
|
|
676
|
+
"""
|
|
677
|
+
result = classify_data_type(data)
|
|
678
|
+
return result.primary_type
|
|
679
|
+
|
|
680
|
+
def classify_detailed(self, data: DataType) -> ClassificationResult:
|
|
681
|
+
"""Classify binary data type with full details.
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
data: Input data as bytes, bytearray, or numpy array.
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
ClassificationResult with type, confidence, and metadata.
|
|
688
|
+
|
|
689
|
+
Example:
|
|
690
|
+
>>> classifier = DataClassifier()
|
|
691
|
+
>>> result = classifier.classify_detailed(b'Hello')
|
|
692
|
+
>>> result.data_type == 'text'
|
|
693
|
+
True
|
|
694
|
+
"""
|
|
695
|
+
return classify_data_type(data)
|
|
696
|
+
|
|
697
|
+
def detect_text_regions(
|
|
698
|
+
self, data: DataType, min_length: int = 8, min_printable: float = 0.8
|
|
699
|
+
) -> list[RegionClassification]:
|
|
700
|
+
"""Detect text regions in data.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
data: Input data.
|
|
704
|
+
min_length: Minimum region length.
|
|
705
|
+
min_printable: Minimum printable ratio.
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
List of text region classifications.
|
|
709
|
+
"""
|
|
710
|
+
return detect_text_regions(data, min_length, min_printable)
|
|
711
|
+
|
|
712
|
+
def detect_encrypted_regions(
|
|
713
|
+
self, data: DataType, min_length: int = 64, min_entropy: float = 7.5
|
|
714
|
+
) -> list[RegionClassification]:
|
|
715
|
+
"""Detect encrypted regions in data.
|
|
716
|
+
|
|
717
|
+
Args:
|
|
718
|
+
data: Input data.
|
|
719
|
+
min_length: Minimum region length.
|
|
720
|
+
min_entropy: Minimum entropy threshold.
|
|
721
|
+
|
|
722
|
+
Returns:
|
|
723
|
+
List of encrypted region classifications.
|
|
724
|
+
"""
|
|
725
|
+
return detect_encrypted_regions(data, min_length, min_entropy)
|
|
726
|
+
|
|
727
|
+
def detect_compressed_regions(
|
|
728
|
+
self, data: DataType, min_length: int = 64
|
|
729
|
+
) -> list[RegionClassification]:
|
|
730
|
+
"""Detect compressed regions in data.
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
data: Input data.
|
|
734
|
+
min_length: Minimum region length.
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
List of compressed region classifications.
|
|
738
|
+
"""
|
|
739
|
+
return detect_compressed_regions(data, min_length)
|
|
740
|
+
|
|
741
|
+
def detect_padding_regions(
|
|
742
|
+
self, data: DataType, min_length: int = 4
|
|
743
|
+
) -> list[RegionClassification]:
|
|
744
|
+
"""Detect padding regions in data.
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
data: Input data.
|
|
748
|
+
min_length: Minimum region length.
|
|
749
|
+
|
|
750
|
+
Returns:
|
|
751
|
+
List of padding region classifications.
|
|
752
|
+
"""
|
|
753
|
+
return detect_padding_regions(data, min_length)
|
|
754
|
+
|
|
755
|
+
def segment(self, data: DataType) -> list[RegionClassification]:
|
|
756
|
+
"""Segment data by type.
|
|
757
|
+
|
|
758
|
+
Args:
|
|
759
|
+
data: Input data.
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
List of classified segments.
|
|
763
|
+
"""
|
|
764
|
+
return segment_by_type(data, self.min_segment_size)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
__all__ = [
|
|
768
|
+
"ClassificationResult",
|
|
769
|
+
"DataClassifier",
|
|
770
|
+
"DataType",
|
|
771
|
+
"RegionClassification",
|
|
772
|
+
"classify_data_type",
|
|
773
|
+
"detect_compressed_regions",
|
|
774
|
+
"detect_encrypted_regions",
|
|
775
|
+
"detect_padding_regions",
|
|
776
|
+
"detect_text_regions",
|
|
777
|
+
"segment_by_type",
|
|
778
|
+
]
|