oscura 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +813 -8
- oscura/__main__.py +392 -0
- oscura/analyzers/__init__.py +37 -0
- oscura/analyzers/digital/__init__.py +177 -0
- oscura/analyzers/digital/bus.py +691 -0
- oscura/analyzers/digital/clock.py +805 -0
- oscura/analyzers/digital/correlation.py +720 -0
- oscura/analyzers/digital/edges.py +632 -0
- oscura/analyzers/digital/extraction.py +413 -0
- oscura/analyzers/digital/quality.py +878 -0
- oscura/analyzers/digital/signal_quality.py +877 -0
- oscura/analyzers/digital/thresholds.py +708 -0
- oscura/analyzers/digital/timing.py +1104 -0
- oscura/analyzers/eye/__init__.py +46 -0
- oscura/analyzers/eye/diagram.py +434 -0
- oscura/analyzers/eye/metrics.py +555 -0
- oscura/analyzers/jitter/__init__.py +83 -0
- oscura/analyzers/jitter/ber.py +333 -0
- oscura/analyzers/jitter/decomposition.py +759 -0
- oscura/analyzers/jitter/measurements.py +413 -0
- oscura/analyzers/jitter/spectrum.py +220 -0
- oscura/analyzers/measurements.py +40 -0
- oscura/analyzers/packet/__init__.py +171 -0
- oscura/analyzers/packet/daq.py +1077 -0
- oscura/analyzers/packet/metrics.py +437 -0
- oscura/analyzers/packet/parser.py +327 -0
- oscura/analyzers/packet/payload.py +2156 -0
- oscura/analyzers/packet/payload_analysis.py +1312 -0
- oscura/analyzers/packet/payload_extraction.py +236 -0
- oscura/analyzers/packet/payload_patterns.py +670 -0
- oscura/analyzers/packet/stream.py +359 -0
- oscura/analyzers/patterns/__init__.py +266 -0
- oscura/analyzers/patterns/clustering.py +1036 -0
- oscura/analyzers/patterns/discovery.py +539 -0
- oscura/analyzers/patterns/learning.py +797 -0
- oscura/analyzers/patterns/matching.py +1091 -0
- oscura/analyzers/patterns/periodic.py +650 -0
- oscura/analyzers/patterns/sequences.py +767 -0
- oscura/analyzers/power/__init__.py +116 -0
- oscura/analyzers/power/ac_power.py +391 -0
- oscura/analyzers/power/basic.py +383 -0
- oscura/analyzers/power/conduction.py +314 -0
- oscura/analyzers/power/efficiency.py +297 -0
- oscura/analyzers/power/ripple.py +356 -0
- oscura/analyzers/power/soa.py +372 -0
- oscura/analyzers/power/switching.py +479 -0
- oscura/analyzers/protocol/__init__.py +150 -0
- oscura/analyzers/protocols/__init__.py +150 -0
- oscura/analyzers/protocols/base.py +500 -0
- oscura/analyzers/protocols/can.py +620 -0
- oscura/analyzers/protocols/can_fd.py +448 -0
- oscura/analyzers/protocols/flexray.py +405 -0
- oscura/analyzers/protocols/hdlc.py +399 -0
- oscura/analyzers/protocols/i2c.py +368 -0
- oscura/analyzers/protocols/i2s.py +296 -0
- oscura/analyzers/protocols/jtag.py +393 -0
- oscura/analyzers/protocols/lin.py +445 -0
- oscura/analyzers/protocols/manchester.py +333 -0
- oscura/analyzers/protocols/onewire.py +501 -0
- oscura/analyzers/protocols/spi.py +334 -0
- oscura/analyzers/protocols/swd.py +325 -0
- oscura/analyzers/protocols/uart.py +393 -0
- oscura/analyzers/protocols/usb.py +495 -0
- oscura/analyzers/signal_integrity/__init__.py +63 -0
- oscura/analyzers/signal_integrity/embedding.py +294 -0
- oscura/analyzers/signal_integrity/equalization.py +370 -0
- oscura/analyzers/signal_integrity/sparams.py +484 -0
- oscura/analyzers/spectral/__init__.py +53 -0
- oscura/analyzers/spectral/chunked.py +273 -0
- oscura/analyzers/spectral/chunked_fft.py +571 -0
- oscura/analyzers/spectral/chunked_wavelet.py +391 -0
- oscura/analyzers/spectral/fft.py +92 -0
- oscura/analyzers/statistical/__init__.py +250 -0
- oscura/analyzers/statistical/checksum.py +923 -0
- oscura/analyzers/statistical/chunked_corr.py +228 -0
- oscura/analyzers/statistical/classification.py +778 -0
- oscura/analyzers/statistical/entropy.py +1113 -0
- oscura/analyzers/statistical/ngrams.py +614 -0
- oscura/analyzers/statistics/__init__.py +119 -0
- oscura/analyzers/statistics/advanced.py +885 -0
- oscura/analyzers/statistics/basic.py +263 -0
- oscura/analyzers/statistics/correlation.py +630 -0
- oscura/analyzers/statistics/distribution.py +298 -0
- oscura/analyzers/statistics/outliers.py +463 -0
- oscura/analyzers/statistics/streaming.py +93 -0
- oscura/analyzers/statistics/trend.py +520 -0
- oscura/analyzers/validation.py +598 -0
- oscura/analyzers/waveform/__init__.py +36 -0
- oscura/analyzers/waveform/measurements.py +943 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +371 -0
- oscura/analyzers/waveform/spectral.py +1689 -0
- oscura/analyzers/waveform/wavelets.py +298 -0
- oscura/api/__init__.py +62 -0
- oscura/api/dsl.py +538 -0
- oscura/api/fluent.py +571 -0
- oscura/api/operators.py +498 -0
- oscura/api/optimization.py +392 -0
- oscura/api/profiling.py +396 -0
- oscura/automotive/__init__.py +73 -0
- oscura/automotive/can/__init__.py +52 -0
- oscura/automotive/can/analysis.py +356 -0
- oscura/automotive/can/checksum.py +250 -0
- oscura/automotive/can/correlation.py +212 -0
- oscura/automotive/can/discovery.py +355 -0
- oscura/automotive/can/message_wrapper.py +375 -0
- oscura/automotive/can/models.py +385 -0
- oscura/automotive/can/patterns.py +381 -0
- oscura/automotive/can/session.py +452 -0
- oscura/automotive/can/state_machine.py +300 -0
- oscura/automotive/can/stimulus_response.py +461 -0
- oscura/automotive/dbc/__init__.py +15 -0
- oscura/automotive/dbc/generator.py +156 -0
- oscura/automotive/dbc/parser.py +146 -0
- oscura/automotive/dtc/__init__.py +30 -0
- oscura/automotive/dtc/database.py +3036 -0
- oscura/automotive/j1939/__init__.py +14 -0
- oscura/automotive/j1939/decoder.py +745 -0
- oscura/automotive/loaders/__init__.py +35 -0
- oscura/automotive/loaders/asc.py +98 -0
- oscura/automotive/loaders/blf.py +77 -0
- oscura/automotive/loaders/csv_can.py +136 -0
- oscura/automotive/loaders/dispatcher.py +136 -0
- oscura/automotive/loaders/mdf.py +331 -0
- oscura/automotive/loaders/pcap.py +132 -0
- oscura/automotive/obd/__init__.py +14 -0
- oscura/automotive/obd/decoder.py +707 -0
- oscura/automotive/uds/__init__.py +48 -0
- oscura/automotive/uds/decoder.py +265 -0
- oscura/automotive/uds/models.py +64 -0
- oscura/automotive/visualization.py +369 -0
- oscura/batch/__init__.py +55 -0
- oscura/batch/advanced.py +627 -0
- oscura/batch/aggregate.py +300 -0
- oscura/batch/analyze.py +139 -0
- oscura/batch/logging.py +487 -0
- oscura/batch/metrics.py +556 -0
- oscura/builders/__init__.py +41 -0
- oscura/builders/signal_builder.py +1131 -0
- oscura/cli/__init__.py +14 -0
- oscura/cli/batch.py +339 -0
- oscura/cli/characterize.py +273 -0
- oscura/cli/compare.py +775 -0
- oscura/cli/decode.py +551 -0
- oscura/cli/main.py +247 -0
- oscura/cli/shell.py +350 -0
- oscura/comparison/__init__.py +66 -0
- oscura/comparison/compare.py +397 -0
- oscura/comparison/golden.py +487 -0
- oscura/comparison/limits.py +391 -0
- oscura/comparison/mask.py +434 -0
- oscura/comparison/trace_diff.py +30 -0
- oscura/comparison/visualization.py +481 -0
- oscura/compliance/__init__.py +70 -0
- oscura/compliance/advanced.py +756 -0
- oscura/compliance/masks.py +363 -0
- oscura/compliance/reporting.py +483 -0
- oscura/compliance/testing.py +298 -0
- oscura/component/__init__.py +38 -0
- oscura/component/impedance.py +365 -0
- oscura/component/reactive.py +598 -0
- oscura/component/transmission_line.py +312 -0
- oscura/config/__init__.py +191 -0
- oscura/config/defaults.py +254 -0
- oscura/config/loader.py +348 -0
- oscura/config/memory.py +271 -0
- oscura/config/migration.py +458 -0
- oscura/config/pipeline.py +1077 -0
- oscura/config/preferences.py +530 -0
- oscura/config/protocol.py +875 -0
- oscura/config/schema.py +713 -0
- oscura/config/settings.py +420 -0
- oscura/config/thresholds.py +599 -0
- oscura/convenience.py +457 -0
- oscura/core/__init__.py +299 -0
- oscura/core/audit.py +457 -0
- oscura/core/backend_selector.py +405 -0
- oscura/core/cache.py +590 -0
- oscura/core/cancellation.py +439 -0
- oscura/core/confidence.py +225 -0
- oscura/core/config.py +506 -0
- oscura/core/correlation.py +216 -0
- oscura/core/cross_domain.py +422 -0
- oscura/core/debug.py +301 -0
- oscura/core/edge_cases.py +541 -0
- oscura/core/exceptions.py +535 -0
- oscura/core/gpu_backend.py +523 -0
- oscura/core/lazy.py +832 -0
- oscura/core/log_query.py +540 -0
- oscura/core/logging.py +931 -0
- oscura/core/logging_advanced.py +952 -0
- oscura/core/memoize.py +171 -0
- oscura/core/memory_check.py +274 -0
- oscura/core/memory_guard.py +290 -0
- oscura/core/memory_limits.py +336 -0
- oscura/core/memory_monitor.py +453 -0
- oscura/core/memory_progress.py +465 -0
- oscura/core/memory_warnings.py +315 -0
- oscura/core/numba_backend.py +362 -0
- oscura/core/performance.py +352 -0
- oscura/core/progress.py +524 -0
- oscura/core/provenance.py +358 -0
- oscura/core/results.py +331 -0
- oscura/core/types.py +504 -0
- oscura/core/uncertainty.py +383 -0
- oscura/discovery/__init__.py +52 -0
- oscura/discovery/anomaly_detector.py +672 -0
- oscura/discovery/auto_decoder.py +415 -0
- oscura/discovery/comparison.py +497 -0
- oscura/discovery/quality_validator.py +528 -0
- oscura/discovery/signal_detector.py +769 -0
- oscura/dsl/__init__.py +73 -0
- oscura/dsl/commands.py +246 -0
- oscura/dsl/interpreter.py +455 -0
- oscura/dsl/parser.py +689 -0
- oscura/dsl/repl.py +172 -0
- oscura/exceptions.py +59 -0
- oscura/exploratory/__init__.py +111 -0
- oscura/exploratory/error_recovery.py +642 -0
- oscura/exploratory/fuzzy.py +513 -0
- oscura/exploratory/fuzzy_advanced.py +786 -0
- oscura/exploratory/legacy.py +831 -0
- oscura/exploratory/parse.py +358 -0
- oscura/exploratory/recovery.py +275 -0
- oscura/exploratory/sync.py +382 -0
- oscura/exploratory/unknown.py +707 -0
- oscura/export/__init__.py +25 -0
- oscura/export/wireshark/README.md +265 -0
- oscura/export/wireshark/__init__.py +47 -0
- oscura/export/wireshark/generator.py +312 -0
- oscura/export/wireshark/lua_builder.py +159 -0
- oscura/export/wireshark/templates/dissector.lua.j2 +92 -0
- oscura/export/wireshark/type_mapping.py +165 -0
- oscura/export/wireshark/validator.py +105 -0
- oscura/exporters/__init__.py +94 -0
- oscura/exporters/csv.py +303 -0
- oscura/exporters/exporters.py +44 -0
- oscura/exporters/hdf5.py +219 -0
- oscura/exporters/html_export.py +701 -0
- oscura/exporters/json_export.py +291 -0
- oscura/exporters/markdown_export.py +367 -0
- oscura/exporters/matlab_export.py +354 -0
- oscura/exporters/npz_export.py +219 -0
- oscura/exporters/spice_export.py +210 -0
- oscura/extensibility/__init__.py +131 -0
- oscura/extensibility/docs.py +752 -0
- oscura/extensibility/extensions.py +1125 -0
- oscura/extensibility/logging.py +259 -0
- oscura/extensibility/measurements.py +485 -0
- oscura/extensibility/plugins.py +414 -0
- oscura/extensibility/registry.py +346 -0
- oscura/extensibility/templates.py +913 -0
- oscura/extensibility/validation.py +651 -0
- oscura/filtering/__init__.py +89 -0
- oscura/filtering/base.py +563 -0
- oscura/filtering/convenience.py +564 -0
- oscura/filtering/design.py +725 -0
- oscura/filtering/filters.py +32 -0
- oscura/filtering/introspection.py +605 -0
- oscura/guidance/__init__.py +24 -0
- oscura/guidance/recommender.py +429 -0
- oscura/guidance/wizard.py +518 -0
- oscura/inference/__init__.py +251 -0
- oscura/inference/active_learning/README.md +153 -0
- oscura/inference/active_learning/__init__.py +38 -0
- oscura/inference/active_learning/lstar.py +257 -0
- oscura/inference/active_learning/observation_table.py +230 -0
- oscura/inference/active_learning/oracle.py +78 -0
- oscura/inference/active_learning/teachers/__init__.py +15 -0
- oscura/inference/active_learning/teachers/simulator.py +192 -0
- oscura/inference/adaptive_tuning.py +453 -0
- oscura/inference/alignment.py +653 -0
- oscura/inference/bayesian.py +943 -0
- oscura/inference/binary.py +1016 -0
- oscura/inference/crc_reverse.py +711 -0
- oscura/inference/logic.py +288 -0
- oscura/inference/message_format.py +1305 -0
- oscura/inference/protocol.py +417 -0
- oscura/inference/protocol_dsl.py +1084 -0
- oscura/inference/protocol_library.py +1230 -0
- oscura/inference/sequences.py +809 -0
- oscura/inference/signal_intelligence.py +1509 -0
- oscura/inference/spectral.py +215 -0
- oscura/inference/state_machine.py +634 -0
- oscura/inference/stream.py +918 -0
- oscura/integrations/__init__.py +59 -0
- oscura/integrations/llm.py +1827 -0
- oscura/jupyter/__init__.py +32 -0
- oscura/jupyter/display.py +268 -0
- oscura/jupyter/magic.py +334 -0
- oscura/loaders/__init__.py +526 -0
- oscura/loaders/binary.py +69 -0
- oscura/loaders/configurable.py +1255 -0
- oscura/loaders/csv.py +26 -0
- oscura/loaders/csv_loader.py +473 -0
- oscura/loaders/hdf5.py +9 -0
- oscura/loaders/hdf5_loader.py +510 -0
- oscura/loaders/lazy.py +370 -0
- oscura/loaders/mmap_loader.py +583 -0
- oscura/loaders/numpy_loader.py +436 -0
- oscura/loaders/pcap.py +432 -0
- oscura/loaders/preprocessing.py +368 -0
- oscura/loaders/rigol.py +287 -0
- oscura/loaders/sigrok.py +321 -0
- oscura/loaders/tdms.py +367 -0
- oscura/loaders/tektronix.py +711 -0
- oscura/loaders/validation.py +584 -0
- oscura/loaders/vcd.py +464 -0
- oscura/loaders/wav.py +233 -0
- oscura/math/__init__.py +45 -0
- oscura/math/arithmetic.py +824 -0
- oscura/math/interpolation.py +413 -0
- oscura/onboarding/__init__.py +39 -0
- oscura/onboarding/help.py +498 -0
- oscura/onboarding/tutorials.py +405 -0
- oscura/onboarding/wizard.py +466 -0
- oscura/optimization/__init__.py +19 -0
- oscura/optimization/parallel.py +440 -0
- oscura/optimization/search.py +532 -0
- oscura/pipeline/__init__.py +43 -0
- oscura/pipeline/base.py +338 -0
- oscura/pipeline/composition.py +242 -0
- oscura/pipeline/parallel.py +448 -0
- oscura/pipeline/pipeline.py +375 -0
- oscura/pipeline/reverse_engineering.py +1119 -0
- oscura/plugins/__init__.py +122 -0
- oscura/plugins/base.py +272 -0
- oscura/plugins/cli.py +497 -0
- oscura/plugins/discovery.py +411 -0
- oscura/plugins/isolation.py +418 -0
- oscura/plugins/lifecycle.py +959 -0
- oscura/plugins/manager.py +493 -0
- oscura/plugins/registry.py +421 -0
- oscura/plugins/versioning.py +372 -0
- oscura/py.typed +0 -0
- oscura/quality/__init__.py +65 -0
- oscura/quality/ensemble.py +740 -0
- oscura/quality/explainer.py +338 -0
- oscura/quality/scoring.py +616 -0
- oscura/quality/warnings.py +456 -0
- oscura/reporting/__init__.py +248 -0
- oscura/reporting/advanced.py +1234 -0
- oscura/reporting/analyze.py +448 -0
- oscura/reporting/argument_preparer.py +596 -0
- oscura/reporting/auto_report.py +507 -0
- oscura/reporting/batch.py +615 -0
- oscura/reporting/chart_selection.py +223 -0
- oscura/reporting/comparison.py +330 -0
- oscura/reporting/config.py +615 -0
- oscura/reporting/content/__init__.py +39 -0
- oscura/reporting/content/executive.py +127 -0
- oscura/reporting/content/filtering.py +191 -0
- oscura/reporting/content/minimal.py +257 -0
- oscura/reporting/content/verbosity.py +162 -0
- oscura/reporting/core.py +508 -0
- oscura/reporting/core_formats/__init__.py +17 -0
- oscura/reporting/core_formats/multi_format.py +210 -0
- oscura/reporting/engine.py +836 -0
- oscura/reporting/export.py +366 -0
- oscura/reporting/formatting/__init__.py +129 -0
- oscura/reporting/formatting/emphasis.py +81 -0
- oscura/reporting/formatting/numbers.py +403 -0
- oscura/reporting/formatting/standards.py +55 -0
- oscura/reporting/formatting.py +466 -0
- oscura/reporting/html.py +578 -0
- oscura/reporting/index.py +590 -0
- oscura/reporting/multichannel.py +296 -0
- oscura/reporting/output.py +379 -0
- oscura/reporting/pdf.py +373 -0
- oscura/reporting/plots.py +731 -0
- oscura/reporting/pptx_export.py +360 -0
- oscura/reporting/renderers/__init__.py +11 -0
- oscura/reporting/renderers/pdf.py +94 -0
- oscura/reporting/sections.py +471 -0
- oscura/reporting/standards.py +680 -0
- oscura/reporting/summary_generator.py +368 -0
- oscura/reporting/tables.py +397 -0
- oscura/reporting/template_system.py +724 -0
- oscura/reporting/templates/__init__.py +15 -0
- oscura/reporting/templates/definition.py +205 -0
- oscura/reporting/templates/index.html +649 -0
- oscura/reporting/templates/index.md +173 -0
- oscura/schemas/__init__.py +158 -0
- oscura/schemas/bus_configuration.json +322 -0
- oscura/schemas/device_mapping.json +182 -0
- oscura/schemas/packet_format.json +418 -0
- oscura/schemas/protocol_definition.json +363 -0
- oscura/search/__init__.py +16 -0
- oscura/search/anomaly.py +292 -0
- oscura/search/context.py +149 -0
- oscura/search/pattern.py +160 -0
- oscura/session/__init__.py +34 -0
- oscura/session/annotations.py +289 -0
- oscura/session/history.py +313 -0
- oscura/session/session.py +445 -0
- oscura/streaming/__init__.py +43 -0
- oscura/streaming/chunked.py +611 -0
- oscura/streaming/progressive.py +393 -0
- oscura/streaming/realtime.py +622 -0
- oscura/testing/__init__.py +54 -0
- oscura/testing/synthetic.py +808 -0
- oscura/triggering/__init__.py +68 -0
- oscura/triggering/base.py +229 -0
- oscura/triggering/edge.py +353 -0
- oscura/triggering/pattern.py +344 -0
- oscura/triggering/pulse.py +581 -0
- oscura/triggering/window.py +453 -0
- oscura/ui/__init__.py +48 -0
- oscura/ui/formatters.py +526 -0
- oscura/ui/progressive_display.py +340 -0
- oscura/utils/__init__.py +99 -0
- oscura/utils/autodetect.py +338 -0
- oscura/utils/buffer.py +389 -0
- oscura/utils/lazy.py +407 -0
- oscura/utils/lazy_imports.py +147 -0
- oscura/utils/memory.py +836 -0
- oscura/utils/memory_advanced.py +1326 -0
- oscura/utils/memory_extensions.py +465 -0
- oscura/utils/progressive.py +352 -0
- oscura/utils/windowing.py +362 -0
- oscura/visualization/__init__.py +321 -0
- oscura/visualization/accessibility.py +526 -0
- oscura/visualization/annotations.py +374 -0
- oscura/visualization/axis_scaling.py +305 -0
- oscura/visualization/colors.py +453 -0
- oscura/visualization/digital.py +337 -0
- oscura/visualization/eye.py +420 -0
- oscura/visualization/histogram.py +281 -0
- oscura/visualization/interactive.py +858 -0
- oscura/visualization/jitter.py +702 -0
- oscura/visualization/keyboard.py +394 -0
- oscura/visualization/layout.py +365 -0
- oscura/visualization/optimization.py +1028 -0
- oscura/visualization/palettes.py +446 -0
- oscura/visualization/plot.py +92 -0
- oscura/visualization/power.py +290 -0
- oscura/visualization/power_extended.py +626 -0
- oscura/visualization/presets.py +467 -0
- oscura/visualization/protocols.py +932 -0
- oscura/visualization/render.py +207 -0
- oscura/visualization/rendering.py +444 -0
- oscura/visualization/reverse_engineering.py +791 -0
- oscura/visualization/signal_integrity.py +808 -0
- oscura/visualization/specialized.py +553 -0
- oscura/visualization/spectral.py +811 -0
- oscura/visualization/styles.py +381 -0
- oscura/visualization/thumbnails.py +311 -0
- oscura/visualization/time_axis.py +351 -0
- oscura/visualization/waveform.py +367 -0
- oscura/workflow/__init__.py +13 -0
- oscura/workflow/dag.py +377 -0
- oscura/workflows/__init__.py +58 -0
- oscura/workflows/compliance.py +280 -0
- oscura/workflows/digital.py +272 -0
- oscura/workflows/multi_trace.py +502 -0
- oscura/workflows/power.py +178 -0
- oscura/workflows/protocol.py +492 -0
- oscura/workflows/reverse_engineering.py +639 -0
- oscura/workflows/signal_integrity.py +227 -0
- oscura-0.1.1.dist-info/METADATA +300 -0
- oscura-0.1.1.dist-info/RECORD +463 -0
- oscura-0.1.1.dist-info/entry_points.txt +2 -0
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/licenses/LICENSE +1 -1
- oscura-0.0.1.dist-info/METADATA +0 -63
- oscura-0.0.1.dist-info/RECORD +0 -5
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""N-gram frequency analysis for protocol fingerprinting.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
This module provides tools for analyzing n-gram (byte sequence) frequencies
|
|
5
|
+
in binary data, useful for pattern identification, data characterization,
|
|
6
|
+
and protocol fingerprinting.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from numpy.typing import NDArray
|
|
17
|
+
|
|
18
|
+
# Type alias for input data
|
|
19
|
+
DataType = Union[bytes, bytearray, "NDArray[Any]"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class NgramProfile:
|
|
24
|
+
"""N-gram frequency profile.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
n: N-gram size (number of bytes)
|
|
28
|
+
frequencies: Dictionary mapping n-grams to their counts
|
|
29
|
+
total_ngrams: Total number of n-grams extracted
|
|
30
|
+
unique_ngrams: Number of unique n-grams found
|
|
31
|
+
top_k: List of top n-grams with (ngram, count, frequency)
|
|
32
|
+
entropy: Shannon entropy of n-gram distribution
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
n: int
|
|
36
|
+
frequencies: dict[bytes, int]
|
|
37
|
+
total_ngrams: int
|
|
38
|
+
unique_ngrams: int
|
|
39
|
+
top_k: list[tuple[bytes, int, float]] # (ngram, count, frequency)
|
|
40
|
+
entropy: float
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class NgramComparison:
|
|
45
|
+
"""Comparison of two n-gram profiles.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
similarity: Jaccard similarity coefficient (0-1, 1 = identical)
|
|
49
|
+
cosine_similarity: Cosine similarity of frequency vectors (0-1)
|
|
50
|
+
chi_square: Chi-square distance between distributions
|
|
51
|
+
common_ngrams: Number of n-grams present in both profiles
|
|
52
|
+
unique_to_a: Number of n-grams unique to first profile
|
|
53
|
+
unique_to_b: Number of n-grams unique to second profile
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
similarity: float
|
|
57
|
+
cosine_similarity: float
|
|
58
|
+
chi_square: float
|
|
59
|
+
common_ngrams: int
|
|
60
|
+
unique_to_a: int
|
|
61
|
+
unique_to_b: int
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def ngram_frequency(data: DataType, n: int = 2, overlap: bool = True) -> NgramProfile:
|
|
65
|
+
"""Compute n-gram frequencies.
|
|
66
|
+
|
|
67
|
+
: N-gram Frequency Analysis
|
|
68
|
+
|
|
69
|
+
Extracts all n-grams from the data and computes their frequency distribution.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
73
|
+
n: N-gram size in bytes (default: 2 for bigrams)
|
|
74
|
+
overlap: If True, use overlapping n-grams; if False, non-overlapping (default: True)
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
NgramProfile containing frequency statistics
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If n < 1
|
|
81
|
+
|
|
82
|
+
Example:
|
|
83
|
+
>>> profile = ngram_frequency(b'ABCABC', n=2)
|
|
84
|
+
>>> profile.n
|
|
85
|
+
2
|
|
86
|
+
>>> profile.total_ngrams
|
|
87
|
+
5
|
|
88
|
+
"""
|
|
89
|
+
if isinstance(data, np.ndarray):
|
|
90
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
91
|
+
|
|
92
|
+
if n < 1:
|
|
93
|
+
raise ValueError(f"N-gram size must be >= 1, got {n}")
|
|
94
|
+
|
|
95
|
+
# Handle data shorter than n - return empty profile
|
|
96
|
+
if len(data) < n:
|
|
97
|
+
return NgramProfile(
|
|
98
|
+
n=n,
|
|
99
|
+
frequencies={},
|
|
100
|
+
total_ngrams=0,
|
|
101
|
+
unique_ngrams=0,
|
|
102
|
+
top_k=[],
|
|
103
|
+
entropy=0.0,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Extract n-grams
|
|
107
|
+
step = 1 if overlap else n
|
|
108
|
+
ngrams = []
|
|
109
|
+
|
|
110
|
+
for i in range(0, len(data) - n + 1, step):
|
|
111
|
+
ngrams.append(bytes(data[i : i + n]))
|
|
112
|
+
|
|
113
|
+
# Count frequencies
|
|
114
|
+
freq_counter = Counter(ngrams)
|
|
115
|
+
total = len(ngrams)
|
|
116
|
+
unique = len(freq_counter)
|
|
117
|
+
|
|
118
|
+
# Get top k n-grams (sorted by count, then by bytes for consistency)
|
|
119
|
+
top_k = [
|
|
120
|
+
(ngram, count, count / total)
|
|
121
|
+
for ngram, count in sorted(
|
|
122
|
+
freq_counter.items(),
|
|
123
|
+
key=lambda x: (-x[1], x[0]), # Sort by count desc, then ngram asc
|
|
124
|
+
)[:100] # Limit to top 100
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Calculate n-gram entropy
|
|
128
|
+
entropy_val = 0.0
|
|
129
|
+
for count in freq_counter.values():
|
|
130
|
+
prob = count / total
|
|
131
|
+
entropy_val -= prob * np.log2(prob)
|
|
132
|
+
|
|
133
|
+
return NgramProfile(
|
|
134
|
+
n=n,
|
|
135
|
+
frequencies=dict(freq_counter),
|
|
136
|
+
total_ngrams=total,
|
|
137
|
+
unique_ngrams=unique,
|
|
138
|
+
top_k=top_k,
|
|
139
|
+
entropy=float(entropy_val),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def ngram_entropy(data: DataType, n: int = 2) -> float:
|
|
144
|
+
"""Calculate entropy over n-gram distribution.
|
|
145
|
+
|
|
146
|
+
: N-gram Frequency Analysis
|
|
147
|
+
|
|
148
|
+
Computes Shannon entropy of the n-gram distribution, which measures
|
|
149
|
+
the predictability of n-gram sequences.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
153
|
+
n: N-gram size in bytes (default: 2)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
N-gram entropy in bits (0.0 if data is too short)
|
|
157
|
+
|
|
158
|
+
Example:
|
|
159
|
+
>>> entropy = ngram_entropy(b'AAAA', n=2)
|
|
160
|
+
>>> entropy
|
|
161
|
+
0.0
|
|
162
|
+
"""
|
|
163
|
+
profile = ngram_frequency(data, n=n, overlap=True)
|
|
164
|
+
return profile.entropy
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def compare_ngram_profiles(data_a: DataType, data_b: DataType, n: int = 2) -> NgramComparison:
|
|
168
|
+
"""Compare n-gram profiles between two datasets.
|
|
169
|
+
|
|
170
|
+
: N-gram Frequency Analysis
|
|
171
|
+
|
|
172
|
+
Computes multiple similarity metrics between two n-gram distributions
|
|
173
|
+
for protocol fingerprinting and classification.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
data_a: First dataset
|
|
177
|
+
data_b: Second dataset
|
|
178
|
+
n: N-gram size in bytes (default: 2)
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
NgramComparison with similarity metrics
|
|
182
|
+
|
|
183
|
+
Example:
|
|
184
|
+
>>> comp = compare_ngram_profiles(b'ABCABC', b'ABCABC', n=2)
|
|
185
|
+
>>> comp.similarity
|
|
186
|
+
1.0
|
|
187
|
+
"""
|
|
188
|
+
# Generate profiles
|
|
189
|
+
profile_a = ngram_frequency(data_a, n=n, overlap=True)
|
|
190
|
+
profile_b = ngram_frequency(data_b, n=n, overlap=True)
|
|
191
|
+
|
|
192
|
+
freq_a = profile_a.frequencies
|
|
193
|
+
freq_b = profile_b.frequencies
|
|
194
|
+
|
|
195
|
+
# Get all unique n-grams
|
|
196
|
+
all_ngrams = set(freq_a.keys()) | set(freq_b.keys())
|
|
197
|
+
common = set(freq_a.keys()) & set(freq_b.keys())
|
|
198
|
+
|
|
199
|
+
# Jaccard similarity (set-based)
|
|
200
|
+
jaccard = len(common) / len(all_ngrams) if all_ngrams else 1.0
|
|
201
|
+
|
|
202
|
+
# Cosine similarity (frequency-based)
|
|
203
|
+
if all_ngrams:
|
|
204
|
+
vec_a = np.array([freq_a.get(ng, 0) for ng in all_ngrams])
|
|
205
|
+
vec_b = np.array([freq_b.get(ng, 0) for ng in all_ngrams])
|
|
206
|
+
|
|
207
|
+
norm_a = np.linalg.norm(vec_a)
|
|
208
|
+
norm_b = np.linalg.norm(vec_b)
|
|
209
|
+
|
|
210
|
+
if norm_a > 0 and norm_b > 0:
|
|
211
|
+
cosine_sim = np.dot(vec_a, vec_b) / (norm_a * norm_b)
|
|
212
|
+
else:
|
|
213
|
+
cosine_sim = 1.0 if norm_a == norm_b else 0.0
|
|
214
|
+
else:
|
|
215
|
+
cosine_sim = 1.0
|
|
216
|
+
|
|
217
|
+
# Chi-square distance
|
|
218
|
+
chi_square_val = 0.0
|
|
219
|
+
if all_ngrams:
|
|
220
|
+
total_a = profile_a.total_ngrams
|
|
221
|
+
total_b = profile_b.total_ngrams
|
|
222
|
+
|
|
223
|
+
for ngram in all_ngrams:
|
|
224
|
+
freq_a_norm = freq_a.get(ngram, 0) / total_a if total_a > 0 else 0
|
|
225
|
+
freq_b_norm = freq_b.get(ngram, 0) / total_b if total_b > 0 else 0
|
|
226
|
+
expected = (freq_a_norm + freq_b_norm) / 2
|
|
227
|
+
|
|
228
|
+
if expected > 0:
|
|
229
|
+
chi_square_val += (
|
|
230
|
+
(freq_a_norm - expected) ** 2 + (freq_b_norm - expected) ** 2
|
|
231
|
+
) / expected
|
|
232
|
+
|
|
233
|
+
return NgramComparison(
|
|
234
|
+
similarity=float(jaccard),
|
|
235
|
+
cosine_similarity=float(cosine_sim),
|
|
236
|
+
chi_square=float(chi_square_val),
|
|
237
|
+
common_ngrams=len(common),
|
|
238
|
+
unique_to_a=len(freq_a) - len(common),
|
|
239
|
+
unique_to_b=len(freq_b) - len(common),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def find_unusual_ngrams(
|
|
244
|
+
data: DataType, baseline: NgramProfile | None = None, n: int = 2, z_threshold: float = 3.0
|
|
245
|
+
) -> list[tuple[bytes, float]]:
|
|
246
|
+
"""Find unusually frequent or rare n-grams.
|
|
247
|
+
|
|
248
|
+
: N-gram Frequency Analysis
|
|
249
|
+
|
|
250
|
+
Identifies n-grams with frequencies that deviate significantly from
|
|
251
|
+
expected (baseline) distributions using z-score analysis.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
data: Input data to analyze
|
|
255
|
+
baseline: Baseline n-gram profile for comparison (None = use uniform)
|
|
256
|
+
n: N-gram size in bytes (default: 2)
|
|
257
|
+
z_threshold: Z-score threshold for unusual classification (default: 3.0)
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
List of (ngram, z_score) tuples for unusual n-grams, sorted by |z_score|
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
ValueError: If baseline n-gram size doesn't match requested size.
|
|
264
|
+
|
|
265
|
+
Example:
|
|
266
|
+
>>> unusual = find_unusual_ngrams(b'AAABBBCCC', n=1)
|
|
267
|
+
>>> len(unusual) >= 0
|
|
268
|
+
True
|
|
269
|
+
"""
|
|
270
|
+
if isinstance(data, np.ndarray):
|
|
271
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
272
|
+
|
|
273
|
+
# Generate profile for current data
|
|
274
|
+
profile = ngram_frequency(data, n=n, overlap=True)
|
|
275
|
+
|
|
276
|
+
if profile.total_ngrams == 0:
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
if baseline is None:
|
|
280
|
+
# Use uniform distribution as baseline
|
|
281
|
+
expected_count = profile.total_ngrams / (256**n)
|
|
282
|
+
baseline_freqs = {} # Empty means all n-grams have same expected frequency
|
|
283
|
+
else:
|
|
284
|
+
if baseline.n != n:
|
|
285
|
+
raise ValueError(f"Baseline n-gram size ({baseline.n}) != requested size ({n})")
|
|
286
|
+
# Normalize baseline frequencies
|
|
287
|
+
baseline_freqs = {
|
|
288
|
+
ng: count / baseline.total_ngrams * profile.total_ngrams
|
|
289
|
+
for ng, count in baseline.frequencies.items()
|
|
290
|
+
}
|
|
291
|
+
expected_count = profile.total_ngrams / (256**n)
|
|
292
|
+
|
|
293
|
+
# Calculate z-scores
|
|
294
|
+
unusual = []
|
|
295
|
+
|
|
296
|
+
for ngram, observed in profile.frequencies.items():
|
|
297
|
+
expected = baseline_freqs.get(ngram, expected_count)
|
|
298
|
+
|
|
299
|
+
# Use Poisson approximation for count data
|
|
300
|
+
if expected > 0:
|
|
301
|
+
# Z-score = (observed - expected) / sqrt(expected)
|
|
302
|
+
z_score = (observed - expected) / np.sqrt(expected)
|
|
303
|
+
|
|
304
|
+
if abs(z_score) >= z_threshold:
|
|
305
|
+
unusual.append((ngram, float(z_score)))
|
|
306
|
+
|
|
307
|
+
# Sort by absolute z-score descending
|
|
308
|
+
unusual.sort(key=lambda x: abs(x[1]), reverse=True)
|
|
309
|
+
|
|
310
|
+
return unusual
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def ngram_heatmap(data: DataType, n: int = 2) -> "NDArray[np.float64]":
|
|
314
|
+
"""Generate n-gram co-occurrence heatmap.
|
|
315
|
+
|
|
316
|
+
: N-gram Frequency Analysis
|
|
317
|
+
|
|
318
|
+
Creates a heatmap matrix showing n-gram frequencies. For bigrams (n=2),
|
|
319
|
+
this produces a 256x256 matrix where entry [i,j] is the count of bigram
|
|
320
|
+
(byte_i, byte_j).
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
324
|
+
n: N-gram size in bytes (must be 2 for heatmap visualization)
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Numpy array of shape (256, 256) for bigrams, normalized to [0, 1]
|
|
328
|
+
|
|
329
|
+
Raises:
|
|
330
|
+
ValueError: If n != 2 (only bigrams supported for heatmap)
|
|
331
|
+
|
|
332
|
+
Example:
|
|
333
|
+
>>> heatmap = ngram_heatmap(b'ABAB', n=2)
|
|
334
|
+
>>> heatmap.shape
|
|
335
|
+
(256, 256)
|
|
336
|
+
"""
|
|
337
|
+
if n != 2:
|
|
338
|
+
raise ValueError(f"Heatmap only supported for bigrams (n=2), got n={n}")
|
|
339
|
+
|
|
340
|
+
if isinstance(data, np.ndarray):
|
|
341
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
342
|
+
|
|
343
|
+
# Initialize 256x256 matrix
|
|
344
|
+
heatmap = np.zeros((256, 256), dtype=np.float64)
|
|
345
|
+
|
|
346
|
+
# Count bigrams
|
|
347
|
+
for i in range(len(data) - 1):
|
|
348
|
+
byte1 = data[i]
|
|
349
|
+
byte2 = data[i + 1]
|
|
350
|
+
heatmap[byte1, byte2] += 1
|
|
351
|
+
|
|
352
|
+
# Normalize to [0, 1]
|
|
353
|
+
max_val = heatmap.max()
|
|
354
|
+
if max_val > 0:
|
|
355
|
+
heatmap = heatmap / max_val
|
|
356
|
+
|
|
357
|
+
return heatmap
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def extract_ngrams(data: DataType, n: int = 2, overlap: bool = True) -> list[bytes]:
|
|
361
|
+
"""Extract n-grams from data as a list.
|
|
362
|
+
|
|
363
|
+
: N-gram Extraction
|
|
364
|
+
|
|
365
|
+
Extracts all n-grams from the input data and returns them as a list,
|
|
366
|
+
with or without overlapping.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
370
|
+
n: N-gram size in bytes (default: 2)
|
|
371
|
+
overlap: If True, use overlapping n-grams; if False, non-overlapping (default: True)
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
List of bytes objects, each representing an n-gram
|
|
375
|
+
|
|
376
|
+
Raises:
|
|
377
|
+
ValueError: If n < 1
|
|
378
|
+
|
|
379
|
+
Example:
|
|
380
|
+
>>> ngrams = extract_ngrams(b'ABCABC', n=2)
|
|
381
|
+
>>> len(ngrams)
|
|
382
|
+
5
|
|
383
|
+
>>> ngrams[0]
|
|
384
|
+
b'AB'
|
|
385
|
+
"""
|
|
386
|
+
if isinstance(data, np.ndarray):
|
|
387
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
388
|
+
|
|
389
|
+
if n < 1:
|
|
390
|
+
raise ValueError(f"N-gram size must be >= 1, got {n}")
|
|
391
|
+
|
|
392
|
+
# Extract n-grams
|
|
393
|
+
step = 1 if overlap else n
|
|
394
|
+
ngrams: list[bytes] = []
|
|
395
|
+
|
|
396
|
+
for i in range(0, len(data) - n + 1, step):
|
|
397
|
+
ngrams.append(bytes(data[i : i + n]))
|
|
398
|
+
|
|
399
|
+
return ngrams
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def ngram_frequencies(data: DataType, n: int = 2, overlap: bool = True) -> dict[bytes, int]:
|
|
403
|
+
"""Get n-gram frequency counts as a dictionary.
|
|
404
|
+
|
|
405
|
+
: N-gram Frequency Analysis
|
|
406
|
+
|
|
407
|
+
Extracts n-grams and counts their frequencies, returning a dictionary
|
|
408
|
+
mapping each n-gram to its count.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
data: Input data as bytes, bytearray, or numpy array
|
|
412
|
+
n: N-gram size in bytes (default: 2)
|
|
413
|
+
overlap: If True, use overlapping n-grams; if False, non-overlapping (default: True)
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
Dictionary mapping n-grams (bytes) to their counts (int)
|
|
417
|
+
|
|
418
|
+
Raises:
|
|
419
|
+
ValueError: If n < 1
|
|
420
|
+
|
|
421
|
+
Example:
|
|
422
|
+
>>> freqs = ngram_frequencies(b'ABCABC', n=2)
|
|
423
|
+
>>> freqs[b'AB']
|
|
424
|
+
2
|
|
425
|
+
>>> freqs[b'BC']
|
|
426
|
+
2
|
|
427
|
+
"""
|
|
428
|
+
if isinstance(data, np.ndarray):
|
|
429
|
+
data = data.tobytes() if data.dtype == np.uint8 else bytes(data.flatten())
|
|
430
|
+
|
|
431
|
+
if n < 1:
|
|
432
|
+
raise ValueError(f"N-gram size must be >= 1, got {n}")
|
|
433
|
+
|
|
434
|
+
# Extract and count
|
|
435
|
+
ngrams = extract_ngrams(data, n=n, overlap=overlap)
|
|
436
|
+
return dict(Counter(ngrams))
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def find_common_ngrams(data_a: DataType, data_b: DataType, n: int = 2) -> set[bytes]:
|
|
440
|
+
"""Find n-grams that appear in both datasets.
|
|
441
|
+
|
|
442
|
+
: N-gram Analysis
|
|
443
|
+
|
|
444
|
+
Identifies n-grams that occur in both input datasets.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
data_a: First dataset
|
|
448
|
+
data_b: Second dataset
|
|
449
|
+
n: N-gram size in bytes (default: 2)
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Set of n-grams present in both datasets
|
|
453
|
+
|
|
454
|
+
Example:
|
|
455
|
+
>>> common = find_common_ngrams(b'ABCABC', b'BCABC', n=2)
|
|
456
|
+
>>> b'BC' in common
|
|
457
|
+
True
|
|
458
|
+
"""
|
|
459
|
+
freqs_a = ngram_frequencies(data_a, n=n, overlap=True)
|
|
460
|
+
freqs_b = ngram_frequencies(data_b, n=n, overlap=True)
|
|
461
|
+
|
|
462
|
+
return set(freqs_a.keys()) & set(freqs_b.keys())
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
class NGramAnalyzer:
|
|
466
|
+
"""Object-oriented wrapper for n-gram analysis.
|
|
467
|
+
|
|
468
|
+
Provides a class-based interface for n-gram analysis operations,
|
|
469
|
+
wrapping the functional API for consistency with test expectations.
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
Example:
|
|
474
|
+
>>> analyzer = NGramAnalyzer(n=2)
|
|
475
|
+
>>> frequencies = analyzer.analyze(b'ABCABC')
|
|
476
|
+
>>> frequencies[b'AB'] # Direct dict access
|
|
477
|
+
2
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
def __init__(self, n: int = 2, overlap: bool = True):
|
|
481
|
+
"""Initialize n-gram analyzer.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
n: N-gram size in bytes.
|
|
485
|
+
overlap: Whether to use overlapping n-grams.
|
|
486
|
+
"""
|
|
487
|
+
self.n = n
|
|
488
|
+
self.overlap = overlap
|
|
489
|
+
self._last_profile: NgramProfile | None = None
|
|
490
|
+
|
|
491
|
+
def analyze(self, data: DataType) -> dict[bytes, int]:
|
|
492
|
+
"""Analyze n-gram frequencies in data.
|
|
493
|
+
|
|
494
|
+
Returns a dictionary mapping n-grams to counts for direct access.
|
|
495
|
+
Returns empty dict if data is shorter than n.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
data: Input data as bytes, bytearray, or numpy array.
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Dictionary mapping n-grams (bytes) to their counts (int).
|
|
502
|
+
|
|
503
|
+
Example:
|
|
504
|
+
>>> analyzer = NGramAnalyzer(n=2)
|
|
505
|
+
>>> frequencies = analyzer.analyze(b'ABCABC')
|
|
506
|
+
>>> frequencies[b'AB']
|
|
507
|
+
2
|
|
508
|
+
"""
|
|
509
|
+
self._last_profile = ngram_frequency(data, n=self.n, overlap=self.overlap)
|
|
510
|
+
return self._last_profile.frequencies
|
|
511
|
+
|
|
512
|
+
def analyze_profile(self, data: DataType) -> NgramProfile:
|
|
513
|
+
"""Analyze n-gram frequencies and return full profile.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
data: Input data as bytes, bytearray, or numpy array.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
NgramProfile with full frequency statistics.
|
|
520
|
+
|
|
521
|
+
Example:
|
|
522
|
+
>>> analyzer = NGramAnalyzer(n=2)
|
|
523
|
+
>>> profile = analyzer.analyze_profile(b'ABCDEF')
|
|
524
|
+
>>> profile.n == 2
|
|
525
|
+
True
|
|
526
|
+
"""
|
|
527
|
+
self._last_profile = ngram_frequency(data, n=self.n, overlap=self.overlap)
|
|
528
|
+
return self._last_profile
|
|
529
|
+
|
|
530
|
+
def get_distribution(self, frequencies: dict[bytes, int]) -> dict[bytes, float]:
|
|
531
|
+
"""Convert frequency counts to normalized distribution.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
frequencies: Dictionary mapping n-grams to counts.
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
Dictionary mapping n-grams to normalized frequencies (0-1).
|
|
538
|
+
|
|
539
|
+
Example:
|
|
540
|
+
>>> analyzer = NGramAnalyzer(n=2)
|
|
541
|
+
>>> freqs = analyzer.analyze(b'ABAB')
|
|
542
|
+
>>> dist = analyzer.get_distribution(freqs)
|
|
543
|
+
>>> sum(dist.values()) # Should sum to 1.0
|
|
544
|
+
1.0
|
|
545
|
+
"""
|
|
546
|
+
total = sum(frequencies.values())
|
|
547
|
+
if total == 0:
|
|
548
|
+
return {}
|
|
549
|
+
return {ngram: count / total for ngram, count in frequencies.items()}
|
|
550
|
+
|
|
551
|
+
def entropy(self, data: DataType) -> float:
|
|
552
|
+
"""Calculate n-gram entropy.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
data: Input data.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Entropy in bits.
|
|
559
|
+
"""
|
|
560
|
+
return ngram_entropy(data, n=self.n)
|
|
561
|
+
|
|
562
|
+
def compare(self, data_a: DataType, data_b: DataType) -> NgramComparison:
|
|
563
|
+
"""Compare n-gram profiles of two datasets.
|
|
564
|
+
|
|
565
|
+
Args:
|
|
566
|
+
data_a: First dataset.
|
|
567
|
+
data_b: Second dataset.
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
NgramComparison with similarity metrics.
|
|
571
|
+
"""
|
|
572
|
+
return compare_ngram_profiles(data_a, data_b, n=self.n)
|
|
573
|
+
|
|
574
|
+
def find_unusual(
|
|
575
|
+
self, data: DataType, baseline: NgramProfile | None = None, z_threshold: float = 3.0
|
|
576
|
+
) -> list[tuple[bytes, float]]:
|
|
577
|
+
"""Find unusual n-grams in data.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
data: Input data.
|
|
581
|
+
baseline: Baseline profile for comparison.
|
|
582
|
+
z_threshold: Z-score threshold.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
List of (ngram, z_score) tuples.
|
|
586
|
+
"""
|
|
587
|
+
return find_unusual_ngrams(data, baseline=baseline, n=self.n, z_threshold=z_threshold)
|
|
588
|
+
|
|
589
|
+
def heatmap(self, data: DataType) -> "NDArray[np.float64]":
|
|
590
|
+
"""Generate bigram heatmap.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
data: Input data.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
256x256 heatmap array.
|
|
597
|
+
"""
|
|
598
|
+
return ngram_heatmap(data, n=self.n)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
__all__ = [
|
|
602
|
+
"DataType",
|
|
603
|
+
"NGramAnalyzer",
|
|
604
|
+
"NgramComparison",
|
|
605
|
+
"NgramProfile",
|
|
606
|
+
"compare_ngram_profiles",
|
|
607
|
+
"extract_ngrams",
|
|
608
|
+
"find_common_ngrams",
|
|
609
|
+
"find_unusual_ngrams",
|
|
610
|
+
"ngram_entropy",
|
|
611
|
+
"ngram_frequencies",
|
|
612
|
+
"ngram_frequency",
|
|
613
|
+
"ngram_heatmap",
|
|
614
|
+
]
|