oscura 0.0.1__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oscura/__init__.py +813 -8
- oscura/__main__.py +392 -0
- oscura/analyzers/__init__.py +37 -0
- oscura/analyzers/digital/__init__.py +177 -0
- oscura/analyzers/digital/bus.py +691 -0
- oscura/analyzers/digital/clock.py +805 -0
- oscura/analyzers/digital/correlation.py +720 -0
- oscura/analyzers/digital/edges.py +632 -0
- oscura/analyzers/digital/extraction.py +413 -0
- oscura/analyzers/digital/quality.py +878 -0
- oscura/analyzers/digital/signal_quality.py +877 -0
- oscura/analyzers/digital/thresholds.py +708 -0
- oscura/analyzers/digital/timing.py +1104 -0
- oscura/analyzers/eye/__init__.py +46 -0
- oscura/analyzers/eye/diagram.py +434 -0
- oscura/analyzers/eye/metrics.py +555 -0
- oscura/analyzers/jitter/__init__.py +83 -0
- oscura/analyzers/jitter/ber.py +333 -0
- oscura/analyzers/jitter/decomposition.py +759 -0
- oscura/analyzers/jitter/measurements.py +413 -0
- oscura/analyzers/jitter/spectrum.py +220 -0
- oscura/analyzers/measurements.py +40 -0
- oscura/analyzers/packet/__init__.py +171 -0
- oscura/analyzers/packet/daq.py +1077 -0
- oscura/analyzers/packet/metrics.py +437 -0
- oscura/analyzers/packet/parser.py +327 -0
- oscura/analyzers/packet/payload.py +2156 -0
- oscura/analyzers/packet/payload_analysis.py +1312 -0
- oscura/analyzers/packet/payload_extraction.py +236 -0
- oscura/analyzers/packet/payload_patterns.py +670 -0
- oscura/analyzers/packet/stream.py +359 -0
- oscura/analyzers/patterns/__init__.py +266 -0
- oscura/analyzers/patterns/clustering.py +1036 -0
- oscura/analyzers/patterns/discovery.py +539 -0
- oscura/analyzers/patterns/learning.py +797 -0
- oscura/analyzers/patterns/matching.py +1091 -0
- oscura/analyzers/patterns/periodic.py +650 -0
- oscura/analyzers/patterns/sequences.py +767 -0
- oscura/analyzers/power/__init__.py +116 -0
- oscura/analyzers/power/ac_power.py +391 -0
- oscura/analyzers/power/basic.py +383 -0
- oscura/analyzers/power/conduction.py +314 -0
- oscura/analyzers/power/efficiency.py +297 -0
- oscura/analyzers/power/ripple.py +356 -0
- oscura/analyzers/power/soa.py +372 -0
- oscura/analyzers/power/switching.py +479 -0
- oscura/analyzers/protocol/__init__.py +150 -0
- oscura/analyzers/protocols/__init__.py +150 -0
- oscura/analyzers/protocols/base.py +500 -0
- oscura/analyzers/protocols/can.py +620 -0
- oscura/analyzers/protocols/can_fd.py +448 -0
- oscura/analyzers/protocols/flexray.py +405 -0
- oscura/analyzers/protocols/hdlc.py +399 -0
- oscura/analyzers/protocols/i2c.py +368 -0
- oscura/analyzers/protocols/i2s.py +296 -0
- oscura/analyzers/protocols/jtag.py +393 -0
- oscura/analyzers/protocols/lin.py +445 -0
- oscura/analyzers/protocols/manchester.py +333 -0
- oscura/analyzers/protocols/onewire.py +501 -0
- oscura/analyzers/protocols/spi.py +334 -0
- oscura/analyzers/protocols/swd.py +325 -0
- oscura/analyzers/protocols/uart.py +393 -0
- oscura/analyzers/protocols/usb.py +495 -0
- oscura/analyzers/signal_integrity/__init__.py +63 -0
- oscura/analyzers/signal_integrity/embedding.py +294 -0
- oscura/analyzers/signal_integrity/equalization.py +370 -0
- oscura/analyzers/signal_integrity/sparams.py +484 -0
- oscura/analyzers/spectral/__init__.py +53 -0
- oscura/analyzers/spectral/chunked.py +273 -0
- oscura/analyzers/spectral/chunked_fft.py +571 -0
- oscura/analyzers/spectral/chunked_wavelet.py +391 -0
- oscura/analyzers/spectral/fft.py +92 -0
- oscura/analyzers/statistical/__init__.py +250 -0
- oscura/analyzers/statistical/checksum.py +923 -0
- oscura/analyzers/statistical/chunked_corr.py +228 -0
- oscura/analyzers/statistical/classification.py +778 -0
- oscura/analyzers/statistical/entropy.py +1113 -0
- oscura/analyzers/statistical/ngrams.py +614 -0
- oscura/analyzers/statistics/__init__.py +119 -0
- oscura/analyzers/statistics/advanced.py +885 -0
- oscura/analyzers/statistics/basic.py +263 -0
- oscura/analyzers/statistics/correlation.py +630 -0
- oscura/analyzers/statistics/distribution.py +298 -0
- oscura/analyzers/statistics/outliers.py +463 -0
- oscura/analyzers/statistics/streaming.py +93 -0
- oscura/analyzers/statistics/trend.py +520 -0
- oscura/analyzers/validation.py +598 -0
- oscura/analyzers/waveform/__init__.py +36 -0
- oscura/analyzers/waveform/measurements.py +943 -0
- oscura/analyzers/waveform/measurements_with_uncertainty.py +371 -0
- oscura/analyzers/waveform/spectral.py +1689 -0
- oscura/analyzers/waveform/wavelets.py +298 -0
- oscura/api/__init__.py +62 -0
- oscura/api/dsl.py +538 -0
- oscura/api/fluent.py +571 -0
- oscura/api/operators.py +498 -0
- oscura/api/optimization.py +392 -0
- oscura/api/profiling.py +396 -0
- oscura/automotive/__init__.py +73 -0
- oscura/automotive/can/__init__.py +52 -0
- oscura/automotive/can/analysis.py +356 -0
- oscura/automotive/can/checksum.py +250 -0
- oscura/automotive/can/correlation.py +212 -0
- oscura/automotive/can/discovery.py +355 -0
- oscura/automotive/can/message_wrapper.py +375 -0
- oscura/automotive/can/models.py +385 -0
- oscura/automotive/can/patterns.py +381 -0
- oscura/automotive/can/session.py +452 -0
- oscura/automotive/can/state_machine.py +300 -0
- oscura/automotive/can/stimulus_response.py +461 -0
- oscura/automotive/dbc/__init__.py +15 -0
- oscura/automotive/dbc/generator.py +156 -0
- oscura/automotive/dbc/parser.py +146 -0
- oscura/automotive/dtc/__init__.py +30 -0
- oscura/automotive/dtc/database.py +3036 -0
- oscura/automotive/j1939/__init__.py +14 -0
- oscura/automotive/j1939/decoder.py +745 -0
- oscura/automotive/loaders/__init__.py +35 -0
- oscura/automotive/loaders/asc.py +98 -0
- oscura/automotive/loaders/blf.py +77 -0
- oscura/automotive/loaders/csv_can.py +136 -0
- oscura/automotive/loaders/dispatcher.py +136 -0
- oscura/automotive/loaders/mdf.py +331 -0
- oscura/automotive/loaders/pcap.py +132 -0
- oscura/automotive/obd/__init__.py +14 -0
- oscura/automotive/obd/decoder.py +707 -0
- oscura/automotive/uds/__init__.py +48 -0
- oscura/automotive/uds/decoder.py +265 -0
- oscura/automotive/uds/models.py +64 -0
- oscura/automotive/visualization.py +369 -0
- oscura/batch/__init__.py +55 -0
- oscura/batch/advanced.py +627 -0
- oscura/batch/aggregate.py +300 -0
- oscura/batch/analyze.py +139 -0
- oscura/batch/logging.py +487 -0
- oscura/batch/metrics.py +556 -0
- oscura/builders/__init__.py +41 -0
- oscura/builders/signal_builder.py +1131 -0
- oscura/cli/__init__.py +14 -0
- oscura/cli/batch.py +339 -0
- oscura/cli/characterize.py +273 -0
- oscura/cli/compare.py +775 -0
- oscura/cli/decode.py +551 -0
- oscura/cli/main.py +247 -0
- oscura/cli/shell.py +350 -0
- oscura/comparison/__init__.py +66 -0
- oscura/comparison/compare.py +397 -0
- oscura/comparison/golden.py +487 -0
- oscura/comparison/limits.py +391 -0
- oscura/comparison/mask.py +434 -0
- oscura/comparison/trace_diff.py +30 -0
- oscura/comparison/visualization.py +481 -0
- oscura/compliance/__init__.py +70 -0
- oscura/compliance/advanced.py +756 -0
- oscura/compliance/masks.py +363 -0
- oscura/compliance/reporting.py +483 -0
- oscura/compliance/testing.py +298 -0
- oscura/component/__init__.py +38 -0
- oscura/component/impedance.py +365 -0
- oscura/component/reactive.py +598 -0
- oscura/component/transmission_line.py +312 -0
- oscura/config/__init__.py +191 -0
- oscura/config/defaults.py +254 -0
- oscura/config/loader.py +348 -0
- oscura/config/memory.py +271 -0
- oscura/config/migration.py +458 -0
- oscura/config/pipeline.py +1077 -0
- oscura/config/preferences.py +530 -0
- oscura/config/protocol.py +875 -0
- oscura/config/schema.py +713 -0
- oscura/config/settings.py +420 -0
- oscura/config/thresholds.py +599 -0
- oscura/convenience.py +457 -0
- oscura/core/__init__.py +299 -0
- oscura/core/audit.py +457 -0
- oscura/core/backend_selector.py +405 -0
- oscura/core/cache.py +590 -0
- oscura/core/cancellation.py +439 -0
- oscura/core/confidence.py +225 -0
- oscura/core/config.py +506 -0
- oscura/core/correlation.py +216 -0
- oscura/core/cross_domain.py +422 -0
- oscura/core/debug.py +301 -0
- oscura/core/edge_cases.py +541 -0
- oscura/core/exceptions.py +535 -0
- oscura/core/gpu_backend.py +523 -0
- oscura/core/lazy.py +832 -0
- oscura/core/log_query.py +540 -0
- oscura/core/logging.py +931 -0
- oscura/core/logging_advanced.py +952 -0
- oscura/core/memoize.py +171 -0
- oscura/core/memory_check.py +274 -0
- oscura/core/memory_guard.py +290 -0
- oscura/core/memory_limits.py +336 -0
- oscura/core/memory_monitor.py +453 -0
- oscura/core/memory_progress.py +465 -0
- oscura/core/memory_warnings.py +315 -0
- oscura/core/numba_backend.py +362 -0
- oscura/core/performance.py +352 -0
- oscura/core/progress.py +524 -0
- oscura/core/provenance.py +358 -0
- oscura/core/results.py +331 -0
- oscura/core/types.py +504 -0
- oscura/core/uncertainty.py +383 -0
- oscura/discovery/__init__.py +52 -0
- oscura/discovery/anomaly_detector.py +672 -0
- oscura/discovery/auto_decoder.py +415 -0
- oscura/discovery/comparison.py +497 -0
- oscura/discovery/quality_validator.py +528 -0
- oscura/discovery/signal_detector.py +769 -0
- oscura/dsl/__init__.py +73 -0
- oscura/dsl/commands.py +246 -0
- oscura/dsl/interpreter.py +455 -0
- oscura/dsl/parser.py +689 -0
- oscura/dsl/repl.py +172 -0
- oscura/exceptions.py +59 -0
- oscura/exploratory/__init__.py +111 -0
- oscura/exploratory/error_recovery.py +642 -0
- oscura/exploratory/fuzzy.py +513 -0
- oscura/exploratory/fuzzy_advanced.py +786 -0
- oscura/exploratory/legacy.py +831 -0
- oscura/exploratory/parse.py +358 -0
- oscura/exploratory/recovery.py +275 -0
- oscura/exploratory/sync.py +382 -0
- oscura/exploratory/unknown.py +707 -0
- oscura/export/__init__.py +25 -0
- oscura/export/wireshark/README.md +265 -0
- oscura/export/wireshark/__init__.py +47 -0
- oscura/export/wireshark/generator.py +312 -0
- oscura/export/wireshark/lua_builder.py +159 -0
- oscura/export/wireshark/templates/dissector.lua.j2 +92 -0
- oscura/export/wireshark/type_mapping.py +165 -0
- oscura/export/wireshark/validator.py +105 -0
- oscura/exporters/__init__.py +94 -0
- oscura/exporters/csv.py +303 -0
- oscura/exporters/exporters.py +44 -0
- oscura/exporters/hdf5.py +219 -0
- oscura/exporters/html_export.py +701 -0
- oscura/exporters/json_export.py +291 -0
- oscura/exporters/markdown_export.py +367 -0
- oscura/exporters/matlab_export.py +354 -0
- oscura/exporters/npz_export.py +219 -0
- oscura/exporters/spice_export.py +210 -0
- oscura/extensibility/__init__.py +131 -0
- oscura/extensibility/docs.py +752 -0
- oscura/extensibility/extensions.py +1125 -0
- oscura/extensibility/logging.py +259 -0
- oscura/extensibility/measurements.py +485 -0
- oscura/extensibility/plugins.py +414 -0
- oscura/extensibility/registry.py +346 -0
- oscura/extensibility/templates.py +913 -0
- oscura/extensibility/validation.py +651 -0
- oscura/filtering/__init__.py +89 -0
- oscura/filtering/base.py +563 -0
- oscura/filtering/convenience.py +564 -0
- oscura/filtering/design.py +725 -0
- oscura/filtering/filters.py +32 -0
- oscura/filtering/introspection.py +605 -0
- oscura/guidance/__init__.py +24 -0
- oscura/guidance/recommender.py +429 -0
- oscura/guidance/wizard.py +518 -0
- oscura/inference/__init__.py +251 -0
- oscura/inference/active_learning/README.md +153 -0
- oscura/inference/active_learning/__init__.py +38 -0
- oscura/inference/active_learning/lstar.py +257 -0
- oscura/inference/active_learning/observation_table.py +230 -0
- oscura/inference/active_learning/oracle.py +78 -0
- oscura/inference/active_learning/teachers/__init__.py +15 -0
- oscura/inference/active_learning/teachers/simulator.py +192 -0
- oscura/inference/adaptive_tuning.py +453 -0
- oscura/inference/alignment.py +653 -0
- oscura/inference/bayesian.py +943 -0
- oscura/inference/binary.py +1016 -0
- oscura/inference/crc_reverse.py +711 -0
- oscura/inference/logic.py +288 -0
- oscura/inference/message_format.py +1305 -0
- oscura/inference/protocol.py +417 -0
- oscura/inference/protocol_dsl.py +1084 -0
- oscura/inference/protocol_library.py +1230 -0
- oscura/inference/sequences.py +809 -0
- oscura/inference/signal_intelligence.py +1509 -0
- oscura/inference/spectral.py +215 -0
- oscura/inference/state_machine.py +634 -0
- oscura/inference/stream.py +918 -0
- oscura/integrations/__init__.py +59 -0
- oscura/integrations/llm.py +1827 -0
- oscura/jupyter/__init__.py +32 -0
- oscura/jupyter/display.py +268 -0
- oscura/jupyter/magic.py +334 -0
- oscura/loaders/__init__.py +526 -0
- oscura/loaders/binary.py +69 -0
- oscura/loaders/configurable.py +1255 -0
- oscura/loaders/csv.py +26 -0
- oscura/loaders/csv_loader.py +473 -0
- oscura/loaders/hdf5.py +9 -0
- oscura/loaders/hdf5_loader.py +510 -0
- oscura/loaders/lazy.py +370 -0
- oscura/loaders/mmap_loader.py +583 -0
- oscura/loaders/numpy_loader.py +436 -0
- oscura/loaders/pcap.py +432 -0
- oscura/loaders/preprocessing.py +368 -0
- oscura/loaders/rigol.py +287 -0
- oscura/loaders/sigrok.py +321 -0
- oscura/loaders/tdms.py +367 -0
- oscura/loaders/tektronix.py +711 -0
- oscura/loaders/validation.py +584 -0
- oscura/loaders/vcd.py +464 -0
- oscura/loaders/wav.py +233 -0
- oscura/math/__init__.py +45 -0
- oscura/math/arithmetic.py +824 -0
- oscura/math/interpolation.py +413 -0
- oscura/onboarding/__init__.py +39 -0
- oscura/onboarding/help.py +498 -0
- oscura/onboarding/tutorials.py +405 -0
- oscura/onboarding/wizard.py +466 -0
- oscura/optimization/__init__.py +19 -0
- oscura/optimization/parallel.py +440 -0
- oscura/optimization/search.py +532 -0
- oscura/pipeline/__init__.py +43 -0
- oscura/pipeline/base.py +338 -0
- oscura/pipeline/composition.py +242 -0
- oscura/pipeline/parallel.py +448 -0
- oscura/pipeline/pipeline.py +375 -0
- oscura/pipeline/reverse_engineering.py +1119 -0
- oscura/plugins/__init__.py +122 -0
- oscura/plugins/base.py +272 -0
- oscura/plugins/cli.py +497 -0
- oscura/plugins/discovery.py +411 -0
- oscura/plugins/isolation.py +418 -0
- oscura/plugins/lifecycle.py +959 -0
- oscura/plugins/manager.py +493 -0
- oscura/plugins/registry.py +421 -0
- oscura/plugins/versioning.py +372 -0
- oscura/py.typed +0 -0
- oscura/quality/__init__.py +65 -0
- oscura/quality/ensemble.py +740 -0
- oscura/quality/explainer.py +338 -0
- oscura/quality/scoring.py +616 -0
- oscura/quality/warnings.py +456 -0
- oscura/reporting/__init__.py +248 -0
- oscura/reporting/advanced.py +1234 -0
- oscura/reporting/analyze.py +448 -0
- oscura/reporting/argument_preparer.py +596 -0
- oscura/reporting/auto_report.py +507 -0
- oscura/reporting/batch.py +615 -0
- oscura/reporting/chart_selection.py +223 -0
- oscura/reporting/comparison.py +330 -0
- oscura/reporting/config.py +615 -0
- oscura/reporting/content/__init__.py +39 -0
- oscura/reporting/content/executive.py +127 -0
- oscura/reporting/content/filtering.py +191 -0
- oscura/reporting/content/minimal.py +257 -0
- oscura/reporting/content/verbosity.py +162 -0
- oscura/reporting/core.py +508 -0
- oscura/reporting/core_formats/__init__.py +17 -0
- oscura/reporting/core_formats/multi_format.py +210 -0
- oscura/reporting/engine.py +836 -0
- oscura/reporting/export.py +366 -0
- oscura/reporting/formatting/__init__.py +129 -0
- oscura/reporting/formatting/emphasis.py +81 -0
- oscura/reporting/formatting/numbers.py +403 -0
- oscura/reporting/formatting/standards.py +55 -0
- oscura/reporting/formatting.py +466 -0
- oscura/reporting/html.py +578 -0
- oscura/reporting/index.py +590 -0
- oscura/reporting/multichannel.py +296 -0
- oscura/reporting/output.py +379 -0
- oscura/reporting/pdf.py +373 -0
- oscura/reporting/plots.py +731 -0
- oscura/reporting/pptx_export.py +360 -0
- oscura/reporting/renderers/__init__.py +11 -0
- oscura/reporting/renderers/pdf.py +94 -0
- oscura/reporting/sections.py +471 -0
- oscura/reporting/standards.py +680 -0
- oscura/reporting/summary_generator.py +368 -0
- oscura/reporting/tables.py +397 -0
- oscura/reporting/template_system.py +724 -0
- oscura/reporting/templates/__init__.py +15 -0
- oscura/reporting/templates/definition.py +205 -0
- oscura/reporting/templates/index.html +649 -0
- oscura/reporting/templates/index.md +173 -0
- oscura/schemas/__init__.py +158 -0
- oscura/schemas/bus_configuration.json +322 -0
- oscura/schemas/device_mapping.json +182 -0
- oscura/schemas/packet_format.json +418 -0
- oscura/schemas/protocol_definition.json +363 -0
- oscura/search/__init__.py +16 -0
- oscura/search/anomaly.py +292 -0
- oscura/search/context.py +149 -0
- oscura/search/pattern.py +160 -0
- oscura/session/__init__.py +34 -0
- oscura/session/annotations.py +289 -0
- oscura/session/history.py +313 -0
- oscura/session/session.py +445 -0
- oscura/streaming/__init__.py +43 -0
- oscura/streaming/chunked.py +611 -0
- oscura/streaming/progressive.py +393 -0
- oscura/streaming/realtime.py +622 -0
- oscura/testing/__init__.py +54 -0
- oscura/testing/synthetic.py +808 -0
- oscura/triggering/__init__.py +68 -0
- oscura/triggering/base.py +229 -0
- oscura/triggering/edge.py +353 -0
- oscura/triggering/pattern.py +344 -0
- oscura/triggering/pulse.py +581 -0
- oscura/triggering/window.py +453 -0
- oscura/ui/__init__.py +48 -0
- oscura/ui/formatters.py +526 -0
- oscura/ui/progressive_display.py +340 -0
- oscura/utils/__init__.py +99 -0
- oscura/utils/autodetect.py +338 -0
- oscura/utils/buffer.py +389 -0
- oscura/utils/lazy.py +407 -0
- oscura/utils/lazy_imports.py +147 -0
- oscura/utils/memory.py +836 -0
- oscura/utils/memory_advanced.py +1326 -0
- oscura/utils/memory_extensions.py +465 -0
- oscura/utils/progressive.py +352 -0
- oscura/utils/windowing.py +362 -0
- oscura/visualization/__init__.py +321 -0
- oscura/visualization/accessibility.py +526 -0
- oscura/visualization/annotations.py +374 -0
- oscura/visualization/axis_scaling.py +305 -0
- oscura/visualization/colors.py +453 -0
- oscura/visualization/digital.py +337 -0
- oscura/visualization/eye.py +420 -0
- oscura/visualization/histogram.py +281 -0
- oscura/visualization/interactive.py +858 -0
- oscura/visualization/jitter.py +702 -0
- oscura/visualization/keyboard.py +394 -0
- oscura/visualization/layout.py +365 -0
- oscura/visualization/optimization.py +1028 -0
- oscura/visualization/palettes.py +446 -0
- oscura/visualization/plot.py +92 -0
- oscura/visualization/power.py +290 -0
- oscura/visualization/power_extended.py +626 -0
- oscura/visualization/presets.py +467 -0
- oscura/visualization/protocols.py +932 -0
- oscura/visualization/render.py +207 -0
- oscura/visualization/rendering.py +444 -0
- oscura/visualization/reverse_engineering.py +791 -0
- oscura/visualization/signal_integrity.py +808 -0
- oscura/visualization/specialized.py +553 -0
- oscura/visualization/spectral.py +811 -0
- oscura/visualization/styles.py +381 -0
- oscura/visualization/thumbnails.py +311 -0
- oscura/visualization/time_axis.py +351 -0
- oscura/visualization/waveform.py +367 -0
- oscura/workflow/__init__.py +13 -0
- oscura/workflow/dag.py +377 -0
- oscura/workflows/__init__.py +58 -0
- oscura/workflows/compliance.py +280 -0
- oscura/workflows/digital.py +272 -0
- oscura/workflows/multi_trace.py +502 -0
- oscura/workflows/power.py +178 -0
- oscura/workflows/protocol.py +492 -0
- oscura/workflows/reverse_engineering.py +639 -0
- oscura/workflows/signal_integrity.py +227 -0
- oscura-0.1.1.dist-info/METADATA +300 -0
- oscura-0.1.1.dist-info/RECORD +463 -0
- oscura-0.1.1.dist-info/entry_points.txt +2 -0
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/licenses/LICENSE +1 -1
- oscura-0.0.1.dist-info/METADATA +0 -63
- oscura-0.0.1.dist-info/RECORD +0 -5
- {oscura-0.0.1.dist-info → oscura-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1036 @@
|
|
|
1
|
+
"""Pattern clustering by similarity.
|
|
2
|
+
|
|
3
|
+
This module implements algorithms for clustering similar patterns/messages
|
|
4
|
+
using various distance metrics and clustering approaches.
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
Author: Oscura Development Team
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Literal
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def cluster_messages(
|
|
17
|
+
data: np.ndarray[tuple[int, int], np.dtype[np.float64]],
|
|
18
|
+
n_clusters: int = 3,
|
|
19
|
+
method: str = "kmeans",
|
|
20
|
+
random_state: int | None = None,
|
|
21
|
+
) -> np.ndarray[tuple[int], np.dtype[np.int_]]:
|
|
22
|
+
"""Cluster data points using K-means algorithm.
|
|
23
|
+
|
|
24
|
+
Groups data points into n_clusters clusters using K-means clustering.
|
|
25
|
+
Supports deterministic clustering with random_state.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data: Data points as (n_points, dimensions) array
|
|
29
|
+
n_clusters: Number of clusters to create
|
|
30
|
+
method: Clustering method (default: 'kmeans')
|
|
31
|
+
random_state: Random seed for deterministic results
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Array of cluster labels (one per data point), in range [0, n_clusters)
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ValueError: If n_clusters is invalid or data shape is incorrect
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> data = np.random.randn(20, 2)
|
|
41
|
+
>>> labels = cluster_messages(data, n_clusters=3, random_state=42)
|
|
42
|
+
>>> assert len(labels) == 20
|
|
43
|
+
>>> assert np.all((labels >= 0) & (labels < 3))
|
|
44
|
+
"""
|
|
45
|
+
if data.ndim != 2:
|
|
46
|
+
raise ValueError(f"Expected 2D data array, got shape {data.shape}")
|
|
47
|
+
|
|
48
|
+
if n_clusters < 1:
|
|
49
|
+
raise ValueError(f"n_clusters must be >= 1, got {n_clusters}")
|
|
50
|
+
|
|
51
|
+
n_points = data.shape[0]
|
|
52
|
+
if n_clusters > n_points:
|
|
53
|
+
raise ValueError(f"n_clusters ({n_clusters}) cannot exceed n_points ({n_points})")
|
|
54
|
+
|
|
55
|
+
# Use K-means clustering
|
|
56
|
+
return _kmeans_clustering(data, n_clusters=n_clusters, random_state=random_state)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _kmeans_clustering(
|
|
60
|
+
data: np.ndarray[tuple[int, int], np.dtype[np.float64]],
|
|
61
|
+
n_clusters: int,
|
|
62
|
+
random_state: int | None = None,
|
|
63
|
+
max_iterations: int = 100,
|
|
64
|
+
tolerance: float = 1e-4,
|
|
65
|
+
) -> np.ndarray[tuple[int], np.dtype[np.int_]]:
|
|
66
|
+
"""K-means clustering implementation.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
data: Input data points (n_points, dimensions)
|
|
70
|
+
n_clusters: Number of clusters
|
|
71
|
+
random_state: Random seed
|
|
72
|
+
max_iterations: Maximum iterations
|
|
73
|
+
tolerance: Convergence tolerance
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Cluster labels for each point
|
|
77
|
+
"""
|
|
78
|
+
if random_state is not None:
|
|
79
|
+
np.random.seed(random_state)
|
|
80
|
+
|
|
81
|
+
n_points = data.shape[0]
|
|
82
|
+
|
|
83
|
+
# Initialize centroids randomly from data points
|
|
84
|
+
initial_indices = np.random.choice(n_points, size=n_clusters, replace=False)
|
|
85
|
+
centroids = data[initial_indices].copy()
|
|
86
|
+
|
|
87
|
+
labels = np.zeros(n_points, dtype=int)
|
|
88
|
+
|
|
89
|
+
for _iteration in range(max_iterations):
|
|
90
|
+
# Assign points to nearest centroid
|
|
91
|
+
distances = np.zeros((n_points, n_clusters))
|
|
92
|
+
for k in range(n_clusters):
|
|
93
|
+
distances[:, k] = np.linalg.norm(data - centroids[k], axis=1)
|
|
94
|
+
|
|
95
|
+
new_labels = np.argmin(distances, axis=1)
|
|
96
|
+
|
|
97
|
+
# Check for convergence
|
|
98
|
+
if np.array_equal(new_labels, labels):
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
labels = new_labels
|
|
102
|
+
|
|
103
|
+
# Update centroids
|
|
104
|
+
for k in range(n_clusters):
|
|
105
|
+
cluster_points = data[labels == k]
|
|
106
|
+
if len(cluster_points) > 0:
|
|
107
|
+
centroids[k] = np.mean(cluster_points, axis=0)
|
|
108
|
+
|
|
109
|
+
return labels
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class ClusterResult:
|
|
114
|
+
"""Result of pattern clustering.
|
|
115
|
+
|
|
116
|
+
Attributes:
|
|
117
|
+
cluster_id: Unique cluster identifier
|
|
118
|
+
patterns: List of patterns in this cluster
|
|
119
|
+
centroid: Representative pattern (centroid)
|
|
120
|
+
size: Number of patterns in cluster
|
|
121
|
+
variance: Within-cluster variance
|
|
122
|
+
common_bytes: Byte positions that are constant across all patterns
|
|
123
|
+
variable_bytes: Byte positions that vary across patterns
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
cluster_id: int
|
|
127
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]]
|
|
128
|
+
centroid: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]
|
|
129
|
+
size: int
|
|
130
|
+
variance: float
|
|
131
|
+
common_bytes: list[int]
|
|
132
|
+
variable_bytes: list[int]
|
|
133
|
+
|
|
134
|
+
def __post_init__(self) -> None:
|
|
135
|
+
"""Validate cluster result."""
|
|
136
|
+
if self.cluster_id < 0:
|
|
137
|
+
raise ValueError("cluster_id must be non-negative")
|
|
138
|
+
if self.size < 0:
|
|
139
|
+
raise ValueError("size must be non-negative")
|
|
140
|
+
if len(self.patterns) != self.size:
|
|
141
|
+
raise ValueError("patterns length must match size")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class ClusteringResult:
|
|
146
|
+
"""Complete clustering result.
|
|
147
|
+
|
|
148
|
+
Attributes:
|
|
149
|
+
clusters: List of ClusterResult objects
|
|
150
|
+
labels: Cluster assignment for each input pattern
|
|
151
|
+
num_clusters: Total number of clusters
|
|
152
|
+
silhouette_score: Clustering quality metric (-1 to 1, higher = better)
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
clusters: list[ClusterResult]
|
|
156
|
+
labels: np.ndarray[tuple[int], np.dtype[np.int_]]
|
|
157
|
+
num_clusters: int
|
|
158
|
+
silhouette_score: float
|
|
159
|
+
|
|
160
|
+
def __post_init__(self) -> None:
|
|
161
|
+
"""Validate clustering result."""
|
|
162
|
+
if self.num_clusters != len(self.clusters):
|
|
163
|
+
raise ValueError("num_clusters must match clusters length")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def cluster_by_hamming(
|
|
167
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
168
|
+
threshold: float = 0.2,
|
|
169
|
+
min_cluster_size: int = 2,
|
|
170
|
+
) -> ClusteringResult:
|
|
171
|
+
"""Cluster fixed-length patterns by Hamming distance.
|
|
172
|
+
|
|
173
|
+
: Hamming distance clustering
|
|
174
|
+
|
|
175
|
+
Groups patterns that differ by at most threshold * pattern_length bits.
|
|
176
|
+
Efficient for fixed-length binary patterns.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
patterns: List of patterns (all must have same length)
|
|
180
|
+
threshold: Maximum normalized Hamming distance within cluster (0-1)
|
|
181
|
+
min_cluster_size: Minimum patterns per cluster
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
ClusteringResult with cluster assignments
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
ValueError: If patterns have different lengths or invalid parameters
|
|
188
|
+
|
|
189
|
+
Examples:
|
|
190
|
+
>>> patterns = [b"ABCD", b"ABCE", b"ABCF", b"XYZA"]
|
|
191
|
+
>>> result = cluster_by_hamming(patterns, threshold=0.3)
|
|
192
|
+
>>> assert result.num_clusters >= 1
|
|
193
|
+
"""
|
|
194
|
+
if not patterns:
|
|
195
|
+
return ClusteringResult(
|
|
196
|
+
clusters=[], labels=np.array([]), num_clusters=0, silhouette_score=0.0
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Validate all patterns have same length
|
|
200
|
+
pattern_length = len(patterns[0])
|
|
201
|
+
for i, p in enumerate(patterns):
|
|
202
|
+
if len(p) != pattern_length:
|
|
203
|
+
raise ValueError(f"Pattern {i} has length {len(p)}, expected {pattern_length}")
|
|
204
|
+
|
|
205
|
+
# Convert to numpy arrays for efficient computation
|
|
206
|
+
pattern_arrays = [_to_array(p) for p in patterns]
|
|
207
|
+
n = len(pattern_arrays)
|
|
208
|
+
|
|
209
|
+
# Compute distance matrix
|
|
210
|
+
dist_matrix = compute_distance_matrix(patterns, metric="hamming")
|
|
211
|
+
|
|
212
|
+
# Perform clustering using simple threshold-based approach
|
|
213
|
+
labels = np.full(n, -1, dtype=int)
|
|
214
|
+
cluster_id = 0
|
|
215
|
+
|
|
216
|
+
for i in range(n):
|
|
217
|
+
if labels[i] != -1:
|
|
218
|
+
continue # Already assigned
|
|
219
|
+
|
|
220
|
+
# Start new cluster
|
|
221
|
+
cluster_members = [i]
|
|
222
|
+
labels[i] = cluster_id
|
|
223
|
+
|
|
224
|
+
# Find all patterns within threshold
|
|
225
|
+
for j in range(i + 1, n):
|
|
226
|
+
if labels[j] != -1:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Check if j is close to all members of current cluster
|
|
230
|
+
max_dist = max(dist_matrix[j, m] for m in cluster_members)
|
|
231
|
+
if max_dist <= threshold:
|
|
232
|
+
cluster_members.append(j)
|
|
233
|
+
labels[j] = cluster_id
|
|
234
|
+
|
|
235
|
+
# Only keep cluster if large enough
|
|
236
|
+
if len(cluster_members) < min_cluster_size:
|
|
237
|
+
for m in cluster_members:
|
|
238
|
+
labels[m] = -1
|
|
239
|
+
else:
|
|
240
|
+
cluster_id += 1
|
|
241
|
+
|
|
242
|
+
# Assign singleton patterns to noise cluster (-1)
|
|
243
|
+
num_clusters = cluster_id
|
|
244
|
+
|
|
245
|
+
# Build cluster results
|
|
246
|
+
clusters = []
|
|
247
|
+
for cid in range(num_clusters):
|
|
248
|
+
cluster_indices = np.where(labels == cid)[0]
|
|
249
|
+
cluster_patterns = [patterns[i] for i in cluster_indices]
|
|
250
|
+
|
|
251
|
+
# Compute centroid (majority vote per byte)
|
|
252
|
+
centroid = _compute_centroid_hamming([pattern_arrays[i] for i in cluster_indices])
|
|
253
|
+
|
|
254
|
+
# Analyze common vs variable bytes
|
|
255
|
+
common, variable = _analyze_pattern_variance([pattern_arrays[i] for i in cluster_indices])
|
|
256
|
+
|
|
257
|
+
# Compute within-cluster variance
|
|
258
|
+
variance = (
|
|
259
|
+
np.mean([dist_matrix[i, j] for i in cluster_indices for j in cluster_indices if i < j])
|
|
260
|
+
if len(cluster_indices) > 1
|
|
261
|
+
else 0.0
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
clusters.append(
|
|
265
|
+
ClusterResult(
|
|
266
|
+
cluster_id=cid,
|
|
267
|
+
patterns=cluster_patterns,
|
|
268
|
+
centroid=bytes(centroid) if isinstance(patterns[0], bytes) else centroid,
|
|
269
|
+
size=len(cluster_patterns),
|
|
270
|
+
variance=float(variance),
|
|
271
|
+
common_bytes=common,
|
|
272
|
+
variable_bytes=variable,
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Compute silhouette score
|
|
277
|
+
silhouette = _compute_silhouette_score(dist_matrix, labels) if num_clusters > 1 else 0.0
|
|
278
|
+
|
|
279
|
+
return ClusteringResult(
|
|
280
|
+
clusters=clusters, labels=labels, num_clusters=num_clusters, silhouette_score=silhouette
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def cluster_by_edit_distance(
|
|
285
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
286
|
+
threshold: float = 0.3,
|
|
287
|
+
min_cluster_size: int = 2,
|
|
288
|
+
) -> ClusteringResult:
|
|
289
|
+
"""Cluster variable-length patterns by edit distance.
|
|
290
|
+
|
|
291
|
+
: Edit distance (Levenshtein) clustering
|
|
292
|
+
|
|
293
|
+
Groups patterns with normalized edit distance <= threshold.
|
|
294
|
+
Works with variable-length patterns.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
patterns: List of patterns (can have different lengths)
|
|
298
|
+
threshold: Maximum normalized edit distance (0-1)
|
|
299
|
+
min_cluster_size: Minimum patterns per cluster
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
ClusteringResult with cluster assignments
|
|
303
|
+
|
|
304
|
+
Examples:
|
|
305
|
+
>>> patterns = [b"ABCD", b"ABCDE", b"ABCDF", b"XYZ"]
|
|
306
|
+
>>> result = cluster_by_edit_distance(patterns, threshold=0.4)
|
|
307
|
+
"""
|
|
308
|
+
if not patterns:
|
|
309
|
+
return ClusteringResult(
|
|
310
|
+
clusters=[], labels=np.array([]), num_clusters=0, silhouette_score=0.0
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
n = len(patterns)
|
|
314
|
+
|
|
315
|
+
# Compute distance matrix
|
|
316
|
+
dist_matrix = compute_distance_matrix(patterns, metric="levenshtein")
|
|
317
|
+
|
|
318
|
+
# Threshold-based clustering
|
|
319
|
+
labels = np.full(n, -1, dtype=int)
|
|
320
|
+
cluster_id = 0
|
|
321
|
+
|
|
322
|
+
for i in range(n):
|
|
323
|
+
if labels[i] != -1:
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# Start new cluster
|
|
327
|
+
cluster_members = [i]
|
|
328
|
+
labels[i] = cluster_id
|
|
329
|
+
|
|
330
|
+
# Find similar patterns
|
|
331
|
+
for j in range(i + 1, n):
|
|
332
|
+
if labels[j] != -1:
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
# Check distance to cluster members
|
|
336
|
+
max_dist = max(dist_matrix[j, m] for m in cluster_members)
|
|
337
|
+
if max_dist <= threshold:
|
|
338
|
+
cluster_members.append(j)
|
|
339
|
+
labels[j] = cluster_id
|
|
340
|
+
|
|
341
|
+
# Keep cluster if large enough
|
|
342
|
+
if len(cluster_members) < min_cluster_size:
|
|
343
|
+
for m in cluster_members:
|
|
344
|
+
labels[m] = -1
|
|
345
|
+
else:
|
|
346
|
+
cluster_id += 1
|
|
347
|
+
|
|
348
|
+
num_clusters = cluster_id
|
|
349
|
+
|
|
350
|
+
# Build cluster results
|
|
351
|
+
clusters = []
|
|
352
|
+
for cid in range(num_clusters):
|
|
353
|
+
cluster_indices = np.where(labels == cid)[0]
|
|
354
|
+
cluster_patterns = [patterns[i] for i in cluster_indices]
|
|
355
|
+
|
|
356
|
+
# Use most common pattern as centroid
|
|
357
|
+
centroid = _compute_centroid_edit(cluster_patterns)
|
|
358
|
+
|
|
359
|
+
# For variable-length patterns, analysis is limited
|
|
360
|
+
# Pad to common length for analysis
|
|
361
|
+
max_len = max(len(p) for p in cluster_patterns)
|
|
362
|
+
padded = [_to_array(p, target_length=max_len) for p in cluster_patterns]
|
|
363
|
+
common, variable = _analyze_pattern_variance(padded)
|
|
364
|
+
|
|
365
|
+
# Compute variance
|
|
366
|
+
variance = (
|
|
367
|
+
np.mean([dist_matrix[i, j] for i in cluster_indices for j in cluster_indices if i < j])
|
|
368
|
+
if len(cluster_indices) > 1
|
|
369
|
+
else 0.0
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
clusters.append(
|
|
373
|
+
ClusterResult(
|
|
374
|
+
cluster_id=cid,
|
|
375
|
+
patterns=cluster_patterns,
|
|
376
|
+
centroid=centroid,
|
|
377
|
+
size=len(cluster_patterns),
|
|
378
|
+
variance=float(variance),
|
|
379
|
+
common_bytes=common,
|
|
380
|
+
variable_bytes=variable,
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Compute silhouette score
|
|
385
|
+
silhouette = _compute_silhouette_score(dist_matrix, labels) if num_clusters > 1 else 0.0
|
|
386
|
+
|
|
387
|
+
return ClusteringResult(
|
|
388
|
+
clusters=clusters, labels=labels, num_clusters=num_clusters, silhouette_score=silhouette
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def cluster_hierarchical(
|
|
393
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
394
|
+
method: Literal["single", "complete", "average", "upgma"] = "upgma",
|
|
395
|
+
num_clusters: int | None = None,
|
|
396
|
+
distance_threshold: float | None = None,
|
|
397
|
+
) -> ClusteringResult:
|
|
398
|
+
"""Hierarchical clustering of patterns.
|
|
399
|
+
|
|
400
|
+
: Hierarchical clustering (UPGMA, etc.)
|
|
401
|
+
|
|
402
|
+
Uses agglomerative hierarchical clustering with various linkage methods.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
patterns: List of patterns
|
|
406
|
+
method: Linkage method ('single', 'complete', 'average', 'upgma')
|
|
407
|
+
num_clusters: Desired number of clusters (if None, use distance_threshold)
|
|
408
|
+
distance_threshold: Distance threshold for cutting dendrogram
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
ClusteringResult with cluster assignments
|
|
412
|
+
|
|
413
|
+
Raises:
|
|
414
|
+
ValueError: If neither num_clusters nor distance_threshold is specified
|
|
415
|
+
|
|
416
|
+
Examples:
|
|
417
|
+
>>> patterns = [b"AAA", b"AAB", b"BBB", b"BBC"]
|
|
418
|
+
>>> result = cluster_hierarchical(patterns, method='average', num_clusters=2)
|
|
419
|
+
"""
|
|
420
|
+
if num_clusters is None and distance_threshold is None:
|
|
421
|
+
raise ValueError("Must specify either num_clusters or distance_threshold")
|
|
422
|
+
|
|
423
|
+
if not patterns:
|
|
424
|
+
return ClusteringResult(
|
|
425
|
+
clusters=[], labels=np.array([]), num_clusters=0, silhouette_score=0.0
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Normalize method name
|
|
429
|
+
if method == "upgma":
|
|
430
|
+
method = "average"
|
|
431
|
+
|
|
432
|
+
_n = len(patterns)
|
|
433
|
+
|
|
434
|
+
# Compute distance matrix
|
|
435
|
+
dist_matrix = compute_distance_matrix(patterns, metric="hamming")
|
|
436
|
+
|
|
437
|
+
# Perform hierarchical clustering
|
|
438
|
+
labels = _hierarchical_clustering(
|
|
439
|
+
dist_matrix, method=method, num_clusters=num_clusters, distance_threshold=distance_threshold
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# Count actual clusters
|
|
443
|
+
unique_labels = set(labels[labels >= 0])
|
|
444
|
+
num_clusters_actual = len(unique_labels)
|
|
445
|
+
|
|
446
|
+
# Build cluster results
|
|
447
|
+
clusters = []
|
|
448
|
+
for cid in sorted(unique_labels):
|
|
449
|
+
cluster_indices = np.where(labels == cid)[0]
|
|
450
|
+
cluster_patterns = [patterns[i] for i in cluster_indices]
|
|
451
|
+
|
|
452
|
+
# Compute centroid
|
|
453
|
+
pattern_arrays = [_to_array(p) for p in cluster_patterns]
|
|
454
|
+
if len({len(p) for p in pattern_arrays}) == 1:
|
|
455
|
+
# Fixed length - use majority vote
|
|
456
|
+
centroid_array = _compute_centroid_hamming(pattern_arrays)
|
|
457
|
+
centroid = bytes(centroid_array) if isinstance(patterns[0], bytes) else centroid_array
|
|
458
|
+
else:
|
|
459
|
+
# Variable length - use most common
|
|
460
|
+
centroid = _compute_centroid_edit(cluster_patterns)
|
|
461
|
+
|
|
462
|
+
# Analyze variance
|
|
463
|
+
max_len = max(len(p) for p in pattern_arrays)
|
|
464
|
+
padded = [_to_array(p, target_length=max_len) for p in pattern_arrays]
|
|
465
|
+
common, variable = _analyze_pattern_variance(padded)
|
|
466
|
+
|
|
467
|
+
# Variance
|
|
468
|
+
variance = (
|
|
469
|
+
np.mean([dist_matrix[i, j] for i in cluster_indices for j in cluster_indices if i < j])
|
|
470
|
+
if len(cluster_indices) > 1
|
|
471
|
+
else 0.0
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
clusters.append(
|
|
475
|
+
ClusterResult(
|
|
476
|
+
cluster_id=cid,
|
|
477
|
+
patterns=cluster_patterns,
|
|
478
|
+
centroid=centroid,
|
|
479
|
+
size=len(cluster_patterns),
|
|
480
|
+
variance=float(variance),
|
|
481
|
+
common_bytes=common,
|
|
482
|
+
variable_bytes=variable,
|
|
483
|
+
)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
# Silhouette score
|
|
487
|
+
silhouette = _compute_silhouette_score(dist_matrix, labels) if num_clusters_actual > 1 else 0.0
|
|
488
|
+
|
|
489
|
+
return ClusteringResult(
|
|
490
|
+
clusters=clusters,
|
|
491
|
+
labels=labels,
|
|
492
|
+
num_clusters=num_clusters_actual,
|
|
493
|
+
silhouette_score=silhouette,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def analyze_cluster(cluster: ClusterResult) -> dict[str, list[int] | list[float] | bytes]:
|
|
498
|
+
"""Analyze cluster to find common vs variable regions.
|
|
499
|
+
|
|
500
|
+
: Cluster analysis
|
|
501
|
+
|
|
502
|
+
Performs detailed analysis of a cluster to identify byte positions
|
|
503
|
+
that are constant vs. those that vary.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
cluster: ClusterResult to analyze
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Dictionary with analysis results including:
|
|
510
|
+
- common_bytes: List of byte positions that are constant
|
|
511
|
+
- variable_bytes: List of byte positions that vary
|
|
512
|
+
- entropy_per_byte: Entropy at each byte position
|
|
513
|
+
- consensus: Consensus pattern with variable bytes marked
|
|
514
|
+
|
|
515
|
+
Examples:
|
|
516
|
+
>>> # Assume we have a cluster
|
|
517
|
+
>>> analysis = analyze_cluster(cluster)
|
|
518
|
+
>>> print(f"Common positions: {analysis['common_bytes']}")
|
|
519
|
+
"""
|
|
520
|
+
if cluster.size == 0:
|
|
521
|
+
return {"common_bytes": [], "variable_bytes": [], "entropy_per_byte": [], "consensus": b""}
|
|
522
|
+
|
|
523
|
+
# Convert patterns to arrays
|
|
524
|
+
pattern_arrays = [_to_array(p) for p in cluster.patterns]
|
|
525
|
+
|
|
526
|
+
# Pad to same length
|
|
527
|
+
max_len = max(len(p) for p in pattern_arrays)
|
|
528
|
+
padded = [_to_array(p, target_length=max_len) for p in pattern_arrays]
|
|
529
|
+
|
|
530
|
+
# Compute entropy per byte position
|
|
531
|
+
entropy_per_byte = []
|
|
532
|
+
for pos in range(max_len):
|
|
533
|
+
byte_values = [p[pos] for p in padded]
|
|
534
|
+
entropy = _compute_byte_entropy(byte_values)
|
|
535
|
+
entropy_per_byte.append(entropy)
|
|
536
|
+
|
|
537
|
+
# Threshold for "common" (low entropy)
|
|
538
|
+
common_threshold = 0.1
|
|
539
|
+
common_bytes = [i for i, e in enumerate(entropy_per_byte) if e < common_threshold]
|
|
540
|
+
variable_bytes = [i for i, e in enumerate(entropy_per_byte) if e >= common_threshold]
|
|
541
|
+
|
|
542
|
+
# Build consensus pattern
|
|
543
|
+
consensus = np.zeros(max_len, dtype=np.uint8)
|
|
544
|
+
for pos in range(max_len):
|
|
545
|
+
byte_values = [p[pos] for p in padded]
|
|
546
|
+
# Use most common byte
|
|
547
|
+
consensus[pos] = max(set(byte_values), key=byte_values.count)
|
|
548
|
+
|
|
549
|
+
return {
|
|
550
|
+
"common_bytes": common_bytes,
|
|
551
|
+
"variable_bytes": variable_bytes,
|
|
552
|
+
"entropy_per_byte": entropy_per_byte,
|
|
553
|
+
"consensus": bytes(consensus),
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def compute_distance_matrix(
|
|
558
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
559
|
+
metric: Literal["hamming", "levenshtein", "jaccard"] = "hamming",
|
|
560
|
+
) -> np.ndarray[tuple[int, int], np.dtype[np.float64]]:
|
|
561
|
+
"""Compute pairwise distance matrix.
|
|
562
|
+
|
|
563
|
+
: Distance matrix computation
|
|
564
|
+
|
|
565
|
+
Computes all pairwise distances between patterns using the specified metric.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
patterns: List of patterns
|
|
569
|
+
metric: Distance metric ('hamming', 'levenshtein', 'jaccard')
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
Symmetric distance matrix (n x n)
|
|
573
|
+
|
|
574
|
+
Raises:
|
|
575
|
+
ValueError: If unknown metric is specified
|
|
576
|
+
|
|
577
|
+
Examples:
|
|
578
|
+
>>> patterns = [b"ABC", b"ABD", b"XYZ"]
|
|
579
|
+
>>> dist = compute_distance_matrix(patterns, metric='hamming')
|
|
580
|
+
>>> assert dist.shape == (3, 3)
|
|
581
|
+
"""
|
|
582
|
+
n = len(patterns)
|
|
583
|
+
dist_matrix = np.zeros((n, n), dtype=float)
|
|
584
|
+
|
|
585
|
+
for i in range(n):
|
|
586
|
+
for j in range(i + 1, n):
|
|
587
|
+
if metric == "hamming":
|
|
588
|
+
dist = _hamming_distance(patterns[i], patterns[j])
|
|
589
|
+
elif metric == "levenshtein":
|
|
590
|
+
dist = _edit_distance(patterns[i], patterns[j])
|
|
591
|
+
elif metric == "jaccard":
|
|
592
|
+
dist = _jaccard_distance(patterns[i], patterns[j])
|
|
593
|
+
else:
|
|
594
|
+
raise ValueError(f"Unknown metric: {metric}")
|
|
595
|
+
|
|
596
|
+
dist_matrix[i, j] = dist
|
|
597
|
+
dist_matrix[j, i] = dist
|
|
598
|
+
|
|
599
|
+
return dist_matrix
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
# Helper functions
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _to_array(
|
|
606
|
+
data: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]] | memoryview | bytearray,
|
|
607
|
+
target_length: int | None = None,
|
|
608
|
+
) -> np.ndarray[tuple[int], np.dtype[np.uint8]]:
|
|
609
|
+
"""Convert to numpy array, optionally padding to target length.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
data: Input data (bytes, bytearray, memoryview, or numpy array)
|
|
613
|
+
target_length: If specified, pad to this length
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
Numpy array of uint8
|
|
617
|
+
|
|
618
|
+
Raises:
|
|
619
|
+
TypeError: If data type is not supported
|
|
620
|
+
"""
|
|
621
|
+
if isinstance(data, bytes):
|
|
622
|
+
arr = np.frombuffer(data, dtype=np.uint8)
|
|
623
|
+
elif isinstance(data, bytearray | memoryview):
|
|
624
|
+
arr = np.frombuffer(bytes(data), dtype=np.uint8)
|
|
625
|
+
elif isinstance(data, np.ndarray):
|
|
626
|
+
arr = data.astype(np.uint8)
|
|
627
|
+
else:
|
|
628
|
+
raise TypeError(f"Unsupported type: {type(data)}")
|
|
629
|
+
|
|
630
|
+
if target_length is not None and len(arr) < target_length:
|
|
631
|
+
# Pad with zeros
|
|
632
|
+
padded = np.zeros(target_length, dtype=np.uint8)
|
|
633
|
+
padded[: len(arr)] = arr
|
|
634
|
+
return padded
|
|
635
|
+
|
|
636
|
+
return arr
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _hamming_distance(
|
|
640
|
+
a: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
641
|
+
b: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
642
|
+
) -> float:
|
|
643
|
+
"""Compute normalized Hamming distance."""
|
|
644
|
+
arr_a = _to_array(a)
|
|
645
|
+
arr_b = _to_array(b)
|
|
646
|
+
|
|
647
|
+
if len(arr_a) != len(arr_b):
|
|
648
|
+
# Pad shorter to match longer
|
|
649
|
+
max_len = max(len(arr_a), len(arr_b))
|
|
650
|
+
arr_a = _to_array(a, target_length=max_len)
|
|
651
|
+
arr_b = _to_array(b, target_length=max_len)
|
|
652
|
+
|
|
653
|
+
# Count differences
|
|
654
|
+
differences = np.sum(arr_a != arr_b)
|
|
655
|
+
return float(differences) / len(arr_a)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def _edit_distance(
|
|
659
|
+
a: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
660
|
+
b: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
661
|
+
) -> float:
|
|
662
|
+
"""Compute normalized Levenshtein edit distance."""
|
|
663
|
+
bytes_a = bytes(a) if isinstance(a, np.ndarray) else a
|
|
664
|
+
bytes_b = bytes(b) if isinstance(b, np.ndarray) else b
|
|
665
|
+
|
|
666
|
+
m, n = len(bytes_a), len(bytes_b)
|
|
667
|
+
|
|
668
|
+
if m == 0 and n == 0:
|
|
669
|
+
return 0.0
|
|
670
|
+
if m == 0:
|
|
671
|
+
return 1.0
|
|
672
|
+
if n == 0:
|
|
673
|
+
return 1.0
|
|
674
|
+
|
|
675
|
+
# DP table
|
|
676
|
+
prev_row = list(range(n + 1))
|
|
677
|
+
curr_row = [0] * (n + 1)
|
|
678
|
+
|
|
679
|
+
for i in range(1, m + 1):
|
|
680
|
+
curr_row[0] = i
|
|
681
|
+
for j in range(1, n + 1):
|
|
682
|
+
if bytes_a[i - 1] == bytes_b[j - 1]:
|
|
683
|
+
curr_row[j] = prev_row[j - 1]
|
|
684
|
+
else:
|
|
685
|
+
curr_row[j] = 1 + min(prev_row[j], curr_row[j - 1], prev_row[j - 1])
|
|
686
|
+
prev_row, curr_row = curr_row, prev_row
|
|
687
|
+
|
|
688
|
+
# Normalize by max length
|
|
689
|
+
return prev_row[n] / max(m, n)
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def _jaccard_distance(
|
|
693
|
+
a: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
694
|
+
b: bytes | np.ndarray[tuple[int], np.dtype[np.uint8]],
|
|
695
|
+
) -> float:
|
|
696
|
+
"""Compute Jaccard distance based on byte sets."""
|
|
697
|
+
set_a = set(_to_array(a))
|
|
698
|
+
set_b = set(_to_array(b))
|
|
699
|
+
|
|
700
|
+
if len(set_a) == 0 and len(set_b) == 0:
|
|
701
|
+
return 0.0
|
|
702
|
+
|
|
703
|
+
intersection = len(set_a & set_b)
|
|
704
|
+
union = len(set_a | set_b)
|
|
705
|
+
|
|
706
|
+
if union == 0:
|
|
707
|
+
return 0.0
|
|
708
|
+
|
|
709
|
+
# Jaccard distance = 1 - Jaccard similarity
|
|
710
|
+
return 1.0 - (intersection / union)
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _compute_centroid_hamming(
|
|
714
|
+
patterns: list[np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
715
|
+
) -> np.ndarray[tuple[int], np.dtype[np.uint8]]:
|
|
716
|
+
"""Compute centroid using majority vote (for fixed-length patterns)."""
|
|
717
|
+
if not patterns:
|
|
718
|
+
return np.array([], dtype=np.uint8)
|
|
719
|
+
|
|
720
|
+
_n = len(patterns)
|
|
721
|
+
length = len(patterns[0])
|
|
722
|
+
|
|
723
|
+
centroid = np.zeros(length, dtype=np.uint8)
|
|
724
|
+
for pos in range(length):
|
|
725
|
+
bytes_at_pos = [p[pos] for p in patterns]
|
|
726
|
+
# Most common byte
|
|
727
|
+
centroid[pos] = max(set(bytes_at_pos), key=bytes_at_pos.count)
|
|
728
|
+
|
|
729
|
+
return centroid
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def _compute_centroid_edit(
|
|
733
|
+
patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
734
|
+
) -> bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]:
|
|
735
|
+
"""Compute centroid for variable-length patterns (most central pattern)."""
|
|
736
|
+
if not patterns:
|
|
737
|
+
return b"" if isinstance(patterns[0], bytes) else np.array([])
|
|
738
|
+
|
|
739
|
+
# Use most common pattern as centroid
|
|
740
|
+
from collections import Counter
|
|
741
|
+
|
|
742
|
+
pattern_counts = Counter(bytes(p) if isinstance(p, np.ndarray) else p for p in patterns)
|
|
743
|
+
most_common = pattern_counts.most_common(1)[0][0]
|
|
744
|
+
|
|
745
|
+
# Return in original type
|
|
746
|
+
if isinstance(patterns[0], bytes):
|
|
747
|
+
return most_common
|
|
748
|
+
else:
|
|
749
|
+
return np.frombuffer(most_common, dtype=np.uint8)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _analyze_pattern_variance(
|
|
753
|
+
patterns: list[np.ndarray[tuple[int], np.dtype[np.uint8]]],
|
|
754
|
+
) -> tuple[list[int], list[int]]:
|
|
755
|
+
"""Analyze which byte positions are common vs variable."""
|
|
756
|
+
if not patterns or len(patterns) == 0:
|
|
757
|
+
return [], []
|
|
758
|
+
|
|
759
|
+
length = len(patterns[0])
|
|
760
|
+
common_bytes = []
|
|
761
|
+
variable_bytes = []
|
|
762
|
+
|
|
763
|
+
for pos in range(length):
|
|
764
|
+
bytes_at_pos = [p[pos] for p in patterns]
|
|
765
|
+
unique_values = len(set(bytes_at_pos))
|
|
766
|
+
|
|
767
|
+
if unique_values == 1:
|
|
768
|
+
common_bytes.append(pos)
|
|
769
|
+
else:
|
|
770
|
+
variable_bytes.append(pos)
|
|
771
|
+
|
|
772
|
+
return common_bytes, variable_bytes
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def _compute_byte_entropy(byte_values: list[int]) -> float:
|
|
776
|
+
"""Compute Shannon entropy of byte values."""
|
|
777
|
+
if not byte_values:
|
|
778
|
+
return 0.0
|
|
779
|
+
|
|
780
|
+
from collections import Counter
|
|
781
|
+
|
|
782
|
+
counts = Counter(byte_values)
|
|
783
|
+
n = len(byte_values)
|
|
784
|
+
|
|
785
|
+
entropy = 0.0
|
|
786
|
+
for count in counts.values():
|
|
787
|
+
if count > 0:
|
|
788
|
+
prob = count / n
|
|
789
|
+
entropy -= prob * np.log2(prob)
|
|
790
|
+
|
|
791
|
+
return entropy
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _compute_silhouette_score(
|
|
795
|
+
dist_matrix: np.ndarray[tuple[int, int], np.dtype[np.float64]],
|
|
796
|
+
labels: np.ndarray[tuple[int], np.dtype[np.int_]],
|
|
797
|
+
) -> float:
|
|
798
|
+
"""Compute average silhouette score for clustering quality."""
|
|
799
|
+
n = len(labels)
|
|
800
|
+
if n <= 1:
|
|
801
|
+
return 0.0
|
|
802
|
+
|
|
803
|
+
# Filter out noise points (-1 labels)
|
|
804
|
+
valid_mask = labels >= 0
|
|
805
|
+
if np.sum(valid_mask) <= 1:
|
|
806
|
+
return 0.0
|
|
807
|
+
|
|
808
|
+
unique_labels = set(labels[valid_mask])
|
|
809
|
+
if len(unique_labels) <= 1:
|
|
810
|
+
return 0.0
|
|
811
|
+
|
|
812
|
+
silhouette_scores = []
|
|
813
|
+
|
|
814
|
+
for i in range(n):
|
|
815
|
+
if labels[i] == -1:
|
|
816
|
+
continue
|
|
817
|
+
|
|
818
|
+
# a(i): average distance to points in same cluster
|
|
819
|
+
same_cluster = (labels == labels[i]) & (np.arange(n) != i)
|
|
820
|
+
if np.sum(same_cluster) == 0:
|
|
821
|
+
continue
|
|
822
|
+
|
|
823
|
+
a_i = np.mean(dist_matrix[i, same_cluster])
|
|
824
|
+
|
|
825
|
+
# b(i): minimum average distance to points in other clusters
|
|
826
|
+
b_i = float("inf")
|
|
827
|
+
for other_label in unique_labels:
|
|
828
|
+
if other_label == labels[i]:
|
|
829
|
+
continue
|
|
830
|
+
|
|
831
|
+
other_cluster = labels == other_label
|
|
832
|
+
if np.sum(other_cluster) > 0:
|
|
833
|
+
avg_dist = np.mean(dist_matrix[i, other_cluster])
|
|
834
|
+
b_i = min(b_i, avg_dist)
|
|
835
|
+
|
|
836
|
+
# Silhouette coefficient
|
|
837
|
+
if b_i == float("inf"):
|
|
838
|
+
s_i = 0.0
|
|
839
|
+
else:
|
|
840
|
+
s_i = (b_i - a_i) / max(a_i, b_i)
|
|
841
|
+
|
|
842
|
+
silhouette_scores.append(s_i)
|
|
843
|
+
|
|
844
|
+
return float(np.mean(silhouette_scores)) if silhouette_scores else 0.0
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def _hierarchical_clustering(
|
|
848
|
+
dist_matrix: np.ndarray[tuple[int, int], np.dtype[np.float64]],
|
|
849
|
+
method: str,
|
|
850
|
+
num_clusters: int | None,
|
|
851
|
+
distance_threshold: float | None,
|
|
852
|
+
) -> np.ndarray[tuple[int], np.dtype[np.int_]]:
|
|
853
|
+
"""Perform agglomerative hierarchical clustering."""
|
|
854
|
+
n = dist_matrix.shape[0]
|
|
855
|
+
|
|
856
|
+
# Initialize: each point is its own cluster
|
|
857
|
+
clusters = [[i] for i in range(n)]
|
|
858
|
+
_cluster_distances = dist_matrix.copy()
|
|
859
|
+
|
|
860
|
+
# Merge until desired number of clusters
|
|
861
|
+
while len(clusters) > 1:
|
|
862
|
+
if num_clusters is not None and len(clusters) <= num_clusters:
|
|
863
|
+
break
|
|
864
|
+
|
|
865
|
+
# Find closest pair of clusters
|
|
866
|
+
min_dist = float("inf")
|
|
867
|
+
merge_i, merge_j = -1, -1
|
|
868
|
+
|
|
869
|
+
for i in range(len(clusters)):
|
|
870
|
+
for j in range(i + 1, len(clusters)):
|
|
871
|
+
# Compute inter-cluster distance
|
|
872
|
+
dist = _linkage_distance(clusters[i], clusters[j], dist_matrix, method)
|
|
873
|
+
|
|
874
|
+
if dist < min_dist:
|
|
875
|
+
min_dist = dist
|
|
876
|
+
merge_i, merge_j = i, j
|
|
877
|
+
|
|
878
|
+
# Check distance threshold
|
|
879
|
+
if distance_threshold is not None and min_dist > distance_threshold:
|
|
880
|
+
break
|
|
881
|
+
|
|
882
|
+
# Merge clusters
|
|
883
|
+
if merge_i >= 0 and merge_j >= 0:
|
|
884
|
+
clusters[merge_i].extend(clusters[merge_j])
|
|
885
|
+
del clusters[merge_j]
|
|
886
|
+
|
|
887
|
+
# Assign labels
|
|
888
|
+
labels = np.full(n, -1, dtype=int)
|
|
889
|
+
for cid, cluster in enumerate(clusters):
|
|
890
|
+
for idx in cluster:
|
|
891
|
+
labels[idx] = cid
|
|
892
|
+
|
|
893
|
+
return labels
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def _linkage_distance(
|
|
897
|
+
cluster_a: list[int],
|
|
898
|
+
cluster_b: list[int],
|
|
899
|
+
dist_matrix: np.ndarray[tuple[int, int], np.dtype[np.float64]],
|
|
900
|
+
method: str,
|
|
901
|
+
) -> float:
|
|
902
|
+
"""Compute distance between two clusters using linkage method."""
|
|
903
|
+
distances = [dist_matrix[i, j] for i in cluster_a for j in cluster_b]
|
|
904
|
+
|
|
905
|
+
if not distances:
|
|
906
|
+
return 0.0
|
|
907
|
+
|
|
908
|
+
if method == "single":
|
|
909
|
+
return float(min(distances))
|
|
910
|
+
elif method == "complete":
|
|
911
|
+
return float(max(distances))
|
|
912
|
+
elif method == "average":
|
|
913
|
+
return float(np.mean(distances))
|
|
914
|
+
else:
|
|
915
|
+
return float(np.mean(distances)) # Default to average
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
class PatternClusterer:
|
|
919
|
+
"""Object-oriented wrapper for pattern clustering functionality.
|
|
920
|
+
|
|
921
|
+
Provides a class-based interface for clustering operations,
|
|
922
|
+
wrapping the functional API for consistency with test expectations.
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
Example:
|
|
927
|
+
>>> clusterer = PatternClusterer(n_clusters=3)
|
|
928
|
+
>>> labels = clusterer.cluster(messages)
|
|
929
|
+
"""
|
|
930
|
+
|
|
931
|
+
def __init__(
|
|
932
|
+
self,
|
|
933
|
+
n_clusters: int = 3,
|
|
934
|
+
method: Literal["hamming", "edit", "hierarchical"] = "hamming",
|
|
935
|
+
distance_metric: Literal["hamming", "levenshtein", "jaccard"] = "hamming",
|
|
936
|
+
threshold: float = 0.3,
|
|
937
|
+
min_cluster_size: int = 2,
|
|
938
|
+
):
|
|
939
|
+
"""Initialize pattern clusterer.
|
|
940
|
+
|
|
941
|
+
Args:
|
|
942
|
+
n_clusters: Desired number of clusters.
|
|
943
|
+
method: Clustering method ('hamming', 'edit', or 'hierarchical').
|
|
944
|
+
distance_metric: Distance metric to use.
|
|
945
|
+
threshold: Distance threshold for clustering.
|
|
946
|
+
min_cluster_size: Minimum patterns per cluster.
|
|
947
|
+
"""
|
|
948
|
+
self.n_clusters = n_clusters
|
|
949
|
+
self.method = method
|
|
950
|
+
self.distance_metric = distance_metric
|
|
951
|
+
self.threshold = threshold
|
|
952
|
+
self.min_cluster_size = min_cluster_size
|
|
953
|
+
self.result_: ClusteringResult | None = None
|
|
954
|
+
|
|
955
|
+
def cluster(
|
|
956
|
+
self, patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]]
|
|
957
|
+
) -> np.ndarray[tuple[int], np.dtype[np.int_]]:
|
|
958
|
+
"""Cluster patterns and return labels.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
patterns: List of patterns to cluster.
|
|
962
|
+
|
|
963
|
+
Returns:
|
|
964
|
+
Array of cluster labels (one per pattern).
|
|
965
|
+
|
|
966
|
+
Example:
|
|
967
|
+
>>> clusterer = PatternClusterer(n_clusters=3)
|
|
968
|
+
>>> labels = clusterer.cluster(messages)
|
|
969
|
+
"""
|
|
970
|
+
if self.method == "hamming":
|
|
971
|
+
self.result_ = cluster_by_hamming(
|
|
972
|
+
patterns, threshold=self.threshold, min_cluster_size=self.min_cluster_size
|
|
973
|
+
)
|
|
974
|
+
elif self.method == "edit":
|
|
975
|
+
self.result_ = cluster_by_edit_distance(
|
|
976
|
+
patterns, threshold=self.threshold, min_cluster_size=self.min_cluster_size
|
|
977
|
+
)
|
|
978
|
+
else: # hierarchical or default
|
|
979
|
+
self.result_ = cluster_hierarchical(
|
|
980
|
+
patterns, method="average", num_clusters=self.n_clusters
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
return self.result_.labels
|
|
984
|
+
|
|
985
|
+
def fit(
|
|
986
|
+
self, patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]]
|
|
987
|
+
) -> "PatternClusterer":
|
|
988
|
+
"""Fit the clusterer to patterns (sklearn-style interface).
|
|
989
|
+
|
|
990
|
+
Args:
|
|
991
|
+
patterns: List of patterns to cluster.
|
|
992
|
+
|
|
993
|
+
Returns:
|
|
994
|
+
Self (for method chaining).
|
|
995
|
+
"""
|
|
996
|
+
self.cluster(patterns)
|
|
997
|
+
return self
|
|
998
|
+
|
|
999
|
+
def fit_predict(
|
|
1000
|
+
self, patterns: list[bytes | np.ndarray[tuple[int], np.dtype[np.uint8]]]
|
|
1001
|
+
) -> np.ndarray[tuple[int], np.dtype[np.int_]]:
|
|
1002
|
+
"""Fit and return cluster labels (sklearn-style interface).
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
patterns: List of patterns to cluster.
|
|
1006
|
+
|
|
1007
|
+
Returns:
|
|
1008
|
+
Array of cluster labels.
|
|
1009
|
+
"""
|
|
1010
|
+
return self.cluster(patterns)
|
|
1011
|
+
|
|
1012
|
+
def get_clusters(self) -> list[ClusterResult]:
|
|
1013
|
+
"""Get detailed cluster results.
|
|
1014
|
+
|
|
1015
|
+
Returns:
|
|
1016
|
+
List of ClusterResult objects with full cluster analysis.
|
|
1017
|
+
|
|
1018
|
+
Raises:
|
|
1019
|
+
ValueError: If cluster() hasn't been called yet.
|
|
1020
|
+
"""
|
|
1021
|
+
if self.result_ is None:
|
|
1022
|
+
raise ValueError("Must call cluster() before get_clusters()")
|
|
1023
|
+
return self.result_.clusters
|
|
1024
|
+
|
|
1025
|
+
def get_silhouette_score(self) -> float:
|
|
1026
|
+
"""Get silhouette score for clustering quality.
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
Silhouette score (-1 to 1, higher is better).
|
|
1030
|
+
|
|
1031
|
+
Raises:
|
|
1032
|
+
ValueError: If cluster() hasn't been called yet.
|
|
1033
|
+
"""
|
|
1034
|
+
if self.result_ is None:
|
|
1035
|
+
raise ValueError("Must call cluster() before get_silhouette_score()")
|
|
1036
|
+
return self.result_.silhouette_score
|