truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1354 @@
|
|
|
1
|
+
"""Streaming Pattern Matching with Chunk Integration.
|
|
2
|
+
|
|
3
|
+
This module provides chunk-aware pattern matching for streaming data processing.
|
|
4
|
+
It solves the problem of pattern detection across chunk boundaries by maintaining
|
|
5
|
+
state and aggregating pattern statistics across multiple chunks.
|
|
6
|
+
|
|
7
|
+
Key features:
|
|
8
|
+
- Chunk-aware pattern state management
|
|
9
|
+
- Pluggable aggregation strategies
|
|
10
|
+
- Cross-chunk pattern boundary detection
|
|
11
|
+
- Statistical confidence tracking across chunks
|
|
12
|
+
- Memory-efficient incremental processing
|
|
13
|
+
- Integration with existing streaming profiler
|
|
14
|
+
|
|
15
|
+
Design Principles:
|
|
16
|
+
- Strategy Pattern: Aggregation strategies are pluggable
|
|
17
|
+
- Observer Pattern: Callbacks for pattern events
|
|
18
|
+
- State Pattern: Chunk state management
|
|
19
|
+
- Template Method: Customizable aggregation pipeline
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
from truthound.profiler.streaming_patterns import (
|
|
23
|
+
StreamingPatternMatcher,
|
|
24
|
+
IncrementalAggregation,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
matcher = StreamingPatternMatcher(
|
|
28
|
+
aggregation_strategy=IncrementalAggregation(),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Process chunks
|
|
32
|
+
for chunk in chunks:
|
|
33
|
+
matcher.process_chunk(chunk, "column_name")
|
|
34
|
+
|
|
35
|
+
# Get final results
|
|
36
|
+
results = matcher.finalize()
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import logging
|
|
42
|
+
import threading
|
|
43
|
+
import time
|
|
44
|
+
from abc import ABC, abstractmethod
|
|
45
|
+
from dataclasses import dataclass, field
|
|
46
|
+
from datetime import datetime
|
|
47
|
+
from enum import Enum
|
|
48
|
+
from typing import (
|
|
49
|
+
Any,
|
|
50
|
+
Callable,
|
|
51
|
+
Generic,
|
|
52
|
+
Iterator,
|
|
53
|
+
Protocol,
|
|
54
|
+
Sequence,
|
|
55
|
+
TypeVar,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
import polars as pl
|
|
59
|
+
|
|
60
|
+
from truthound.profiler.base import DataType, PatternMatch
|
|
61
|
+
from truthound.profiler.native_patterns import (
|
|
62
|
+
BUILTIN_PATTERNS,
|
|
63
|
+
NativePatternMatcher,
|
|
64
|
+
PatternMatchResult,
|
|
65
|
+
PatternRegistry,
|
|
66
|
+
PatternSpec,
|
|
67
|
+
)
|
|
68
|
+
from truthound.profiler.sampling import (
|
|
69
|
+
SamplingConfig,
|
|
70
|
+
SamplingMetrics,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
logger = logging.getLogger(__name__)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Types and Enums
|
|
78
|
+
# =============================================================================
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class AggregationMethod(str, Enum):
|
|
82
|
+
"""Methods for aggregating pattern statistics across chunks."""
|
|
83
|
+
|
|
84
|
+
INCREMENTAL = "incremental" # Running totals
|
|
85
|
+
WEIGHTED = "weighted" # Size-weighted averages
|
|
86
|
+
SLIDING_WINDOW = "sliding_window" # Recent chunks only
|
|
87
|
+
EXPONENTIAL = "exponential" # Exponential moving average
|
|
88
|
+
RESERVOIR = "reservoir" # Reservoir-based sampling
|
|
89
|
+
CONSENSUS = "consensus" # Agreement across chunks
|
|
90
|
+
ADAPTIVE = "adaptive" # Auto-select based on data
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ChunkProcessingStatus(str, Enum):
|
|
94
|
+
"""Status of chunk processing."""
|
|
95
|
+
|
|
96
|
+
PENDING = "pending"
|
|
97
|
+
PROCESSING = "processing"
|
|
98
|
+
COMPLETED = "completed"
|
|
99
|
+
FAILED = "failed"
|
|
100
|
+
SKIPPED = "skipped"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# =============================================================================
|
|
104
|
+
# Pattern State Management
|
|
105
|
+
# =============================================================================
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class PatternChunkStats:
|
|
110
|
+
"""Statistics for a pattern in a single chunk.
|
|
111
|
+
|
|
112
|
+
This is the basic unit of pattern statistics captured per chunk.
|
|
113
|
+
Immutable after creation to ensure thread-safety.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
pattern_name: str
|
|
117
|
+
match_count: int
|
|
118
|
+
total_count: int
|
|
119
|
+
chunk_index: int
|
|
120
|
+
processing_time_ms: float = 0.0
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def match_ratio(self) -> float:
|
|
124
|
+
"""Calculate match ratio for this chunk."""
|
|
125
|
+
return self.match_count / self.total_count if self.total_count > 0 else 0.0
|
|
126
|
+
|
|
127
|
+
def to_dict(self) -> dict[str, Any]:
|
|
128
|
+
"""Convert to dictionary."""
|
|
129
|
+
return {
|
|
130
|
+
"pattern_name": self.pattern_name,
|
|
131
|
+
"match_count": self.match_count,
|
|
132
|
+
"total_count": self.total_count,
|
|
133
|
+
"match_ratio": self.match_ratio,
|
|
134
|
+
"chunk_index": self.chunk_index,
|
|
135
|
+
"processing_time_ms": self.processing_time_ms,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class PatternState:
|
|
141
|
+
"""Mutable state for a pattern across all chunks.
|
|
142
|
+
|
|
143
|
+
Maintains running statistics and history for aggregation.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
pattern: PatternSpec
|
|
147
|
+
chunk_stats: list[PatternChunkStats] = field(default_factory=list)
|
|
148
|
+
|
|
149
|
+
# Running totals
|
|
150
|
+
total_matches: int = 0
|
|
151
|
+
total_rows: int = 0
|
|
152
|
+
chunks_processed: int = 0
|
|
153
|
+
|
|
154
|
+
# Sample collection
|
|
155
|
+
sample_matches: list[str] = field(default_factory=list)
|
|
156
|
+
max_samples: int = 10
|
|
157
|
+
|
|
158
|
+
# Timing
|
|
159
|
+
total_processing_time_ms: float = 0.0
|
|
160
|
+
|
|
161
|
+
def add_chunk_stats(self, stats: PatternChunkStats) -> None:
|
|
162
|
+
"""Add statistics from a new chunk."""
|
|
163
|
+
self.chunk_stats.append(stats)
|
|
164
|
+
self.total_matches += stats.match_count
|
|
165
|
+
self.total_rows += stats.total_count
|
|
166
|
+
self.chunks_processed += 1
|
|
167
|
+
self.total_processing_time_ms += stats.processing_time_ms
|
|
168
|
+
|
|
169
|
+
def add_samples(self, samples: Sequence[str]) -> None:
|
|
170
|
+
"""Add sample matches (up to max_samples)."""
|
|
171
|
+
remaining = self.max_samples - len(self.sample_matches)
|
|
172
|
+
if remaining > 0:
|
|
173
|
+
self.sample_matches.extend(samples[:remaining])
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def overall_match_ratio(self) -> float:
|
|
177
|
+
"""Calculate overall match ratio across all chunks."""
|
|
178
|
+
return self.total_matches / self.total_rows if self.total_rows > 0 else 0.0
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def chunk_ratios(self) -> list[float]:
|
|
182
|
+
"""Get match ratios for each chunk."""
|
|
183
|
+
return [s.match_ratio for s in self.chunk_stats]
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def variance(self) -> float:
|
|
187
|
+
"""Calculate variance of match ratios across chunks."""
|
|
188
|
+
if len(self.chunk_stats) < 2:
|
|
189
|
+
return 0.0
|
|
190
|
+
ratios = self.chunk_ratios
|
|
191
|
+
mean = sum(ratios) / len(ratios)
|
|
192
|
+
return sum((r - mean) ** 2 for r in ratios) / (len(ratios) - 1)
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def std_deviation(self) -> float:
|
|
196
|
+
"""Calculate standard deviation of match ratios."""
|
|
197
|
+
return self.variance ** 0.5
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def is_consistent(self) -> bool:
|
|
201
|
+
"""Check if pattern is consistent across chunks."""
|
|
202
|
+
if len(self.chunk_stats) < 2:
|
|
203
|
+
return True
|
|
204
|
+
return self.std_deviation < 0.1 # Less than 10% variation
|
|
205
|
+
|
|
206
|
+
def to_pattern_match(self) -> PatternMatch:
|
|
207
|
+
"""Convert to legacy PatternMatch format."""
|
|
208
|
+
return PatternMatch(
|
|
209
|
+
pattern=self.pattern.name,
|
|
210
|
+
regex=self.pattern.regex,
|
|
211
|
+
match_ratio=self.overall_match_ratio,
|
|
212
|
+
sample_matches=tuple(self.sample_matches),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@dataclass
|
|
217
|
+
class ColumnPatternState:
|
|
218
|
+
"""Complete pattern state for a single column."""
|
|
219
|
+
|
|
220
|
+
column_name: str
|
|
221
|
+
pattern_states: dict[str, PatternState] = field(default_factory=dict)
|
|
222
|
+
chunks_processed: int = 0
|
|
223
|
+
total_rows: int = 0
|
|
224
|
+
started_at: datetime = field(default_factory=datetime.now)
|
|
225
|
+
completed_at: datetime | None = None
|
|
226
|
+
|
|
227
|
+
def get_or_create_pattern_state(self, pattern: PatternSpec) -> PatternState:
|
|
228
|
+
"""Get or create pattern state for a pattern."""
|
|
229
|
+
if pattern.name not in self.pattern_states:
|
|
230
|
+
self.pattern_states[pattern.name] = PatternState(pattern=pattern)
|
|
231
|
+
return self.pattern_states[pattern.name]
|
|
232
|
+
|
|
233
|
+
def add_chunk(self, chunk_rows: int) -> None:
|
|
234
|
+
"""Register a chunk was processed."""
|
|
235
|
+
self.chunks_processed += 1
|
|
236
|
+
self.total_rows += chunk_rows
|
|
237
|
+
|
|
238
|
+
def finalize(self) -> None:
|
|
239
|
+
"""Mark processing as complete."""
|
|
240
|
+
self.completed_at = datetime.now()
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def processing_duration_ms(self) -> float:
|
|
244
|
+
"""Get total processing duration."""
|
|
245
|
+
end = self.completed_at or datetime.now()
|
|
246
|
+
return (end - self.started_at).total_seconds() * 1000
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# =============================================================================
|
|
250
|
+
# Aggregation Strategies
|
|
251
|
+
# =============================================================================
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class AggregationStrategy(ABC):
|
|
255
|
+
"""Abstract base class for pattern aggregation strategies.
|
|
256
|
+
|
|
257
|
+
Aggregation strategies determine how pattern statistics from
|
|
258
|
+
multiple chunks are combined into final results.
|
|
259
|
+
|
|
260
|
+
Subclass this to create custom aggregation behavior.
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
name: str = "base"
|
|
264
|
+
|
|
265
|
+
@abstractmethod
|
|
266
|
+
def aggregate(
|
|
267
|
+
self,
|
|
268
|
+
state: PatternState,
|
|
269
|
+
min_match_ratio: float = 0.8,
|
|
270
|
+
) -> PatternMatchResult | None:
|
|
271
|
+
"""Aggregate pattern statistics into final result.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
state: Pattern state with all chunk statistics
|
|
275
|
+
min_match_ratio: Minimum ratio to consider a match
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Aggregated PatternMatchResult or None if not matching
|
|
279
|
+
"""
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
@abstractmethod
|
|
283
|
+
def should_include_pattern(
|
|
284
|
+
self,
|
|
285
|
+
state: PatternState,
|
|
286
|
+
min_match_ratio: float,
|
|
287
|
+
) -> bool:
|
|
288
|
+
"""Determine if pattern should be included in results.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
state: Pattern state
|
|
292
|
+
min_match_ratio: Minimum ratio threshold
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
True if pattern should be included
|
|
296
|
+
"""
|
|
297
|
+
pass
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class IncrementalAggregation(AggregationStrategy):
|
|
301
|
+
"""Simple incremental aggregation using running totals.
|
|
302
|
+
|
|
303
|
+
The most straightforward aggregation: sum all matches and
|
|
304
|
+
divide by total rows. Works well for uniform data.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
name = "incremental"
|
|
308
|
+
|
|
309
|
+
def aggregate(
|
|
310
|
+
self,
|
|
311
|
+
state: PatternState,
|
|
312
|
+
min_match_ratio: float = 0.8,
|
|
313
|
+
) -> PatternMatchResult | None:
|
|
314
|
+
"""Aggregate using simple totals."""
|
|
315
|
+
if not self.should_include_pattern(state, min_match_ratio):
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
return PatternMatchResult(
|
|
319
|
+
pattern=state.pattern,
|
|
320
|
+
match_count=state.total_matches,
|
|
321
|
+
total_count=state.total_rows,
|
|
322
|
+
match_ratio=state.overall_match_ratio,
|
|
323
|
+
sample_matches=tuple(state.sample_matches),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
def should_include_pattern(
|
|
327
|
+
self,
|
|
328
|
+
state: PatternState,
|
|
329
|
+
min_match_ratio: float,
|
|
330
|
+
) -> bool:
|
|
331
|
+
"""Include if overall ratio meets threshold."""
|
|
332
|
+
return state.overall_match_ratio >= min_match_ratio
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class WeightedAggregation(AggregationStrategy):
|
|
336
|
+
"""Weighted aggregation based on chunk sizes.
|
|
337
|
+
|
|
338
|
+
Gives more weight to larger chunks. Useful when chunk
|
|
339
|
+
sizes vary significantly.
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
name = "weighted"
|
|
343
|
+
|
|
344
|
+
def aggregate(
|
|
345
|
+
self,
|
|
346
|
+
state: PatternState,
|
|
347
|
+
min_match_ratio: float = 0.8,
|
|
348
|
+
) -> PatternMatchResult | None:
|
|
349
|
+
"""Aggregate using size-weighted average."""
|
|
350
|
+
if not self.should_include_pattern(state, min_match_ratio):
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
# Weighted average is the same as simple total ratio
|
|
354
|
+
# when weights are proportional to counts
|
|
355
|
+
return PatternMatchResult(
|
|
356
|
+
pattern=state.pattern,
|
|
357
|
+
match_count=state.total_matches,
|
|
358
|
+
total_count=state.total_rows,
|
|
359
|
+
match_ratio=state.overall_match_ratio,
|
|
360
|
+
sample_matches=tuple(state.sample_matches),
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
def should_include_pattern(
|
|
364
|
+
self,
|
|
365
|
+
state: PatternState,
|
|
366
|
+
min_match_ratio: float,
|
|
367
|
+
) -> bool:
|
|
368
|
+
"""Include based on weighted ratio."""
|
|
369
|
+
return state.overall_match_ratio >= min_match_ratio
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class SlidingWindowAggregation(AggregationStrategy):
|
|
373
|
+
"""Aggregation using only recent chunks.
|
|
374
|
+
|
|
375
|
+
Useful for detecting patterns in recent data when older
|
|
376
|
+
data may have different characteristics.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
name = "sliding_window"
|
|
380
|
+
|
|
381
|
+
def __init__(self, window_size: int = 5):
|
|
382
|
+
"""Initialize with window size.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
window_size: Number of recent chunks to consider
|
|
386
|
+
"""
|
|
387
|
+
self.window_size = window_size
|
|
388
|
+
|
|
389
|
+
def aggregate(
|
|
390
|
+
self,
|
|
391
|
+
state: PatternState,
|
|
392
|
+
min_match_ratio: float = 0.8,
|
|
393
|
+
) -> PatternMatchResult | None:
|
|
394
|
+
"""Aggregate using recent chunks only."""
|
|
395
|
+
if not self.should_include_pattern(state, min_match_ratio):
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
# Get recent chunks
|
|
399
|
+
recent = state.chunk_stats[-self.window_size:]
|
|
400
|
+
if not recent:
|
|
401
|
+
return None
|
|
402
|
+
|
|
403
|
+
total_matches = sum(s.match_count for s in recent)
|
|
404
|
+
total_rows = sum(s.total_count for s in recent)
|
|
405
|
+
match_ratio = total_matches / total_rows if total_rows > 0 else 0.0
|
|
406
|
+
|
|
407
|
+
return PatternMatchResult(
|
|
408
|
+
pattern=state.pattern,
|
|
409
|
+
match_count=total_matches,
|
|
410
|
+
total_count=total_rows,
|
|
411
|
+
match_ratio=match_ratio,
|
|
412
|
+
sample_matches=tuple(state.sample_matches),
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
def should_include_pattern(
|
|
416
|
+
self,
|
|
417
|
+
state: PatternState,
|
|
418
|
+
min_match_ratio: float,
|
|
419
|
+
) -> bool:
|
|
420
|
+
"""Include based on recent chunks."""
|
|
421
|
+
recent = state.chunk_stats[-self.window_size:]
|
|
422
|
+
if not recent:
|
|
423
|
+
return False
|
|
424
|
+
|
|
425
|
+
total_matches = sum(s.match_count for s in recent)
|
|
426
|
+
total_rows = sum(s.total_count for s in recent)
|
|
427
|
+
ratio = total_matches / total_rows if total_rows > 0 else 0.0
|
|
428
|
+
return ratio >= min_match_ratio
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class ExponentialAggregation(AggregationStrategy):
|
|
432
|
+
"""Exponential moving average aggregation.
|
|
433
|
+
|
|
434
|
+
Gives exponentially more weight to recent chunks.
|
|
435
|
+
Alpha controls the decay rate (higher = more weight to recent).
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
name = "exponential"
|
|
439
|
+
|
|
440
|
+
def __init__(self, alpha: float = 0.3):
|
|
441
|
+
"""Initialize with smoothing factor.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
alpha: Smoothing factor (0-1). Higher = more weight to recent.
|
|
445
|
+
"""
|
|
446
|
+
if not 0 < alpha <= 1:
|
|
447
|
+
raise ValueError(f"alpha must be between 0 and 1, got {alpha}")
|
|
448
|
+
self.alpha = alpha
|
|
449
|
+
|
|
450
|
+
def aggregate(
|
|
451
|
+
self,
|
|
452
|
+
state: PatternState,
|
|
453
|
+
min_match_ratio: float = 0.8,
|
|
454
|
+
) -> PatternMatchResult | None:
|
|
455
|
+
"""Aggregate using exponential moving average."""
|
|
456
|
+
if not self.should_include_pattern(state, min_match_ratio):
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
ema_ratio = self._calculate_ema(state.chunk_ratios)
|
|
460
|
+
|
|
461
|
+
return PatternMatchResult(
|
|
462
|
+
pattern=state.pattern,
|
|
463
|
+
match_count=state.total_matches,
|
|
464
|
+
total_count=state.total_rows,
|
|
465
|
+
match_ratio=ema_ratio,
|
|
466
|
+
sample_matches=tuple(state.sample_matches),
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def _calculate_ema(self, ratios: list[float]) -> float:
|
|
470
|
+
"""Calculate exponential moving average of ratios."""
|
|
471
|
+
if not ratios:
|
|
472
|
+
return 0.0
|
|
473
|
+
|
|
474
|
+
ema = ratios[0]
|
|
475
|
+
for ratio in ratios[1:]:
|
|
476
|
+
ema = self.alpha * ratio + (1 - self.alpha) * ema
|
|
477
|
+
return ema
|
|
478
|
+
|
|
479
|
+
def should_include_pattern(
|
|
480
|
+
self,
|
|
481
|
+
state: PatternState,
|
|
482
|
+
min_match_ratio: float,
|
|
483
|
+
) -> bool:
|
|
484
|
+
"""Include based on EMA ratio."""
|
|
485
|
+
if not state.chunk_ratios:
|
|
486
|
+
return False
|
|
487
|
+
ema = self._calculate_ema(state.chunk_ratios)
|
|
488
|
+
return ema >= min_match_ratio
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
class ConsensusAggregation(AggregationStrategy):
|
|
492
|
+
"""Consensus-based aggregation requiring agreement across chunks.
|
|
493
|
+
|
|
494
|
+
Pattern is included only if it matches in a minimum fraction
|
|
495
|
+
of chunks. Useful for detecting consistent patterns.
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
name = "consensus"
|
|
499
|
+
|
|
500
|
+
def __init__(self, consensus_threshold: float = 0.8):
|
|
501
|
+
"""Initialize with consensus threshold.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
consensus_threshold: Fraction of chunks that must match (0-1)
|
|
505
|
+
"""
|
|
506
|
+
if not 0 < consensus_threshold <= 1:
|
|
507
|
+
raise ValueError(
|
|
508
|
+
f"consensus_threshold must be between 0 and 1, got {consensus_threshold}"
|
|
509
|
+
)
|
|
510
|
+
self.consensus_threshold = consensus_threshold
|
|
511
|
+
|
|
512
|
+
def aggregate(
|
|
513
|
+
self,
|
|
514
|
+
state: PatternState,
|
|
515
|
+
min_match_ratio: float = 0.8,
|
|
516
|
+
) -> PatternMatchResult | None:
|
|
517
|
+
"""Aggregate requiring consensus across chunks."""
|
|
518
|
+
if not self.should_include_pattern(state, min_match_ratio):
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
return PatternMatchResult(
|
|
522
|
+
pattern=state.pattern,
|
|
523
|
+
match_count=state.total_matches,
|
|
524
|
+
total_count=state.total_rows,
|
|
525
|
+
match_ratio=state.overall_match_ratio,
|
|
526
|
+
sample_matches=tuple(state.sample_matches),
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
def should_include_pattern(
|
|
530
|
+
self,
|
|
531
|
+
state: PatternState,
|
|
532
|
+
min_match_ratio: float,
|
|
533
|
+
) -> bool:
|
|
534
|
+
"""Include if consensus threshold is met."""
|
|
535
|
+
if not state.chunk_stats:
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
# Count chunks where pattern matches
|
|
539
|
+
matching_chunks = sum(
|
|
540
|
+
1 for s in state.chunk_stats if s.match_ratio >= min_match_ratio
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
consensus_ratio = matching_chunks / len(state.chunk_stats)
|
|
544
|
+
return consensus_ratio >= self.consensus_threshold
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
class AdaptiveAggregation(AggregationStrategy):
|
|
548
|
+
"""Adaptive aggregation that selects strategy based on data characteristics.
|
|
549
|
+
|
|
550
|
+
Automatically chooses the best aggregation method based on:
|
|
551
|
+
- Variance in chunk ratios
|
|
552
|
+
- Number of chunks processed
|
|
553
|
+
- Pattern consistency
|
|
554
|
+
"""
|
|
555
|
+
|
|
556
|
+
name = "adaptive"
|
|
557
|
+
|
|
558
|
+
def __init__(self) -> None:
|
|
559
|
+
"""Initialize with sub-strategies."""
|
|
560
|
+
self._strategies = {
|
|
561
|
+
"incremental": IncrementalAggregation(),
|
|
562
|
+
"exponential": ExponentialAggregation(alpha=0.3),
|
|
563
|
+
"consensus": ConsensusAggregation(consensus_threshold=0.7),
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
def aggregate(
|
|
567
|
+
self,
|
|
568
|
+
state: PatternState,
|
|
569
|
+
min_match_ratio: float = 0.8,
|
|
570
|
+
) -> PatternMatchResult | None:
|
|
571
|
+
"""Aggregate using adaptively selected strategy."""
|
|
572
|
+
strategy = self._select_strategy(state)
|
|
573
|
+
logger.debug(
|
|
574
|
+
f"Adaptive aggregation selected '{strategy.name}' for pattern '{state.pattern.name}'"
|
|
575
|
+
)
|
|
576
|
+
return strategy.aggregate(state, min_match_ratio)
|
|
577
|
+
|
|
578
|
+
def should_include_pattern(
|
|
579
|
+
self,
|
|
580
|
+
state: PatternState,
|
|
581
|
+
min_match_ratio: float,
|
|
582
|
+
) -> bool:
|
|
583
|
+
"""Check using adaptively selected strategy."""
|
|
584
|
+
strategy = self._select_strategy(state)
|
|
585
|
+
return strategy.should_include_pattern(state, min_match_ratio)
|
|
586
|
+
|
|
587
|
+
def _select_strategy(self, state: PatternState) -> AggregationStrategy:
|
|
588
|
+
"""Select best strategy based on state characteristics."""
|
|
589
|
+
if len(state.chunk_stats) < 3:
|
|
590
|
+
# Too few chunks for sophisticated analysis
|
|
591
|
+
return self._strategies["incremental"]
|
|
592
|
+
|
|
593
|
+
if state.is_consistent:
|
|
594
|
+
# Consistent pattern: simple aggregation is fine
|
|
595
|
+
return self._strategies["incremental"]
|
|
596
|
+
|
|
597
|
+
if state.std_deviation > 0.2:
|
|
598
|
+
# High variance: use consensus to require agreement
|
|
599
|
+
return self._strategies["consensus"]
|
|
600
|
+
|
|
601
|
+
# Default: exponential for balanced handling
|
|
602
|
+
return self._strategies["exponential"]
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
# =============================================================================
|
|
606
|
+
# Aggregation Strategy Registry
|
|
607
|
+
# =============================================================================
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
class AggregationStrategyRegistry:
|
|
611
|
+
"""Registry for aggregation strategies.
|
|
612
|
+
|
|
613
|
+
Allows registration of custom strategies and creation by name.
|
|
614
|
+
"""
|
|
615
|
+
|
|
616
|
+
def __init__(self) -> None:
|
|
617
|
+
self._strategies: dict[str, AggregationStrategy] = {}
|
|
618
|
+
self._lock = threading.RLock()
|
|
619
|
+
self._register_defaults()
|
|
620
|
+
|
|
621
|
+
def _register_defaults(self) -> None:
|
|
622
|
+
"""Register built-in strategies."""
|
|
623
|
+
self.register(IncrementalAggregation())
|
|
624
|
+
self.register(WeightedAggregation())
|
|
625
|
+
self.register(SlidingWindowAggregation())
|
|
626
|
+
self.register(ExponentialAggregation())
|
|
627
|
+
self.register(ConsensusAggregation())
|
|
628
|
+
self.register(AdaptiveAggregation())
|
|
629
|
+
|
|
630
|
+
def register(self, strategy: AggregationStrategy) -> None:
|
|
631
|
+
"""Register an aggregation strategy."""
|
|
632
|
+
with self._lock:
|
|
633
|
+
self._strategies[strategy.name] = strategy
|
|
634
|
+
logger.debug(f"Registered aggregation strategy: {strategy.name}")
|
|
635
|
+
|
|
636
|
+
def get(self, name: str) -> AggregationStrategy:
|
|
637
|
+
"""Get a strategy by name."""
|
|
638
|
+
with self._lock:
|
|
639
|
+
if name not in self._strategies:
|
|
640
|
+
available = list(self._strategies.keys())
|
|
641
|
+
raise KeyError(
|
|
642
|
+
f"Unknown aggregation strategy: '{name}'. Available: {available}"
|
|
643
|
+
)
|
|
644
|
+
return self._strategies[name]
|
|
645
|
+
|
|
646
|
+
def get_or_default(
|
|
647
|
+
self,
|
|
648
|
+
name: str,
|
|
649
|
+
default: AggregationStrategy | None = None,
|
|
650
|
+
) -> AggregationStrategy:
|
|
651
|
+
"""Get strategy by name with fallback."""
|
|
652
|
+
try:
|
|
653
|
+
return self.get(name)
|
|
654
|
+
except KeyError:
|
|
655
|
+
return default or AdaptiveAggregation()
|
|
656
|
+
|
|
657
|
+
def list_strategies(self) -> list[str]:
|
|
658
|
+
"""List all registered strategy names."""
|
|
659
|
+
with self._lock:
|
|
660
|
+
return list(self._strategies.keys())
|
|
661
|
+
|
|
662
|
+
def create_from_method(self, method: AggregationMethod) -> AggregationStrategy:
|
|
663
|
+
"""Create strategy from AggregationMethod enum."""
|
|
664
|
+
return self.get(method.value)
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
# Global registry instance
|
|
668
|
+
aggregation_strategy_registry = AggregationStrategyRegistry()
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
# =============================================================================
|
|
672
|
+
# Streaming Pattern Matcher Result
|
|
673
|
+
# =============================================================================
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
@dataclass
|
|
677
|
+
class StreamingPatternResult:
|
|
678
|
+
"""Result of streaming pattern matching for a column.
|
|
679
|
+
|
|
680
|
+
Contains aggregated pattern matches and metadata about
|
|
681
|
+
the streaming process.
|
|
682
|
+
"""
|
|
683
|
+
|
|
684
|
+
column: str
|
|
685
|
+
matches: list[PatternMatchResult]
|
|
686
|
+
chunks_processed: int
|
|
687
|
+
total_rows: int
|
|
688
|
+
processing_time_ms: float
|
|
689
|
+
aggregation_method: str
|
|
690
|
+
inferred_type: DataType | None = None
|
|
691
|
+
|
|
692
|
+
# Per-pattern statistics
|
|
693
|
+
pattern_stats: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
694
|
+
|
|
695
|
+
@property
|
|
696
|
+
def has_matches(self) -> bool:
|
|
697
|
+
"""Check if any patterns matched."""
|
|
698
|
+
return len(self.matches) > 0
|
|
699
|
+
|
|
700
|
+
@property
|
|
701
|
+
def best_match(self) -> PatternMatchResult | None:
|
|
702
|
+
"""Get the best (highest ratio) match."""
|
|
703
|
+
return self.matches[0] if self.matches else None
|
|
704
|
+
|
|
705
|
+
def to_dict(self) -> dict[str, Any]:
|
|
706
|
+
"""Convert to dictionary."""
|
|
707
|
+
return {
|
|
708
|
+
"column": self.column,
|
|
709
|
+
"matches": [
|
|
710
|
+
{
|
|
711
|
+
"pattern_name": m.pattern.name,
|
|
712
|
+
"match_ratio": m.match_ratio,
|
|
713
|
+
"match_count": m.match_count,
|
|
714
|
+
"total_count": m.total_count,
|
|
715
|
+
}
|
|
716
|
+
for m in self.matches
|
|
717
|
+
],
|
|
718
|
+
"chunks_processed": self.chunks_processed,
|
|
719
|
+
"total_rows": self.total_rows,
|
|
720
|
+
"processing_time_ms": self.processing_time_ms,
|
|
721
|
+
"aggregation_method": self.aggregation_method,
|
|
722
|
+
"inferred_type": self.inferred_type.value if self.inferred_type else None,
|
|
723
|
+
"pattern_stats": self.pattern_stats,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
# =============================================================================
|
|
728
|
+
# Pattern Event Callbacks
|
|
729
|
+
# =============================================================================
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
@dataclass
|
|
733
|
+
class PatternEvent:
|
|
734
|
+
"""Event emitted during pattern processing."""
|
|
735
|
+
|
|
736
|
+
event_type: str # "chunk_processed", "pattern_detected", "processing_complete"
|
|
737
|
+
column: str
|
|
738
|
+
chunk_index: int
|
|
739
|
+
data: dict[str, Any] = field(default_factory=dict)
|
|
740
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
PatternEventCallback = Callable[[PatternEvent], None]
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
# =============================================================================
|
|
747
|
+
# Streaming Pattern Matcher Configuration
|
|
748
|
+
# =============================================================================
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
@dataclass
|
|
752
|
+
class StreamingPatternConfig:
|
|
753
|
+
"""Configuration for streaming pattern matching.
|
|
754
|
+
|
|
755
|
+
Attributes:
|
|
756
|
+
aggregation_method: Method for aggregating chunk statistics
|
|
757
|
+
min_match_ratio: Minimum ratio to consider a pattern matched
|
|
758
|
+
sample_size_per_chunk: Max samples to collect per chunk
|
|
759
|
+
patterns: Pattern registry to use
|
|
760
|
+
enable_early_termination: Stop if pattern definitely matched/not matched
|
|
761
|
+
early_termination_chunks: Chunks after which early termination is checked
|
|
762
|
+
collect_statistics: Collect detailed per-chunk statistics
|
|
763
|
+
"""
|
|
764
|
+
|
|
765
|
+
aggregation_method: AggregationMethod = AggregationMethod.ADAPTIVE
|
|
766
|
+
min_match_ratio: float = 0.8
|
|
767
|
+
sample_size_per_chunk: int = 3
|
|
768
|
+
patterns: PatternRegistry | None = None
|
|
769
|
+
enable_early_termination: bool = True
|
|
770
|
+
early_termination_chunks: int = 3
|
|
771
|
+
early_termination_confidence: float = 0.95
|
|
772
|
+
collect_statistics: bool = True
|
|
773
|
+
|
|
774
|
+
def __post_init__(self) -> None:
|
|
775
|
+
"""Validate configuration."""
|
|
776
|
+
if not 0.0 <= self.min_match_ratio <= 1.0:
|
|
777
|
+
raise ValueError(
|
|
778
|
+
f"min_match_ratio must be between 0 and 1, got {self.min_match_ratio}"
|
|
779
|
+
)
|
|
780
|
+
if self.sample_size_per_chunk < 0:
|
|
781
|
+
raise ValueError(
|
|
782
|
+
f"sample_size_per_chunk must be non-negative, got {self.sample_size_per_chunk}"
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
@classmethod
|
|
786
|
+
def fast(cls) -> "StreamingPatternConfig":
|
|
787
|
+
"""Create config optimized for speed."""
|
|
788
|
+
return cls(
|
|
789
|
+
aggregation_method=AggregationMethod.INCREMENTAL,
|
|
790
|
+
min_match_ratio=0.7,
|
|
791
|
+
sample_size_per_chunk=2,
|
|
792
|
+
enable_early_termination=True,
|
|
793
|
+
early_termination_chunks=2,
|
|
794
|
+
collect_statistics=False,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
@classmethod
|
|
798
|
+
def accurate(cls) -> "StreamingPatternConfig":
|
|
799
|
+
"""Create config optimized for accuracy."""
|
|
800
|
+
return cls(
|
|
801
|
+
aggregation_method=AggregationMethod.CONSENSUS,
|
|
802
|
+
min_match_ratio=0.85,
|
|
803
|
+
sample_size_per_chunk=5,
|
|
804
|
+
enable_early_termination=False,
|
|
805
|
+
collect_statistics=True,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
@classmethod
|
|
809
|
+
def balanced(cls) -> "StreamingPatternConfig":
|
|
810
|
+
"""Create balanced config (default)."""
|
|
811
|
+
return cls(
|
|
812
|
+
aggregation_method=AggregationMethod.ADAPTIVE,
|
|
813
|
+
min_match_ratio=0.8,
|
|
814
|
+
sample_size_per_chunk=3,
|
|
815
|
+
enable_early_termination=True,
|
|
816
|
+
early_termination_chunks=3,
|
|
817
|
+
collect_statistics=True,
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
# =============================================================================
|
|
822
|
+
# Streaming Pattern Matcher
|
|
823
|
+
# =============================================================================
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
class StreamingPatternMatcher:
|
|
827
|
+
"""Chunk-aware pattern matcher for streaming data.
|
|
828
|
+
|
|
829
|
+
This is the main interface for streaming pattern matching.
|
|
830
|
+
It maintains state across chunks and provides aggregated
|
|
831
|
+
results using configurable strategies.
|
|
832
|
+
|
|
833
|
+
Example:
|
|
834
|
+
# Basic usage
|
|
835
|
+
matcher = StreamingPatternMatcher()
|
|
836
|
+
|
|
837
|
+
for chunk in data_chunks:
|
|
838
|
+
matcher.process_chunk(chunk, "column_name")
|
|
839
|
+
|
|
840
|
+
result = matcher.finalize("column_name")
|
|
841
|
+
for match in result.matches:
|
|
842
|
+
print(f"{match.pattern.name}: {match.match_ratio:.2%}")
|
|
843
|
+
|
|
844
|
+
# With configuration
|
|
845
|
+
config = StreamingPatternConfig(
|
|
846
|
+
aggregation_method=AggregationMethod.CONSENSUS,
|
|
847
|
+
min_match_ratio=0.9,
|
|
848
|
+
)
|
|
849
|
+
matcher = StreamingPatternMatcher(config=config)
|
|
850
|
+
|
|
851
|
+
# Process multiple columns
|
|
852
|
+
for chunk in data_chunks:
|
|
853
|
+
for col in ["email", "phone", "id"]:
|
|
854
|
+
matcher.process_chunk(chunk, col)
|
|
855
|
+
|
|
856
|
+
# Get all results
|
|
857
|
+
results = matcher.finalize_all()
|
|
858
|
+
"""
|
|
859
|
+
|
|
860
|
+
def __init__(
|
|
861
|
+
self,
|
|
862
|
+
config: StreamingPatternConfig | None = None,
|
|
863
|
+
aggregation_strategy: AggregationStrategy | None = None,
|
|
864
|
+
patterns: PatternRegistry | None = None,
|
|
865
|
+
event_callback: PatternEventCallback | None = None,
|
|
866
|
+
):
|
|
867
|
+
"""Initialize the streaming pattern matcher.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
config: Configuration for pattern matching
|
|
871
|
+
aggregation_strategy: Override aggregation strategy
|
|
872
|
+
patterns: Override pattern registry
|
|
873
|
+
event_callback: Callback for pattern events
|
|
874
|
+
"""
|
|
875
|
+
self.config = config or StreamingPatternConfig.balanced()
|
|
876
|
+
|
|
877
|
+
# Allow overrides
|
|
878
|
+
if aggregation_strategy:
|
|
879
|
+
self._aggregation = aggregation_strategy
|
|
880
|
+
else:
|
|
881
|
+
self._aggregation = aggregation_strategy_registry.create_from_method(
|
|
882
|
+
self.config.aggregation_method
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
self._patterns = patterns or self.config.patterns or BUILTIN_PATTERNS
|
|
886
|
+
self._event_callback = event_callback
|
|
887
|
+
|
|
888
|
+
# Internal matcher for per-chunk pattern detection
|
|
889
|
+
self._chunk_matcher = NativePatternMatcher(
|
|
890
|
+
patterns=self._patterns,
|
|
891
|
+
min_match_ratio=0.0, # We'll filter ourselves after aggregation
|
|
892
|
+
sample_size=self.config.sample_size_per_chunk,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
# State management
|
|
896
|
+
self._column_states: dict[str, ColumnPatternState] = {}
|
|
897
|
+
self._lock = threading.RLock()
|
|
898
|
+
|
|
899
|
+
@property
|
|
900
|
+
def patterns(self) -> PatternRegistry:
|
|
901
|
+
"""Get the pattern registry."""
|
|
902
|
+
return self._patterns
|
|
903
|
+
|
|
904
|
+
@property
|
|
905
|
+
def aggregation_strategy(self) -> AggregationStrategy:
|
|
906
|
+
"""Get the current aggregation strategy."""
|
|
907
|
+
return self._aggregation
|
|
908
|
+
|
|
909
|
+
def process_chunk(
|
|
910
|
+
self,
|
|
911
|
+
chunk: pl.LazyFrame | pl.DataFrame,
|
|
912
|
+
column: str,
|
|
913
|
+
chunk_index: int | None = None,
|
|
914
|
+
) -> ChunkProcessingStatus:
|
|
915
|
+
"""Process a single chunk for pattern matching.
|
|
916
|
+
|
|
917
|
+
This updates the internal state with pattern statistics
|
|
918
|
+
from the chunk.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
chunk: DataFrame or LazyFrame chunk to process
|
|
922
|
+
column: Column name to analyze
|
|
923
|
+
chunk_index: Optional chunk index (auto-incremented if not provided)
|
|
924
|
+
|
|
925
|
+
Returns:
|
|
926
|
+
Status of chunk processing
|
|
927
|
+
"""
|
|
928
|
+
start_time = time.perf_counter()
|
|
929
|
+
|
|
930
|
+
# Ensure LazyFrame
|
|
931
|
+
if isinstance(chunk, pl.DataFrame):
|
|
932
|
+
lf = chunk.lazy()
|
|
933
|
+
else:
|
|
934
|
+
lf = chunk
|
|
935
|
+
|
|
936
|
+
with self._lock:
|
|
937
|
+
# Get or create column state
|
|
938
|
+
if column not in self._column_states:
|
|
939
|
+
self._column_states[column] = ColumnPatternState(column_name=column)
|
|
940
|
+
|
|
941
|
+
col_state = self._column_states[column]
|
|
942
|
+
idx = chunk_index if chunk_index is not None else col_state.chunks_processed
|
|
943
|
+
|
|
944
|
+
# Check early termination
|
|
945
|
+
if self._should_terminate_early(col_state):
|
|
946
|
+
self._emit_event("chunk_skipped", column, idx, {"reason": "early_termination"})
|
|
947
|
+
return ChunkProcessingStatus.SKIPPED
|
|
948
|
+
|
|
949
|
+
try:
|
|
950
|
+
# Get chunk row count
|
|
951
|
+
chunk_rows = lf.select(pl.len()).collect().item()
|
|
952
|
+
if chunk_rows == 0:
|
|
953
|
+
with self._lock:
|
|
954
|
+
col_state.add_chunk(0)
|
|
955
|
+
return ChunkProcessingStatus.COMPLETED
|
|
956
|
+
|
|
957
|
+
# Run pattern matching on chunk
|
|
958
|
+
chunk_results = self._chunk_matcher.match_column(lf, column)
|
|
959
|
+
|
|
960
|
+
# Get total non-null count
|
|
961
|
+
total_count = (
|
|
962
|
+
lf.select(pl.col(column).is_not_null().sum())
|
|
963
|
+
.collect()
|
|
964
|
+
.item()
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
968
|
+
|
|
969
|
+
with self._lock:
|
|
970
|
+
# Update state for each pattern that was tested
|
|
971
|
+
for pattern in self._patterns:
|
|
972
|
+
pattern_state = col_state.get_or_create_pattern_state(pattern)
|
|
973
|
+
|
|
974
|
+
# Find matching result
|
|
975
|
+
result = next(
|
|
976
|
+
(r for r in chunk_results if r.pattern.name == pattern.name),
|
|
977
|
+
None,
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
if result:
|
|
981
|
+
# Pattern was found in this chunk
|
|
982
|
+
stats = PatternChunkStats(
|
|
983
|
+
pattern_name=pattern.name,
|
|
984
|
+
match_count=result.match_count,
|
|
985
|
+
total_count=result.total_count,
|
|
986
|
+
chunk_index=idx,
|
|
987
|
+
processing_time_ms=elapsed_ms / len(list(self._patterns)),
|
|
988
|
+
)
|
|
989
|
+
pattern_state.add_chunk_stats(stats)
|
|
990
|
+
pattern_state.add_samples(result.sample_matches)
|
|
991
|
+
else:
|
|
992
|
+
# Pattern not found - record zero matches
|
|
993
|
+
stats = PatternChunkStats(
|
|
994
|
+
pattern_name=pattern.name,
|
|
995
|
+
match_count=0,
|
|
996
|
+
total_count=total_count,
|
|
997
|
+
chunk_index=idx,
|
|
998
|
+
processing_time_ms=elapsed_ms / len(list(self._patterns)),
|
|
999
|
+
)
|
|
1000
|
+
pattern_state.add_chunk_stats(stats)
|
|
1001
|
+
|
|
1002
|
+
col_state.add_chunk(chunk_rows)
|
|
1003
|
+
|
|
1004
|
+
self._emit_event("chunk_processed", column, idx, {
|
|
1005
|
+
"rows": chunk_rows,
|
|
1006
|
+
"patterns_detected": len(chunk_results),
|
|
1007
|
+
"processing_time_ms": elapsed_ms,
|
|
1008
|
+
})
|
|
1009
|
+
|
|
1010
|
+
return ChunkProcessingStatus.COMPLETED
|
|
1011
|
+
|
|
1012
|
+
except Exception as e:
|
|
1013
|
+
logger.error(f"Failed to process chunk {idx} for column '{column}': {e}")
|
|
1014
|
+
self._emit_event("chunk_failed", column, idx, {"error": str(e)})
|
|
1015
|
+
return ChunkProcessingStatus.FAILED
|
|
1016
|
+
|
|
1017
|
+
def process_chunks(
|
|
1018
|
+
self,
|
|
1019
|
+
chunks: Iterator[pl.LazyFrame | pl.DataFrame],
|
|
1020
|
+
column: str,
|
|
1021
|
+
) -> list[ChunkProcessingStatus]:
|
|
1022
|
+
"""Process multiple chunks in sequence.
|
|
1023
|
+
|
|
1024
|
+
Args:
|
|
1025
|
+
chunks: Iterator of chunks to process
|
|
1026
|
+
column: Column name to analyze
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
List of processing statuses for each chunk
|
|
1030
|
+
"""
|
|
1031
|
+
statuses = []
|
|
1032
|
+
for chunk in chunks:
|
|
1033
|
+
status = self.process_chunk(chunk, column)
|
|
1034
|
+
statuses.append(status)
|
|
1035
|
+
if status == ChunkProcessingStatus.SKIPPED:
|
|
1036
|
+
# Early termination triggered
|
|
1037
|
+
break
|
|
1038
|
+
return statuses
|
|
1039
|
+
|
|
1040
|
+
def finalize(self, column: str) -> StreamingPatternResult:
|
|
1041
|
+
"""Finalize pattern matching for a column.
|
|
1042
|
+
|
|
1043
|
+
Aggregates all chunk statistics into final results.
|
|
1044
|
+
|
|
1045
|
+
Args:
|
|
1046
|
+
column: Column name to finalize
|
|
1047
|
+
|
|
1048
|
+
Returns:
|
|
1049
|
+
StreamingPatternResult with aggregated matches
|
|
1050
|
+
"""
|
|
1051
|
+
with self._lock:
|
|
1052
|
+
if column not in self._column_states:
|
|
1053
|
+
return StreamingPatternResult(
|
|
1054
|
+
column=column,
|
|
1055
|
+
matches=[],
|
|
1056
|
+
chunks_processed=0,
|
|
1057
|
+
total_rows=0,
|
|
1058
|
+
processing_time_ms=0.0,
|
|
1059
|
+
aggregation_method=self._aggregation.name,
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
col_state = self._column_states[column]
|
|
1063
|
+
col_state.finalize()
|
|
1064
|
+
|
|
1065
|
+
# Aggregate pattern statistics
|
|
1066
|
+
matches = []
|
|
1067
|
+
pattern_stats = {}
|
|
1068
|
+
|
|
1069
|
+
for pattern_name, pattern_state in col_state.pattern_states.items():
|
|
1070
|
+
result = self._aggregation.aggregate(
|
|
1071
|
+
pattern_state,
|
|
1072
|
+
self.config.min_match_ratio,
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
if result is not None:
|
|
1076
|
+
matches.append(result)
|
|
1077
|
+
|
|
1078
|
+
if self.config.collect_statistics:
|
|
1079
|
+
pattern_stats[pattern_name] = {
|
|
1080
|
+
"total_matches": pattern_state.total_matches,
|
|
1081
|
+
"total_rows": pattern_state.total_rows,
|
|
1082
|
+
"overall_ratio": pattern_state.overall_match_ratio,
|
|
1083
|
+
"chunks_with_matches": sum(
|
|
1084
|
+
1 for s in pattern_state.chunk_stats if s.match_count > 0
|
|
1085
|
+
),
|
|
1086
|
+
"variance": pattern_state.variance,
|
|
1087
|
+
"is_consistent": pattern_state.is_consistent,
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
# Sort by match ratio
|
|
1091
|
+
matches.sort(key=lambda r: (-r.match_ratio, -r.pattern.priority))
|
|
1092
|
+
|
|
1093
|
+
# Infer type from best match
|
|
1094
|
+
inferred_type = matches[0].pattern.data_type if matches else None
|
|
1095
|
+
|
|
1096
|
+
self._emit_event("processing_complete", column, col_state.chunks_processed, {
|
|
1097
|
+
"matches": len(matches),
|
|
1098
|
+
"total_rows": col_state.total_rows,
|
|
1099
|
+
})
|
|
1100
|
+
|
|
1101
|
+
return StreamingPatternResult(
|
|
1102
|
+
column=column,
|
|
1103
|
+
matches=matches,
|
|
1104
|
+
chunks_processed=col_state.chunks_processed,
|
|
1105
|
+
total_rows=col_state.total_rows,
|
|
1106
|
+
processing_time_ms=col_state.processing_duration_ms,
|
|
1107
|
+
aggregation_method=self._aggregation.name,
|
|
1108
|
+
inferred_type=inferred_type,
|
|
1109
|
+
pattern_stats=pattern_stats,
|
|
1110
|
+
)
|
|
1111
|
+
|
|
1112
|
+
def finalize_all(self) -> dict[str, StreamingPatternResult]:
|
|
1113
|
+
"""Finalize pattern matching for all processed columns.
|
|
1114
|
+
|
|
1115
|
+
Returns:
|
|
1116
|
+
Dictionary mapping column names to their results
|
|
1117
|
+
"""
|
|
1118
|
+
with self._lock:
|
|
1119
|
+
columns = list(self._column_states.keys())
|
|
1120
|
+
|
|
1121
|
+
return {column: self.finalize(column) for column in columns}
|
|
1122
|
+
|
|
1123
|
+
def reset(self, column: str | None = None) -> None:
|
|
1124
|
+
"""Reset state for a column or all columns.
|
|
1125
|
+
|
|
1126
|
+
Args:
|
|
1127
|
+
column: Column to reset, or None to reset all
|
|
1128
|
+
"""
|
|
1129
|
+
with self._lock:
|
|
1130
|
+
if column is None:
|
|
1131
|
+
self._column_states.clear()
|
|
1132
|
+
elif column in self._column_states:
|
|
1133
|
+
del self._column_states[column]
|
|
1134
|
+
|
|
1135
|
+
def get_current_state(self, column: str) -> ColumnPatternState | None:
|
|
1136
|
+
"""Get current state for a column (for monitoring).
|
|
1137
|
+
|
|
1138
|
+
Args:
|
|
1139
|
+
column: Column name
|
|
1140
|
+
|
|
1141
|
+
Returns:
|
|
1142
|
+
Current column state or None
|
|
1143
|
+
"""
|
|
1144
|
+
with self._lock:
|
|
1145
|
+
return self._column_states.get(column)
|
|
1146
|
+
|
|
1147
|
+
def _should_terminate_early(self, state: ColumnPatternState) -> bool:
|
|
1148
|
+
"""Check if early termination should be triggered."""
|
|
1149
|
+
if not self.config.enable_early_termination:
|
|
1150
|
+
return False
|
|
1151
|
+
|
|
1152
|
+
if state.chunks_processed < self.config.early_termination_chunks:
|
|
1153
|
+
return False
|
|
1154
|
+
|
|
1155
|
+
# Check if all patterns are clearly above or below threshold
|
|
1156
|
+
for pattern_state in state.pattern_states.values():
|
|
1157
|
+
if pattern_state.chunks_processed < 2:
|
|
1158
|
+
continue
|
|
1159
|
+
|
|
1160
|
+
ratio = pattern_state.overall_match_ratio
|
|
1161
|
+
std = pattern_state.std_deviation
|
|
1162
|
+
|
|
1163
|
+
# Pattern is clearly matching
|
|
1164
|
+
if ratio - 2 * std > self.config.min_match_ratio:
|
|
1165
|
+
continue
|
|
1166
|
+
|
|
1167
|
+
# Pattern is clearly not matching
|
|
1168
|
+
if ratio + 2 * std < self.config.min_match_ratio:
|
|
1169
|
+
continue
|
|
1170
|
+
|
|
1171
|
+
# Pattern is uncertain - continue processing
|
|
1172
|
+
return False
|
|
1173
|
+
|
|
1174
|
+
# All patterns are determined
|
|
1175
|
+
return True
|
|
1176
|
+
|
|
1177
|
+
def _emit_event(
|
|
1178
|
+
self,
|
|
1179
|
+
event_type: str,
|
|
1180
|
+
column: str,
|
|
1181
|
+
chunk_index: int,
|
|
1182
|
+
data: dict[str, Any],
|
|
1183
|
+
) -> None:
|
|
1184
|
+
"""Emit a pattern event."""
|
|
1185
|
+
if self._event_callback:
|
|
1186
|
+
event = PatternEvent(
|
|
1187
|
+
event_type=event_type,
|
|
1188
|
+
column=column,
|
|
1189
|
+
chunk_index=chunk_index,
|
|
1190
|
+
data=data,
|
|
1191
|
+
)
|
|
1192
|
+
try:
|
|
1193
|
+
self._event_callback(event)
|
|
1194
|
+
except Exception as e:
|
|
1195
|
+
logger.warning(f"Event callback failed: {e}")
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
# =============================================================================
|
|
1199
|
+
# Integration with StreamingProfiler
|
|
1200
|
+
# =============================================================================
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
class StreamingPatternIntegration:
|
|
1204
|
+
"""Integration layer for StreamingProfiler.
|
|
1205
|
+
|
|
1206
|
+
This class provides the interface for integrating streaming
|
|
1207
|
+
pattern matching with the existing StreamingProfiler.
|
|
1208
|
+
"""
|
|
1209
|
+
|
|
1210
|
+
def __init__(
|
|
1211
|
+
self,
|
|
1212
|
+
config: StreamingPatternConfig | None = None,
|
|
1213
|
+
patterns: PatternRegistry | None = None,
|
|
1214
|
+
):
|
|
1215
|
+
"""Initialize integration.
|
|
1216
|
+
|
|
1217
|
+
Args:
|
|
1218
|
+
config: Pattern matching configuration
|
|
1219
|
+
patterns: Pattern registry to use
|
|
1220
|
+
"""
|
|
1221
|
+
self.config = config or StreamingPatternConfig.balanced()
|
|
1222
|
+
self.matcher = StreamingPatternMatcher(
|
|
1223
|
+
config=self.config,
|
|
1224
|
+
patterns=patterns,
|
|
1225
|
+
)
|
|
1226
|
+
|
|
1227
|
+
def process_column_chunk(
|
|
1228
|
+
self,
|
|
1229
|
+
chunk: pl.LazyFrame | pl.DataFrame,
|
|
1230
|
+
column: str,
|
|
1231
|
+
chunk_index: int,
|
|
1232
|
+
) -> None:
|
|
1233
|
+
"""Process a column in a chunk.
|
|
1234
|
+
|
|
1235
|
+
Called by StreamingProfiler for each chunk.
|
|
1236
|
+
|
|
1237
|
+
Args:
|
|
1238
|
+
chunk: Data chunk
|
|
1239
|
+
column: Column name
|
|
1240
|
+
chunk_index: Index of this chunk
|
|
1241
|
+
"""
|
|
1242
|
+
self.matcher.process_chunk(chunk, column, chunk_index)
|
|
1243
|
+
|
|
1244
|
+
def get_column_patterns(self, column: str) -> tuple[PatternMatch, ...]:
|
|
1245
|
+
"""Get detected patterns for a column.
|
|
1246
|
+
|
|
1247
|
+
Called by StreamingProfiler when building ColumnProfile.
|
|
1248
|
+
|
|
1249
|
+
Args:
|
|
1250
|
+
column: Column name
|
|
1251
|
+
|
|
1252
|
+
Returns:
|
|
1253
|
+
Tuple of PatternMatch objects
|
|
1254
|
+
"""
|
|
1255
|
+
result = self.matcher.finalize(column)
|
|
1256
|
+
return tuple(r.to_pattern_match() for r in result.matches)
|
|
1257
|
+
|
|
1258
|
+
def get_inferred_type(self, column: str) -> DataType | None:
|
|
1259
|
+
"""Get inferred type for a column.
|
|
1260
|
+
|
|
1261
|
+
Args:
|
|
1262
|
+
column: Column name
|
|
1263
|
+
|
|
1264
|
+
Returns:
|
|
1265
|
+
Inferred DataType or None
|
|
1266
|
+
"""
|
|
1267
|
+
result = self.matcher.finalize(column)
|
|
1268
|
+
return result.inferred_type
|
|
1269
|
+
|
|
1270
|
+
def reset(self) -> None:
|
|
1271
|
+
"""Reset all state."""
|
|
1272
|
+
self.matcher.reset()
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
# =============================================================================
|
|
1276
|
+
# Convenience Functions
|
|
1277
|
+
# =============================================================================
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
def create_streaming_matcher(
|
|
1281
|
+
aggregation: str | AggregationMethod = "adaptive",
|
|
1282
|
+
min_match_ratio: float = 0.8,
|
|
1283
|
+
**kwargs: Any,
|
|
1284
|
+
) -> StreamingPatternMatcher:
|
|
1285
|
+
"""Create a streaming pattern matcher with common options.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
aggregation: Aggregation method name or enum
|
|
1289
|
+
min_match_ratio: Minimum match ratio threshold
|
|
1290
|
+
**kwargs: Additional config options
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
Configured StreamingPatternMatcher
|
|
1294
|
+
|
|
1295
|
+
Example:
|
|
1296
|
+
matcher = create_streaming_matcher(
|
|
1297
|
+
aggregation="consensus",
|
|
1298
|
+
min_match_ratio=0.9,
|
|
1299
|
+
)
|
|
1300
|
+
"""
|
|
1301
|
+
if isinstance(aggregation, str):
|
|
1302
|
+
aggregation = AggregationMethod(aggregation)
|
|
1303
|
+
|
|
1304
|
+
config = StreamingPatternConfig(
|
|
1305
|
+
aggregation_method=aggregation,
|
|
1306
|
+
min_match_ratio=min_match_ratio,
|
|
1307
|
+
**kwargs,
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
return StreamingPatternMatcher(config=config)
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
def stream_match_patterns(
|
|
1314
|
+
chunks: Iterator[pl.LazyFrame | pl.DataFrame],
|
|
1315
|
+
column: str,
|
|
1316
|
+
*,
|
|
1317
|
+
aggregation: str = "adaptive",
|
|
1318
|
+
min_ratio: float = 0.8,
|
|
1319
|
+
) -> StreamingPatternResult:
|
|
1320
|
+
"""Convenience function for streaming pattern matching.
|
|
1321
|
+
|
|
1322
|
+
Args:
|
|
1323
|
+
chunks: Iterator of data chunks
|
|
1324
|
+
column: Column to analyze
|
|
1325
|
+
aggregation: Aggregation method
|
|
1326
|
+
min_ratio: Minimum match ratio
|
|
1327
|
+
|
|
1328
|
+
Returns:
|
|
1329
|
+
StreamingPatternResult
|
|
1330
|
+
|
|
1331
|
+
Example:
|
|
1332
|
+
from truthound.profiler.streaming_patterns import stream_match_patterns
|
|
1333
|
+
|
|
1334
|
+
# From file chunks
|
|
1335
|
+
result = stream_match_patterns(
|
|
1336
|
+
file_chunk_iterator("data.csv"),
|
|
1337
|
+
"email_column",
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
print(f"Best match: {result.best_match.pattern.name}")
|
|
1341
|
+
print(f"Chunks processed: {result.chunks_processed}")
|
|
1342
|
+
"""
|
|
1343
|
+
matcher = create_streaming_matcher(
|
|
1344
|
+
aggregation=aggregation,
|
|
1345
|
+
min_match_ratio=min_ratio,
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
matcher.process_chunks(chunks, column)
|
|
1349
|
+
return matcher.finalize(column)
|
|
1350
|
+
|
|
1351
|
+
|
|
1352
|
+
def get_available_aggregation_methods() -> list[str]:
|
|
1353
|
+
"""Get list of available aggregation methods."""
|
|
1354
|
+
return aggregation_strategy_registry.list_strategies()
|