truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1190 @@
|
|
|
1
|
+
"""Machine learning based anomaly detection validators.
|
|
2
|
+
|
|
3
|
+
These validators use scikit-learn for advanced anomaly detection.
|
|
4
|
+
Requires: pip install truthound[anomaly] (includes scikit-learn)
|
|
5
|
+
|
|
6
|
+
Memory Optimization:
|
|
7
|
+
These validators now support automatic sampling for large datasets:
|
|
8
|
+
|
|
9
|
+
# Memory-efficient usage for large datasets:
|
|
10
|
+
validator = IsolationForestValidator(
|
|
11
|
+
columns=["col1", "col2"],
|
|
12
|
+
sample_size=100000, # Sample if data exceeds this
|
|
13
|
+
batch_size=50000, # Process in batches for scoring
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Or use auto-sampling based on available memory:
|
|
17
|
+
validator = IsolationForestValidator(
|
|
18
|
+
columns=["col1", "col2"],
|
|
19
|
+
auto_sample=True, # Auto-detect optimal sample size
|
|
20
|
+
)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
import polars as pl
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
from truthound.types import Severity
|
|
29
|
+
from truthound.validators.base import ValidationIssue
|
|
30
|
+
from truthound.validators.registry import register_validator
|
|
31
|
+
from truthound.validators.anomaly.base import (
|
|
32
|
+
AnomalyValidator,
|
|
33
|
+
MLAnomalyMixin,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Default thresholds for memory-efficient processing
|
|
38
|
+
DEFAULT_SAMPLE_SIZE = 100000 # Default max samples for training
|
|
39
|
+
DEFAULT_BATCH_SIZE = 50000 # Default batch size for scoring
|
|
40
|
+
MEMORY_THRESHOLD_MB = 500 # Auto-sample when data exceeds this
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_sklearn_available() -> None:
|
|
44
|
+
"""Check if scikit-learn is available."""
|
|
45
|
+
try:
|
|
46
|
+
import sklearn # noqa: F401
|
|
47
|
+
except ImportError:
|
|
48
|
+
raise ImportError(
|
|
49
|
+
"scikit-learn is required for ML-based anomaly detection. "
|
|
50
|
+
"Install with: pip install truthound[anomaly]"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _estimate_data_memory_mb(n_rows: int, n_cols: int) -> float:
|
|
55
|
+
"""Estimate memory usage for numpy array in MB."""
|
|
56
|
+
# Assuming float64 (8 bytes per element)
|
|
57
|
+
bytes_needed = n_rows * n_cols * 8
|
|
58
|
+
return bytes_needed / (1024 * 1024)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _compute_optimal_sample_size(
|
|
62
|
+
n_rows: int,
|
|
63
|
+
n_cols: int,
|
|
64
|
+
max_memory_mb: float = MEMORY_THRESHOLD_MB,
|
|
65
|
+
) -> int:
|
|
66
|
+
"""Compute optimal sample size based on memory constraints.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
n_rows: Total number of rows
|
|
70
|
+
n_cols: Number of columns
|
|
71
|
+
max_memory_mb: Maximum memory to use
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Optimal sample size
|
|
75
|
+
"""
|
|
76
|
+
# Calculate max rows that fit in memory
|
|
77
|
+
bytes_per_row = n_cols * 8 # float64
|
|
78
|
+
max_rows = int((max_memory_mb * 1024 * 1024) / bytes_per_row)
|
|
79
|
+
|
|
80
|
+
# Apply a safety margin and cap
|
|
81
|
+
safe_rows = int(max_rows * 0.8)
|
|
82
|
+
return min(n_rows, max(safe_rows, 1000)) # At least 1000 samples
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class LargeDatasetMixin:
|
|
86
|
+
"""Mixin providing large dataset handling utilities for ML validators.
|
|
87
|
+
|
|
88
|
+
Provides:
|
|
89
|
+
- Automatic sampling for training
|
|
90
|
+
- Mini-batch scoring for prediction
|
|
91
|
+
- Memory-aware data loading
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def _smart_sample_lazyframe(
|
|
95
|
+
self,
|
|
96
|
+
lf: pl.LazyFrame,
|
|
97
|
+
columns: list[str],
|
|
98
|
+
sample_size: int | None = None,
|
|
99
|
+
random_state: int = 42,
|
|
100
|
+
) -> tuple[np.ndarray, int, bool]:
|
|
101
|
+
"""Efficiently sample data from LazyFrame.
|
|
102
|
+
|
|
103
|
+
Uses Polars lazy evaluation to avoid loading full dataset.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
lf: Input LazyFrame
|
|
107
|
+
columns: Columns to select
|
|
108
|
+
sample_size: Max samples (None = load all)
|
|
109
|
+
random_state: Random seed
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Tuple of (data_array, original_count, was_sampled)
|
|
113
|
+
"""
|
|
114
|
+
# First, get count efficiently
|
|
115
|
+
count_result = lf.select(pl.len()).collect()
|
|
116
|
+
total_count = count_result.item()
|
|
117
|
+
|
|
118
|
+
if total_count == 0:
|
|
119
|
+
return np.array([]).reshape(0, len(columns)), 0, False
|
|
120
|
+
|
|
121
|
+
# Determine if sampling is needed
|
|
122
|
+
effective_sample_size = sample_size
|
|
123
|
+
should_sample = sample_size is not None and total_count > sample_size
|
|
124
|
+
|
|
125
|
+
if should_sample:
|
|
126
|
+
# Collect data first, then sample (more reliable approach)
|
|
127
|
+
# For very large data, we use slice-based sampling
|
|
128
|
+
df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
|
|
129
|
+
|
|
130
|
+
if len(df) > effective_sample_size:
|
|
131
|
+
# Random sampling from collected dataframe
|
|
132
|
+
df = df.sample(n=effective_sample_size, seed=random_state)
|
|
133
|
+
else:
|
|
134
|
+
df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
|
|
135
|
+
|
|
136
|
+
if len(df) == 0:
|
|
137
|
+
return np.array([]).reshape(0, len(columns)), total_count, should_sample
|
|
138
|
+
|
|
139
|
+
data = df.to_numpy()
|
|
140
|
+
return data, total_count, should_sample
|
|
141
|
+
|
|
142
|
+
def _batch_predict(
|
|
143
|
+
self,
|
|
144
|
+
model: Any,
|
|
145
|
+
data: np.ndarray,
|
|
146
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
147
|
+
predict_method: str = "predict",
|
|
148
|
+
) -> np.ndarray:
|
|
149
|
+
"""Predict in batches to reduce memory usage.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
model: Fitted sklearn model
|
|
153
|
+
data: Input data array
|
|
154
|
+
batch_size: Size of each batch
|
|
155
|
+
predict_method: Method to call on model ('predict' or 'decision_function')
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Concatenated predictions
|
|
159
|
+
"""
|
|
160
|
+
n_samples = len(data)
|
|
161
|
+
if n_samples <= batch_size:
|
|
162
|
+
method = getattr(model, predict_method)
|
|
163
|
+
return method(data)
|
|
164
|
+
|
|
165
|
+
predictions = []
|
|
166
|
+
for start_idx in range(0, n_samples, batch_size):
|
|
167
|
+
end_idx = min(start_idx + batch_size, n_samples)
|
|
168
|
+
batch = data[start_idx:end_idx]
|
|
169
|
+
method = getattr(model, predict_method)
|
|
170
|
+
batch_pred = method(batch)
|
|
171
|
+
predictions.append(batch_pred)
|
|
172
|
+
|
|
173
|
+
return np.concatenate(predictions)
|
|
174
|
+
|
|
175
|
+
def _streaming_score(
|
|
176
|
+
self,
|
|
177
|
+
lf: pl.LazyFrame,
|
|
178
|
+
columns: list[str],
|
|
179
|
+
model: Any,
|
|
180
|
+
medians: np.ndarray,
|
|
181
|
+
iqrs: np.ndarray,
|
|
182
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
183
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
184
|
+
"""Stream data and score in batches for very large datasets.
|
|
185
|
+
|
|
186
|
+
This method processes data in chunks without loading the entire
|
|
187
|
+
dataset into memory at once.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
lf: LazyFrame with data
|
|
191
|
+
columns: Columns to process
|
|
192
|
+
model: Fitted sklearn model (must have predict() and optionally decision_function())
|
|
193
|
+
medians: Normalization medians
|
|
194
|
+
iqrs: Normalization IQRs
|
|
195
|
+
batch_size: Size of each batch
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Tuple of (predictions_array, scores_array or empty array)
|
|
199
|
+
"""
|
|
200
|
+
# Get total count
|
|
201
|
+
total_count = lf.select(pl.len()).collect().item()
|
|
202
|
+
|
|
203
|
+
if total_count == 0:
|
|
204
|
+
return np.array([]), np.array([])
|
|
205
|
+
|
|
206
|
+
all_predictions = []
|
|
207
|
+
all_scores = []
|
|
208
|
+
has_decision_function = hasattr(model, 'decision_function')
|
|
209
|
+
|
|
210
|
+
# Process in streaming batches
|
|
211
|
+
for offset in range(0, total_count, batch_size):
|
|
212
|
+
# Fetch batch using slice
|
|
213
|
+
batch_lf = (
|
|
214
|
+
lf.select([pl.col(c) for c in columns])
|
|
215
|
+
.slice(offset, batch_size)
|
|
216
|
+
.drop_nulls()
|
|
217
|
+
)
|
|
218
|
+
batch_df = batch_lf.collect()
|
|
219
|
+
|
|
220
|
+
if len(batch_df) == 0:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
batch_data = batch_df.to_numpy()
|
|
224
|
+
|
|
225
|
+
# Normalize using training stats
|
|
226
|
+
normalized_batch = (batch_data - medians) / np.where(iqrs == 0, 1, iqrs)
|
|
227
|
+
|
|
228
|
+
# Predict
|
|
229
|
+
batch_preds = model.predict(normalized_batch)
|
|
230
|
+
all_predictions.append(batch_preds)
|
|
231
|
+
|
|
232
|
+
if has_decision_function:
|
|
233
|
+
batch_scores = model.decision_function(normalized_batch)
|
|
234
|
+
all_scores.append(batch_scores)
|
|
235
|
+
|
|
236
|
+
if not all_predictions:
|
|
237
|
+
return np.array([]), np.array([])
|
|
238
|
+
|
|
239
|
+
predictions = np.concatenate(all_predictions)
|
|
240
|
+
scores = np.concatenate(all_scores) if all_scores else np.array([])
|
|
241
|
+
|
|
242
|
+
return predictions, scores
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@register_validator
|
|
246
|
+
class IsolationForestValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
|
|
247
|
+
"""Isolation Forest anomaly detection.
|
|
248
|
+
|
|
249
|
+
Isolation Forest isolates anomalies by randomly selecting a feature
|
|
250
|
+
and then randomly selecting a split value. Anomalies are easier to
|
|
251
|
+
isolate, so they have shorter path lengths in the tree.
|
|
252
|
+
|
|
253
|
+
This is efficient for high-dimensional data and doesn't assume
|
|
254
|
+
any particular distribution.
|
|
255
|
+
|
|
256
|
+
Memory Optimization:
|
|
257
|
+
For large datasets, use sample_size and batch_size parameters:
|
|
258
|
+
|
|
259
|
+
# Memory-efficient for 10M+ rows:
|
|
260
|
+
validator = IsolationForestValidator(
|
|
261
|
+
columns=["col1", "col2"],
|
|
262
|
+
sample_size=100000, # Train on 100k samples
|
|
263
|
+
batch_size=50000, # Score in 50k batches
|
|
264
|
+
auto_sample=True, # Or let it auto-detect
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
Example:
|
|
268
|
+
# Detect anomalies in multiple columns
|
|
269
|
+
validator = IsolationForestValidator(
|
|
270
|
+
columns=["feature1", "feature2", "feature3"],
|
|
271
|
+
contamination=0.05, # Expected 5% anomalies
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Auto-detect contamination
|
|
275
|
+
validator = IsolationForestValidator(
|
|
276
|
+
columns=["col1", "col2"],
|
|
277
|
+
contamination="auto",
|
|
278
|
+
)
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
name = "isolation_forest"
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
columns: list[str] | None = None,
|
|
286
|
+
contamination: float | str = "auto",
|
|
287
|
+
n_estimators: int = 100,
|
|
288
|
+
max_samples: int | float | str = "auto",
|
|
289
|
+
random_state: int | None = 42,
|
|
290
|
+
max_anomaly_ratio: float = 0.1,
|
|
291
|
+
sample_size: int | None = None,
|
|
292
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
293
|
+
auto_sample: bool = False,
|
|
294
|
+
max_memory_mb: float = MEMORY_THRESHOLD_MB,
|
|
295
|
+
**kwargs: Any,
|
|
296
|
+
):
|
|
297
|
+
"""Initialize Isolation Forest validator.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
columns: Columns to use for detection. If None, uses all numeric.
|
|
301
|
+
contamination: Expected proportion of outliers ("auto" or 0.0-0.5)
|
|
302
|
+
n_estimators: Number of trees in the forest
|
|
303
|
+
max_samples: Number of samples for each tree
|
|
304
|
+
random_state: Random seed for reproducibility
|
|
305
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
306
|
+
sample_size: Max samples for training (None = use all data)
|
|
307
|
+
batch_size: Batch size for scoring large datasets
|
|
308
|
+
auto_sample: If True, automatically determine sample_size
|
|
309
|
+
max_memory_mb: Max memory (MB) for auto_sample mode
|
|
310
|
+
**kwargs: Additional config
|
|
311
|
+
"""
|
|
312
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
313
|
+
self.contamination = contamination
|
|
314
|
+
self.n_estimators = n_estimators
|
|
315
|
+
self.max_samples = max_samples
|
|
316
|
+
self.random_state = random_state
|
|
317
|
+
self._sample_size = sample_size
|
|
318
|
+
self._batch_size = batch_size
|
|
319
|
+
self._auto_sample = auto_sample
|
|
320
|
+
self._max_memory_mb = max_memory_mb
|
|
321
|
+
|
|
322
|
+
def detect_anomalies(
|
|
323
|
+
self, data: np.ndarray, column_names: list[str]
|
|
324
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
325
|
+
"""Detect anomalies using Isolation Forest."""
|
|
326
|
+
_check_sklearn_available()
|
|
327
|
+
from sklearn.ensemble import IsolationForest
|
|
328
|
+
|
|
329
|
+
# Normalize data for better performance
|
|
330
|
+
normalized_data, medians, iqrs = self.normalize_data(data)
|
|
331
|
+
|
|
332
|
+
# Create and fit model
|
|
333
|
+
model = IsolationForest(
|
|
334
|
+
contamination=self.contamination,
|
|
335
|
+
n_estimators=self.n_estimators,
|
|
336
|
+
max_samples=self.max_samples,
|
|
337
|
+
random_state=self.random_state,
|
|
338
|
+
n_jobs=-1, # Use all cores
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Predict: -1 for anomalies, 1 for normal
|
|
342
|
+
predictions = model.fit_predict(normalized_data)
|
|
343
|
+
anomaly_mask = predictions == -1
|
|
344
|
+
|
|
345
|
+
# Get anomaly scores (lower = more anomalous)
|
|
346
|
+
scores = model.decision_function(normalized_data)
|
|
347
|
+
|
|
348
|
+
return anomaly_mask, {
|
|
349
|
+
"n_features": data.shape[1],
|
|
350
|
+
"n_samples": data.shape[0],
|
|
351
|
+
"min_score": float(np.min(scores)),
|
|
352
|
+
"max_score": float(np.max(scores)),
|
|
353
|
+
"threshold": float(model.offset_),
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
357
|
+
issues: list[ValidationIssue] = []
|
|
358
|
+
|
|
359
|
+
columns = self._get_anomaly_columns(lf)
|
|
360
|
+
if not columns:
|
|
361
|
+
return issues
|
|
362
|
+
|
|
363
|
+
# Determine sample size
|
|
364
|
+
sample_size = self._sample_size
|
|
365
|
+
if self._auto_sample and sample_size is None:
|
|
366
|
+
# Get row count first
|
|
367
|
+
total_count = lf.select(pl.len()).collect().item()
|
|
368
|
+
sample_size = _compute_optimal_sample_size(
|
|
369
|
+
total_count, len(columns), self._max_memory_mb
|
|
370
|
+
)
|
|
371
|
+
self.logger.debug(f"Auto-sample: using {sample_size} samples from {total_count}")
|
|
372
|
+
|
|
373
|
+
# Smart sampling from LazyFrame
|
|
374
|
+
data, original_count, was_sampled = self._smart_sample_lazyframe(
|
|
375
|
+
lf, columns, sample_size, self.random_state or 42
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
if len(data) < 10:
|
|
379
|
+
return issues
|
|
380
|
+
|
|
381
|
+
# Detect anomalies on (possibly sampled) data
|
|
382
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
383
|
+
|
|
384
|
+
# If we sampled, we need to report based on sample
|
|
385
|
+
# For large datasets, we train on sample but can optionally score all data
|
|
386
|
+
if was_sampled and len(data) < original_count:
|
|
387
|
+
# For very large datasets, we estimate anomaly ratio from sample
|
|
388
|
+
sample_anomaly_count = int(anomaly_mask.sum())
|
|
389
|
+
sample_anomaly_ratio = sample_anomaly_count / len(data)
|
|
390
|
+
# Extrapolate to full dataset
|
|
391
|
+
estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
|
|
392
|
+
anomaly_count = estimated_total_anomalies
|
|
393
|
+
anomaly_ratio = sample_anomaly_ratio
|
|
394
|
+
info["sampled"] = True
|
|
395
|
+
info["sample_size"] = len(data)
|
|
396
|
+
info["original_count"] = original_count
|
|
397
|
+
else:
|
|
398
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
399
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
400
|
+
info["sampled"] = False
|
|
401
|
+
|
|
402
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
403
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
404
|
+
|
|
405
|
+
sample_note = ""
|
|
406
|
+
if info.get("sampled"):
|
|
407
|
+
sample_note = f" (estimated from {info['sample_size']:,} samples)"
|
|
408
|
+
|
|
409
|
+
issues.append(
|
|
410
|
+
ValidationIssue(
|
|
411
|
+
column=", ".join(columns),
|
|
412
|
+
issue_type="isolation_forest_anomaly",
|
|
413
|
+
count=anomaly_count,
|
|
414
|
+
severity=severity,
|
|
415
|
+
details=(
|
|
416
|
+
f"Isolation Forest detected {anomaly_count:,} anomalies "
|
|
417
|
+
f"({anomaly_ratio:.2%}) across {info['n_features']} features{sample_note}. "
|
|
418
|
+
f"Score range: [{info['min_score']:.4f}, {info['max_score']:.4f}]"
|
|
419
|
+
),
|
|
420
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
421
|
+
)
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
return issues
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
@register_validator
|
|
428
|
+
class LOFValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
|
|
429
|
+
"""Local Outlier Factor (LOF) anomaly detection.
|
|
430
|
+
|
|
431
|
+
LOF measures the local density deviation of a point with respect to
|
|
432
|
+
its neighbors. Points with substantially lower density than their
|
|
433
|
+
neighbors are considered outliers.
|
|
434
|
+
|
|
435
|
+
Best for detecting local anomalies in clustered data.
|
|
436
|
+
|
|
437
|
+
Memory Optimization:
|
|
438
|
+
LOF is memory-intensive due to distance computations.
|
|
439
|
+
For large datasets, use sampling:
|
|
440
|
+
|
|
441
|
+
validator = LOFValidator(
|
|
442
|
+
columns=["x", "y"],
|
|
443
|
+
n_neighbors=20,
|
|
444
|
+
sample_size=50000, # Sample for large datasets
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
Example:
|
|
448
|
+
validator = LOFValidator(
|
|
449
|
+
columns=["x", "y"],
|
|
450
|
+
n_neighbors=20,
|
|
451
|
+
contamination=0.05,
|
|
452
|
+
)
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
name = "lof"
|
|
456
|
+
|
|
457
|
+
def __init__(
|
|
458
|
+
self,
|
|
459
|
+
columns: list[str] | None = None,
|
|
460
|
+
n_neighbors: int = 20,
|
|
461
|
+
contamination: float | str = "auto",
|
|
462
|
+
metric: str = "minkowski",
|
|
463
|
+
max_anomaly_ratio: float = 0.1,
|
|
464
|
+
sample_size: int | None = None,
|
|
465
|
+
auto_sample: bool = False,
|
|
466
|
+
max_memory_mb: float = MEMORY_THRESHOLD_MB,
|
|
467
|
+
**kwargs: Any,
|
|
468
|
+
):
|
|
469
|
+
"""Initialize LOF validator.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
columns: Columns to use for detection. If None, uses all numeric.
|
|
473
|
+
n_neighbors: Number of neighbors for LOF calculation
|
|
474
|
+
contamination: Expected proportion of outliers
|
|
475
|
+
metric: Distance metric to use
|
|
476
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
477
|
+
sample_size: Max samples for training (None = use all data)
|
|
478
|
+
auto_sample: If True, automatically determine sample_size
|
|
479
|
+
max_memory_mb: Max memory (MB) for auto_sample mode
|
|
480
|
+
**kwargs: Additional config
|
|
481
|
+
"""
|
|
482
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
483
|
+
self.n_neighbors = n_neighbors
|
|
484
|
+
self.contamination = contamination
|
|
485
|
+
self.metric = metric
|
|
486
|
+
self._sample_size = sample_size
|
|
487
|
+
self._auto_sample = auto_sample
|
|
488
|
+
self._max_memory_mb = max_memory_mb
|
|
489
|
+
|
|
490
|
+
def detect_anomalies(
|
|
491
|
+
self, data: np.ndarray, column_names: list[str]
|
|
492
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
493
|
+
"""Detect anomalies using LOF."""
|
|
494
|
+
_check_sklearn_available()
|
|
495
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
496
|
+
|
|
497
|
+
# Normalize data
|
|
498
|
+
normalized_data, _, _ = self.normalize_data(data)
|
|
499
|
+
|
|
500
|
+
# Adjust n_neighbors if needed
|
|
501
|
+
n_neighbors = min(self.n_neighbors, len(data) - 1)
|
|
502
|
+
|
|
503
|
+
model = LocalOutlierFactor(
|
|
504
|
+
n_neighbors=n_neighbors,
|
|
505
|
+
contamination=self.contamination,
|
|
506
|
+
metric=self.metric,
|
|
507
|
+
n_jobs=-1,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Predict: -1 for anomalies, 1 for normal
|
|
511
|
+
predictions = model.fit_predict(normalized_data)
|
|
512
|
+
anomaly_mask = predictions == -1
|
|
513
|
+
|
|
514
|
+
# Get LOF scores (higher = more anomalous)
|
|
515
|
+
lof_scores = -model.negative_outlier_factor_
|
|
516
|
+
|
|
517
|
+
return anomaly_mask, {
|
|
518
|
+
"n_neighbors": n_neighbors,
|
|
519
|
+
"min_lof": float(np.min(lof_scores)),
|
|
520
|
+
"max_lof": float(np.max(lof_scores)),
|
|
521
|
+
"mean_lof": float(np.mean(lof_scores)),
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
525
|
+
issues: list[ValidationIssue] = []
|
|
526
|
+
|
|
527
|
+
columns = self._get_anomaly_columns(lf)
|
|
528
|
+
if not columns:
|
|
529
|
+
return issues
|
|
530
|
+
|
|
531
|
+
# Determine sample size (LOF is O(n^2) memory, so sampling is critical)
|
|
532
|
+
sample_size = self._sample_size
|
|
533
|
+
if self._auto_sample and sample_size is None:
|
|
534
|
+
total_count = lf.select(pl.len()).collect().item()
|
|
535
|
+
# LOF needs distance matrix, so use more aggressive sampling
|
|
536
|
+
# Memory: O(n^2) for distance matrix
|
|
537
|
+
sample_size = _compute_optimal_sample_size(
|
|
538
|
+
total_count, len(columns), self._max_memory_mb / 2 # More conservative
|
|
539
|
+
)
|
|
540
|
+
# Cap at reasonable limit for LOF
|
|
541
|
+
sample_size = min(sample_size, 50000)
|
|
542
|
+
self.logger.debug(f"Auto-sample (LOF): using {sample_size} samples from {total_count}")
|
|
543
|
+
|
|
544
|
+
# Smart sampling from LazyFrame
|
|
545
|
+
data, original_count, was_sampled = self._smart_sample_lazyframe(
|
|
546
|
+
lf, columns, sample_size, 42
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if len(data) < self.n_neighbors + 1:
|
|
550
|
+
return issues
|
|
551
|
+
|
|
552
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
553
|
+
|
|
554
|
+
# Handle sampled results
|
|
555
|
+
if was_sampled and len(data) < original_count:
|
|
556
|
+
sample_anomaly_count = int(anomaly_mask.sum())
|
|
557
|
+
sample_anomaly_ratio = sample_anomaly_count / len(data)
|
|
558
|
+
estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
|
|
559
|
+
anomaly_count = estimated_total_anomalies
|
|
560
|
+
anomaly_ratio = sample_anomaly_ratio
|
|
561
|
+
info["sampled"] = True
|
|
562
|
+
info["sample_size"] = len(data)
|
|
563
|
+
info["original_count"] = original_count
|
|
564
|
+
else:
|
|
565
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
566
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
567
|
+
info["sampled"] = False
|
|
568
|
+
|
|
569
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
570
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
571
|
+
|
|
572
|
+
sample_note = ""
|
|
573
|
+
if info.get("sampled"):
|
|
574
|
+
sample_note = f" (estimated from {info['sample_size']:,} samples)"
|
|
575
|
+
|
|
576
|
+
issues.append(
|
|
577
|
+
ValidationIssue(
|
|
578
|
+
column=", ".join(columns),
|
|
579
|
+
issue_type="lof_anomaly",
|
|
580
|
+
count=anomaly_count,
|
|
581
|
+
severity=severity,
|
|
582
|
+
details=(
|
|
583
|
+
f"LOF (k={info['n_neighbors']}) detected {anomaly_count:,} anomalies "
|
|
584
|
+
f"({anomaly_ratio:.2%}){sample_note}. LOF scores: mean={info['mean_lof']:.2f}, "
|
|
585
|
+
f"max={info['max_lof']:.2f}"
|
|
586
|
+
),
|
|
587
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
588
|
+
)
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
return issues
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
@register_validator
|
|
595
|
+
class OneClassSVMValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
|
|
596
|
+
"""One-Class SVM for anomaly detection.
|
|
597
|
+
|
|
598
|
+
One-Class SVM learns a decision boundary around normal data.
|
|
599
|
+
Points outside this boundary are classified as anomalies.
|
|
600
|
+
|
|
601
|
+
Works well for high-dimensional data but can be slower than
|
|
602
|
+
tree-based methods.
|
|
603
|
+
|
|
604
|
+
Memory Optimization:
|
|
605
|
+
SVM training is O(n^2) to O(n^3), so sampling is essential:
|
|
606
|
+
|
|
607
|
+
validator = OneClassSVMValidator(
|
|
608
|
+
columns=["feature1", "feature2"],
|
|
609
|
+
nu=0.05,
|
|
610
|
+
sample_size=10000, # Train on smaller sample
|
|
611
|
+
batch_size=50000, # Score in batches
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
Example:
|
|
615
|
+
validator = OneClassSVMValidator(
|
|
616
|
+
columns=["feature1", "feature2"],
|
|
617
|
+
nu=0.05, # Upper bound on fraction of anomalies
|
|
618
|
+
kernel="rbf",
|
|
619
|
+
)
|
|
620
|
+
"""
|
|
621
|
+
|
|
622
|
+
name = "one_class_svm"
|
|
623
|
+
|
|
624
|
+
def __init__(
|
|
625
|
+
self,
|
|
626
|
+
columns: list[str] | None = None,
|
|
627
|
+
kernel: str = "rbf",
|
|
628
|
+
nu: float = 0.05,
|
|
629
|
+
gamma: str | float = "scale",
|
|
630
|
+
max_anomaly_ratio: float = 0.1,
|
|
631
|
+
sample_size: int | None = None,
|
|
632
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
633
|
+
auto_sample: bool = False,
|
|
634
|
+
max_memory_mb: float = MEMORY_THRESHOLD_MB,
|
|
635
|
+
**kwargs: Any,
|
|
636
|
+
):
|
|
637
|
+
"""Initialize One-Class SVM validator.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
columns: Columns to use for detection
|
|
641
|
+
kernel: Kernel type ('rbf', 'linear', 'poly', 'sigmoid')
|
|
642
|
+
nu: Upper bound on fraction of training errors and support vectors
|
|
643
|
+
gamma: Kernel coefficient
|
|
644
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
645
|
+
sample_size: Max samples for training (None = use all data)
|
|
646
|
+
batch_size: Batch size for scoring large datasets
|
|
647
|
+
auto_sample: If True, automatically determine sample_size
|
|
648
|
+
max_memory_mb: Max memory (MB) for auto_sample mode
|
|
649
|
+
**kwargs: Additional config
|
|
650
|
+
"""
|
|
651
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
652
|
+
self.kernel = kernel
|
|
653
|
+
self.nu = nu
|
|
654
|
+
self.gamma = gamma
|
|
655
|
+
self._sample_size = sample_size
|
|
656
|
+
self._batch_size = batch_size
|
|
657
|
+
self._auto_sample = auto_sample
|
|
658
|
+
self._max_memory_mb = max_memory_mb
|
|
659
|
+
|
|
660
|
+
def detect_anomalies(
|
|
661
|
+
self, data: np.ndarray, column_names: list[str]
|
|
662
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
663
|
+
"""Detect anomalies using One-Class SVM."""
|
|
664
|
+
_check_sklearn_available()
|
|
665
|
+
from sklearn.svm import OneClassSVM
|
|
666
|
+
|
|
667
|
+
# Normalize data
|
|
668
|
+
normalized_data, _, _ = self.normalize_data(data)
|
|
669
|
+
|
|
670
|
+
model = OneClassSVM(
|
|
671
|
+
kernel=self.kernel,
|
|
672
|
+
nu=self.nu,
|
|
673
|
+
gamma=self.gamma,
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
predictions = model.fit_predict(normalized_data)
|
|
677
|
+
anomaly_mask = predictions == -1
|
|
678
|
+
|
|
679
|
+
# Get decision function scores
|
|
680
|
+
scores = model.decision_function(normalized_data)
|
|
681
|
+
|
|
682
|
+
return anomaly_mask, {
|
|
683
|
+
"kernel": self.kernel,
|
|
684
|
+
"nu": self.nu,
|
|
685
|
+
"n_support": len(model.support_),
|
|
686
|
+
"min_score": float(np.min(scores)),
|
|
687
|
+
"max_score": float(np.max(scores)),
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
691
|
+
issues: list[ValidationIssue] = []
|
|
692
|
+
|
|
693
|
+
columns = self._get_anomaly_columns(lf)
|
|
694
|
+
if not columns:
|
|
695
|
+
return issues
|
|
696
|
+
|
|
697
|
+
# Determine sample size (SVM is O(n^2)-O(n^3), very memory intensive)
|
|
698
|
+
sample_size = self._sample_size
|
|
699
|
+
if self._auto_sample and sample_size is None:
|
|
700
|
+
total_count = lf.select(pl.len()).collect().item()
|
|
701
|
+
# SVM is very expensive, use aggressive sampling
|
|
702
|
+
sample_size = _compute_optimal_sample_size(
|
|
703
|
+
total_count, len(columns), self._max_memory_mb / 4 # Very conservative
|
|
704
|
+
)
|
|
705
|
+
# Cap at reasonable limit for SVM
|
|
706
|
+
sample_size = min(sample_size, 20000)
|
|
707
|
+
self.logger.debug(f"Auto-sample (SVM): using {sample_size} samples from {total_count}")
|
|
708
|
+
|
|
709
|
+
# Smart sampling from LazyFrame
|
|
710
|
+
data, original_count, was_sampled = self._smart_sample_lazyframe(
|
|
711
|
+
lf, columns, sample_size, 42
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
if len(data) < 10:
|
|
715
|
+
return issues
|
|
716
|
+
|
|
717
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
718
|
+
|
|
719
|
+
# Handle sampled results
|
|
720
|
+
if was_sampled and len(data) < original_count:
|
|
721
|
+
sample_anomaly_count = int(anomaly_mask.sum())
|
|
722
|
+
sample_anomaly_ratio = sample_anomaly_count / len(data)
|
|
723
|
+
estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
|
|
724
|
+
anomaly_count = estimated_total_anomalies
|
|
725
|
+
anomaly_ratio = sample_anomaly_ratio
|
|
726
|
+
info["sampled"] = True
|
|
727
|
+
info["sample_size"] = len(data)
|
|
728
|
+
info["original_count"] = original_count
|
|
729
|
+
else:
|
|
730
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
731
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
732
|
+
info["sampled"] = False
|
|
733
|
+
|
|
734
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
735
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
736
|
+
|
|
737
|
+
sample_note = ""
|
|
738
|
+
if info.get("sampled"):
|
|
739
|
+
sample_note = f" (estimated from {info['sample_size']:,} samples)"
|
|
740
|
+
|
|
741
|
+
issues.append(
|
|
742
|
+
ValidationIssue(
|
|
743
|
+
column=", ".join(columns),
|
|
744
|
+
issue_type="svm_anomaly",
|
|
745
|
+
count=anomaly_count,
|
|
746
|
+
severity=severity,
|
|
747
|
+
details=(
|
|
748
|
+
f"One-Class SVM ({info['kernel']}, nu={info['nu']}) detected "
|
|
749
|
+
f"{anomaly_count:,} anomalies ({anomaly_ratio:.2%}){sample_note}. "
|
|
750
|
+
f"Support vectors: {info['n_support']}"
|
|
751
|
+
),
|
|
752
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
753
|
+
)
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
return issues
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
@register_validator
|
|
760
|
+
class MemoryEfficientLOFValidator(AnomalyValidator, MLAnomalyMixin):
|
|
761
|
+
"""Memory-efficient LOF using approximate k-NN.
|
|
762
|
+
|
|
763
|
+
This validator uses approximate nearest neighbor algorithms (BallTree, Annoy, HNSW)
|
|
764
|
+
to compute LOF scores without building a full O(n²) distance matrix.
|
|
765
|
+
|
|
766
|
+
Memory Complexity:
|
|
767
|
+
- Standard LOF: O(n²) for distance matrix
|
|
768
|
+
- This implementation: O(n) with O(log n) query time
|
|
769
|
+
|
|
770
|
+
Use this for datasets > 50,000 rows where standard LOF would run out of memory.
|
|
771
|
+
|
|
772
|
+
Example:
|
|
773
|
+
# For large datasets (100k+ rows)
|
|
774
|
+
validator = MemoryEfficientLOFValidator(
|
|
775
|
+
columns=["feature1", "feature2"],
|
|
776
|
+
n_neighbors=20,
|
|
777
|
+
knn_backend="balltree", # or "annoy", "hnsw" if installed
|
|
778
|
+
)
|
|
779
|
+
"""
|
|
780
|
+
|
|
781
|
+
name = "memory_efficient_lof"
|
|
782
|
+
|
|
783
|
+
def __init__(
|
|
784
|
+
self,
|
|
785
|
+
columns: list[str] | None = None,
|
|
786
|
+
n_neighbors: int = 20,
|
|
787
|
+
contamination: float = 0.1,
|
|
788
|
+
max_anomaly_ratio: float = 0.1,
|
|
789
|
+
knn_backend: str = "auto",
|
|
790
|
+
sample_size: int | None = None,
|
|
791
|
+
**kwargs: Any,
|
|
792
|
+
):
|
|
793
|
+
"""Initialize memory-efficient LOF validator.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
columns: Columns to use for detection
|
|
797
|
+
n_neighbors: Number of neighbors for LOF
|
|
798
|
+
contamination: Expected proportion of outliers
|
|
799
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
800
|
+
knn_backend: k-NN backend ('auto', 'balltree', 'kdtree', 'annoy', 'hnsw')
|
|
801
|
+
sample_size: Optional sample size for very large datasets
|
|
802
|
+
**kwargs: Additional config
|
|
803
|
+
"""
|
|
804
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
805
|
+
self.n_neighbors = n_neighbors
|
|
806
|
+
self.contamination = contamination
|
|
807
|
+
self.knn_backend = knn_backend
|
|
808
|
+
self._sample_size = sample_size
|
|
809
|
+
|
|
810
|
+
# Import the mixin at runtime to avoid circular imports
|
|
811
|
+
from truthound.validators.memory import ApproximateKNNMixin
|
|
812
|
+
self._knn_mixin = ApproximateKNNMixin()
|
|
813
|
+
|
|
814
|
+
def detect_anomalies(
|
|
815
|
+
self, data: np.ndarray, column_names: list[str]
|
|
816
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
817
|
+
"""Detect anomalies using approximate LOF."""
|
|
818
|
+
# Normalize data
|
|
819
|
+
normalized_data, _, _ = self.normalize_data(data)
|
|
820
|
+
|
|
821
|
+
n_neighbors = min(self.n_neighbors, len(data) - 1)
|
|
822
|
+
|
|
823
|
+
# Build approximate index
|
|
824
|
+
backend = self.knn_backend if self.knn_backend != "auto" else None
|
|
825
|
+
self._knn_mixin.build_approximate_index(
|
|
826
|
+
normalized_data,
|
|
827
|
+
backend=backend,
|
|
828
|
+
metric="euclidean",
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# Compute LOF scores using approximate k-NN
|
|
832
|
+
lof_scores = self._knn_mixin.compute_local_outlier_factor(
|
|
833
|
+
normalized_data, k=n_neighbors
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Determine threshold based on contamination
|
|
837
|
+
if isinstance(self.contamination, float) and 0 < self.contamination < 0.5:
|
|
838
|
+
threshold = np.percentile(lof_scores, 100 * (1 - self.contamination))
|
|
839
|
+
else:
|
|
840
|
+
# Auto: use 1.5 as threshold (common LOF threshold)
|
|
841
|
+
threshold = 1.5
|
|
842
|
+
|
|
843
|
+
anomaly_mask = lof_scores > threshold
|
|
844
|
+
|
|
845
|
+
# Clear index to free memory
|
|
846
|
+
self._knn_mixin.clear_index()
|
|
847
|
+
|
|
848
|
+
return anomaly_mask, {
|
|
849
|
+
"n_neighbors": n_neighbors,
|
|
850
|
+
"min_lof": float(np.min(lof_scores)),
|
|
851
|
+
"max_lof": float(np.max(lof_scores)),
|
|
852
|
+
"mean_lof": float(np.mean(lof_scores)),
|
|
853
|
+
"threshold": float(threshold),
|
|
854
|
+
"backend": str(self._knn_mixin._knn_backend),
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
858
|
+
issues: list[ValidationIssue] = []
|
|
859
|
+
|
|
860
|
+
columns = self._get_anomaly_columns(lf)
|
|
861
|
+
if not columns:
|
|
862
|
+
return issues
|
|
863
|
+
|
|
864
|
+
# Sample if needed
|
|
865
|
+
if self._sample_size:
|
|
866
|
+
sample_lf = lf.head(self._sample_size)
|
|
867
|
+
df = sample_lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
|
|
868
|
+
else:
|
|
869
|
+
df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
|
|
870
|
+
|
|
871
|
+
if len(df) < self.n_neighbors + 1:
|
|
872
|
+
return issues
|
|
873
|
+
|
|
874
|
+
data = df.to_numpy()
|
|
875
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
876
|
+
|
|
877
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
878
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
879
|
+
|
|
880
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
881
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
882
|
+
|
|
883
|
+
issues.append(
|
|
884
|
+
ValidationIssue(
|
|
885
|
+
column=", ".join(columns),
|
|
886
|
+
issue_type="memory_efficient_lof_anomaly",
|
|
887
|
+
count=anomaly_count,
|
|
888
|
+
severity=severity,
|
|
889
|
+
details=(
|
|
890
|
+
f"Approximate LOF (k={info['n_neighbors']}, backend={info['backend']}) "
|
|
891
|
+
f"detected {anomaly_count:,} anomalies ({anomaly_ratio:.2%}). "
|
|
892
|
+
f"LOF scores: mean={info['mean_lof']:.2f}, max={info['max_lof']:.2f}"
|
|
893
|
+
),
|
|
894
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
895
|
+
)
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
return issues
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
@register_validator
|
|
902
|
+
class OnlineSVMValidator(AnomalyValidator, MLAnomalyMixin):
|
|
903
|
+
"""Online One-Class SVM using SGD for memory-efficient training.
|
|
904
|
+
|
|
905
|
+
This validator uses kernel approximation (Nystroem) and SGD optimization
|
|
906
|
+
to train One-Class SVM incrementally, avoiding the O(n²) kernel matrix.
|
|
907
|
+
|
|
908
|
+
Memory Complexity:
|
|
909
|
+
- Standard SVM: O(n²) for kernel matrix
|
|
910
|
+
- This implementation: O(n_components × n_features) constant
|
|
911
|
+
|
|
912
|
+
Use this for datasets > 20,000 rows where standard SVM would run out of memory.
|
|
913
|
+
|
|
914
|
+
Example:
|
|
915
|
+
# For large datasets
|
|
916
|
+
validator = OnlineSVMValidator(
|
|
917
|
+
columns=["feature1", "feature2"],
|
|
918
|
+
nu=0.05,
|
|
919
|
+
n_components=100, # Kernel approximation components
|
|
920
|
+
)
|
|
921
|
+
"""
|
|
922
|
+
|
|
923
|
+
name = "online_svm"
|
|
924
|
+
|
|
925
|
+
def __init__(
|
|
926
|
+
self,
|
|
927
|
+
columns: list[str] | None = None,
|
|
928
|
+
nu: float = 0.05,
|
|
929
|
+
n_components: int = 100,
|
|
930
|
+
kernel_approx: str = "nystroem",
|
|
931
|
+
max_anomaly_ratio: float = 0.1,
|
|
932
|
+
n_iterations: int = 3,
|
|
933
|
+
batch_size: int = 1000,
|
|
934
|
+
**kwargs: Any,
|
|
935
|
+
):
|
|
936
|
+
"""Initialize online SVM validator.
|
|
937
|
+
|
|
938
|
+
Args:
|
|
939
|
+
columns: Columns to use for detection
|
|
940
|
+
nu: Upper bound on fraction of outliers
|
|
941
|
+
n_components: Number of kernel approximation components
|
|
942
|
+
kernel_approx: Kernel approximation method ('nystroem' or 'rbf_sampler')
|
|
943
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
944
|
+
n_iterations: Number of passes through data
|
|
945
|
+
batch_size: Mini-batch size for training
|
|
946
|
+
**kwargs: Additional config
|
|
947
|
+
"""
|
|
948
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
949
|
+
self.nu = nu
|
|
950
|
+
self.n_components = n_components
|
|
951
|
+
self.kernel_approx = kernel_approx
|
|
952
|
+
self.n_iterations = n_iterations
|
|
953
|
+
self.batch_size = batch_size
|
|
954
|
+
|
|
955
|
+
def detect_anomalies(
|
|
956
|
+
self, data: np.ndarray, column_names: list[str]
|
|
957
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
958
|
+
"""Detect anomalies using online SVM."""
|
|
959
|
+
from truthound.validators.memory import SGDOneClassSVM
|
|
960
|
+
|
|
961
|
+
# Normalize data
|
|
962
|
+
normalized_data, _, _ = self.normalize_data(data)
|
|
963
|
+
|
|
964
|
+
# Create online SVM
|
|
965
|
+
model = SGDOneClassSVM(
|
|
966
|
+
nu=self.nu,
|
|
967
|
+
n_components=min(self.n_components, len(data)),
|
|
968
|
+
kernel_approx=self.kernel_approx,
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
# Train incrementally
|
|
972
|
+
n_samples = len(normalized_data)
|
|
973
|
+
for _ in range(self.n_iterations):
|
|
974
|
+
for start in range(0, n_samples, self.batch_size):
|
|
975
|
+
end = min(start + self.batch_size, n_samples)
|
|
976
|
+
model.partial_fit(normalized_data[start:end])
|
|
977
|
+
|
|
978
|
+
# Predict
|
|
979
|
+
predictions = model.predict(normalized_data)
|
|
980
|
+
anomaly_mask = predictions == -1
|
|
981
|
+
|
|
982
|
+
# Get decision scores
|
|
983
|
+
scores = model.decision_function(normalized_data)
|
|
984
|
+
|
|
985
|
+
return anomaly_mask, {
|
|
986
|
+
"nu": self.nu,
|
|
987
|
+
"n_components": self.n_components,
|
|
988
|
+
"min_score": float(np.min(scores)),
|
|
989
|
+
"max_score": float(np.max(scores)),
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
993
|
+
issues: list[ValidationIssue] = []
|
|
994
|
+
|
|
995
|
+
columns = self._get_anomaly_columns(lf)
|
|
996
|
+
if not columns:
|
|
997
|
+
return issues
|
|
998
|
+
|
|
999
|
+
df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
|
|
1000
|
+
|
|
1001
|
+
if len(df) < 10:
|
|
1002
|
+
return issues
|
|
1003
|
+
|
|
1004
|
+
data = df.to_numpy()
|
|
1005
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
1006
|
+
|
|
1007
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
1008
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
1009
|
+
|
|
1010
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
1011
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
1012
|
+
|
|
1013
|
+
issues.append(
|
|
1014
|
+
ValidationIssue(
|
|
1015
|
+
column=", ".join(columns),
|
|
1016
|
+
issue_type="online_svm_anomaly",
|
|
1017
|
+
count=anomaly_count,
|
|
1018
|
+
severity=severity,
|
|
1019
|
+
details=(
|
|
1020
|
+
f"Online SVM (nu={info['nu']}, components={info['n_components']}) "
|
|
1021
|
+
f"detected {anomaly_count:,} anomalies ({anomaly_ratio:.2%}). "
|
|
1022
|
+
f"Score range: [{info['min_score']:.4f}, {info['max_score']:.4f}]"
|
|
1023
|
+
),
|
|
1024
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
1025
|
+
)
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
return issues
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
@register_validator
|
|
1032
|
+
class DBSCANAnomalyValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
|
|
1033
|
+
"""DBSCAN-based anomaly detection.
|
|
1034
|
+
|
|
1035
|
+
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
|
|
1036
|
+
identifies outliers as noise points that don't belong to any cluster.
|
|
1037
|
+
|
|
1038
|
+
Best for discovering clusters of arbitrary shape while identifying
|
|
1039
|
+
noise points as anomalies.
|
|
1040
|
+
|
|
1041
|
+
Memory Optimization:
|
|
1042
|
+
DBSCAN requires pairwise distance computation. For large datasets:
|
|
1043
|
+
|
|
1044
|
+
validator = DBSCANAnomalyValidator(
|
|
1045
|
+
columns=["x", "y"],
|
|
1046
|
+
eps=0.5,
|
|
1047
|
+
sample_size=50000, # Sample for large datasets
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
Example:
|
|
1051
|
+
validator = DBSCANAnomalyValidator(
|
|
1052
|
+
columns=["x", "y"],
|
|
1053
|
+
eps=0.5, # Maximum distance between points
|
|
1054
|
+
min_samples=5, # Minimum cluster size
|
|
1055
|
+
)
|
|
1056
|
+
"""
|
|
1057
|
+
|
|
1058
|
+
name = "dbscan_anomaly"
|
|
1059
|
+
|
|
1060
|
+
def __init__(
|
|
1061
|
+
self,
|
|
1062
|
+
columns: list[str] | None = None,
|
|
1063
|
+
eps: float = 0.5,
|
|
1064
|
+
min_samples: int = 5,
|
|
1065
|
+
metric: str = "euclidean",
|
|
1066
|
+
max_anomaly_ratio: float = 0.1,
|
|
1067
|
+
sample_size: int | None = None,
|
|
1068
|
+
auto_sample: bool = False,
|
|
1069
|
+
max_memory_mb: float = MEMORY_THRESHOLD_MB,
|
|
1070
|
+
**kwargs: Any,
|
|
1071
|
+
):
|
|
1072
|
+
"""Initialize DBSCAN anomaly validator.
|
|
1073
|
+
|
|
1074
|
+
Args:
|
|
1075
|
+
columns: Columns to use for detection
|
|
1076
|
+
eps: Maximum distance between points in a cluster
|
|
1077
|
+
min_samples: Minimum number of points for a core point
|
|
1078
|
+
metric: Distance metric
|
|
1079
|
+
max_anomaly_ratio: Maximum acceptable ratio of anomalies
|
|
1080
|
+
sample_size: Max samples for processing (None = use all data)
|
|
1081
|
+
auto_sample: If True, automatically determine sample_size
|
|
1082
|
+
max_memory_mb: Max memory (MB) for auto_sample mode
|
|
1083
|
+
**kwargs: Additional config
|
|
1084
|
+
"""
|
|
1085
|
+
super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
|
|
1086
|
+
self.eps = eps
|
|
1087
|
+
self.min_samples = min_samples
|
|
1088
|
+
self.metric = metric
|
|
1089
|
+
self._sample_size = sample_size
|
|
1090
|
+
self._auto_sample = auto_sample
|
|
1091
|
+
self._max_memory_mb = max_memory_mb
|
|
1092
|
+
|
|
1093
|
+
def detect_anomalies(
|
|
1094
|
+
self, data: np.ndarray, column_names: list[str]
|
|
1095
|
+
) -> tuple[np.ndarray, dict[str, Any]]:
|
|
1096
|
+
"""Detect anomalies using DBSCAN."""
|
|
1097
|
+
_check_sklearn_available()
|
|
1098
|
+
from sklearn.cluster import DBSCAN
|
|
1099
|
+
|
|
1100
|
+
# Normalize data
|
|
1101
|
+
normalized_data, _, _ = self.normalize_data(data)
|
|
1102
|
+
|
|
1103
|
+
model = DBSCAN(
|
|
1104
|
+
eps=self.eps,
|
|
1105
|
+
min_samples=self.min_samples,
|
|
1106
|
+
metric=self.metric,
|
|
1107
|
+
n_jobs=-1,
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
labels = model.fit_predict(normalized_data)
|
|
1111
|
+
|
|
1112
|
+
# -1 label indicates noise (anomaly)
|
|
1113
|
+
anomaly_mask = labels == -1
|
|
1114
|
+
|
|
1115
|
+
# Count clusters
|
|
1116
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
1117
|
+
|
|
1118
|
+
return anomaly_mask, {
|
|
1119
|
+
"n_clusters": n_clusters,
|
|
1120
|
+
"eps": self.eps,
|
|
1121
|
+
"min_samples": self.min_samples,
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
1125
|
+
issues: list[ValidationIssue] = []
|
|
1126
|
+
|
|
1127
|
+
columns = self._get_anomaly_columns(lf)
|
|
1128
|
+
if not columns:
|
|
1129
|
+
return issues
|
|
1130
|
+
|
|
1131
|
+
# Determine sample size (DBSCAN needs O(n^2) distance computations)
|
|
1132
|
+
sample_size = self._sample_size
|
|
1133
|
+
if self._auto_sample and sample_size is None:
|
|
1134
|
+
total_count = lf.select(pl.len()).collect().item()
|
|
1135
|
+
# DBSCAN is memory intensive, use conservative sampling
|
|
1136
|
+
sample_size = _compute_optimal_sample_size(
|
|
1137
|
+
total_count, len(columns), self._max_memory_mb / 2
|
|
1138
|
+
)
|
|
1139
|
+
# Cap at reasonable limit
|
|
1140
|
+
sample_size = min(sample_size, 50000)
|
|
1141
|
+
self.logger.debug(f"Auto-sample (DBSCAN): using {sample_size} samples from {total_count}")
|
|
1142
|
+
|
|
1143
|
+
# Smart sampling from LazyFrame
|
|
1144
|
+
data, original_count, was_sampled = self._smart_sample_lazyframe(
|
|
1145
|
+
lf, columns, sample_size, 42
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
if len(data) < self.min_samples:
|
|
1149
|
+
return issues
|
|
1150
|
+
|
|
1151
|
+
anomaly_mask, info = self.detect_anomalies(data, columns)
|
|
1152
|
+
|
|
1153
|
+
# Handle sampled results
|
|
1154
|
+
if was_sampled and len(data) < original_count:
|
|
1155
|
+
sample_anomaly_count = int(anomaly_mask.sum())
|
|
1156
|
+
sample_anomaly_ratio = sample_anomaly_count / len(data)
|
|
1157
|
+
estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
|
|
1158
|
+
anomaly_count = estimated_total_anomalies
|
|
1159
|
+
anomaly_ratio = sample_anomaly_ratio
|
|
1160
|
+
info["sampled"] = True
|
|
1161
|
+
info["sample_size"] = len(data)
|
|
1162
|
+
info["original_count"] = original_count
|
|
1163
|
+
else:
|
|
1164
|
+
anomaly_count = int(anomaly_mask.sum())
|
|
1165
|
+
anomaly_ratio = anomaly_count / len(data)
|
|
1166
|
+
info["sampled"] = False
|
|
1167
|
+
|
|
1168
|
+
if anomaly_ratio > self.max_anomaly_ratio:
|
|
1169
|
+
severity = self._calculate_severity(anomaly_ratio)
|
|
1170
|
+
|
|
1171
|
+
sample_note = ""
|
|
1172
|
+
if info.get("sampled"):
|
|
1173
|
+
sample_note = f" (estimated from {info['sample_size']:,} samples)"
|
|
1174
|
+
|
|
1175
|
+
issues.append(
|
|
1176
|
+
ValidationIssue(
|
|
1177
|
+
column=", ".join(columns),
|
|
1178
|
+
issue_type="dbscan_anomaly",
|
|
1179
|
+
count=anomaly_count,
|
|
1180
|
+
severity=severity,
|
|
1181
|
+
details=(
|
|
1182
|
+
f"DBSCAN (eps={info['eps']}, min_samples={info['min_samples']}) "
|
|
1183
|
+
f"found {info['n_clusters']} clusters and {anomaly_count:,} noise points "
|
|
1184
|
+
f"({anomaly_ratio:.2%}){sample_note}"
|
|
1185
|
+
),
|
|
1186
|
+
expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
|
|
1187
|
+
)
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
return issues
|