truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
"""SGD-based online learning for memory-efficient model training.
|
|
2
|
+
|
|
3
|
+
This module provides incremental/online learning implementations for
|
|
4
|
+
algorithms that traditionally require O(n²) or O(n³) memory for training.
|
|
5
|
+
|
|
6
|
+
Key Algorithms:
|
|
7
|
+
- SGDOneClassSVM: Online One-Class SVM using SGD
|
|
8
|
+
- IncrementalPCA: Streaming PCA for dimensionality reduction
|
|
9
|
+
- OnlineIsolationForest: Incremental tree building
|
|
10
|
+
|
|
11
|
+
Memory Complexity:
|
|
12
|
+
- Traditional SVM: O(n²) for kernel matrix
|
|
13
|
+
- SGD SVM: O(1) per sample, O(d) for model weights
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
class MemoryEfficientSVM(AnomalyValidator, SGDOnlineMixin):
|
|
17
|
+
def validate(self, lf):
|
|
18
|
+
# Stream data through online learner
|
|
19
|
+
model = self.create_online_svm()
|
|
20
|
+
for chunk in self.iterate_chunks(lf):
|
|
21
|
+
model.partial_fit(chunk)
|
|
22
|
+
|
|
23
|
+
# Predict on new data
|
|
24
|
+
predictions = model.predict(current_data)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from abc import ABC, abstractmethod
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from enum import Enum, auto
|
|
32
|
+
from typing import Any, Iterator, Protocol, TYPE_CHECKING
|
|
33
|
+
import warnings
|
|
34
|
+
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
import polars as pl
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class OnlineLearnerType(Enum):
|
|
42
|
+
"""Type of online learning algorithm."""
|
|
43
|
+
|
|
44
|
+
SGD_SVM = auto() # SGD-based One-Class SVM
|
|
45
|
+
INCREMENTAL_PCA = auto() # Streaming PCA
|
|
46
|
+
MINI_BATCH_KMEANS = auto() # Mini-batch K-Means
|
|
47
|
+
ONLINE_COVARIANCE = auto() # Streaming covariance estimation
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class OnlineLearnerConfig:
|
|
52
|
+
"""Configuration for online learning algorithms.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
learning_rate: Initial learning rate
|
|
56
|
+
learning_rate_schedule: Schedule type ('constant', 'optimal', 'invscaling')
|
|
57
|
+
n_iterations: Number of passes through data
|
|
58
|
+
batch_size: Mini-batch size for partial_fit
|
|
59
|
+
regularization: L2 regularization strength
|
|
60
|
+
random_state: Random seed
|
|
61
|
+
warm_start: Whether to continue from previous fit
|
|
62
|
+
tol: Tolerance for convergence
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
learning_rate: float = 0.001
|
|
66
|
+
learning_rate_schedule: str = "optimal"
|
|
67
|
+
n_iterations: int = 5
|
|
68
|
+
batch_size: int = 1000
|
|
69
|
+
regularization: float = 0.0001
|
|
70
|
+
random_state: int = 42
|
|
71
|
+
warm_start: bool = True
|
|
72
|
+
tol: float = 1e-4
|
|
73
|
+
|
|
74
|
+
# SVM-specific
|
|
75
|
+
nu: float = 0.1 # Upper bound on outlier fraction
|
|
76
|
+
kernel_approx: str = "nystroem" # Kernel approximation method
|
|
77
|
+
n_components: int = 100 # Number of kernel components
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class IncrementalModel(Protocol):
|
|
81
|
+
"""Protocol for incremental learning models."""
|
|
82
|
+
|
|
83
|
+
def partial_fit(self, X: np.ndarray) -> "IncrementalModel":
|
|
84
|
+
"""Fit on a batch of data."""
|
|
85
|
+
...
|
|
86
|
+
|
|
87
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
88
|
+
"""Predict on data."""
|
|
89
|
+
...
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class OnlineStatistics:
|
|
93
|
+
"""Welford's online algorithm for computing running statistics.
|
|
94
|
+
|
|
95
|
+
Computes mean, variance, and standard deviation in a single pass
|
|
96
|
+
with O(1) memory per feature.
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
stats = OnlineStatistics(n_features=10)
|
|
100
|
+
for batch in data_stream:
|
|
101
|
+
stats.update(batch)
|
|
102
|
+
mean, std = stats.mean, stats.std
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(self, n_features: int):
|
|
106
|
+
"""Initialize online statistics tracker.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
n_features: Number of features
|
|
110
|
+
"""
|
|
111
|
+
self.n_features = n_features
|
|
112
|
+
self.n_samples = 0
|
|
113
|
+
self._mean = np.zeros(n_features)
|
|
114
|
+
self._M2 = np.zeros(n_features) # Sum of squared differences
|
|
115
|
+
self._min = np.full(n_features, np.inf)
|
|
116
|
+
self._max = np.full(n_features, -np.inf)
|
|
117
|
+
|
|
118
|
+
def update(self, X: np.ndarray) -> None:
|
|
119
|
+
"""Update statistics with new batch.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
X: Data batch (n_samples, n_features)
|
|
123
|
+
"""
|
|
124
|
+
if X.ndim == 1:
|
|
125
|
+
X = X.reshape(1, -1)
|
|
126
|
+
|
|
127
|
+
for x in X:
|
|
128
|
+
self.n_samples += 1
|
|
129
|
+
delta = x - self._mean
|
|
130
|
+
self._mean += delta / self.n_samples
|
|
131
|
+
delta2 = x - self._mean
|
|
132
|
+
self._M2 += delta * delta2
|
|
133
|
+
self._min = np.minimum(self._min, x)
|
|
134
|
+
self._max = np.maximum(self._max, x)
|
|
135
|
+
|
|
136
|
+
def update_batch(self, X: np.ndarray) -> None:
|
|
137
|
+
"""Batch update using parallel algorithm.
|
|
138
|
+
|
|
139
|
+
More efficient than individual updates for large batches.
|
|
140
|
+
"""
|
|
141
|
+
if X.ndim == 1:
|
|
142
|
+
X = X.reshape(1, -1)
|
|
143
|
+
|
|
144
|
+
n_batch = len(X)
|
|
145
|
+
batch_mean = X.mean(axis=0)
|
|
146
|
+
batch_var = X.var(axis=0, ddof=0)
|
|
147
|
+
|
|
148
|
+
# Combine with existing statistics
|
|
149
|
+
if self.n_samples == 0:
|
|
150
|
+
self._mean = batch_mean
|
|
151
|
+
self._M2 = batch_var * n_batch
|
|
152
|
+
else:
|
|
153
|
+
n_total = self.n_samples + n_batch
|
|
154
|
+
delta = batch_mean - self._mean
|
|
155
|
+
|
|
156
|
+
self._mean = (self.n_samples * self._mean + n_batch * batch_mean) / n_total
|
|
157
|
+
self._M2 += batch_var * n_batch + delta**2 * self.n_samples * n_batch / n_total
|
|
158
|
+
|
|
159
|
+
self.n_samples += n_batch
|
|
160
|
+
self._min = np.minimum(self._min, X.min(axis=0))
|
|
161
|
+
self._max = np.maximum(self._max, X.max(axis=0))
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def mean(self) -> np.ndarray:
|
|
165
|
+
"""Get current mean."""
|
|
166
|
+
return self._mean.copy()
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def variance(self) -> np.ndarray:
|
|
170
|
+
"""Get current variance."""
|
|
171
|
+
if self.n_samples < 2:
|
|
172
|
+
return np.zeros(self.n_features)
|
|
173
|
+
return self._M2 / (self.n_samples - 1)
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def std(self) -> np.ndarray:
|
|
177
|
+
"""Get current standard deviation."""
|
|
178
|
+
return np.sqrt(self.variance)
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def min(self) -> np.ndarray:
|
|
182
|
+
"""Get minimum values."""
|
|
183
|
+
return self._min.copy()
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def max(self) -> np.ndarray:
|
|
187
|
+
"""Get maximum values."""
|
|
188
|
+
return self._max.copy()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class OnlineScaler:
|
|
192
|
+
"""Online standardization scaler.
|
|
193
|
+
|
|
194
|
+
Computes scaling parameters incrementally and can transform data
|
|
195
|
+
using the running mean and standard deviation.
|
|
196
|
+
|
|
197
|
+
Example:
|
|
198
|
+
scaler = OnlineScaler()
|
|
199
|
+
for batch in training_data:
|
|
200
|
+
scaler.partial_fit(batch)
|
|
201
|
+
scaled = scaler.transform(new_data)
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
def __init__(self, with_mean: bool = True, with_std: bool = True):
|
|
205
|
+
"""Initialize scaler.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
with_mean: Whether to center data
|
|
209
|
+
with_std: Whether to scale by std
|
|
210
|
+
"""
|
|
211
|
+
self.with_mean = with_mean
|
|
212
|
+
self.with_std = with_std
|
|
213
|
+
self._stats: OnlineStatistics | None = None
|
|
214
|
+
|
|
215
|
+
def partial_fit(self, X: np.ndarray) -> "OnlineScaler":
|
|
216
|
+
"""Update scaler with new data.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
X: Data batch (n_samples, n_features)
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
self
|
|
223
|
+
"""
|
|
224
|
+
if X.ndim == 1:
|
|
225
|
+
X = X.reshape(1, -1)
|
|
226
|
+
|
|
227
|
+
if self._stats is None:
|
|
228
|
+
self._stats = OnlineStatistics(X.shape[1])
|
|
229
|
+
|
|
230
|
+
self._stats.update_batch(X)
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
def transform(self, X: np.ndarray) -> np.ndarray:
|
|
234
|
+
"""Transform data using learned parameters.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
X: Data to transform
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Transformed data
|
|
241
|
+
"""
|
|
242
|
+
if self._stats is None:
|
|
243
|
+
raise RuntimeError("Scaler not fitted. Call partial_fit first.")
|
|
244
|
+
|
|
245
|
+
result = X.copy()
|
|
246
|
+
if self.with_mean:
|
|
247
|
+
result = result - self._stats.mean
|
|
248
|
+
if self.with_std:
|
|
249
|
+
std = self._stats.std
|
|
250
|
+
std = np.where(std == 0, 1, std)
|
|
251
|
+
result = result / std
|
|
252
|
+
return result
|
|
253
|
+
|
|
254
|
+
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
|
255
|
+
"""Fit and transform in one step."""
|
|
256
|
+
self.partial_fit(X)
|
|
257
|
+
return self.transform(X)
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def mean_(self) -> np.ndarray:
|
|
261
|
+
"""Get learned mean."""
|
|
262
|
+
if self._stats is None:
|
|
263
|
+
raise RuntimeError("Scaler not fitted")
|
|
264
|
+
return self._stats.mean
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def scale_(self) -> np.ndarray:
|
|
268
|
+
"""Get learned scale (std)."""
|
|
269
|
+
if self._stats is None:
|
|
270
|
+
raise RuntimeError("Scaler not fitted")
|
|
271
|
+
return self._stats.std
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class SGDOneClassSVM:
|
|
275
|
+
"""SGD-based One-Class SVM for online anomaly detection.
|
|
276
|
+
|
|
277
|
+
This implementation uses:
|
|
278
|
+
1. Kernel approximation (Nystroem or RBF Sampler) for scalability
|
|
279
|
+
2. SGD optimization for online learning
|
|
280
|
+
3. Linear SVM in the approximated feature space
|
|
281
|
+
|
|
282
|
+
Memory: O(n_components × n_features) instead of O(n_samples²)
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
model = SGDOneClassSVM(nu=0.05, n_components=100)
|
|
286
|
+
for batch in data_stream:
|
|
287
|
+
model.partial_fit(batch)
|
|
288
|
+
predictions = model.predict(test_data) # -1 for outliers
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
def __init__(
|
|
292
|
+
self,
|
|
293
|
+
nu: float = 0.1,
|
|
294
|
+
kernel_approx: str = "nystroem",
|
|
295
|
+
n_components: int = 100,
|
|
296
|
+
gamma: float | str = "scale",
|
|
297
|
+
learning_rate: str = "optimal",
|
|
298
|
+
eta0: float = 0.01,
|
|
299
|
+
random_state: int = 42,
|
|
300
|
+
max_iter: int = 1000,
|
|
301
|
+
tol: float = 1e-4,
|
|
302
|
+
):
|
|
303
|
+
"""Initialize SGD One-Class SVM.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
nu: Upper bound on fraction of outliers (0 < nu <= 0.5)
|
|
307
|
+
kernel_approx: Kernel approximation ('nystroem' or 'rbf_sampler')
|
|
308
|
+
n_components: Number of kernel components
|
|
309
|
+
gamma: Kernel coefficient ('scale', 'auto', or float)
|
|
310
|
+
learning_rate: Learning rate schedule
|
|
311
|
+
eta0: Initial learning rate
|
|
312
|
+
random_state: Random seed
|
|
313
|
+
max_iter: Maximum iterations for SGD
|
|
314
|
+
tol: Tolerance for convergence
|
|
315
|
+
"""
|
|
316
|
+
self.nu = nu
|
|
317
|
+
self.kernel_approx = kernel_approx
|
|
318
|
+
self.n_components = n_components
|
|
319
|
+
self.gamma = gamma
|
|
320
|
+
self.learning_rate = learning_rate
|
|
321
|
+
self.eta0 = eta0
|
|
322
|
+
self.random_state = random_state
|
|
323
|
+
self.max_iter = max_iter
|
|
324
|
+
self.tol = tol
|
|
325
|
+
|
|
326
|
+
self._kernel_transformer = None
|
|
327
|
+
self._sgd_classifier = None
|
|
328
|
+
self._scaler = None
|
|
329
|
+
self._is_fitted = False
|
|
330
|
+
self._n_features = None
|
|
331
|
+
|
|
332
|
+
def _init_models(self, X: np.ndarray) -> None:
|
|
333
|
+
"""Initialize internal models on first fit."""
|
|
334
|
+
from sklearn.kernel_approximation import Nystroem, RBFSampler
|
|
335
|
+
from sklearn.linear_model import SGDClassifier
|
|
336
|
+
|
|
337
|
+
self._n_features = X.shape[1]
|
|
338
|
+
|
|
339
|
+
# Compute gamma if needed
|
|
340
|
+
gamma = self.gamma
|
|
341
|
+
if gamma == "scale":
|
|
342
|
+
gamma = 1.0 / (self._n_features * X.var())
|
|
343
|
+
elif gamma == "auto":
|
|
344
|
+
gamma = 1.0 / self._n_features
|
|
345
|
+
|
|
346
|
+
# Initialize kernel approximation
|
|
347
|
+
if self.kernel_approx == "nystroem":
|
|
348
|
+
self._kernel_transformer = Nystroem(
|
|
349
|
+
kernel="rbf",
|
|
350
|
+
gamma=gamma,
|
|
351
|
+
n_components=min(self.n_components, len(X)),
|
|
352
|
+
random_state=self.random_state,
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
self._kernel_transformer = RBFSampler(
|
|
356
|
+
gamma=gamma,
|
|
357
|
+
n_components=self.n_components,
|
|
358
|
+
random_state=self.random_state,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Initialize SGD classifier
|
|
362
|
+
# Use hinge loss for SVM-like behavior
|
|
363
|
+
self._sgd_classifier = SGDClassifier(
|
|
364
|
+
loss="hinge",
|
|
365
|
+
penalty="l2",
|
|
366
|
+
alpha=0.0001,
|
|
367
|
+
learning_rate=self.learning_rate,
|
|
368
|
+
eta0=self.eta0,
|
|
369
|
+
random_state=self.random_state,
|
|
370
|
+
max_iter=self.max_iter,
|
|
371
|
+
tol=self.tol,
|
|
372
|
+
warm_start=True,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Initialize online scaler
|
|
376
|
+
self._scaler = OnlineScaler()
|
|
377
|
+
|
|
378
|
+
def partial_fit(self, X: np.ndarray) -> "SGDOneClassSVM":
|
|
379
|
+
"""Incrementally fit the model on a batch.
|
|
380
|
+
|
|
381
|
+
For One-Class SVM, we generate synthetic outliers and train
|
|
382
|
+
a binary classifier to separate normal data from outliers.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
X: Training data (n_samples, n_features)
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
self
|
|
389
|
+
"""
|
|
390
|
+
if X.ndim == 1:
|
|
391
|
+
X = X.reshape(1, -1)
|
|
392
|
+
|
|
393
|
+
# Initialize on first call
|
|
394
|
+
if not self._is_fitted:
|
|
395
|
+
self._init_models(X)
|
|
396
|
+
# Fit kernel transformer on first batch
|
|
397
|
+
self._kernel_transformer.fit(X)
|
|
398
|
+
|
|
399
|
+
# Update scaler
|
|
400
|
+
self._scaler.partial_fit(X)
|
|
401
|
+
X_scaled = self._scaler.transform(X)
|
|
402
|
+
|
|
403
|
+
# Transform to kernel space
|
|
404
|
+
X_kernel = self._kernel_transformer.transform(X_scaled)
|
|
405
|
+
|
|
406
|
+
# Generate synthetic outliers
|
|
407
|
+
n_outliers = max(1, int(len(X) * self.nu / (1 - self.nu)))
|
|
408
|
+
outliers = self._generate_outliers(X_scaled, n_outliers)
|
|
409
|
+
outliers_kernel = self._kernel_transformer.transform(outliers)
|
|
410
|
+
|
|
411
|
+
# Combine normal and outliers
|
|
412
|
+
X_combined = np.vstack([X_kernel, outliers_kernel])
|
|
413
|
+
y_combined = np.array([1] * len(X) + [-1] * n_outliers)
|
|
414
|
+
|
|
415
|
+
# Partial fit SGD classifier
|
|
416
|
+
self._sgd_classifier.partial_fit(X_combined, y_combined, classes=[-1, 1])
|
|
417
|
+
self._is_fitted = True
|
|
418
|
+
|
|
419
|
+
return self
|
|
420
|
+
|
|
421
|
+
def _generate_outliers(self, X: np.ndarray, n_outliers: int) -> np.ndarray:
|
|
422
|
+
"""Generate synthetic outliers for training.
|
|
423
|
+
|
|
424
|
+
Uses uniform sampling in an expanded bounding box around the data.
|
|
425
|
+
"""
|
|
426
|
+
rng = np.random.default_rng(self.random_state)
|
|
427
|
+
|
|
428
|
+
# Expand bounding box
|
|
429
|
+
min_vals = X.min(axis=0)
|
|
430
|
+
max_vals = X.max(axis=0)
|
|
431
|
+
ranges = max_vals - min_vals
|
|
432
|
+
ranges = np.where(ranges == 0, 1, ranges)
|
|
433
|
+
|
|
434
|
+
# Sample from expanded box
|
|
435
|
+
expansion = 1.5
|
|
436
|
+
outliers = rng.uniform(
|
|
437
|
+
min_vals - expansion * ranges,
|
|
438
|
+
max_vals + expansion * ranges,
|
|
439
|
+
size=(n_outliers, X.shape[1]),
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return outliers
|
|
443
|
+
|
|
444
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
445
|
+
"""Predict if samples are outliers.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
X: Test data (n_samples, n_features)
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Array of predictions: 1 for normal, -1 for outlier
|
|
452
|
+
"""
|
|
453
|
+
if not self._is_fitted:
|
|
454
|
+
raise RuntimeError("Model not fitted. Call partial_fit first.")
|
|
455
|
+
|
|
456
|
+
if X.ndim == 1:
|
|
457
|
+
X = X.reshape(1, -1)
|
|
458
|
+
|
|
459
|
+
X_scaled = self._scaler.transform(X)
|
|
460
|
+
X_kernel = self._kernel_transformer.transform(X_scaled)
|
|
461
|
+
|
|
462
|
+
return self._sgd_classifier.predict(X_kernel)
|
|
463
|
+
|
|
464
|
+
def decision_function(self, X: np.ndarray) -> np.ndarray:
|
|
465
|
+
"""Compute decision function values.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
X: Test data
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Decision function values (positive = normal, negative = outlier)
|
|
472
|
+
"""
|
|
473
|
+
if not self._is_fitted:
|
|
474
|
+
raise RuntimeError("Model not fitted. Call partial_fit first.")
|
|
475
|
+
|
|
476
|
+
if X.ndim == 1:
|
|
477
|
+
X = X.reshape(1, -1)
|
|
478
|
+
|
|
479
|
+
X_scaled = self._scaler.transform(X)
|
|
480
|
+
X_kernel = self._kernel_transformer.transform(X_scaled)
|
|
481
|
+
|
|
482
|
+
return self._sgd_classifier.decision_function(X_kernel)
|
|
483
|
+
|
|
484
|
+
def fit(self, X: np.ndarray) -> "SGDOneClassSVM":
|
|
485
|
+
"""Fit the model on entire dataset at once.
|
|
486
|
+
|
|
487
|
+
For compatibility with sklearn API.
|
|
488
|
+
"""
|
|
489
|
+
return self.partial_fit(X)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class IncrementalMahalanobis:
|
|
493
|
+
"""Incremental Mahalanobis distance computation.
|
|
494
|
+
|
|
495
|
+
Maintains running mean and covariance matrix for computing
|
|
496
|
+
Mahalanobis distances without storing all data.
|
|
497
|
+
|
|
498
|
+
Memory: O(d²) for d features instead of O(n × d) for n samples.
|
|
499
|
+
|
|
500
|
+
Example:
|
|
501
|
+
detector = IncrementalMahalanobis()
|
|
502
|
+
for batch in training_data:
|
|
503
|
+
detector.partial_fit(batch)
|
|
504
|
+
distances = detector.mahalanobis(test_data)
|
|
505
|
+
"""
|
|
506
|
+
|
|
507
|
+
def __init__(self, regularization: float = 1e-6):
|
|
508
|
+
"""Initialize detector.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
regularization: Regularization for covariance inversion
|
|
512
|
+
"""
|
|
513
|
+
self.regularization = regularization
|
|
514
|
+
self._n_samples = 0
|
|
515
|
+
self._mean = None
|
|
516
|
+
self._cov_sum = None
|
|
517
|
+
self._inv_cov = None
|
|
518
|
+
|
|
519
|
+
def partial_fit(self, X: np.ndarray) -> "IncrementalMahalanobis":
|
|
520
|
+
"""Update with new batch.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
X: Data batch (n_samples, n_features)
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
self
|
|
527
|
+
"""
|
|
528
|
+
if X.ndim == 1:
|
|
529
|
+
X = X.reshape(1, -1)
|
|
530
|
+
|
|
531
|
+
n_batch = len(X)
|
|
532
|
+
batch_mean = X.mean(axis=0)
|
|
533
|
+
|
|
534
|
+
if self._n_samples == 0:
|
|
535
|
+
self._mean = batch_mean
|
|
536
|
+
self._cov_sum = np.zeros((X.shape[1], X.shape[1]))
|
|
537
|
+
else:
|
|
538
|
+
# Update mean
|
|
539
|
+
total = self._n_samples + n_batch
|
|
540
|
+
self._mean = (self._n_samples * self._mean + n_batch * batch_mean) / total
|
|
541
|
+
|
|
542
|
+
# Update covariance sum
|
|
543
|
+
centered = X - self._mean
|
|
544
|
+
self._cov_sum += centered.T @ centered
|
|
545
|
+
self._n_samples += n_batch
|
|
546
|
+
|
|
547
|
+
# Invalidate cached inverse
|
|
548
|
+
self._inv_cov = None
|
|
549
|
+
|
|
550
|
+
return self
|
|
551
|
+
|
|
552
|
+
@property
|
|
553
|
+
def covariance(self) -> np.ndarray:
|
|
554
|
+
"""Get current covariance matrix."""
|
|
555
|
+
if self._n_samples < 2:
|
|
556
|
+
raise RuntimeError("Need at least 2 samples for covariance")
|
|
557
|
+
return self._cov_sum / (self._n_samples - 1)
|
|
558
|
+
|
|
559
|
+
def _compute_inverse_covariance(self) -> np.ndarray:
|
|
560
|
+
"""Compute and cache inverse covariance."""
|
|
561
|
+
if self._inv_cov is None:
|
|
562
|
+
cov = self.covariance
|
|
563
|
+
# Add regularization
|
|
564
|
+
cov_reg = cov + self.regularization * np.eye(cov.shape[0])
|
|
565
|
+
self._inv_cov = np.linalg.inv(cov_reg)
|
|
566
|
+
return self._inv_cov
|
|
567
|
+
|
|
568
|
+
def mahalanobis(self, X: np.ndarray) -> np.ndarray:
|
|
569
|
+
"""Compute Mahalanobis distances.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
X: Test data (n_samples, n_features)
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Array of Mahalanobis distances
|
|
576
|
+
"""
|
|
577
|
+
if self._n_samples < 2:
|
|
578
|
+
raise RuntimeError("Model not fitted with enough samples")
|
|
579
|
+
|
|
580
|
+
if X.ndim == 1:
|
|
581
|
+
X = X.reshape(1, -1)
|
|
582
|
+
|
|
583
|
+
inv_cov = self._compute_inverse_covariance()
|
|
584
|
+
centered = X - self._mean
|
|
585
|
+
|
|
586
|
+
# Mahalanobis distance: sqrt((x-μ)ᵀ Σ⁻¹ (x-μ))
|
|
587
|
+
left = centered @ inv_cov
|
|
588
|
+
distances = np.sqrt(np.sum(left * centered, axis=1))
|
|
589
|
+
|
|
590
|
+
return distances
|
|
591
|
+
|
|
592
|
+
def predict(self, X: np.ndarray, threshold: float = 3.0) -> np.ndarray:
|
|
593
|
+
"""Predict outliers based on Mahalanobis distance.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
X: Test data
|
|
597
|
+
threshold: Distance threshold for outlier detection
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Array of predictions: 1 for normal, -1 for outlier
|
|
601
|
+
"""
|
|
602
|
+
distances = self.mahalanobis(X)
|
|
603
|
+
return np.where(distances > threshold, -1, 1)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
class SGDOnlineMixin:
|
|
607
|
+
"""Mixin providing SGD-based online learning capabilities.
|
|
608
|
+
|
|
609
|
+
This mixin enables memory-efficient training of models that
|
|
610
|
+
traditionally require full data loading (SVM, covariance-based methods).
|
|
611
|
+
|
|
612
|
+
Example:
|
|
613
|
+
class MemoryEfficientSVMValidator(AnomalyValidator, SGDOnlineMixin):
|
|
614
|
+
def validate(self, lf):
|
|
615
|
+
# Create online SVM
|
|
616
|
+
model = self.create_online_svm(nu=0.05)
|
|
617
|
+
|
|
618
|
+
# Stream training data
|
|
619
|
+
for chunk in self.iterate_training_chunks(lf):
|
|
620
|
+
model.partial_fit(chunk)
|
|
621
|
+
|
|
622
|
+
# Predict
|
|
623
|
+
predictions = model.predict(current_data)
|
|
624
|
+
"""
|
|
625
|
+
|
|
626
|
+
_online_config: OnlineLearnerConfig = None
|
|
627
|
+
_online_models: dict[str, IncrementalModel] = None
|
|
628
|
+
|
|
629
|
+
def get_online_config(self) -> OnlineLearnerConfig:
|
|
630
|
+
"""Get online learning configuration."""
|
|
631
|
+
if self._online_config is None:
|
|
632
|
+
self._online_config = OnlineLearnerConfig()
|
|
633
|
+
return self._online_config
|
|
634
|
+
|
|
635
|
+
def set_online_config(self, config: OnlineLearnerConfig) -> None:
|
|
636
|
+
"""Set online learning configuration."""
|
|
637
|
+
self._online_config = config
|
|
638
|
+
|
|
639
|
+
def create_online_svm(
|
|
640
|
+
self,
|
|
641
|
+
nu: float | None = None,
|
|
642
|
+
n_components: int | None = None,
|
|
643
|
+
**kwargs: Any,
|
|
644
|
+
) -> SGDOneClassSVM:
|
|
645
|
+
"""Create SGD-based One-Class SVM.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
nu: Upper bound on outlier fraction
|
|
649
|
+
n_components: Number of kernel components
|
|
650
|
+
**kwargs: Additional parameters
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
SGDOneClassSVM instance
|
|
654
|
+
"""
|
|
655
|
+
config = self.get_online_config()
|
|
656
|
+
|
|
657
|
+
return SGDOneClassSVM(
|
|
658
|
+
nu=nu or config.nu,
|
|
659
|
+
n_components=n_components or config.n_components,
|
|
660
|
+
kernel_approx=config.kernel_approx,
|
|
661
|
+
learning_rate=config.learning_rate_schedule,
|
|
662
|
+
eta0=config.learning_rate,
|
|
663
|
+
random_state=config.random_state,
|
|
664
|
+
tol=config.tol,
|
|
665
|
+
**kwargs,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
def create_online_scaler(self) -> OnlineScaler:
|
|
669
|
+
"""Create online standardization scaler."""
|
|
670
|
+
return OnlineScaler()
|
|
671
|
+
|
|
672
|
+
def create_online_statistics(self, n_features: int) -> OnlineStatistics:
|
|
673
|
+
"""Create online statistics tracker."""
|
|
674
|
+
return OnlineStatistics(n_features)
|
|
675
|
+
|
|
676
|
+
def create_mahalanobis_detector(
|
|
677
|
+
self,
|
|
678
|
+
regularization: float = 1e-6,
|
|
679
|
+
) -> IncrementalMahalanobis:
|
|
680
|
+
"""Create incremental Mahalanobis distance detector."""
|
|
681
|
+
return IncrementalMahalanobis(regularization=regularization)
|
|
682
|
+
|
|
683
|
+
def train_incrementally(
|
|
684
|
+
self,
|
|
685
|
+
lf: "pl.LazyFrame",
|
|
686
|
+
columns: list[str],
|
|
687
|
+
model: IncrementalModel,
|
|
688
|
+
n_iterations: int | None = None,
|
|
689
|
+
) -> IncrementalModel:
|
|
690
|
+
"""Train model incrementally on streaming data.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
lf: Input LazyFrame
|
|
694
|
+
columns: Columns to use
|
|
695
|
+
model: Incremental model with partial_fit method
|
|
696
|
+
n_iterations: Number of passes through data
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
Trained model
|
|
700
|
+
"""
|
|
701
|
+
from truthound.validators.memory.base import DataChunker
|
|
702
|
+
|
|
703
|
+
config = self.get_online_config()
|
|
704
|
+
n_iterations = n_iterations or config.n_iterations
|
|
705
|
+
|
|
706
|
+
chunker = DataChunker(
|
|
707
|
+
chunk_size=config.batch_size,
|
|
708
|
+
columns=columns,
|
|
709
|
+
drop_nulls=True,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
for iteration in range(n_iterations):
|
|
713
|
+
for chunk_arr in chunker.iterate(lf, as_numpy=True):
|
|
714
|
+
model.partial_fit(chunk_arr)
|
|
715
|
+
|
|
716
|
+
if hasattr(self, "logger"):
|
|
717
|
+
self.logger.debug(f"Completed iteration {iteration + 1}/{n_iterations}")
|
|
718
|
+
|
|
719
|
+
return model
|