truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1345 @@
|
|
|
1
|
+
"""Rule quality scoring with precision/recall estimation.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive quality metrics for generated rules:
|
|
4
|
+
- Precision and recall estimation
|
|
5
|
+
- F1 score calculation
|
|
6
|
+
- Confidence scoring
|
|
7
|
+
- Rule validation against sample data
|
|
8
|
+
- Quality trend analysis
|
|
9
|
+
|
|
10
|
+
Key features:
|
|
11
|
+
- Pluggable quality estimator architecture
|
|
12
|
+
- Statistical sampling for large datasets
|
|
13
|
+
- Historical quality tracking
|
|
14
|
+
- Feedback loop integration
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
from truthound.profiler.quality import (
|
|
18
|
+
RuleQualityScorer,
|
|
19
|
+
QualityMetrics,
|
|
20
|
+
estimate_quality,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Score a rule
|
|
24
|
+
scorer = RuleQualityScorer()
|
|
25
|
+
metrics = scorer.score(rule, data)
|
|
26
|
+
|
|
27
|
+
print(f"Precision: {metrics.precision:.2%}")
|
|
28
|
+
print(f"Recall: {metrics.recall:.2%}")
|
|
29
|
+
print(f"F1 Score: {metrics.f1_score:.2%}")
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import hashlib
|
|
35
|
+
import json
|
|
36
|
+
import math
|
|
37
|
+
import random
|
|
38
|
+
import re
|
|
39
|
+
import threading
|
|
40
|
+
from abc import ABC, abstractmethod
|
|
41
|
+
from collections import defaultdict
|
|
42
|
+
from dataclasses import dataclass, field
|
|
43
|
+
from datetime import datetime, timedelta
|
|
44
|
+
from enum import Enum
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any, Callable, Generic, Protocol, TypeVar
|
|
47
|
+
|
|
48
|
+
import polars as pl
|
|
49
|
+
|
|
50
|
+
from truthound.profiler.base import ColumnProfile, TableProfile, DataType
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# =============================================================================
|
|
54
|
+
# Types and Enums
|
|
55
|
+
# =============================================================================
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class QualityLevel(str, Enum):
|
|
59
|
+
"""Quality level classification."""
|
|
60
|
+
|
|
61
|
+
EXCELLENT = "excellent" # F1 >= 0.95
|
|
62
|
+
GOOD = "good" # F1 >= 0.85
|
|
63
|
+
ACCEPTABLE = "acceptable" # F1 >= 0.70
|
|
64
|
+
POOR = "poor" # F1 >= 0.50
|
|
65
|
+
UNACCEPTABLE = "unacceptable" # F1 < 0.50
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_f1(cls, f1_score: float) -> "QualityLevel":
|
|
69
|
+
"""Determine quality level from F1 score."""
|
|
70
|
+
if f1_score >= 0.95:
|
|
71
|
+
return cls.EXCELLENT
|
|
72
|
+
elif f1_score >= 0.85:
|
|
73
|
+
return cls.GOOD
|
|
74
|
+
elif f1_score >= 0.70:
|
|
75
|
+
return cls.ACCEPTABLE
|
|
76
|
+
elif f1_score >= 0.50:
|
|
77
|
+
return cls.POOR
|
|
78
|
+
else:
|
|
79
|
+
return cls.UNACCEPTABLE
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RuleType(str, Enum):
|
|
83
|
+
"""Types of validation rules."""
|
|
84
|
+
|
|
85
|
+
SCHEMA = "schema"
|
|
86
|
+
FORMAT = "format"
|
|
87
|
+
RANGE = "range"
|
|
88
|
+
UNIQUENESS = "uniqueness"
|
|
89
|
+
COMPLETENESS = "completeness"
|
|
90
|
+
PATTERN = "pattern"
|
|
91
|
+
CUSTOM = "custom"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# =============================================================================
|
|
95
|
+
# Quality Metrics
|
|
96
|
+
# =============================================================================
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass(frozen=True)
|
|
100
|
+
class ConfusionMatrix:
|
|
101
|
+
"""Confusion matrix for rule evaluation."""
|
|
102
|
+
|
|
103
|
+
true_positives: int = 0
|
|
104
|
+
true_negatives: int = 0
|
|
105
|
+
false_positives: int = 0
|
|
106
|
+
false_negatives: int = 0
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def total(self) -> int:
|
|
110
|
+
"""Total observations."""
|
|
111
|
+
return (
|
|
112
|
+
self.true_positives + self.true_negatives +
|
|
113
|
+
self.false_positives + self.false_negatives
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def accuracy(self) -> float:
|
|
118
|
+
"""Calculate accuracy."""
|
|
119
|
+
if self.total == 0:
|
|
120
|
+
return 0.0
|
|
121
|
+
return (self.true_positives + self.true_negatives) / self.total
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def precision(self) -> float:
|
|
125
|
+
"""Calculate precision (PPV)."""
|
|
126
|
+
denominator = self.true_positives + self.false_positives
|
|
127
|
+
if denominator == 0:
|
|
128
|
+
return 0.0
|
|
129
|
+
return self.true_positives / denominator
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def recall(self) -> float:
|
|
133
|
+
"""Calculate recall (sensitivity/TPR)."""
|
|
134
|
+
denominator = self.true_positives + self.false_negatives
|
|
135
|
+
if denominator == 0:
|
|
136
|
+
return 0.0
|
|
137
|
+
return self.true_positives / denominator
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def specificity(self) -> float:
|
|
141
|
+
"""Calculate specificity (TNR)."""
|
|
142
|
+
denominator = self.true_negatives + self.false_positives
|
|
143
|
+
if denominator == 0:
|
|
144
|
+
return 0.0
|
|
145
|
+
return self.true_negatives / denominator
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def f1_score(self) -> float:
|
|
149
|
+
"""Calculate F1 score."""
|
|
150
|
+
p, r = self.precision, self.recall
|
|
151
|
+
if p + r == 0:
|
|
152
|
+
return 0.0
|
|
153
|
+
return 2 * (p * r) / (p + r)
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def f_beta(self) -> Callable[[float], float]:
|
|
157
|
+
"""Calculate F-beta score with given beta."""
|
|
158
|
+
def calc(beta: float) -> float:
|
|
159
|
+
p, r = self.precision, self.recall
|
|
160
|
+
if p + r == 0:
|
|
161
|
+
return 0.0
|
|
162
|
+
beta_sq = beta ** 2
|
|
163
|
+
return (1 + beta_sq) * (p * r) / (beta_sq * p + r)
|
|
164
|
+
return calc
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def mcc(self) -> float:
|
|
168
|
+
"""Calculate Matthews Correlation Coefficient."""
|
|
169
|
+
tp, tn = self.true_positives, self.true_negatives
|
|
170
|
+
fp, fn = self.false_positives, self.false_negatives
|
|
171
|
+
|
|
172
|
+
numerator = tp * tn - fp * fn
|
|
173
|
+
denominator = math.sqrt(
|
|
174
|
+
(tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if denominator == 0:
|
|
178
|
+
return 0.0
|
|
179
|
+
return numerator / denominator
|
|
180
|
+
|
|
181
|
+
def to_dict(self) -> dict[str, Any]:
|
|
182
|
+
"""Convert to dictionary."""
|
|
183
|
+
return {
|
|
184
|
+
"true_positives": self.true_positives,
|
|
185
|
+
"true_negatives": self.true_negatives,
|
|
186
|
+
"false_positives": self.false_positives,
|
|
187
|
+
"false_negatives": self.false_negatives,
|
|
188
|
+
"accuracy": self.accuracy,
|
|
189
|
+
"precision": self.precision,
|
|
190
|
+
"recall": self.recall,
|
|
191
|
+
"specificity": self.specificity,
|
|
192
|
+
"f1_score": self.f1_score,
|
|
193
|
+
"mcc": self.mcc,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class QualityMetrics:
|
|
199
|
+
"""Complete quality metrics for a rule.
|
|
200
|
+
|
|
201
|
+
Contains precision, recall, F1, and additional quality indicators.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
# Core metrics
|
|
205
|
+
precision: float = 0.0
|
|
206
|
+
recall: float = 0.0
|
|
207
|
+
f1_score: float = 0.0
|
|
208
|
+
accuracy: float = 0.0
|
|
209
|
+
|
|
210
|
+
# Additional metrics
|
|
211
|
+
specificity: float = 0.0
|
|
212
|
+
mcc: float = 0.0 # Matthews Correlation Coefficient
|
|
213
|
+
|
|
214
|
+
# Confidence intervals (95%)
|
|
215
|
+
precision_ci: tuple[float, float] = (0.0, 0.0)
|
|
216
|
+
recall_ci: tuple[float, float] = (0.0, 0.0)
|
|
217
|
+
f1_ci: tuple[float, float] = (0.0, 0.0)
|
|
218
|
+
|
|
219
|
+
# Sample info
|
|
220
|
+
sample_size: int = 0
|
|
221
|
+
population_size: int = 0
|
|
222
|
+
|
|
223
|
+
# Quality assessment
|
|
224
|
+
quality_level: QualityLevel = QualityLevel.UNACCEPTABLE
|
|
225
|
+
confidence: float = 0.0 # Confidence in the metrics
|
|
226
|
+
|
|
227
|
+
# Confusion matrix
|
|
228
|
+
confusion_matrix: ConfusionMatrix | None = None
|
|
229
|
+
|
|
230
|
+
# Metadata
|
|
231
|
+
evaluated_at: datetime = field(default_factory=datetime.now)
|
|
232
|
+
evaluation_duration_ms: float = 0.0
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def from_confusion_matrix(
|
|
236
|
+
cls,
|
|
237
|
+
matrix: ConfusionMatrix,
|
|
238
|
+
sample_size: int = 0,
|
|
239
|
+
population_size: int = 0,
|
|
240
|
+
) -> "QualityMetrics":
|
|
241
|
+
"""Create metrics from confusion matrix."""
|
|
242
|
+
metrics = cls(
|
|
243
|
+
precision=matrix.precision,
|
|
244
|
+
recall=matrix.recall,
|
|
245
|
+
f1_score=matrix.f1_score,
|
|
246
|
+
accuracy=matrix.accuracy,
|
|
247
|
+
specificity=matrix.specificity,
|
|
248
|
+
mcc=matrix.mcc,
|
|
249
|
+
sample_size=sample_size,
|
|
250
|
+
population_size=population_size,
|
|
251
|
+
quality_level=QualityLevel.from_f1(matrix.f1_score),
|
|
252
|
+
confusion_matrix=matrix,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Calculate confidence intervals
|
|
256
|
+
if sample_size > 0:
|
|
257
|
+
metrics.precision_ci = cls._wilson_ci(
|
|
258
|
+
matrix.true_positives,
|
|
259
|
+
matrix.true_positives + matrix.false_positives,
|
|
260
|
+
)
|
|
261
|
+
metrics.recall_ci = cls._wilson_ci(
|
|
262
|
+
matrix.true_positives,
|
|
263
|
+
matrix.true_positives + matrix.false_negatives,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Confidence based on sample size
|
|
267
|
+
metrics.confidence = min(1.0, sample_size / max(population_size, 1))
|
|
268
|
+
|
|
269
|
+
return metrics
|
|
270
|
+
|
|
271
|
+
@staticmethod
|
|
272
|
+
def _wilson_ci(successes: int, trials: int, z: float = 1.96) -> tuple[float, float]:
|
|
273
|
+
"""Calculate Wilson confidence interval."""
|
|
274
|
+
if trials == 0:
|
|
275
|
+
return (0.0, 0.0)
|
|
276
|
+
|
|
277
|
+
p = successes / trials
|
|
278
|
+
denominator = 1 + z * z / trials
|
|
279
|
+
centre = p + z * z / (2 * trials)
|
|
280
|
+
margin = z * math.sqrt((p * (1 - p) + z * z / (4 * trials)) / trials)
|
|
281
|
+
|
|
282
|
+
lower = max(0.0, (centre - margin) / denominator)
|
|
283
|
+
upper = min(1.0, (centre + margin) / denominator)
|
|
284
|
+
|
|
285
|
+
return (lower, upper)
|
|
286
|
+
|
|
287
|
+
def to_dict(self) -> dict[str, Any]:
|
|
288
|
+
"""Convert to dictionary."""
|
|
289
|
+
result = {
|
|
290
|
+
"precision": self.precision,
|
|
291
|
+
"recall": self.recall,
|
|
292
|
+
"f1_score": self.f1_score,
|
|
293
|
+
"accuracy": self.accuracy,
|
|
294
|
+
"specificity": self.specificity,
|
|
295
|
+
"mcc": self.mcc,
|
|
296
|
+
"precision_ci": self.precision_ci,
|
|
297
|
+
"recall_ci": self.recall_ci,
|
|
298
|
+
"f1_ci": self.f1_ci,
|
|
299
|
+
"sample_size": self.sample_size,
|
|
300
|
+
"population_size": self.population_size,
|
|
301
|
+
"quality_level": self.quality_level.value,
|
|
302
|
+
"confidence": self.confidence,
|
|
303
|
+
"evaluated_at": self.evaluated_at.isoformat(),
|
|
304
|
+
"evaluation_duration_ms": self.evaluation_duration_ms,
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
if self.confusion_matrix:
|
|
308
|
+
result["confusion_matrix"] = self.confusion_matrix.to_dict()
|
|
309
|
+
|
|
310
|
+
return result
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# =============================================================================
|
|
314
|
+
# Rule Protocol
|
|
315
|
+
# =============================================================================
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class RuleProtocol(Protocol):
|
|
319
|
+
"""Protocol for validation rules."""
|
|
320
|
+
|
|
321
|
+
name: str
|
|
322
|
+
rule_type: RuleType
|
|
323
|
+
column: str | None
|
|
324
|
+
|
|
325
|
+
def validate(self, value: Any) -> bool:
|
|
326
|
+
"""Validate a single value."""
|
|
327
|
+
...
|
|
328
|
+
|
|
329
|
+
def validate_column(self, df: pl.DataFrame, column: str) -> pl.Series:
|
|
330
|
+
"""Validate a column, returning boolean series."""
|
|
331
|
+
...
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
@dataclass
|
|
335
|
+
class ValidationRule:
|
|
336
|
+
"""Simple validation rule implementation."""
|
|
337
|
+
|
|
338
|
+
name: str
|
|
339
|
+
rule_type: RuleType
|
|
340
|
+
column: str | None = None
|
|
341
|
+
pattern: str | None = None
|
|
342
|
+
min_value: float | None = None
|
|
343
|
+
max_value: float | None = None
|
|
344
|
+
allowed_values: set[Any] | None = None
|
|
345
|
+
nullable: bool = True
|
|
346
|
+
validate_fn: Callable[[Any], bool] | None = None
|
|
347
|
+
|
|
348
|
+
def validate(self, value: Any) -> bool:
|
|
349
|
+
"""Validate a single value."""
|
|
350
|
+
if value is None:
|
|
351
|
+
return self.nullable
|
|
352
|
+
|
|
353
|
+
if self.validate_fn:
|
|
354
|
+
return self.validate_fn(value)
|
|
355
|
+
|
|
356
|
+
if self.pattern:
|
|
357
|
+
if not isinstance(value, str):
|
|
358
|
+
return False
|
|
359
|
+
return bool(re.match(self.pattern, value))
|
|
360
|
+
|
|
361
|
+
if self.min_value is not None and value < self.min_value:
|
|
362
|
+
return False
|
|
363
|
+
|
|
364
|
+
if self.max_value is not None and value > self.max_value:
|
|
365
|
+
return False
|
|
366
|
+
|
|
367
|
+
if self.allowed_values is not None and value not in self.allowed_values:
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
return True
|
|
371
|
+
|
|
372
|
+
def validate_column(self, df: pl.DataFrame, column: str) -> pl.Series:
|
|
373
|
+
"""Validate a column, returning boolean series."""
|
|
374
|
+
col = df.get_column(column)
|
|
375
|
+
|
|
376
|
+
# Handle nulls
|
|
377
|
+
is_null = col.is_null()
|
|
378
|
+
if self.nullable:
|
|
379
|
+
valid = is_null # Nulls are valid if nullable
|
|
380
|
+
else:
|
|
381
|
+
valid = ~is_null # Nulls are invalid if not nullable
|
|
382
|
+
|
|
383
|
+
# Apply rule-specific validation
|
|
384
|
+
non_null = ~is_null
|
|
385
|
+
|
|
386
|
+
if self.pattern:
|
|
387
|
+
valid = valid | (non_null & col.cast(pl.Utf8).str.contains(self.pattern))
|
|
388
|
+
|
|
389
|
+
elif self.min_value is not None or self.max_value is not None:
|
|
390
|
+
if self.min_value is not None:
|
|
391
|
+
valid = valid & (is_null | (col >= self.min_value))
|
|
392
|
+
if self.max_value is not None:
|
|
393
|
+
valid = valid & (is_null | (col <= self.max_value))
|
|
394
|
+
|
|
395
|
+
elif self.allowed_values is not None:
|
|
396
|
+
valid = valid | (non_null & col.is_in(list(self.allowed_values)))
|
|
397
|
+
|
|
398
|
+
return valid
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# =============================================================================
|
|
402
|
+
# Quality Estimator Protocol
|
|
403
|
+
# =============================================================================
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
class QualityEstimator(ABC):
|
|
407
|
+
"""Abstract base class for quality estimators.
|
|
408
|
+
|
|
409
|
+
Different estimators use different strategies to estimate
|
|
410
|
+
rule quality (sampling, heuristics, etc.)
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
name: str = "base"
|
|
414
|
+
|
|
415
|
+
@abstractmethod
|
|
416
|
+
def estimate(
|
|
417
|
+
self,
|
|
418
|
+
rule: RuleProtocol,
|
|
419
|
+
data: pl.DataFrame,
|
|
420
|
+
ground_truth: pl.Series | None = None,
|
|
421
|
+
) -> QualityMetrics:
|
|
422
|
+
"""Estimate quality metrics for a rule.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
rule: Rule to evaluate
|
|
426
|
+
data: Data to evaluate against
|
|
427
|
+
ground_truth: Optional ground truth labels
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Quality metrics
|
|
431
|
+
"""
|
|
432
|
+
pass
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class SamplingQualityEstimator(QualityEstimator):
|
|
436
|
+
"""Estimates quality using statistical sampling.
|
|
437
|
+
|
|
438
|
+
Uses random sampling to estimate precision and recall
|
|
439
|
+
with confidence intervals.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
name = "sampling"
|
|
443
|
+
|
|
444
|
+
def __init__(
|
|
445
|
+
self,
|
|
446
|
+
sample_size: int = 1000,
|
|
447
|
+
confidence_level: float = 0.95,
|
|
448
|
+
random_seed: int | None = None,
|
|
449
|
+
):
|
|
450
|
+
self.sample_size = sample_size
|
|
451
|
+
self.confidence_level = confidence_level
|
|
452
|
+
self.random_seed = random_seed
|
|
453
|
+
|
|
454
|
+
def estimate(
|
|
455
|
+
self,
|
|
456
|
+
rule: RuleProtocol,
|
|
457
|
+
data: pl.DataFrame,
|
|
458
|
+
ground_truth: pl.Series | None = None,
|
|
459
|
+
) -> QualityMetrics:
|
|
460
|
+
"""Estimate quality via sampling."""
|
|
461
|
+
start_time = datetime.now()
|
|
462
|
+
|
|
463
|
+
column = rule.column
|
|
464
|
+
if column is None or column not in data.columns:
|
|
465
|
+
return QualityMetrics()
|
|
466
|
+
|
|
467
|
+
# Sample data if needed
|
|
468
|
+
population_size = len(data)
|
|
469
|
+
if population_size > self.sample_size:
|
|
470
|
+
if self.random_seed is not None:
|
|
471
|
+
random.seed(self.random_seed)
|
|
472
|
+
indices = random.sample(range(population_size), self.sample_size)
|
|
473
|
+
sample = data[indices]
|
|
474
|
+
sample_size = self.sample_size
|
|
475
|
+
else:
|
|
476
|
+
sample = data
|
|
477
|
+
sample_size = population_size
|
|
478
|
+
|
|
479
|
+
# Validate sample
|
|
480
|
+
predictions = rule.validate_column(sample, column)
|
|
481
|
+
|
|
482
|
+
# If we have ground truth, calculate confusion matrix
|
|
483
|
+
if ground_truth is not None:
|
|
484
|
+
if len(ground_truth) > self.sample_size:
|
|
485
|
+
gt_sample = ground_truth[indices] if population_size > self.sample_size else ground_truth
|
|
486
|
+
else:
|
|
487
|
+
gt_sample = ground_truth
|
|
488
|
+
|
|
489
|
+
matrix = self._calculate_confusion_matrix(predictions, gt_sample)
|
|
490
|
+
else:
|
|
491
|
+
# Without ground truth, estimate based on data patterns
|
|
492
|
+
matrix = self._estimate_confusion_matrix(predictions, sample, column)
|
|
493
|
+
|
|
494
|
+
duration_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
495
|
+
|
|
496
|
+
metrics = QualityMetrics.from_confusion_matrix(
|
|
497
|
+
matrix,
|
|
498
|
+
sample_size=sample_size,
|
|
499
|
+
population_size=population_size,
|
|
500
|
+
)
|
|
501
|
+
metrics.evaluation_duration_ms = duration_ms
|
|
502
|
+
|
|
503
|
+
return metrics
|
|
504
|
+
|
|
505
|
+
def _calculate_confusion_matrix(
|
|
506
|
+
self,
|
|
507
|
+
predictions: pl.Series,
|
|
508
|
+
ground_truth: pl.Series,
|
|
509
|
+
) -> ConfusionMatrix:
|
|
510
|
+
"""Calculate confusion matrix from predictions and ground truth."""
|
|
511
|
+
pred_array = predictions.to_numpy()
|
|
512
|
+
truth_array = ground_truth.to_numpy()
|
|
513
|
+
|
|
514
|
+
tp = int(((pred_array == True) & (truth_array == True)).sum())
|
|
515
|
+
tn = int(((pred_array == False) & (truth_array == False)).sum())
|
|
516
|
+
fp = int(((pred_array == True) & (truth_array == False)).sum())
|
|
517
|
+
fn = int(((pred_array == False) & (truth_array == True)).sum())
|
|
518
|
+
|
|
519
|
+
return ConfusionMatrix(
|
|
520
|
+
true_positives=tp,
|
|
521
|
+
true_negatives=tn,
|
|
522
|
+
false_positives=fp,
|
|
523
|
+
false_negatives=fn,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
def _estimate_confusion_matrix(
|
|
527
|
+
self,
|
|
528
|
+
predictions: pl.Series,
|
|
529
|
+
data: pl.DataFrame,
|
|
530
|
+
column: str,
|
|
531
|
+
) -> ConfusionMatrix:
|
|
532
|
+
"""Estimate confusion matrix without ground truth.
|
|
533
|
+
|
|
534
|
+
Uses heuristics based on data distribution to estimate
|
|
535
|
+
likely true/false positive rates.
|
|
536
|
+
"""
|
|
537
|
+
valid_count = predictions.sum()
|
|
538
|
+
invalid_count = len(predictions) - valid_count
|
|
539
|
+
|
|
540
|
+
# Heuristic: assume most valid predictions are true positives
|
|
541
|
+
# and most invalid predictions are true negatives
|
|
542
|
+
# This is a simplification - actual FP/FN rates depend on the rule
|
|
543
|
+
|
|
544
|
+
# Estimate FP rate based on rule strictness
|
|
545
|
+
estimated_fp_rate = 0.02 # Conservative estimate
|
|
546
|
+
estimated_fn_rate = 0.05 # Conservative estimate
|
|
547
|
+
|
|
548
|
+
tp = int(valid_count * (1 - estimated_fp_rate))
|
|
549
|
+
fp = int(valid_count * estimated_fp_rate)
|
|
550
|
+
tn = int(invalid_count * (1 - estimated_fn_rate))
|
|
551
|
+
fn = int(invalid_count * estimated_fn_rate)
|
|
552
|
+
|
|
553
|
+
return ConfusionMatrix(
|
|
554
|
+
true_positives=tp,
|
|
555
|
+
true_negatives=tn,
|
|
556
|
+
false_positives=fp,
|
|
557
|
+
false_negatives=fn,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class HeuristicQualityEstimator(QualityEstimator):
|
|
562
|
+
"""Estimates quality using heuristics and data patterns.
|
|
563
|
+
|
|
564
|
+
Useful when ground truth is not available and sampling
|
|
565
|
+
is not practical.
|
|
566
|
+
"""
|
|
567
|
+
|
|
568
|
+
name = "heuristic"
|
|
569
|
+
|
|
570
|
+
def __init__(self, strictness: float = 0.5):
|
|
571
|
+
self.strictness = strictness # 0.0 = loose, 1.0 = strict
|
|
572
|
+
|
|
573
|
+
def estimate(
|
|
574
|
+
self,
|
|
575
|
+
rule: RuleProtocol,
|
|
576
|
+
data: pl.DataFrame,
|
|
577
|
+
ground_truth: pl.Series | None = None,
|
|
578
|
+
) -> QualityMetrics:
|
|
579
|
+
"""Estimate quality using heuristics."""
|
|
580
|
+
start_time = datetime.now()
|
|
581
|
+
|
|
582
|
+
column = rule.column
|
|
583
|
+
if column is None or column not in data.columns:
|
|
584
|
+
return QualityMetrics()
|
|
585
|
+
|
|
586
|
+
col = data.get_column(column)
|
|
587
|
+
predictions = rule.validate_column(data, column)
|
|
588
|
+
|
|
589
|
+
# Calculate base metrics
|
|
590
|
+
valid_ratio = predictions.sum() / len(predictions)
|
|
591
|
+
null_ratio = col.null_count() / len(col)
|
|
592
|
+
unique_ratio = col.n_unique() / len(col)
|
|
593
|
+
|
|
594
|
+
# Heuristic quality estimation based on rule type
|
|
595
|
+
if rule.rule_type == RuleType.PATTERN:
|
|
596
|
+
metrics = self._estimate_pattern_quality(
|
|
597
|
+
valid_ratio, null_ratio, unique_ratio
|
|
598
|
+
)
|
|
599
|
+
elif rule.rule_type == RuleType.RANGE:
|
|
600
|
+
metrics = self._estimate_range_quality(
|
|
601
|
+
valid_ratio, null_ratio, col
|
|
602
|
+
)
|
|
603
|
+
elif rule.rule_type == RuleType.UNIQUENESS:
|
|
604
|
+
metrics = self._estimate_uniqueness_quality(
|
|
605
|
+
valid_ratio, unique_ratio
|
|
606
|
+
)
|
|
607
|
+
else:
|
|
608
|
+
metrics = self._estimate_general_quality(
|
|
609
|
+
valid_ratio, null_ratio
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
duration_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
613
|
+
metrics.sample_size = len(data)
|
|
614
|
+
metrics.population_size = len(data)
|
|
615
|
+
metrics.evaluation_duration_ms = duration_ms
|
|
616
|
+
metrics.quality_level = QualityLevel.from_f1(metrics.f1_score)
|
|
617
|
+
|
|
618
|
+
return metrics
|
|
619
|
+
|
|
620
|
+
def _estimate_pattern_quality(
|
|
621
|
+
self,
|
|
622
|
+
valid_ratio: float,
|
|
623
|
+
null_ratio: float,
|
|
624
|
+
unique_ratio: float,
|
|
625
|
+
) -> QualityMetrics:
|
|
626
|
+
"""Estimate quality for pattern rules."""
|
|
627
|
+
# Pattern rules with high match ratio are likely good
|
|
628
|
+
# Unless the pattern is too generic (low uniqueness)
|
|
629
|
+
|
|
630
|
+
if valid_ratio > 0.95:
|
|
631
|
+
# Very high match - might be too loose
|
|
632
|
+
precision = 0.85 - (valid_ratio - 0.95) * 2
|
|
633
|
+
recall = 0.95
|
|
634
|
+
elif valid_ratio > 0.80:
|
|
635
|
+
# Good match ratio
|
|
636
|
+
precision = 0.90
|
|
637
|
+
recall = valid_ratio
|
|
638
|
+
else:
|
|
639
|
+
# Low match - might be too strict or wrong pattern
|
|
640
|
+
precision = 0.95
|
|
641
|
+
recall = valid_ratio
|
|
642
|
+
|
|
643
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
644
|
+
|
|
645
|
+
return QualityMetrics(
|
|
646
|
+
precision=precision,
|
|
647
|
+
recall=recall,
|
|
648
|
+
f1_score=f1,
|
|
649
|
+
confidence=0.7, # Heuristic confidence
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
def _estimate_range_quality(
|
|
653
|
+
self,
|
|
654
|
+
valid_ratio: float,
|
|
655
|
+
null_ratio: float,
|
|
656
|
+
col: pl.Series,
|
|
657
|
+
) -> QualityMetrics:
|
|
658
|
+
"""Estimate quality for range rules."""
|
|
659
|
+
# Range rules are typically more reliable
|
|
660
|
+
# Quality depends on how well the range fits the data distribution
|
|
661
|
+
|
|
662
|
+
# Check if values are near boundaries (potential FN)
|
|
663
|
+
try:
|
|
664
|
+
if col.dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]:
|
|
665
|
+
non_null = col.drop_nulls()
|
|
666
|
+
if len(non_null) > 0:
|
|
667
|
+
std = non_null.std()
|
|
668
|
+
mean = non_null.mean()
|
|
669
|
+
# If std is large relative to mean, more uncertainty
|
|
670
|
+
cv = abs(std / mean) if mean != 0 else 0
|
|
671
|
+
precision = 0.95 if cv < 0.5 else 0.85
|
|
672
|
+
else:
|
|
673
|
+
precision = 0.90
|
|
674
|
+
else:
|
|
675
|
+
precision = 0.90
|
|
676
|
+
except Exception:
|
|
677
|
+
precision = 0.90
|
|
678
|
+
|
|
679
|
+
recall = valid_ratio
|
|
680
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
681
|
+
|
|
682
|
+
return QualityMetrics(
|
|
683
|
+
precision=precision,
|
|
684
|
+
recall=recall,
|
|
685
|
+
f1_score=f1,
|
|
686
|
+
confidence=0.8,
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
def _estimate_uniqueness_quality(
|
|
690
|
+
self,
|
|
691
|
+
valid_ratio: float,
|
|
692
|
+
unique_ratio: float,
|
|
693
|
+
) -> QualityMetrics:
|
|
694
|
+
"""Estimate quality for uniqueness rules."""
|
|
695
|
+
# Uniqueness rules are binary - either unique or not
|
|
696
|
+
# High precision if unique_ratio is very high
|
|
697
|
+
if unique_ratio > 0.99:
|
|
698
|
+
precision = 0.98
|
|
699
|
+
recall = 0.95
|
|
700
|
+
elif unique_ratio > 0.95:
|
|
701
|
+
precision = 0.90
|
|
702
|
+
recall = 0.90
|
|
703
|
+
else:
|
|
704
|
+
precision = 0.80
|
|
705
|
+
recall = unique_ratio
|
|
706
|
+
|
|
707
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
708
|
+
|
|
709
|
+
return QualityMetrics(
|
|
710
|
+
precision=precision,
|
|
711
|
+
recall=recall,
|
|
712
|
+
f1_score=f1,
|
|
713
|
+
confidence=0.85,
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
def _estimate_general_quality(
|
|
717
|
+
self,
|
|
718
|
+
valid_ratio: float,
|
|
719
|
+
null_ratio: float,
|
|
720
|
+
) -> QualityMetrics:
|
|
721
|
+
"""Estimate quality for general rules."""
|
|
722
|
+
# Default estimation
|
|
723
|
+
precision = 0.90
|
|
724
|
+
recall = valid_ratio * (1 - null_ratio)
|
|
725
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
726
|
+
|
|
727
|
+
return QualityMetrics(
|
|
728
|
+
precision=precision,
|
|
729
|
+
recall=recall,
|
|
730
|
+
f1_score=f1,
|
|
731
|
+
confidence=0.6,
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
class CrossValidationEstimator(QualityEstimator):
|
|
736
|
+
"""Estimates quality using cross-validation.
|
|
737
|
+
|
|
738
|
+
Splits data into folds and evaluates consistency across folds.
|
|
739
|
+
"""
|
|
740
|
+
|
|
741
|
+
name = "cross_validation"
|
|
742
|
+
|
|
743
|
+
def __init__(
|
|
744
|
+
self,
|
|
745
|
+
n_folds: int = 5,
|
|
746
|
+
random_seed: int | None = None,
|
|
747
|
+
):
|
|
748
|
+
self.n_folds = n_folds
|
|
749
|
+
self.random_seed = random_seed
|
|
750
|
+
|
|
751
|
+
def estimate(
|
|
752
|
+
self,
|
|
753
|
+
rule: RuleProtocol,
|
|
754
|
+
data: pl.DataFrame,
|
|
755
|
+
ground_truth: pl.Series | None = None,
|
|
756
|
+
) -> QualityMetrics:
|
|
757
|
+
"""Estimate quality via cross-validation."""
|
|
758
|
+
start_time = datetime.now()
|
|
759
|
+
|
|
760
|
+
column = rule.column
|
|
761
|
+
if column is None or column not in data.columns:
|
|
762
|
+
return QualityMetrics()
|
|
763
|
+
|
|
764
|
+
# Create folds
|
|
765
|
+
n = len(data)
|
|
766
|
+
fold_size = n // self.n_folds
|
|
767
|
+
|
|
768
|
+
if self.random_seed is not None:
|
|
769
|
+
random.seed(self.random_seed)
|
|
770
|
+
|
|
771
|
+
indices = list(range(n))
|
|
772
|
+
random.shuffle(indices)
|
|
773
|
+
|
|
774
|
+
# Evaluate on each fold
|
|
775
|
+
fold_metrics: list[float] = []
|
|
776
|
+
for i in range(self.n_folds):
|
|
777
|
+
start_idx = i * fold_size
|
|
778
|
+
end_idx = start_idx + fold_size if i < self.n_folds - 1 else n
|
|
779
|
+
fold_indices = indices[start_idx:end_idx]
|
|
780
|
+
|
|
781
|
+
fold_data = data[fold_indices]
|
|
782
|
+
predictions = rule.validate_column(fold_data, column)
|
|
783
|
+
valid_ratio = predictions.sum() / len(predictions)
|
|
784
|
+
fold_metrics.append(valid_ratio)
|
|
785
|
+
|
|
786
|
+
# Calculate consistency across folds
|
|
787
|
+
mean_valid = sum(fold_metrics) / len(fold_metrics)
|
|
788
|
+
std_valid = (sum((x - mean_valid) ** 2 for x in fold_metrics) / len(fold_metrics)) ** 0.5
|
|
789
|
+
|
|
790
|
+
# Low variance = high consistency = likely high precision
|
|
791
|
+
consistency = 1.0 - min(1.0, std_valid * 5)
|
|
792
|
+
|
|
793
|
+
precision = 0.85 + consistency * 0.10
|
|
794
|
+
recall = mean_valid
|
|
795
|
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
796
|
+
|
|
797
|
+
duration_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
798
|
+
|
|
799
|
+
return QualityMetrics(
|
|
800
|
+
precision=precision,
|
|
801
|
+
recall=recall,
|
|
802
|
+
f1_score=f1,
|
|
803
|
+
confidence=consistency,
|
|
804
|
+
sample_size=n,
|
|
805
|
+
population_size=n,
|
|
806
|
+
quality_level=QualityLevel.from_f1(f1),
|
|
807
|
+
evaluation_duration_ms=duration_ms,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
# =============================================================================
|
|
812
|
+
# Quality Estimator Registry
|
|
813
|
+
# =============================================================================
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class QualityEstimatorRegistry:
|
|
817
|
+
"""Registry for quality estimator factories."""
|
|
818
|
+
|
|
819
|
+
def __init__(self) -> None:
|
|
820
|
+
self._estimators: dict[str, type[QualityEstimator]] = {}
|
|
821
|
+
|
|
822
|
+
def register(
|
|
823
|
+
self,
|
|
824
|
+
name: str,
|
|
825
|
+
estimator_class: type[QualityEstimator],
|
|
826
|
+
) -> None:
|
|
827
|
+
"""Register an estimator class."""
|
|
828
|
+
self._estimators[name] = estimator_class
|
|
829
|
+
|
|
830
|
+
def create(self, name: str, **kwargs: Any) -> QualityEstimator:
|
|
831
|
+
"""Create an estimator instance."""
|
|
832
|
+
if name not in self._estimators:
|
|
833
|
+
raise KeyError(
|
|
834
|
+
f"Unknown estimator: {name}. "
|
|
835
|
+
f"Available: {list(self._estimators.keys())}"
|
|
836
|
+
)
|
|
837
|
+
return self._estimators[name](**kwargs)
|
|
838
|
+
|
|
839
|
+
def list_estimators(self) -> list[str]:
|
|
840
|
+
"""List registered estimator names."""
|
|
841
|
+
return list(self._estimators.keys())
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
# Global registry
|
|
845
|
+
quality_estimator_registry = QualityEstimatorRegistry()
|
|
846
|
+
quality_estimator_registry.register("sampling", SamplingQualityEstimator)
|
|
847
|
+
quality_estimator_registry.register("heuristic", HeuristicQualityEstimator)
|
|
848
|
+
quality_estimator_registry.register("cross_validation", CrossValidationEstimator)
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
# =============================================================================
|
|
852
|
+
# Rule Quality Scorer
|
|
853
|
+
# =============================================================================
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
@dataclass
|
|
857
|
+
class ScoringConfig:
|
|
858
|
+
"""Configuration for quality scoring."""
|
|
859
|
+
|
|
860
|
+
estimator: str = "sampling"
|
|
861
|
+
estimator_options: dict[str, Any] = field(default_factory=dict)
|
|
862
|
+
min_sample_size: int = 100
|
|
863
|
+
min_confidence: float = 0.5
|
|
864
|
+
cache_results: bool = True
|
|
865
|
+
cache_ttl_seconds: int = 3600
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
@dataclass
|
|
869
|
+
class RuleQualityScore:
|
|
870
|
+
"""Complete quality score for a rule."""
|
|
871
|
+
|
|
872
|
+
rule_name: str
|
|
873
|
+
rule_type: RuleType
|
|
874
|
+
column: str | None
|
|
875
|
+
metrics: QualityMetrics
|
|
876
|
+
recommendation: str
|
|
877
|
+
should_use: bool
|
|
878
|
+
alternatives: list[str] = field(default_factory=list)
|
|
879
|
+
|
|
880
|
+
def to_dict(self) -> dict[str, Any]:
|
|
881
|
+
"""Convert to dictionary."""
|
|
882
|
+
return {
|
|
883
|
+
"rule_name": self.rule_name,
|
|
884
|
+
"rule_type": self.rule_type.value,
|
|
885
|
+
"column": self.column,
|
|
886
|
+
"metrics": self.metrics.to_dict(),
|
|
887
|
+
"recommendation": self.recommendation,
|
|
888
|
+
"should_use": self.should_use,
|
|
889
|
+
"alternatives": self.alternatives,
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
class RuleQualityScorer:
|
|
894
|
+
"""Main interface for scoring rule quality.
|
|
895
|
+
|
|
896
|
+
Evaluates rules against data and provides quality recommendations.
|
|
897
|
+
|
|
898
|
+
Example:
|
|
899
|
+
scorer = RuleQualityScorer()
|
|
900
|
+
|
|
901
|
+
score = scorer.score(rule, data)
|
|
902
|
+
print(f"Should use: {score.should_use}")
|
|
903
|
+
print(f"Recommendation: {score.recommendation}")
|
|
904
|
+
"""
|
|
905
|
+
|
|
906
|
+
def __init__(
|
|
907
|
+
self,
|
|
908
|
+
estimator: str | QualityEstimator = "sampling",
|
|
909
|
+
estimator_options: dict[str, Any] | None = None,
|
|
910
|
+
min_confidence: float = 0.5,
|
|
911
|
+
quality_threshold: float = 0.70,
|
|
912
|
+
):
|
|
913
|
+
"""Initialize scorer.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
estimator: Estimator name or instance
|
|
917
|
+
estimator_options: Options for estimator construction
|
|
918
|
+
min_confidence: Minimum confidence for recommendations
|
|
919
|
+
quality_threshold: Minimum F1 score for rule acceptance
|
|
920
|
+
"""
|
|
921
|
+
self.min_confidence = min_confidence
|
|
922
|
+
self.quality_threshold = quality_threshold
|
|
923
|
+
self._cache: dict[str, RuleQualityScore] = {}
|
|
924
|
+
self._lock = threading.Lock()
|
|
925
|
+
|
|
926
|
+
if isinstance(estimator, QualityEstimator):
|
|
927
|
+
self._estimator = estimator
|
|
928
|
+
else:
|
|
929
|
+
options = estimator_options or {}
|
|
930
|
+
self._estimator = quality_estimator_registry.create(estimator, **options)
|
|
931
|
+
|
|
932
|
+
def score(
|
|
933
|
+
self,
|
|
934
|
+
rule: RuleProtocol | ValidationRule,
|
|
935
|
+
data: pl.DataFrame,
|
|
936
|
+
ground_truth: pl.Series | None = None,
|
|
937
|
+
use_cache: bool = True,
|
|
938
|
+
) -> RuleQualityScore:
|
|
939
|
+
"""Score a rule's quality.
|
|
940
|
+
|
|
941
|
+
Args:
|
|
942
|
+
rule: Rule to score
|
|
943
|
+
data: Data to evaluate against
|
|
944
|
+
ground_truth: Optional ground truth labels
|
|
945
|
+
use_cache: Whether to use cached results
|
|
946
|
+
|
|
947
|
+
Returns:
|
|
948
|
+
Complete quality score
|
|
949
|
+
"""
|
|
950
|
+
# Check cache
|
|
951
|
+
cache_key = self._make_cache_key(rule, data)
|
|
952
|
+
if use_cache:
|
|
953
|
+
with self._lock:
|
|
954
|
+
if cache_key in self._cache:
|
|
955
|
+
return self._cache[cache_key]
|
|
956
|
+
|
|
957
|
+
# Estimate metrics
|
|
958
|
+
metrics = self._estimator.estimate(rule, data, ground_truth)
|
|
959
|
+
|
|
960
|
+
# Generate recommendation
|
|
961
|
+
recommendation, should_use = self._generate_recommendation(metrics, rule)
|
|
962
|
+
|
|
963
|
+
# Create score
|
|
964
|
+
score = RuleQualityScore(
|
|
965
|
+
rule_name=rule.name,
|
|
966
|
+
rule_type=rule.rule_type,
|
|
967
|
+
column=rule.column,
|
|
968
|
+
metrics=metrics,
|
|
969
|
+
recommendation=recommendation,
|
|
970
|
+
should_use=should_use,
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
# Cache result
|
|
974
|
+
if use_cache:
|
|
975
|
+
with self._lock:
|
|
976
|
+
self._cache[cache_key] = score
|
|
977
|
+
|
|
978
|
+
return score
|
|
979
|
+
|
|
980
|
+
def score_all(
|
|
981
|
+
self,
|
|
982
|
+
rules: list[RuleProtocol | ValidationRule],
|
|
983
|
+
data: pl.DataFrame,
|
|
984
|
+
ground_truth: pl.Series | None = None,
|
|
985
|
+
) -> list[RuleQualityScore]:
|
|
986
|
+
"""Score multiple rules.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
rules: Rules to score
|
|
990
|
+
data: Data to evaluate against
|
|
991
|
+
ground_truth: Optional ground truth labels
|
|
992
|
+
|
|
993
|
+
Returns:
|
|
994
|
+
List of quality scores
|
|
995
|
+
"""
|
|
996
|
+
return [self.score(rule, data, ground_truth) for rule in rules]
|
|
997
|
+
|
|
998
|
+
def compare(
|
|
999
|
+
self,
|
|
1000
|
+
rules: list[RuleProtocol | ValidationRule],
|
|
1001
|
+
data: pl.DataFrame,
|
|
1002
|
+
) -> list[RuleQualityScore]:
|
|
1003
|
+
"""Compare multiple rules and rank by quality.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
rules: Rules to compare
|
|
1007
|
+
data: Data to evaluate against
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
Scores sorted by F1 score (best first)
|
|
1011
|
+
"""
|
|
1012
|
+
scores = self.score_all(rules, data)
|
|
1013
|
+
return sorted(scores, key=lambda s: s.metrics.f1_score, reverse=True)
|
|
1014
|
+
|
|
1015
|
+
def _generate_recommendation(
|
|
1016
|
+
self,
|
|
1017
|
+
metrics: QualityMetrics,
|
|
1018
|
+
rule: RuleProtocol | ValidationRule,
|
|
1019
|
+
) -> tuple[str, bool]:
|
|
1020
|
+
"""Generate recommendation based on metrics."""
|
|
1021
|
+
f1 = metrics.f1_score
|
|
1022
|
+
precision = metrics.precision
|
|
1023
|
+
recall = metrics.recall
|
|
1024
|
+
confidence = metrics.confidence
|
|
1025
|
+
|
|
1026
|
+
# Check confidence
|
|
1027
|
+
if confidence < self.min_confidence:
|
|
1028
|
+
return (
|
|
1029
|
+
f"Low confidence ({confidence:.0%}). "
|
|
1030
|
+
"Consider collecting more data or using ground truth validation.",
|
|
1031
|
+
False,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
# Check quality threshold
|
|
1035
|
+
if f1 >= self.quality_threshold:
|
|
1036
|
+
if f1 >= 0.95:
|
|
1037
|
+
return f"Excellent rule quality (F1={f1:.2%}). Safe to use.", True
|
|
1038
|
+
elif f1 >= 0.85:
|
|
1039
|
+
return f"Good rule quality (F1={f1:.2%}). Recommended for use.", True
|
|
1040
|
+
else:
|
|
1041
|
+
return f"Acceptable quality (F1={f1:.2%}). Monitor for issues.", True
|
|
1042
|
+
|
|
1043
|
+
# Below threshold - provide specific advice
|
|
1044
|
+
if precision < recall:
|
|
1045
|
+
return (
|
|
1046
|
+
f"Low precision ({precision:.0%}). Rule may be too permissive. "
|
|
1047
|
+
"Consider stricter constraints.",
|
|
1048
|
+
False,
|
|
1049
|
+
)
|
|
1050
|
+
elif recall < precision:
|
|
1051
|
+
return (
|
|
1052
|
+
f"Low recall ({recall:.0%}). Rule may be too strict. "
|
|
1053
|
+
"Consider relaxing constraints or checking for edge cases.",
|
|
1054
|
+
False,
|
|
1055
|
+
)
|
|
1056
|
+
else:
|
|
1057
|
+
return (
|
|
1058
|
+
f"Poor overall quality (F1={f1:.2%}). "
|
|
1059
|
+
"Consider redesigning the rule or checking data quality.",
|
|
1060
|
+
False,
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
def _make_cache_key(
|
|
1064
|
+
self,
|
|
1065
|
+
rule: RuleProtocol | ValidationRule,
|
|
1066
|
+
data: pl.DataFrame,
|
|
1067
|
+
) -> str:
|
|
1068
|
+
"""Create cache key for rule + data combination."""
|
|
1069
|
+
rule_str = f"{rule.name}:{rule.rule_type}:{rule.column}"
|
|
1070
|
+
data_hash = hashlib.sha256(
|
|
1071
|
+
f"{len(data)}:{data.columns}".encode()
|
|
1072
|
+
).hexdigest()[:16]
|
|
1073
|
+
return f"{rule_str}:{data_hash}"
|
|
1074
|
+
|
|
1075
|
+
def clear_cache(self) -> None:
|
|
1076
|
+
"""Clear the score cache."""
|
|
1077
|
+
with self._lock:
|
|
1078
|
+
self._cache.clear()
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
# =============================================================================
|
|
1082
|
+
# Quality Trend Analyzer
|
|
1083
|
+
# =============================================================================
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
@dataclass
|
|
1087
|
+
class QualityTrendPoint:
|
|
1088
|
+
"""Single point in quality trend."""
|
|
1089
|
+
|
|
1090
|
+
timestamp: datetime
|
|
1091
|
+
metrics: QualityMetrics
|
|
1092
|
+
data_size: int
|
|
1093
|
+
notes: str = ""
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
class QualityTrendAnalyzer:
|
|
1097
|
+
"""Analyzes quality trends over time.
|
|
1098
|
+
|
|
1099
|
+
Tracks how rule quality changes as data evolves.
|
|
1100
|
+
|
|
1101
|
+
Example:
|
|
1102
|
+
analyzer = QualityTrendAnalyzer()
|
|
1103
|
+
|
|
1104
|
+
# Record quality over time
|
|
1105
|
+
analyzer.record(rule_name, metrics1, datetime.now())
|
|
1106
|
+
analyzer.record(rule_name, metrics2, datetime.now())
|
|
1107
|
+
|
|
1108
|
+
# Analyze trend
|
|
1109
|
+
trend = analyzer.analyze_trend(rule_name)
|
|
1110
|
+
print(f"Quality is {trend.direction}")
|
|
1111
|
+
"""
|
|
1112
|
+
|
|
1113
|
+
def __init__(self, storage_path: str | Path | None = None):
|
|
1114
|
+
self.storage_path = Path(storage_path) if storage_path else None
|
|
1115
|
+
self._trends: dict[str, list[QualityTrendPoint]] = defaultdict(list)
|
|
1116
|
+
self._lock = threading.Lock()
|
|
1117
|
+
|
|
1118
|
+
if self.storage_path and self.storage_path.exists():
|
|
1119
|
+
self._load()
|
|
1120
|
+
|
|
1121
|
+
def record(
|
|
1122
|
+
self,
|
|
1123
|
+
rule_name: str,
|
|
1124
|
+
metrics: QualityMetrics,
|
|
1125
|
+
timestamp: datetime | None = None,
|
|
1126
|
+
data_size: int = 0,
|
|
1127
|
+
notes: str = "",
|
|
1128
|
+
) -> None:
|
|
1129
|
+
"""Record a quality measurement.
|
|
1130
|
+
|
|
1131
|
+
Args:
|
|
1132
|
+
rule_name: Name of the rule
|
|
1133
|
+
metrics: Quality metrics
|
|
1134
|
+
timestamp: When measured (defaults to now)
|
|
1135
|
+
data_size: Size of data evaluated
|
|
1136
|
+
notes: Optional notes
|
|
1137
|
+
"""
|
|
1138
|
+
point = QualityTrendPoint(
|
|
1139
|
+
timestamp=timestamp or datetime.now(),
|
|
1140
|
+
metrics=metrics,
|
|
1141
|
+
data_size=data_size,
|
|
1142
|
+
notes=notes,
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
with self._lock:
|
|
1146
|
+
self._trends[rule_name].append(point)
|
|
1147
|
+
# Keep last 100 points per rule
|
|
1148
|
+
if len(self._trends[rule_name]) > 100:
|
|
1149
|
+
self._trends[rule_name] = self._trends[rule_name][-100:]
|
|
1150
|
+
|
|
1151
|
+
if self.storage_path:
|
|
1152
|
+
self._save()
|
|
1153
|
+
|
|
1154
|
+
def analyze_trend(
|
|
1155
|
+
self,
|
|
1156
|
+
rule_name: str,
|
|
1157
|
+
window_days: int = 30,
|
|
1158
|
+
) -> dict[str, Any]:
|
|
1159
|
+
"""Analyze quality trend for a rule.
|
|
1160
|
+
|
|
1161
|
+
Args:
|
|
1162
|
+
rule_name: Name of the rule
|
|
1163
|
+
window_days: Days to analyze
|
|
1164
|
+
|
|
1165
|
+
Returns:
|
|
1166
|
+
Trend analysis results
|
|
1167
|
+
"""
|
|
1168
|
+
with self._lock:
|
|
1169
|
+
points = self._trends.get(rule_name, [])
|
|
1170
|
+
|
|
1171
|
+
if not points:
|
|
1172
|
+
return {"error": "No data available"}
|
|
1173
|
+
|
|
1174
|
+
# Filter to window
|
|
1175
|
+
cutoff = datetime.now() - timedelta(days=window_days)
|
|
1176
|
+
recent = [p for p in points if p.timestamp > cutoff]
|
|
1177
|
+
|
|
1178
|
+
if len(recent) < 2:
|
|
1179
|
+
return {
|
|
1180
|
+
"current": points[-1].metrics.to_dict() if points else None,
|
|
1181
|
+
"trend": "insufficient_data",
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
# Calculate trend
|
|
1185
|
+
f1_values = [p.metrics.f1_score for p in recent]
|
|
1186
|
+
first_half = sum(f1_values[:len(f1_values)//2]) / (len(f1_values)//2)
|
|
1187
|
+
second_half = sum(f1_values[len(f1_values)//2:]) / (len(f1_values) - len(f1_values)//2)
|
|
1188
|
+
|
|
1189
|
+
change = second_half - first_half
|
|
1190
|
+
if change > 0.05:
|
|
1191
|
+
direction = "improving"
|
|
1192
|
+
elif change < -0.05:
|
|
1193
|
+
direction = "degrading"
|
|
1194
|
+
else:
|
|
1195
|
+
direction = "stable"
|
|
1196
|
+
|
|
1197
|
+
return {
|
|
1198
|
+
"current": recent[-1].metrics.to_dict(),
|
|
1199
|
+
"trend": direction,
|
|
1200
|
+
"change": change,
|
|
1201
|
+
"points_analyzed": len(recent),
|
|
1202
|
+
"oldest_point": recent[0].timestamp.isoformat(),
|
|
1203
|
+
"newest_point": recent[-1].timestamp.isoformat(),
|
|
1204
|
+
"f1_min": min(f1_values),
|
|
1205
|
+
"f1_max": max(f1_values),
|
|
1206
|
+
"f1_mean": sum(f1_values) / len(f1_values),
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
def get_history(
|
|
1210
|
+
self,
|
|
1211
|
+
rule_name: str,
|
|
1212
|
+
limit: int = 50,
|
|
1213
|
+
) -> list[dict[str, Any]]:
|
|
1214
|
+
"""Get quality history for a rule.
|
|
1215
|
+
|
|
1216
|
+
Args:
|
|
1217
|
+
rule_name: Name of the rule
|
|
1218
|
+
limit: Maximum points to return
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
List of historical measurements
|
|
1222
|
+
"""
|
|
1223
|
+
with self._lock:
|
|
1224
|
+
points = self._trends.get(rule_name, [])
|
|
1225
|
+
|
|
1226
|
+
return [
|
|
1227
|
+
{
|
|
1228
|
+
"timestamp": p.timestamp.isoformat(),
|
|
1229
|
+
"metrics": p.metrics.to_dict(),
|
|
1230
|
+
"data_size": p.data_size,
|
|
1231
|
+
"notes": p.notes,
|
|
1232
|
+
}
|
|
1233
|
+
for p in points[-limit:]
|
|
1234
|
+
]
|
|
1235
|
+
|
|
1236
|
+
def _save(self) -> None:
|
|
1237
|
+
"""Save trends to storage."""
|
|
1238
|
+
if not self.storage_path:
|
|
1239
|
+
return
|
|
1240
|
+
|
|
1241
|
+
data = {}
|
|
1242
|
+
with self._lock:
|
|
1243
|
+
for rule_name, points in self._trends.items():
|
|
1244
|
+
data[rule_name] = [
|
|
1245
|
+
{
|
|
1246
|
+
"timestamp": p.timestamp.isoformat(),
|
|
1247
|
+
"metrics": p.metrics.to_dict(),
|
|
1248
|
+
"data_size": p.data_size,
|
|
1249
|
+
"notes": p.notes,
|
|
1250
|
+
}
|
|
1251
|
+
for p in points
|
|
1252
|
+
]
|
|
1253
|
+
|
|
1254
|
+
with open(self.storage_path, "w") as f:
|
|
1255
|
+
json.dump(data, f)
|
|
1256
|
+
|
|
1257
|
+
def _load(self) -> None:
|
|
1258
|
+
"""Load trends from storage."""
|
|
1259
|
+
if not self.storage_path or not self.storage_path.exists():
|
|
1260
|
+
return
|
|
1261
|
+
|
|
1262
|
+
try:
|
|
1263
|
+
with open(self.storage_path) as f:
|
|
1264
|
+
data = json.load(f)
|
|
1265
|
+
|
|
1266
|
+
for rule_name, points in data.items():
|
|
1267
|
+
self._trends[rule_name] = [
|
|
1268
|
+
QualityTrendPoint(
|
|
1269
|
+
timestamp=datetime.fromisoformat(p["timestamp"]),
|
|
1270
|
+
metrics=QualityMetrics(**{
|
|
1271
|
+
k: v for k, v in p["metrics"].items()
|
|
1272
|
+
if k in QualityMetrics.__dataclass_fields__
|
|
1273
|
+
and k != "confusion_matrix"
|
|
1274
|
+
}),
|
|
1275
|
+
data_size=p.get("data_size", 0),
|
|
1276
|
+
notes=p.get("notes", ""),
|
|
1277
|
+
)
|
|
1278
|
+
for p in points
|
|
1279
|
+
]
|
|
1280
|
+
except Exception:
|
|
1281
|
+
pass
|
|
1282
|
+
|
|
1283
|
+
|
|
1284
|
+
# =============================================================================
|
|
1285
|
+
# Convenience Functions
|
|
1286
|
+
# =============================================================================
|
|
1287
|
+
|
|
1288
|
+
|
|
1289
|
+
def estimate_quality(
|
|
1290
|
+
rule: RuleProtocol | ValidationRule,
|
|
1291
|
+
data: pl.DataFrame,
|
|
1292
|
+
estimator: str = "sampling",
|
|
1293
|
+
**kwargs: Any,
|
|
1294
|
+
) -> QualityMetrics:
|
|
1295
|
+
"""Estimate quality metrics for a rule.
|
|
1296
|
+
|
|
1297
|
+
Args:
|
|
1298
|
+
rule: Rule to evaluate
|
|
1299
|
+
data: Data to evaluate against
|
|
1300
|
+
estimator: Estimator type to use
|
|
1301
|
+
**kwargs: Estimator options
|
|
1302
|
+
|
|
1303
|
+
Returns:
|
|
1304
|
+
Quality metrics
|
|
1305
|
+
"""
|
|
1306
|
+
est = quality_estimator_registry.create(estimator, **kwargs)
|
|
1307
|
+
return est.estimate(rule, data)
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
def score_rule(
|
|
1311
|
+
rule: RuleProtocol | ValidationRule,
|
|
1312
|
+
data: pl.DataFrame,
|
|
1313
|
+
**kwargs: Any,
|
|
1314
|
+
) -> RuleQualityScore:
|
|
1315
|
+
"""Score a rule's quality.
|
|
1316
|
+
|
|
1317
|
+
Args:
|
|
1318
|
+
rule: Rule to score
|
|
1319
|
+
data: Data to evaluate against
|
|
1320
|
+
**kwargs: Scorer options
|
|
1321
|
+
|
|
1322
|
+
Returns:
|
|
1323
|
+
Complete quality score
|
|
1324
|
+
"""
|
|
1325
|
+
scorer = RuleQualityScorer(**kwargs)
|
|
1326
|
+
return scorer.score(rule, data)
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
def compare_rules(
|
|
1330
|
+
rules: list[RuleProtocol | ValidationRule],
|
|
1331
|
+
data: pl.DataFrame,
|
|
1332
|
+
**kwargs: Any,
|
|
1333
|
+
) -> list[RuleQualityScore]:
|
|
1334
|
+
"""Compare multiple rules by quality.
|
|
1335
|
+
|
|
1336
|
+
Args:
|
|
1337
|
+
rules: Rules to compare
|
|
1338
|
+
data: Data to evaluate against
|
|
1339
|
+
**kwargs: Scorer options
|
|
1340
|
+
|
|
1341
|
+
Returns:
|
|
1342
|
+
Scores sorted by quality (best first)
|
|
1343
|
+
"""
|
|
1344
|
+
scorer = RuleQualityScorer(**kwargs)
|
|
1345
|
+
return scorer.compare(rules, data)
|