truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1089 @@
|
|
|
1
|
+
"""Base classes for validators.
|
|
2
|
+
|
|
3
|
+
Features:
|
|
4
|
+
- Immutable configuration (thread-safe)
|
|
5
|
+
- Timeout mechanism
|
|
6
|
+
- Type-safe column filtering
|
|
7
|
+
- ReDoS protection for regex patterns
|
|
8
|
+
- Graceful degradation on errors
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
import re
|
|
15
|
+
import signal
|
|
16
|
+
import threading
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
from functools import wraps
|
|
20
|
+
from enum import Enum
|
|
21
|
+
|
|
22
|
+
import polars as pl
|
|
23
|
+
|
|
24
|
+
from truthound.types import Severity
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ============================================================================
|
|
28
|
+
# Logging - Uses standard Python logging directly
|
|
29
|
+
# ============================================================================
|
|
30
|
+
|
|
31
|
+
def _get_logger(name: str) -> logging.Logger:
|
|
32
|
+
"""Get a logger for the given validator name."""
|
|
33
|
+
return logging.getLogger(f"truthound.{name}")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ============================================================================
|
|
37
|
+
# Error Types
|
|
38
|
+
# ============================================================================
|
|
39
|
+
|
|
40
|
+
class RegexValidationError(ValueError):
|
|
41
|
+
"""Raised when a regex pattern is invalid."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, pattern: str, error: str):
|
|
44
|
+
self.pattern = pattern
|
|
45
|
+
self.error = error
|
|
46
|
+
super().__init__(f"Invalid regex pattern '{pattern}': {error}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ValidationTimeoutError(Exception):
|
|
50
|
+
"""Raised when validation exceeds the configured timeout."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, timeout_seconds: float, validator_name: str = ""):
|
|
53
|
+
self.timeout_seconds = timeout_seconds
|
|
54
|
+
self.validator_name = validator_name
|
|
55
|
+
message = f"Validation timed out after {timeout_seconds}s"
|
|
56
|
+
if validator_name:
|
|
57
|
+
message = f"[{validator_name}] {message}"
|
|
58
|
+
super().__init__(message)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ColumnNotFoundError(Exception):
|
|
62
|
+
"""Raised when a required column is not found in the schema."""
|
|
63
|
+
|
|
64
|
+
def __init__(self, column: str, available_columns: list[str]):
|
|
65
|
+
self.column = column
|
|
66
|
+
self.available_columns = available_columns
|
|
67
|
+
super().__init__(
|
|
68
|
+
f"Column '{column}' not found. Available: {available_columns[:10]}"
|
|
69
|
+
+ ("..." if len(available_columns) > 10 else "")
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ============================================================================
|
|
74
|
+
# ReDoS Protection (simplified)
|
|
75
|
+
# ============================================================================
|
|
76
|
+
|
|
77
|
+
class RegexSafetyChecker:
|
|
78
|
+
"""Detects ReDoS vulnerabilities in regex patterns.
|
|
79
|
+
|
|
80
|
+
Checks for common dangerous patterns that could cause exponential backtracking.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
REDOS_PATTERNS = [
|
|
84
|
+
r"\(.+\)\+\+", # Nested quantifiers: (a+)+
|
|
85
|
+
r"\(.+\)\*\*", # Nested quantifiers: (a*)*
|
|
86
|
+
r"\(.+\)\{\d+,\}", # Nested with unbounded repetition
|
|
87
|
+
r"\(.+\|.+\)\+", # Alternation in quantified group
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
MAX_PATTERN_LENGTH = 1000
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def check_pattern(cls, pattern: str) -> tuple[bool, str | None]:
|
|
94
|
+
"""Check if a pattern is potentially vulnerable to ReDoS."""
|
|
95
|
+
if len(pattern) > cls.MAX_PATTERN_LENGTH:
|
|
96
|
+
return False, f"Pattern too long ({len(pattern)} > {cls.MAX_PATTERN_LENGTH})"
|
|
97
|
+
|
|
98
|
+
for redos_pattern in cls.REDOS_PATTERNS:
|
|
99
|
+
if re.search(redos_pattern, pattern):
|
|
100
|
+
return False, f"Potentially vulnerable to ReDoS: matches {redos_pattern}"
|
|
101
|
+
|
|
102
|
+
return True, None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ============================================================================
|
|
106
|
+
# Safe Sampling
|
|
107
|
+
# ============================================================================
|
|
108
|
+
|
|
109
|
+
class SafeSampler:
|
|
110
|
+
"""Memory-safe sampling using Polars lazy evaluation."""
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def safe_head(
|
|
114
|
+
lf: pl.LazyFrame,
|
|
115
|
+
n: int,
|
|
116
|
+
columns: list[str] | None = None,
|
|
117
|
+
) -> pl.DataFrame:
|
|
118
|
+
"""Safely get first n rows."""
|
|
119
|
+
query = lf
|
|
120
|
+
if columns:
|
|
121
|
+
schema = lf.collect_schema()
|
|
122
|
+
valid_cols = [c for c in columns if c in schema.names()]
|
|
123
|
+
if valid_cols:
|
|
124
|
+
query = query.select(valid_cols)
|
|
125
|
+
return query.head(n).collect(engine="streaming")
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def safe_sample(
|
|
129
|
+
lf: pl.LazyFrame,
|
|
130
|
+
n: int,
|
|
131
|
+
columns: list[str] | None = None,
|
|
132
|
+
seed: int | None = None,
|
|
133
|
+
) -> pl.DataFrame:
|
|
134
|
+
"""Safely sample n rows."""
|
|
135
|
+
return SafeSampler.safe_head(lf, n, columns)
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def safe_filter_sample(
|
|
139
|
+
lf: pl.LazyFrame,
|
|
140
|
+
filter_expr: pl.Expr,
|
|
141
|
+
n: int,
|
|
142
|
+
columns: list[str] | None = None,
|
|
143
|
+
) -> pl.DataFrame:
|
|
144
|
+
"""Safely get filtered samples."""
|
|
145
|
+
query = lf.filter(filter_expr)
|
|
146
|
+
if columns:
|
|
147
|
+
schema = lf.collect_schema()
|
|
148
|
+
valid_cols = [c for c in columns if c in schema.names()]
|
|
149
|
+
if valid_cols:
|
|
150
|
+
query = query.select(valid_cols)
|
|
151
|
+
return query.head(n).collect(engine="streaming")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ============================================================================
|
|
155
|
+
# Memory Tracking (stub for compatibility)
|
|
156
|
+
# ============================================================================
|
|
157
|
+
|
|
158
|
+
class MemoryTracker:
|
|
159
|
+
"""Stub for backward compatibility. Memory tracking is not enforced."""
|
|
160
|
+
|
|
161
|
+
def __init__(self, limit_mb: float | None = None):
|
|
162
|
+
self.limit_mb = limit_mb
|
|
163
|
+
self.peak_mb: float = 0.0
|
|
164
|
+
|
|
165
|
+
def get_current_mb(self) -> float:
|
|
166
|
+
return 0.0
|
|
167
|
+
|
|
168
|
+
def start(self) -> None:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
def check(self) -> None:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
def get_delta_mb(self) -> float:
|
|
175
|
+
return 0.0
|
|
176
|
+
|
|
177
|
+
def __enter__(self) -> "MemoryTracker":
|
|
178
|
+
return self
|
|
179
|
+
|
|
180
|
+
def __exit__(self, *args: Any) -> None:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ============================================================================
|
|
185
|
+
# Graceful Degradation (#8, #13)
|
|
186
|
+
# ============================================================================
|
|
187
|
+
|
|
188
|
+
class ValidationResult(Enum):
|
|
189
|
+
"""Result status for individual validation operations."""
|
|
190
|
+
SUCCESS = "success"
|
|
191
|
+
PARTIAL = "partial" # Completed with some issues
|
|
192
|
+
SKIPPED = "skipped" # Skipped due to missing columns, etc.
|
|
193
|
+
FAILED = "failed" # Unrecoverable error
|
|
194
|
+
TIMEOUT = "timeout" # Exceeded time limit
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class ErrorContext:
|
|
199
|
+
"""Simplified error context for validation failures."""
|
|
200
|
+
error_type: str
|
|
201
|
+
message: str
|
|
202
|
+
|
|
203
|
+
def to_dict(self) -> dict[str, Any]:
|
|
204
|
+
return {"error_type": self.error_type, "message": self.message}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@dataclass
|
|
208
|
+
class ValidatorExecutionResult:
|
|
209
|
+
"""Result of a single validator execution with error handling."""
|
|
210
|
+
validator_name: str
|
|
211
|
+
status: ValidationResult
|
|
212
|
+
issues: list["ValidationIssue"]
|
|
213
|
+
error_message: str | None = None
|
|
214
|
+
error_context: ErrorContext | None = None
|
|
215
|
+
execution_time_ms: float = 0.0
|
|
216
|
+
|
|
217
|
+
def to_dict(self) -> dict[str, Any]:
|
|
218
|
+
return {
|
|
219
|
+
"validator": self.validator_name,
|
|
220
|
+
"status": self.status.value,
|
|
221
|
+
"issue_count": len(self.issues),
|
|
222
|
+
"execution_time_ms": self.execution_time_ms,
|
|
223
|
+
"error": self.error_context.to_dict() if self.error_context else None,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _validate_safe(
|
|
228
|
+
validator: "Validator",
|
|
229
|
+
lf: pl.LazyFrame,
|
|
230
|
+
skip_on_error: bool = True,
|
|
231
|
+
log_errors: bool = True,
|
|
232
|
+
) -> ValidatorExecutionResult:
|
|
233
|
+
"""Execute validation with error handling.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
ValidatorExecutionResult with status and any issues found
|
|
237
|
+
"""
|
|
238
|
+
start_time = time.time()
|
|
239
|
+
logger = _get_logger(validator.name)
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
issues = validator.validate(lf)
|
|
243
|
+
return ValidatorExecutionResult(
|
|
244
|
+
validator_name=validator.name,
|
|
245
|
+
status=ValidationResult.SUCCESS,
|
|
246
|
+
issues=issues,
|
|
247
|
+
execution_time_ms=(time.time() - start_time) * 1000,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
except ColumnNotFoundError as e:
|
|
251
|
+
if log_errors:
|
|
252
|
+
logger.warning(f"Column not found: {e.column}")
|
|
253
|
+
return ValidatorExecutionResult(
|
|
254
|
+
validator_name=validator.name,
|
|
255
|
+
status=ValidationResult.SKIPPED,
|
|
256
|
+
issues=[],
|
|
257
|
+
error_message=str(e),
|
|
258
|
+
error_context=ErrorContext("ColumnNotFoundError", str(e)),
|
|
259
|
+
execution_time_ms=(time.time() - start_time) * 1000,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
except ValidationTimeoutError as e:
|
|
263
|
+
if log_errors:
|
|
264
|
+
logger.warning(f"Validation timed out: {e.timeout_seconds}s")
|
|
265
|
+
return ValidatorExecutionResult(
|
|
266
|
+
validator_name=validator.name,
|
|
267
|
+
status=ValidationResult.TIMEOUT,
|
|
268
|
+
issues=[],
|
|
269
|
+
error_message=str(e),
|
|
270
|
+
error_context=ErrorContext("ValidationTimeoutError", str(e)),
|
|
271
|
+
execution_time_ms=(time.time() - start_time) * 1000,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
except Exception as e:
|
|
275
|
+
if log_errors:
|
|
276
|
+
logger.exception(f"Error in {validator.name}: {e}")
|
|
277
|
+
if skip_on_error:
|
|
278
|
+
return ValidatorExecutionResult(
|
|
279
|
+
validator_name=validator.name,
|
|
280
|
+
status=ValidationResult.FAILED,
|
|
281
|
+
issues=[],
|
|
282
|
+
error_message=str(e),
|
|
283
|
+
error_context=ErrorContext(type(e).__name__, str(e)),
|
|
284
|
+
execution_time_ms=(time.time() - start_time) * 1000,
|
|
285
|
+
)
|
|
286
|
+
raise
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class GracefulValidator:
|
|
290
|
+
"""Wrapper for backward compatibility. Use validator.validate_safe() instead."""
|
|
291
|
+
|
|
292
|
+
def __init__(
|
|
293
|
+
self,
|
|
294
|
+
validator: "Validator",
|
|
295
|
+
skip_on_error: bool = True,
|
|
296
|
+
log_errors: bool = True,
|
|
297
|
+
):
|
|
298
|
+
self.validator = validator
|
|
299
|
+
self.skip_on_error = skip_on_error
|
|
300
|
+
self.log_errors = log_errors
|
|
301
|
+
|
|
302
|
+
def validate(self, lf: pl.LazyFrame) -> ValidatorExecutionResult:
|
|
303
|
+
return _validate_safe(
|
|
304
|
+
self.validator, lf, self.skip_on_error, self.log_errors
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# ============================================================================
|
|
309
|
+
# Schema Resilience (#9)
|
|
310
|
+
# ============================================================================
|
|
311
|
+
|
|
312
|
+
class SchemaValidator:
|
|
313
|
+
"""Validates schema compatibility before running validators.
|
|
314
|
+
|
|
315
|
+
Prevents runtime errors from missing columns by pre-checking schema.
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def check_columns_exist(
|
|
320
|
+
lf: pl.LazyFrame,
|
|
321
|
+
required_columns: list[str],
|
|
322
|
+
raise_on_missing: bool = True,
|
|
323
|
+
) -> tuple[bool, list[str]]:
|
|
324
|
+
"""Check if required columns exist in the LazyFrame.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
lf: LazyFrame to check
|
|
328
|
+
required_columns: List of required column names
|
|
329
|
+
raise_on_missing: If True, raise ColumnNotFoundError
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Tuple of (all_exist, missing_columns)
|
|
333
|
+
"""
|
|
334
|
+
schema = lf.collect_schema()
|
|
335
|
+
available = set(schema.names())
|
|
336
|
+
missing = [c for c in required_columns if c not in available]
|
|
337
|
+
|
|
338
|
+
if missing and raise_on_missing:
|
|
339
|
+
raise ColumnNotFoundError(missing[0], list(available))
|
|
340
|
+
|
|
341
|
+
return len(missing) == 0, missing
|
|
342
|
+
|
|
343
|
+
@staticmethod
|
|
344
|
+
def get_safe_columns(
|
|
345
|
+
lf: pl.LazyFrame,
|
|
346
|
+
requested_columns: list[str] | None,
|
|
347
|
+
dtype_filter: set[type] | None = None,
|
|
348
|
+
) -> list[str]:
|
|
349
|
+
"""Get columns that exist and match type filter.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
lf: LazyFrame to check
|
|
353
|
+
requested_columns: Requested columns (None = all)
|
|
354
|
+
dtype_filter: Optional set of allowed types
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
List of valid column names
|
|
358
|
+
"""
|
|
359
|
+
schema = lf.collect_schema()
|
|
360
|
+
available = list(schema.names())
|
|
361
|
+
|
|
362
|
+
if requested_columns:
|
|
363
|
+
columns = [c for c in requested_columns if c in available]
|
|
364
|
+
else:
|
|
365
|
+
columns = available
|
|
366
|
+
|
|
367
|
+
if dtype_filter:
|
|
368
|
+
columns = [c for c in columns if type(schema[c]) in dtype_filter]
|
|
369
|
+
|
|
370
|
+
return columns
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ============================================================================
|
|
374
|
+
# Configuration
|
|
375
|
+
# ============================================================================
|
|
376
|
+
|
|
377
|
+
@dataclass(frozen=True)
|
|
378
|
+
class ValidatorConfig:
|
|
379
|
+
"""Immutable configuration for validators.
|
|
380
|
+
|
|
381
|
+
Thread-safe frozen dataclass that can be used as dict keys.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
columns: tuple[str, ...] | None = None
|
|
385
|
+
exclude_columns: tuple[str, ...] | None = None
|
|
386
|
+
severity_override: Severity | None = None
|
|
387
|
+
sample_size: int = 5
|
|
388
|
+
mostly: float | None = None # Fraction of rows that must pass (0.0 to 1.0)
|
|
389
|
+
timeout_seconds: float | None = 300.0
|
|
390
|
+
graceful_degradation: bool = True
|
|
391
|
+
log_errors: bool = True
|
|
392
|
+
|
|
393
|
+
def __post_init__(self) -> None:
|
|
394
|
+
"""Validate configuration parameters."""
|
|
395
|
+
if self.sample_size < 0:
|
|
396
|
+
raise ValueError(f"sample_size must be >= 0, got {self.sample_size}")
|
|
397
|
+
if self.mostly is not None and not (0.0 <= self.mostly <= 1.0):
|
|
398
|
+
raise ValueError(f"mostly must be in [0.0, 1.0], got {self.mostly}")
|
|
399
|
+
if self.timeout_seconds is not None and self.timeout_seconds <= 0:
|
|
400
|
+
raise ValueError(f"timeout_seconds must be > 0, got {self.timeout_seconds}")
|
|
401
|
+
|
|
402
|
+
def replace(self, **kwargs: Any) -> "ValidatorConfig":
|
|
403
|
+
"""Create a new config with updated values."""
|
|
404
|
+
from dataclasses import asdict
|
|
405
|
+
current = asdict(self)
|
|
406
|
+
current.update(kwargs)
|
|
407
|
+
# Convert lists to tuples for frozen dataclass
|
|
408
|
+
if "columns" in current and isinstance(current["columns"], list):
|
|
409
|
+
current["columns"] = tuple(current["columns"])
|
|
410
|
+
if "exclude_columns" in current and isinstance(current["exclude_columns"], list):
|
|
411
|
+
current["exclude_columns"] = tuple(current["exclude_columns"])
|
|
412
|
+
return ValidatorConfig(**current)
|
|
413
|
+
|
|
414
|
+
@classmethod
|
|
415
|
+
def from_kwargs(cls, **kwargs: Any) -> "ValidatorConfig":
|
|
416
|
+
"""Create config from kwargs, converting lists to tuples."""
|
|
417
|
+
if "columns" in kwargs and isinstance(kwargs["columns"], list):
|
|
418
|
+
kwargs["columns"] = tuple(kwargs["columns"])
|
|
419
|
+
if "exclude_columns" in kwargs and isinstance(kwargs["exclude_columns"], list):
|
|
420
|
+
kwargs["exclude_columns"] = tuple(kwargs["exclude_columns"])
|
|
421
|
+
valid_fields = {
|
|
422
|
+
"columns", "exclude_columns", "severity_override", "sample_size",
|
|
423
|
+
"mostly", "timeout_seconds", "graceful_degradation", "log_errors"
|
|
424
|
+
}
|
|
425
|
+
filtered = {k: v for k, v in kwargs.items() if k in valid_fields}
|
|
426
|
+
return cls(**filtered)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# ============================================================================
|
|
430
|
+
# Timeout Handler
|
|
431
|
+
# ============================================================================
|
|
432
|
+
|
|
433
|
+
class TimeoutHandler:
|
|
434
|
+
"""Thread-safe timeout handler for validation operations."""
|
|
435
|
+
|
|
436
|
+
def __init__(self, timeout_seconds: float | None, validator_name: str = ""):
|
|
437
|
+
self.timeout_seconds = timeout_seconds
|
|
438
|
+
self.validator_name = validator_name
|
|
439
|
+
self._old_handler = None
|
|
440
|
+
|
|
441
|
+
def _timeout_handler(self, signum: int, frame: Any) -> None:
|
|
442
|
+
raise ValidationTimeoutError(self.timeout_seconds or 0, self.validator_name)
|
|
443
|
+
|
|
444
|
+
def __enter__(self) -> "TimeoutHandler":
|
|
445
|
+
if self.timeout_seconds is None:
|
|
446
|
+
return self
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
if threading.current_thread() is threading.main_thread():
|
|
450
|
+
self._old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
|
|
451
|
+
signal.setitimer(signal.ITIMER_REAL, self.timeout_seconds)
|
|
452
|
+
except (AttributeError, ValueError):
|
|
453
|
+
pass
|
|
454
|
+
|
|
455
|
+
return self
|
|
456
|
+
|
|
457
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> bool:
|
|
458
|
+
if self.timeout_seconds is None:
|
|
459
|
+
return False
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
if threading.current_thread() is threading.main_thread():
|
|
463
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
464
|
+
if self._old_handler is not None:
|
|
465
|
+
signal.signal(signal.SIGALRM, self._old_handler)
|
|
466
|
+
except (AttributeError, ValueError):
|
|
467
|
+
pass
|
|
468
|
+
|
|
469
|
+
return False
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def with_timeout(func: Callable) -> Callable:
|
|
473
|
+
"""Decorator to add timeout support to validation methods."""
|
|
474
|
+
@wraps(func)
|
|
475
|
+
def wrapper(self: "Validator", *args: Any, **kwargs: Any) -> Any:
|
|
476
|
+
timeout = self.config.timeout_seconds
|
|
477
|
+
validator_name = getattr(self, "name", self.__class__.__name__)
|
|
478
|
+
|
|
479
|
+
with TimeoutHandler(timeout, validator_name):
|
|
480
|
+
return func(self, *args, **kwargs)
|
|
481
|
+
|
|
482
|
+
return wrapper
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
# ============================================================================
|
|
486
|
+
# ValidationIssue
|
|
487
|
+
# ============================================================================
|
|
488
|
+
|
|
489
|
+
@dataclass
|
|
490
|
+
class ValidationIssue:
|
|
491
|
+
"""Represents a single data quality issue found during validation."""
|
|
492
|
+
|
|
493
|
+
column: str
|
|
494
|
+
issue_type: str
|
|
495
|
+
count: int
|
|
496
|
+
severity: Severity
|
|
497
|
+
details: str | None = None
|
|
498
|
+
expected: Any | None = None
|
|
499
|
+
actual: Any | None = None
|
|
500
|
+
sample_values: list[Any] | None = None
|
|
501
|
+
|
|
502
|
+
def to_dict(self) -> dict:
|
|
503
|
+
"""Convert to dictionary for JSON serialization."""
|
|
504
|
+
result = {
|
|
505
|
+
"column": self.column,
|
|
506
|
+
"issue_type": self.issue_type,
|
|
507
|
+
"count": self.count,
|
|
508
|
+
"severity": self.severity.value,
|
|
509
|
+
"details": self.details,
|
|
510
|
+
}
|
|
511
|
+
if self.expected is not None:
|
|
512
|
+
result["expected"] = self.expected
|
|
513
|
+
if self.actual is not None:
|
|
514
|
+
result["actual"] = self.actual
|
|
515
|
+
if self.sample_values is not None:
|
|
516
|
+
result["sample_values"] = self.sample_values
|
|
517
|
+
return result
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
# ============================================================================
|
|
521
|
+
# Type Filters
|
|
522
|
+
# ============================================================================
|
|
523
|
+
|
|
524
|
+
NUMERIC_TYPES: set[type[pl.DataType]] = {
|
|
525
|
+
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
526
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
527
|
+
pl.Float32, pl.Float64,
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
STRING_TYPES: set[type[pl.DataType]] = {pl.String, pl.Utf8}
|
|
531
|
+
|
|
532
|
+
DATETIME_TYPES: set[type[pl.DataType]] = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
|
|
533
|
+
|
|
534
|
+
FLOAT_TYPES: set[type[pl.DataType]] = {pl.Float32, pl.Float64}
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# ============================================================================
|
|
538
|
+
# Base Validator
|
|
539
|
+
# ============================================================================
|
|
540
|
+
|
|
541
|
+
class Validator(ABC):
|
|
542
|
+
"""Abstract base class for all validators.
|
|
543
|
+
|
|
544
|
+
Features:
|
|
545
|
+
- Immutable ValidatorConfig (thread-safe)
|
|
546
|
+
- Timeout support
|
|
547
|
+
- Schema validation
|
|
548
|
+
- Graceful degradation on errors
|
|
549
|
+
- Dependency-aware execution ordering
|
|
550
|
+
|
|
551
|
+
Data Type Support:
|
|
552
|
+
Validators ONLY accept Polars LazyFrame (pl.LazyFrame) directly.
|
|
553
|
+
For other data types, use the public API (th.check()) which handles conversion:
|
|
554
|
+
|
|
555
|
+
- th.check("data.csv") → Automatically converts to LazyFrame
|
|
556
|
+
- th.check(pl.DataFrame()) → Converts DataFrame to LazyFrame
|
|
557
|
+
- th.check(pd.DataFrame()) → Converts pandas DataFrame to LazyFrame
|
|
558
|
+
- th.check({"col": [1,2]}) → Converts dict to LazyFrame
|
|
559
|
+
|
|
560
|
+
If using validators directly, convert data first::
|
|
561
|
+
|
|
562
|
+
import polars as pl
|
|
563
|
+
from truthound.adapters import to_lazyframe
|
|
564
|
+
|
|
565
|
+
# Option 1: Use the adapter
|
|
566
|
+
lf = to_lazyframe(your_data)
|
|
567
|
+
issues = NullValidator().validate(lf)
|
|
568
|
+
|
|
569
|
+
# Option 2: Convert manually
|
|
570
|
+
lf = pl.DataFrame(your_data).lazy()
|
|
571
|
+
issues = NullValidator().validate(lf)
|
|
572
|
+
|
|
573
|
+
Class Attributes:
|
|
574
|
+
name: Unique identifier for this validator
|
|
575
|
+
category: Validator category (schema, completeness, uniqueness, etc.)
|
|
576
|
+
dependencies: Set of validator names that must run before this one
|
|
577
|
+
provides: Set of capabilities this validator provides
|
|
578
|
+
priority: Execution priority within phase (lower = earlier)
|
|
579
|
+
|
|
580
|
+
Example:
|
|
581
|
+
class MyValidator(Validator):
|
|
582
|
+
name = "my_validator"
|
|
583
|
+
category = "custom"
|
|
584
|
+
dependencies = {"null", "schema"} # Runs after null and schema
|
|
585
|
+
provides = {"my_check"} # Other validators can depend on this
|
|
586
|
+
|
|
587
|
+
def validate(self, lf):
|
|
588
|
+
...
|
|
589
|
+
"""
|
|
590
|
+
|
|
591
|
+
name: str = "base"
|
|
592
|
+
category: str = "general"
|
|
593
|
+
|
|
594
|
+
# DAG execution metadata
|
|
595
|
+
dependencies: set[str] = set() # Validators that must run before this
|
|
596
|
+
provides: set[str] = set() # Capabilities this validator provides
|
|
597
|
+
priority: int = 100 # Lower = runs earlier within same phase
|
|
598
|
+
|
|
599
|
+
def __init__(self, config: ValidatorConfig | None = None, **kwargs: Any):
|
|
600
|
+
"""Initialize the validator.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
config: Immutable validator configuration
|
|
604
|
+
**kwargs: Additional config options (merged into config)
|
|
605
|
+
"""
|
|
606
|
+
if config is not None:
|
|
607
|
+
self.config = config.replace(**kwargs) if kwargs else config
|
|
608
|
+
else:
|
|
609
|
+
self.config = ValidatorConfig.from_kwargs(**kwargs)
|
|
610
|
+
self.logger = _get_logger(self.name)
|
|
611
|
+
|
|
612
|
+
@abstractmethod
|
|
613
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
614
|
+
"""Run validation on the given LazyFrame."""
|
|
615
|
+
pass
|
|
616
|
+
|
|
617
|
+
def validate_safe(self, lf: pl.LazyFrame) -> ValidatorExecutionResult:
|
|
618
|
+
"""Run validation with graceful error handling."""
|
|
619
|
+
return _validate_safe(
|
|
620
|
+
self,
|
|
621
|
+
lf,
|
|
622
|
+
skip_on_error=self.config.graceful_degradation,
|
|
623
|
+
log_errors=self.config.log_errors,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
def validate_with_timeout(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
627
|
+
"""Run validation with timeout protection."""
|
|
628
|
+
timeout = self.config.timeout_seconds
|
|
629
|
+
validator_name = getattr(self, "name", self.__class__.__name__)
|
|
630
|
+
|
|
631
|
+
with TimeoutHandler(timeout, validator_name):
|
|
632
|
+
return self.validate(lf)
|
|
633
|
+
|
|
634
|
+
def _get_target_columns(
|
|
635
|
+
self,
|
|
636
|
+
lf: pl.LazyFrame,
|
|
637
|
+
dtype_filter: set[type[pl.DataType]] | None = None,
|
|
638
|
+
) -> list[str]:
|
|
639
|
+
"""Get columns to validate based on config and dtype filter.
|
|
640
|
+
|
|
641
|
+
Uses SchemaValidator for safe column resolution.
|
|
642
|
+
"""
|
|
643
|
+
requested = list(self.config.columns) if self.config.columns else None
|
|
644
|
+
exclude = list(self.config.exclude_columns) if self.config.exclude_columns else []
|
|
645
|
+
|
|
646
|
+
columns = SchemaValidator.get_safe_columns(lf, requested, dtype_filter)
|
|
647
|
+
|
|
648
|
+
if exclude:
|
|
649
|
+
columns = [c for c in columns if c not in exclude]
|
|
650
|
+
|
|
651
|
+
return columns
|
|
652
|
+
|
|
653
|
+
def _calculate_severity(
|
|
654
|
+
self,
|
|
655
|
+
ratio: float,
|
|
656
|
+
thresholds: tuple[float, float, float] = (0.5, 0.2, 0.05),
|
|
657
|
+
) -> Severity:
|
|
658
|
+
"""Calculate severity based on ratio and thresholds."""
|
|
659
|
+
if self.config.severity_override:
|
|
660
|
+
return self.config.severity_override
|
|
661
|
+
|
|
662
|
+
critical_th, high_th, medium_th = thresholds
|
|
663
|
+
if ratio > critical_th:
|
|
664
|
+
return Severity.CRITICAL
|
|
665
|
+
elif ratio > high_th:
|
|
666
|
+
return Severity.HIGH
|
|
667
|
+
elif ratio > medium_th:
|
|
668
|
+
return Severity.MEDIUM
|
|
669
|
+
return Severity.LOW
|
|
670
|
+
|
|
671
|
+
def _passes_mostly(self, failure_count: int, total_count: int) -> bool:
|
|
672
|
+
"""Check if validation passes based on mostly threshold."""
|
|
673
|
+
if self.config.mostly is None:
|
|
674
|
+
return False
|
|
675
|
+
|
|
676
|
+
if total_count == 0:
|
|
677
|
+
return True
|
|
678
|
+
|
|
679
|
+
pass_ratio = 1 - (failure_count / total_count)
|
|
680
|
+
return pass_ratio >= self.config.mostly
|
|
681
|
+
|
|
682
|
+
def _get_mostly_adjusted_severity(
|
|
683
|
+
self,
|
|
684
|
+
failure_count: int,
|
|
685
|
+
total_count: int,
|
|
686
|
+
base_severity: Severity,
|
|
687
|
+
) -> Severity | None:
|
|
688
|
+
"""Get severity adjusted for mostly threshold."""
|
|
689
|
+
if self._passes_mostly(failure_count, total_count):
|
|
690
|
+
return None
|
|
691
|
+
return base_severity
|
|
692
|
+
|
|
693
|
+
def _safe_sample(
|
|
694
|
+
self,
|
|
695
|
+
lf: pl.LazyFrame,
|
|
696
|
+
filter_expr: pl.Expr,
|
|
697
|
+
columns: list[str] | None = None,
|
|
698
|
+
) -> list[Any]:
|
|
699
|
+
"""Safely get sample values."""
|
|
700
|
+
try:
|
|
701
|
+
df = SafeSampler.safe_filter_sample(
|
|
702
|
+
lf, filter_expr, self.config.sample_size, columns
|
|
703
|
+
)
|
|
704
|
+
return df.to_dicts() if len(df) > 0 else []
|
|
705
|
+
except Exception as e:
|
|
706
|
+
self.logger.warning(f"Failed to collect samples: {e}")
|
|
707
|
+
return []
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
# ============================================================================
|
|
711
|
+
# Mixins
|
|
712
|
+
# ============================================================================
|
|
713
|
+
|
|
714
|
+
class NumericValidatorMixin:
|
|
715
|
+
"""Mixin for validators that work with numeric columns."""
|
|
716
|
+
|
|
717
|
+
def _get_numeric_columns(self, lf: pl.LazyFrame) -> list[str]:
|
|
718
|
+
return self._get_target_columns(lf, dtype_filter=NUMERIC_TYPES) # type: ignore
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
class StringValidatorMixin:
|
|
722
|
+
"""Mixin for validators that work with string columns."""
|
|
723
|
+
|
|
724
|
+
def _get_string_columns(self, lf: pl.LazyFrame) -> list[str]:
|
|
725
|
+
return self._get_target_columns(lf, dtype_filter=STRING_TYPES) # type: ignore
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
class DatetimeValidatorMixin:
|
|
729
|
+
"""Mixin for validators that work with datetime columns."""
|
|
730
|
+
|
|
731
|
+
def _get_datetime_columns(self, lf: pl.LazyFrame) -> list[str]:
|
|
732
|
+
return self._get_target_columns(lf, dtype_filter=DATETIME_TYPES) # type: ignore
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
class FloatValidatorMixin:
|
|
736
|
+
"""Mixin for validators that work with float columns."""
|
|
737
|
+
|
|
738
|
+
def _get_float_columns(self, lf: pl.LazyFrame) -> list[str]:
|
|
739
|
+
return self._get_target_columns(lf, dtype_filter=FLOAT_TYPES) # type: ignore
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
class RegexValidatorMixin:
|
|
743
|
+
"""Mixin for validators that use regex patterns with ReDoS protection."""
|
|
744
|
+
|
|
745
|
+
@staticmethod
|
|
746
|
+
def validate_pattern(pattern: str, flags: int = 0) -> re.Pattern[str]:
|
|
747
|
+
"""Validate and compile a regex pattern with ReDoS check."""
|
|
748
|
+
if pattern is None:
|
|
749
|
+
raise RegexValidationError("None", "Pattern cannot be None")
|
|
750
|
+
|
|
751
|
+
is_safe, warning = RegexSafetyChecker.check_pattern(pattern)
|
|
752
|
+
if not is_safe:
|
|
753
|
+
raise RegexValidationError(pattern, f"ReDoS risk: {warning}")
|
|
754
|
+
|
|
755
|
+
try:
|
|
756
|
+
return re.compile(pattern, flags)
|
|
757
|
+
except re.error as e:
|
|
758
|
+
raise RegexValidationError(pattern, str(e)) from e
|
|
759
|
+
|
|
760
|
+
@staticmethod
|
|
761
|
+
def validate_patterns(patterns: list[str], flags: int = 0) -> list[re.Pattern[str]]:
|
|
762
|
+
"""Validate and compile multiple regex patterns."""
|
|
763
|
+
return [RegexValidatorMixin.validate_pattern(p, flags) for p in patterns]
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
class StreamingValidatorMixin:
|
|
767
|
+
"""Mixin for validators that support streaming/chunked processing."""
|
|
768
|
+
|
|
769
|
+
default_chunk_size: int = 100_000
|
|
770
|
+
|
|
771
|
+
def _validate_streaming(
|
|
772
|
+
self,
|
|
773
|
+
lf: pl.LazyFrame,
|
|
774
|
+
chunk_size: int | None = None,
|
|
775
|
+
validate_chunk: Callable[[pl.LazyFrame], list["ValidationIssue"]] | None = None,
|
|
776
|
+
) -> list["ValidationIssue"]:
|
|
777
|
+
"""Process validation in streaming chunks."""
|
|
778
|
+
chunk_size = chunk_size or self.default_chunk_size
|
|
779
|
+
validate_fn = validate_chunk or self.validate # type: ignore
|
|
780
|
+
|
|
781
|
+
total_rows = lf.select(pl.len()).collect().item()
|
|
782
|
+
if total_rows == 0:
|
|
783
|
+
return []
|
|
784
|
+
if total_rows <= chunk_size:
|
|
785
|
+
return validate_fn(lf)
|
|
786
|
+
|
|
787
|
+
all_issues: dict[tuple[str, str], "ValidationIssue"] = {}
|
|
788
|
+
for offset in range(0, total_rows, chunk_size):
|
|
789
|
+
chunk_lf = lf.slice(offset, chunk_size)
|
|
790
|
+
for issue in validate_fn(chunk_lf):
|
|
791
|
+
key = (issue.column, issue.issue_type)
|
|
792
|
+
if key in all_issues:
|
|
793
|
+
all_issues[key].count += issue.count
|
|
794
|
+
else:
|
|
795
|
+
all_issues[key] = issue
|
|
796
|
+
return list(all_issues.values())
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
class EnterpriseScaleSamplingMixin:
|
|
800
|
+
"""Mixin for validators that support enterprise-scale sampling.
|
|
801
|
+
|
|
802
|
+
Provides automatic sampling for large datasets (100M+ rows) with
|
|
803
|
+
statistical quality guarantees.
|
|
804
|
+
|
|
805
|
+
Features:
|
|
806
|
+
- Automatic scale detection and strategy selection
|
|
807
|
+
- Memory-aware sampling with backpressure
|
|
808
|
+
- Statistical confidence bounds on results
|
|
809
|
+
- Time-budget aware processing
|
|
810
|
+
|
|
811
|
+
Usage:
|
|
812
|
+
class MyValidator(Validator, EnterpriseScaleSamplingMixin):
|
|
813
|
+
# Enable sampling for datasets > 10M rows
|
|
814
|
+
sampling_threshold: int = 10_000_000
|
|
815
|
+
sampling_target_rows: int = 100_000
|
|
816
|
+
sampling_quality: str = "standard"
|
|
817
|
+
|
|
818
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
819
|
+
# Automatically sample if dataset is large
|
|
820
|
+
sampled_lf, metrics = self._sample_for_validation(lf)
|
|
821
|
+
|
|
822
|
+
# Validate on sampled data
|
|
823
|
+
issues = self._do_validation(sampled_lf)
|
|
824
|
+
|
|
825
|
+
# Extrapolate counts if sampled
|
|
826
|
+
if metrics.is_sampled:
|
|
827
|
+
issues = self._extrapolate_issues(issues, metrics)
|
|
828
|
+
|
|
829
|
+
return issues
|
|
830
|
+
"""
|
|
831
|
+
|
|
832
|
+
# Sampling configuration (override in subclass)
|
|
833
|
+
sampling_threshold: int = 10_000_000 # 10M rows
|
|
834
|
+
sampling_target_rows: int = 100_000 # Target sample size
|
|
835
|
+
sampling_quality: str = "standard" # Quality level
|
|
836
|
+
sampling_confidence: float = 0.95 # Confidence level
|
|
837
|
+
sampling_margin_of_error: float = 0.05 # Acceptable error
|
|
838
|
+
|
|
839
|
+
def _sample_for_validation(
|
|
840
|
+
self,
|
|
841
|
+
lf: pl.LazyFrame,
|
|
842
|
+
target_rows: int | None = None,
|
|
843
|
+
) -> tuple[pl.LazyFrame, "SamplingInfo"]:
|
|
844
|
+
"""Sample data if it exceeds threshold.
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
lf: Input LazyFrame
|
|
848
|
+
target_rows: Override target sample size
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
Tuple of (sampled LazyFrame, sampling info)
|
|
852
|
+
"""
|
|
853
|
+
# Get row count
|
|
854
|
+
total_rows = lf.select(pl.len()).collect().item()
|
|
855
|
+
|
|
856
|
+
# Check if sampling needed
|
|
857
|
+
if total_rows <= self.sampling_threshold:
|
|
858
|
+
return lf, SamplingInfo(
|
|
859
|
+
is_sampled=False,
|
|
860
|
+
original_rows=total_rows,
|
|
861
|
+
sampled_rows=total_rows,
|
|
862
|
+
sampling_ratio=1.0,
|
|
863
|
+
confidence_level=1.0,
|
|
864
|
+
margin_of_error=0.0,
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
# Determine target
|
|
868
|
+
target = target_rows or self.sampling_target_rows
|
|
869
|
+
target = min(target, total_rows)
|
|
870
|
+
|
|
871
|
+
# Calculate sampling ratio
|
|
872
|
+
sample_ratio = target / total_rows
|
|
873
|
+
|
|
874
|
+
# Apply sampling
|
|
875
|
+
seed = getattr(self, "_sampling_seed", 42)
|
|
876
|
+
threshold = max(1, int(sample_ratio * 10000))
|
|
877
|
+
|
|
878
|
+
sampled_lf = (
|
|
879
|
+
lf.with_row_index("__sample_idx")
|
|
880
|
+
.filter(pl.col("__sample_idx").hash(seed) % 10000 < threshold)
|
|
881
|
+
.drop("__sample_idx")
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
return sampled_lf, SamplingInfo(
|
|
885
|
+
is_sampled=True,
|
|
886
|
+
original_rows=total_rows,
|
|
887
|
+
sampled_rows=target,
|
|
888
|
+
sampling_ratio=sample_ratio,
|
|
889
|
+
confidence_level=self.sampling_confidence,
|
|
890
|
+
margin_of_error=self.sampling_margin_of_error,
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
def _extrapolate_issues(
|
|
894
|
+
self,
|
|
895
|
+
issues: list["ValidationIssue"],
|
|
896
|
+
sampling_info: "SamplingInfo",
|
|
897
|
+
) -> list["ValidationIssue"]:
|
|
898
|
+
"""Extrapolate issue counts from sample to population.
|
|
899
|
+
|
|
900
|
+
Args:
|
|
901
|
+
issues: Issues found in sample
|
|
902
|
+
sampling_info: Sampling information
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
Issues with extrapolated counts
|
|
906
|
+
"""
|
|
907
|
+
if not sampling_info.is_sampled:
|
|
908
|
+
return issues
|
|
909
|
+
|
|
910
|
+
extrapolation_factor = 1.0 / sampling_info.sampling_ratio
|
|
911
|
+
|
|
912
|
+
for issue in issues:
|
|
913
|
+
# Extrapolate count
|
|
914
|
+
original_count = issue.count
|
|
915
|
+
extrapolated_count = int(original_count * extrapolation_factor)
|
|
916
|
+
issue.count = extrapolated_count
|
|
917
|
+
|
|
918
|
+
# Add sampling note to details
|
|
919
|
+
if issue.details:
|
|
920
|
+
issue.details = (
|
|
921
|
+
f"{issue.details} "
|
|
922
|
+
f"[sampled: {original_count} → estimated: {extrapolated_count}, "
|
|
923
|
+
f"confidence: {sampling_info.confidence_level:.0%}]"
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
return issues
|
|
927
|
+
|
|
928
|
+
def _get_sampling_strategy(self, total_rows: int) -> str:
|
|
929
|
+
"""Get recommended sampling strategy for data size."""
|
|
930
|
+
if total_rows < 1_000_000:
|
|
931
|
+
return "none"
|
|
932
|
+
elif total_rows < 10_000_000:
|
|
933
|
+
return "systematic"
|
|
934
|
+
elif total_rows < 100_000_000:
|
|
935
|
+
return "block"
|
|
936
|
+
else:
|
|
937
|
+
return "multi_stage"
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
@dataclass
|
|
941
|
+
class SamplingInfo:
|
|
942
|
+
"""Information about sampling applied to validation.
|
|
943
|
+
|
|
944
|
+
Attributes:
|
|
945
|
+
is_sampled: Whether sampling was applied
|
|
946
|
+
original_rows: Original row count
|
|
947
|
+
sampled_rows: Rows after sampling
|
|
948
|
+
sampling_ratio: Sample size / original size
|
|
949
|
+
confidence_level: Statistical confidence
|
|
950
|
+
margin_of_error: Error margin
|
|
951
|
+
"""
|
|
952
|
+
is_sampled: bool
|
|
953
|
+
original_rows: int
|
|
954
|
+
sampled_rows: int
|
|
955
|
+
sampling_ratio: float
|
|
956
|
+
confidence_level: float
|
|
957
|
+
margin_of_error: float
|
|
958
|
+
|
|
959
|
+
@property
|
|
960
|
+
def extrapolation_factor(self) -> float:
|
|
961
|
+
"""Factor to multiply sample counts by for population estimate."""
|
|
962
|
+
if self.sampling_ratio <= 0:
|
|
963
|
+
return 1.0
|
|
964
|
+
return 1.0 / self.sampling_ratio
|
|
965
|
+
|
|
966
|
+
def to_dict(self) -> dict[str, Any]:
|
|
967
|
+
return {
|
|
968
|
+
"is_sampled": self.is_sampled,
|
|
969
|
+
"original_rows": self.original_rows,
|
|
970
|
+
"sampled_rows": self.sampled_rows,
|
|
971
|
+
"sampling_ratio": self.sampling_ratio,
|
|
972
|
+
"confidence_level": self.confidence_level,
|
|
973
|
+
"margin_of_error": self.margin_of_error,
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
# ============================================================================
|
|
978
|
+
# Template Validators
|
|
979
|
+
# ============================================================================
|
|
980
|
+
|
|
981
|
+
class ColumnValidator(Validator):
|
|
982
|
+
"""Template for column-level validation."""
|
|
983
|
+
|
|
984
|
+
@abstractmethod
|
|
985
|
+
def check_column(
|
|
986
|
+
self,
|
|
987
|
+
lf: pl.LazyFrame,
|
|
988
|
+
col: str,
|
|
989
|
+
total_rows: int,
|
|
990
|
+
) -> ValidationIssue | None:
|
|
991
|
+
"""Check a single column. Implement in subclass."""
|
|
992
|
+
pass
|
|
993
|
+
|
|
994
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
995
|
+
issues: list[ValidationIssue] = []
|
|
996
|
+
columns = self._get_target_columns(lf)
|
|
997
|
+
|
|
998
|
+
total_rows = lf.select(pl.len()).collect().item()
|
|
999
|
+
|
|
1000
|
+
if total_rows == 0:
|
|
1001
|
+
return issues
|
|
1002
|
+
|
|
1003
|
+
for col in columns:
|
|
1004
|
+
try:
|
|
1005
|
+
issue = self.check_column(lf, col, total_rows)
|
|
1006
|
+
if issue:
|
|
1007
|
+
issues.append(issue)
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
if self.config.graceful_degradation:
|
|
1010
|
+
self.logger.warning(f"Error checking column {col}: {e}")
|
|
1011
|
+
else:
|
|
1012
|
+
raise
|
|
1013
|
+
|
|
1014
|
+
return issues
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
class AggregateValidator(Validator, NumericValidatorMixin):
|
|
1018
|
+
"""Template for aggregate statistics validation."""
|
|
1019
|
+
|
|
1020
|
+
@abstractmethod
|
|
1021
|
+
def check_aggregate(
|
|
1022
|
+
self,
|
|
1023
|
+
col: str,
|
|
1024
|
+
stats: dict[str, Any],
|
|
1025
|
+
total_rows: int,
|
|
1026
|
+
) -> ValidationIssue | None:
|
|
1027
|
+
"""Check aggregate stats for a column. Implement in subclass."""
|
|
1028
|
+
pass
|
|
1029
|
+
|
|
1030
|
+
def _compute_stats(
|
|
1031
|
+
self,
|
|
1032
|
+
lf: pl.LazyFrame,
|
|
1033
|
+
columns: list[str],
|
|
1034
|
+
) -> tuple[int, dict[str, dict[str, Any]]]:
|
|
1035
|
+
"""Compute statistics for all columns in single query."""
|
|
1036
|
+
exprs: list[pl.Expr] = [pl.len().alias("_total")]
|
|
1037
|
+
|
|
1038
|
+
for col in columns:
|
|
1039
|
+
exprs.extend([
|
|
1040
|
+
pl.col(col).mean().alias(f"_mean_{col}"),
|
|
1041
|
+
pl.col(col).std().alias(f"_std_{col}"),
|
|
1042
|
+
pl.col(col).min().alias(f"_min_{col}"),
|
|
1043
|
+
pl.col(col).max().alias(f"_max_{col}"),
|
|
1044
|
+
pl.col(col).sum().alias(f"_sum_{col}"),
|
|
1045
|
+
pl.col(col).median().alias(f"_median_{col}"),
|
|
1046
|
+
pl.col(col).count().alias(f"_count_{col}"),
|
|
1047
|
+
])
|
|
1048
|
+
|
|
1049
|
+
result = lf.select(exprs).collect()
|
|
1050
|
+
total = result["_total"][0]
|
|
1051
|
+
|
|
1052
|
+
stats: dict[str, dict[str, Any]] = {}
|
|
1053
|
+
for col in columns:
|
|
1054
|
+
stats[col] = {
|
|
1055
|
+
"mean": result[f"_mean_{col}"][0],
|
|
1056
|
+
"std": result[f"_std_{col}"][0],
|
|
1057
|
+
"min": result[f"_min_{col}"][0],
|
|
1058
|
+
"max": result[f"_max_{col}"][0],
|
|
1059
|
+
"sum": result[f"_sum_{col}"][0],
|
|
1060
|
+
"median": result[f"_median_{col}"][0],
|
|
1061
|
+
"count": result[f"_count_{col}"][0],
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
return total, stats
|
|
1065
|
+
|
|
1066
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
1067
|
+
issues: list[ValidationIssue] = []
|
|
1068
|
+
columns = self._get_numeric_columns(lf)
|
|
1069
|
+
|
|
1070
|
+
if not columns:
|
|
1071
|
+
return issues
|
|
1072
|
+
|
|
1073
|
+
total_rows, all_stats = self._compute_stats(lf, columns)
|
|
1074
|
+
|
|
1075
|
+
if total_rows == 0:
|
|
1076
|
+
return issues
|
|
1077
|
+
|
|
1078
|
+
for col in columns:
|
|
1079
|
+
try:
|
|
1080
|
+
issue = self.check_aggregate(col, all_stats[col], total_rows)
|
|
1081
|
+
if issue:
|
|
1082
|
+
issues.append(issue)
|
|
1083
|
+
except Exception as e:
|
|
1084
|
+
if self.config.graceful_degradation:
|
|
1085
|
+
self.logger.warning(f"Error checking aggregate for {col}: {e}")
|
|
1086
|
+
else:
|
|
1087
|
+
raise
|
|
1088
|
+
|
|
1089
|
+
return issues
|