truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1710 @@
|
|
|
1
|
+
"""Incremental profiling validation framework.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive validation for incremental profiling:
|
|
4
|
+
- Change detection accuracy validation
|
|
5
|
+
- Fingerprint consistency validation
|
|
6
|
+
- Profile merge correctness validation
|
|
7
|
+
- Performance regression validation
|
|
8
|
+
- Data integrity validation
|
|
9
|
+
|
|
10
|
+
The framework is designed for high extensibility and maintainability:
|
|
11
|
+
- Protocol-based validators for easy extension
|
|
12
|
+
- Registry pattern for validator discovery
|
|
13
|
+
- Configurable validation strategies
|
|
14
|
+
- Detailed validation results with recommendations
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
from truthound.profiler.incremental_validation import (
|
|
18
|
+
IncrementalValidator,
|
|
19
|
+
ValidationRunner,
|
|
20
|
+
ValidationConfig,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Create validator
|
|
24
|
+
validator = IncrementalValidator()
|
|
25
|
+
|
|
26
|
+
# Run validation
|
|
27
|
+
result = validator.validate(
|
|
28
|
+
original_profile=profile1,
|
|
29
|
+
incremental_profile=profile2,
|
|
30
|
+
data=df,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Check results
|
|
34
|
+
if result.passed:
|
|
35
|
+
print("Validation passed!")
|
|
36
|
+
else:
|
|
37
|
+
for issue in result.issues:
|
|
38
|
+
print(f"{issue.severity}: {issue.message}")
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import hashlib
|
|
44
|
+
import logging
|
|
45
|
+
import time
|
|
46
|
+
from abc import ABC, abstractmethod
|
|
47
|
+
from dataclasses import dataclass, field
|
|
48
|
+
from datetime import datetime, timedelta
|
|
49
|
+
from enum import Enum
|
|
50
|
+
from typing import Any, Callable, Protocol, Sequence, TypeVar
|
|
51
|
+
|
|
52
|
+
import polars as pl
|
|
53
|
+
|
|
54
|
+
from truthound.profiler.base import (
|
|
55
|
+
ColumnProfile,
|
|
56
|
+
TableProfile,
|
|
57
|
+
ProfilerConfig,
|
|
58
|
+
)
|
|
59
|
+
from truthound.profiler.incremental import (
|
|
60
|
+
ChangeReason,
|
|
61
|
+
ColumnFingerprint,
|
|
62
|
+
ChangeDetectionResult,
|
|
63
|
+
FingerprintCalculator,
|
|
64
|
+
IncrementalConfig,
|
|
65
|
+
IncrementalProfiler,
|
|
66
|
+
ProfileMerger,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Set up logging
|
|
71
|
+
logger = logging.getLogger("truthound.profiler.incremental_validation")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# =============================================================================
|
|
75
|
+
# Validation Types
|
|
76
|
+
# =============================================================================
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ValidationSeverity(str, Enum):
|
|
80
|
+
"""Severity levels for validation issues."""
|
|
81
|
+
|
|
82
|
+
INFO = "info" # Informational message
|
|
83
|
+
WARNING = "warning" # Potential issue
|
|
84
|
+
ERROR = "error" # Definite problem
|
|
85
|
+
CRITICAL = "critical" # Validation cannot proceed
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class ValidationCategory(str, Enum):
|
|
89
|
+
"""Categories of validation checks."""
|
|
90
|
+
|
|
91
|
+
CHANGE_DETECTION = "change_detection"
|
|
92
|
+
FINGERPRINT = "fingerprint"
|
|
93
|
+
PROFILE_MERGE = "profile_merge"
|
|
94
|
+
DATA_INTEGRITY = "data_integrity"
|
|
95
|
+
PERFORMANCE = "performance"
|
|
96
|
+
CONSISTENCY = "consistency"
|
|
97
|
+
SCHEMA = "schema"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ValidationType(str, Enum):
|
|
101
|
+
"""Types of validation operations."""
|
|
102
|
+
|
|
103
|
+
FULL = "full" # Complete validation
|
|
104
|
+
QUICK = "quick" # Fast essential checks
|
|
105
|
+
CHANGE_ONLY = "change_only" # Only change detection
|
|
106
|
+
MERGE_ONLY = "merge_only" # Only merge validation
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# =============================================================================
|
|
110
|
+
# Validation Results
|
|
111
|
+
# =============================================================================
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class ValidationIssue:
|
|
116
|
+
"""A single validation issue found during checks.
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
category: Category of the issue
|
|
120
|
+
severity: How severe the issue is
|
|
121
|
+
message: Human-readable description
|
|
122
|
+
column_name: Affected column (if applicable)
|
|
123
|
+
expected: Expected value
|
|
124
|
+
actual: Actual value found
|
|
125
|
+
recommendation: Suggested fix
|
|
126
|
+
metadata: Additional context
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
category: ValidationCategory
|
|
130
|
+
severity: ValidationSeverity
|
|
131
|
+
message: str
|
|
132
|
+
column_name: str | None = None
|
|
133
|
+
expected: Any = None
|
|
134
|
+
actual: Any = None
|
|
135
|
+
recommendation: str | None = None
|
|
136
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
137
|
+
|
|
138
|
+
def to_dict(self) -> dict[str, Any]:
|
|
139
|
+
"""Convert to dictionary."""
|
|
140
|
+
return {
|
|
141
|
+
"category": self.category.value,
|
|
142
|
+
"severity": self.severity.value,
|
|
143
|
+
"message": self.message,
|
|
144
|
+
"column_name": self.column_name,
|
|
145
|
+
"expected": str(self.expected) if self.expected is not None else None,
|
|
146
|
+
"actual": str(self.actual) if self.actual is not None else None,
|
|
147
|
+
"recommendation": self.recommendation,
|
|
148
|
+
"metadata": self.metadata,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class ValidationMetrics:
|
|
154
|
+
"""Metrics from validation run.
|
|
155
|
+
|
|
156
|
+
Attributes:
|
|
157
|
+
total_checks: Total number of checks performed
|
|
158
|
+
passed_checks: Number of checks that passed
|
|
159
|
+
failed_checks: Number of checks that failed
|
|
160
|
+
skipped_checks: Number of checks skipped
|
|
161
|
+
duration_ms: Total validation time
|
|
162
|
+
columns_validated: Number of columns validated
|
|
163
|
+
changes_detected: Number of changes detected
|
|
164
|
+
false_positives: Estimated false positive count
|
|
165
|
+
false_negatives: Estimated false negative count
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
total_checks: int = 0
|
|
169
|
+
passed_checks: int = 0
|
|
170
|
+
failed_checks: int = 0
|
|
171
|
+
skipped_checks: int = 0
|
|
172
|
+
duration_ms: float = 0.0
|
|
173
|
+
columns_validated: int = 0
|
|
174
|
+
changes_detected: int = 0
|
|
175
|
+
false_positives: int = 0
|
|
176
|
+
false_negatives: int = 0
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def pass_rate(self) -> float:
|
|
180
|
+
"""Calculate pass rate."""
|
|
181
|
+
if self.total_checks == 0:
|
|
182
|
+
return 1.0
|
|
183
|
+
return self.passed_checks / self.total_checks
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def accuracy(self) -> float:
|
|
187
|
+
"""Calculate change detection accuracy."""
|
|
188
|
+
total = self.columns_validated
|
|
189
|
+
if total == 0:
|
|
190
|
+
return 1.0
|
|
191
|
+
errors = self.false_positives + self.false_negatives
|
|
192
|
+
return (total - errors) / total
|
|
193
|
+
|
|
194
|
+
def to_dict(self) -> dict[str, Any]:
|
|
195
|
+
"""Convert to dictionary."""
|
|
196
|
+
return {
|
|
197
|
+
"total_checks": self.total_checks,
|
|
198
|
+
"passed_checks": self.passed_checks,
|
|
199
|
+
"failed_checks": self.failed_checks,
|
|
200
|
+
"skipped_checks": self.skipped_checks,
|
|
201
|
+
"pass_rate": self.pass_rate,
|
|
202
|
+
"duration_ms": self.duration_ms,
|
|
203
|
+
"columns_validated": self.columns_validated,
|
|
204
|
+
"changes_detected": self.changes_detected,
|
|
205
|
+
"false_positives": self.false_positives,
|
|
206
|
+
"false_negatives": self.false_negatives,
|
|
207
|
+
"accuracy": self.accuracy,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class ValidationResult:
|
|
213
|
+
"""Complete result of a validation run.
|
|
214
|
+
|
|
215
|
+
Attributes:
|
|
216
|
+
passed: Whether validation passed overall
|
|
217
|
+
validation_type: Type of validation performed
|
|
218
|
+
issues: List of issues found
|
|
219
|
+
metrics: Validation metrics
|
|
220
|
+
validated_at: When validation was performed
|
|
221
|
+
config: Configuration used
|
|
222
|
+
details: Additional details per category
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
passed: bool
|
|
226
|
+
validation_type: ValidationType
|
|
227
|
+
issues: list[ValidationIssue] = field(default_factory=list)
|
|
228
|
+
metrics: ValidationMetrics = field(default_factory=ValidationMetrics)
|
|
229
|
+
validated_at: datetime = field(default_factory=datetime.now)
|
|
230
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
231
|
+
details: dict[ValidationCategory, dict[str, Any]] = field(default_factory=dict)
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def error_count(self) -> int:
|
|
235
|
+
"""Count of error-level issues."""
|
|
236
|
+
return sum(1 for i in self.issues if i.severity == ValidationSeverity.ERROR)
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def warning_count(self) -> int:
|
|
240
|
+
"""Count of warning-level issues."""
|
|
241
|
+
return sum(1 for i in self.issues if i.severity == ValidationSeverity.WARNING)
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def critical_count(self) -> int:
|
|
245
|
+
"""Count of critical issues."""
|
|
246
|
+
return sum(1 for i in self.issues if i.severity == ValidationSeverity.CRITICAL)
|
|
247
|
+
|
|
248
|
+
def get_issues_by_category(
|
|
249
|
+
self,
|
|
250
|
+
category: ValidationCategory,
|
|
251
|
+
) -> list[ValidationIssue]:
|
|
252
|
+
"""Get issues for a specific category."""
|
|
253
|
+
return [i for i in self.issues if i.category == category]
|
|
254
|
+
|
|
255
|
+
def get_issues_by_severity(
|
|
256
|
+
self,
|
|
257
|
+
severity: ValidationSeverity,
|
|
258
|
+
) -> list[ValidationIssue]:
|
|
259
|
+
"""Get issues for a specific severity."""
|
|
260
|
+
return [i for i in self.issues if i.severity == severity]
|
|
261
|
+
|
|
262
|
+
def to_dict(self) -> dict[str, Any]:
|
|
263
|
+
"""Convert to dictionary."""
|
|
264
|
+
return {
|
|
265
|
+
"passed": self.passed,
|
|
266
|
+
"validation_type": self.validation_type.value,
|
|
267
|
+
"error_count": self.error_count,
|
|
268
|
+
"warning_count": self.warning_count,
|
|
269
|
+
"critical_count": self.critical_count,
|
|
270
|
+
"issues": [i.to_dict() for i in self.issues],
|
|
271
|
+
"metrics": self.metrics.to_dict(),
|
|
272
|
+
"validated_at": self.validated_at.isoformat(),
|
|
273
|
+
"config": self.config,
|
|
274
|
+
"details": {k.value: v for k, v in self.details.items()},
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
def to_markdown(self) -> str:
|
|
278
|
+
"""Generate markdown report."""
|
|
279
|
+
lines = [
|
|
280
|
+
"# Incremental Profiling Validation Report",
|
|
281
|
+
"",
|
|
282
|
+
f"**Status**: {'✅ PASSED' if self.passed else '❌ FAILED'}",
|
|
283
|
+
f"**Validation Type**: {self.validation_type.value}",
|
|
284
|
+
f"**Validated At**: {self.validated_at.isoformat()}",
|
|
285
|
+
"",
|
|
286
|
+
"## Summary",
|
|
287
|
+
"",
|
|
288
|
+
f"- Total Checks: {self.metrics.total_checks}",
|
|
289
|
+
f"- Passed: {self.metrics.passed_checks}",
|
|
290
|
+
f"- Failed: {self.metrics.failed_checks}",
|
|
291
|
+
f"- Pass Rate: {self.metrics.pass_rate:.1%}",
|
|
292
|
+
f"- Duration: {self.metrics.duration_ms:.1f}ms",
|
|
293
|
+
"",
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
if self.issues:
|
|
297
|
+
lines.extend([
|
|
298
|
+
"## Issues Found",
|
|
299
|
+
"",
|
|
300
|
+
])
|
|
301
|
+
|
|
302
|
+
for severity in [
|
|
303
|
+
ValidationSeverity.CRITICAL,
|
|
304
|
+
ValidationSeverity.ERROR,
|
|
305
|
+
ValidationSeverity.WARNING,
|
|
306
|
+
ValidationSeverity.INFO,
|
|
307
|
+
]:
|
|
308
|
+
issues = self.get_issues_by_severity(severity)
|
|
309
|
+
if issues:
|
|
310
|
+
lines.append(f"### {severity.value.title()} ({len(issues)})")
|
|
311
|
+
lines.append("")
|
|
312
|
+
for issue in issues:
|
|
313
|
+
col_info = f" [{issue.column_name}]" if issue.column_name else ""
|
|
314
|
+
lines.append(f"- **{issue.category.value}**{col_info}: {issue.message}")
|
|
315
|
+
if issue.recommendation:
|
|
316
|
+
lines.append(f" - *Recommendation*: {issue.recommendation}")
|
|
317
|
+
lines.append("")
|
|
318
|
+
|
|
319
|
+
return "\n".join(lines)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# =============================================================================
|
|
323
|
+
# Validation Protocol and Base Classes
|
|
324
|
+
# =============================================================================
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class ValidatorProtocol(Protocol):
|
|
328
|
+
"""Protocol for validators."""
|
|
329
|
+
|
|
330
|
+
@property
|
|
331
|
+
def name(self) -> str:
|
|
332
|
+
"""Validator name."""
|
|
333
|
+
...
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def category(self) -> ValidationCategory:
|
|
337
|
+
"""Validation category."""
|
|
338
|
+
...
|
|
339
|
+
|
|
340
|
+
def validate(
|
|
341
|
+
self,
|
|
342
|
+
context: "ValidationContext",
|
|
343
|
+
) -> list[ValidationIssue]:
|
|
344
|
+
"""Perform validation.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
context: Validation context with data and profiles
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
List of issues found (empty if validation passes)
|
|
351
|
+
"""
|
|
352
|
+
...
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
@dataclass
|
|
356
|
+
class ValidationContext:
|
|
357
|
+
"""Context for validation operations.
|
|
358
|
+
|
|
359
|
+
Contains all data needed for validation checks.
|
|
360
|
+
"""
|
|
361
|
+
|
|
362
|
+
# Data
|
|
363
|
+
data: pl.LazyFrame | pl.DataFrame
|
|
364
|
+
|
|
365
|
+
# Profiles
|
|
366
|
+
original_profile: TableProfile | None = None
|
|
367
|
+
incremental_profile: TableProfile | None = None
|
|
368
|
+
full_profile: TableProfile | None = None # For comparison
|
|
369
|
+
|
|
370
|
+
# Fingerprints
|
|
371
|
+
original_fingerprints: dict[str, ColumnFingerprint] = field(default_factory=dict)
|
|
372
|
+
current_fingerprints: dict[str, ColumnFingerprint] = field(default_factory=dict)
|
|
373
|
+
|
|
374
|
+
# Change detection results
|
|
375
|
+
change_results: dict[str, ChangeDetectionResult] = field(default_factory=dict)
|
|
376
|
+
|
|
377
|
+
# Profiling metadata
|
|
378
|
+
profiled_columns: set[str] = field(default_factory=set)
|
|
379
|
+
skipped_columns: set[str] = field(default_factory=set)
|
|
380
|
+
change_reasons: dict[str, ChangeReason] = field(default_factory=dict)
|
|
381
|
+
|
|
382
|
+
# Configuration
|
|
383
|
+
config: "ValidationConfig | None" = None
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class BaseValidator(ABC):
|
|
387
|
+
"""Base class for validators.
|
|
388
|
+
|
|
389
|
+
Provides common functionality for all validators.
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
def __init__(self, enabled: bool = True):
|
|
393
|
+
self.enabled = enabled
|
|
394
|
+
self._issues: list[ValidationIssue] = []
|
|
395
|
+
|
|
396
|
+
@property
|
|
397
|
+
@abstractmethod
|
|
398
|
+
def name(self) -> str:
|
|
399
|
+
"""Validator name."""
|
|
400
|
+
pass
|
|
401
|
+
|
|
402
|
+
@property
|
|
403
|
+
@abstractmethod
|
|
404
|
+
def category(self) -> ValidationCategory:
|
|
405
|
+
"""Validation category."""
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
@abstractmethod
|
|
409
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
410
|
+
"""Perform validation."""
|
|
411
|
+
pass
|
|
412
|
+
|
|
413
|
+
def add_issue(
|
|
414
|
+
self,
|
|
415
|
+
severity: ValidationSeverity,
|
|
416
|
+
message: str,
|
|
417
|
+
column_name: str | None = None,
|
|
418
|
+
expected: Any = None,
|
|
419
|
+
actual: Any = None,
|
|
420
|
+
recommendation: str | None = None,
|
|
421
|
+
**metadata: Any,
|
|
422
|
+
) -> None:
|
|
423
|
+
"""Add a validation issue."""
|
|
424
|
+
self._issues.append(ValidationIssue(
|
|
425
|
+
category=self.category,
|
|
426
|
+
severity=severity,
|
|
427
|
+
message=message,
|
|
428
|
+
column_name=column_name,
|
|
429
|
+
expected=expected,
|
|
430
|
+
actual=actual,
|
|
431
|
+
recommendation=recommendation,
|
|
432
|
+
metadata=metadata,
|
|
433
|
+
))
|
|
434
|
+
|
|
435
|
+
def reset(self) -> None:
|
|
436
|
+
"""Reset validator state."""
|
|
437
|
+
self._issues = []
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
# =============================================================================
|
|
441
|
+
# Change Detection Validators
|
|
442
|
+
# =============================================================================
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
class ChangeDetectionAccuracyValidator(BaseValidator):
|
|
446
|
+
"""Validates accuracy of change detection.
|
|
447
|
+
|
|
448
|
+
Compares incremental change detection against full re-profiling
|
|
449
|
+
to identify false positives and false negatives.
|
|
450
|
+
"""
|
|
451
|
+
|
|
452
|
+
@property
|
|
453
|
+
def name(self) -> str:
|
|
454
|
+
return "change_detection_accuracy"
|
|
455
|
+
|
|
456
|
+
@property
|
|
457
|
+
def category(self) -> ValidationCategory:
|
|
458
|
+
return ValidationCategory.CHANGE_DETECTION
|
|
459
|
+
|
|
460
|
+
def __init__(
|
|
461
|
+
self,
|
|
462
|
+
tolerance: float = 0.01,
|
|
463
|
+
enabled: bool = True,
|
|
464
|
+
):
|
|
465
|
+
"""Initialize validator.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
tolerance: Tolerance for numerical comparisons
|
|
469
|
+
enabled: Whether validator is enabled
|
|
470
|
+
"""
|
|
471
|
+
super().__init__(enabled)
|
|
472
|
+
self.tolerance = tolerance
|
|
473
|
+
|
|
474
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
475
|
+
"""Validate change detection accuracy."""
|
|
476
|
+
self.reset()
|
|
477
|
+
|
|
478
|
+
if context.incremental_profile is None:
|
|
479
|
+
self.add_issue(
|
|
480
|
+
ValidationSeverity.ERROR,
|
|
481
|
+
"No incremental profile provided for validation",
|
|
482
|
+
recommendation="Provide an incremental profile to validate",
|
|
483
|
+
)
|
|
484
|
+
return self._issues
|
|
485
|
+
|
|
486
|
+
if context.full_profile is None:
|
|
487
|
+
self.add_issue(
|
|
488
|
+
ValidationSeverity.WARNING,
|
|
489
|
+
"No full profile for comparison; accuracy cannot be fully validated",
|
|
490
|
+
recommendation="Provide a full profile for accurate comparison",
|
|
491
|
+
)
|
|
492
|
+
return self._issues
|
|
493
|
+
|
|
494
|
+
inc_columns = {col.name: col for col in context.incremental_profile.columns}
|
|
495
|
+
full_columns = {col.name: col for col in context.full_profile.columns}
|
|
496
|
+
|
|
497
|
+
# Check for missing columns
|
|
498
|
+
for col_name in full_columns:
|
|
499
|
+
if col_name not in inc_columns:
|
|
500
|
+
self.add_issue(
|
|
501
|
+
ValidationSeverity.ERROR,
|
|
502
|
+
f"Column missing from incremental profile",
|
|
503
|
+
column_name=col_name,
|
|
504
|
+
recommendation="Check if column was incorrectly skipped",
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Check for extra columns
|
|
508
|
+
for col_name in inc_columns:
|
|
509
|
+
if col_name not in full_columns:
|
|
510
|
+
self.add_issue(
|
|
511
|
+
ValidationSeverity.WARNING,
|
|
512
|
+
f"Column in incremental but not in full profile",
|
|
513
|
+
column_name=col_name,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Compare column profiles
|
|
517
|
+
for col_name, inc_col in inc_columns.items():
|
|
518
|
+
if col_name not in full_columns:
|
|
519
|
+
continue
|
|
520
|
+
|
|
521
|
+
full_col = full_columns[col_name]
|
|
522
|
+
self._compare_columns(col_name, inc_col, full_col, context)
|
|
523
|
+
|
|
524
|
+
return self._issues
|
|
525
|
+
|
|
526
|
+
def _compare_columns(
|
|
527
|
+
self,
|
|
528
|
+
col_name: str,
|
|
529
|
+
inc_col: ColumnProfile,
|
|
530
|
+
full_col: ColumnProfile,
|
|
531
|
+
context: ValidationContext,
|
|
532
|
+
) -> None:
|
|
533
|
+
"""Compare incremental and full profile for a column."""
|
|
534
|
+
was_skipped = col_name in context.skipped_columns
|
|
535
|
+
was_profiled = col_name in context.profiled_columns
|
|
536
|
+
|
|
537
|
+
# If column was skipped, it should match the original
|
|
538
|
+
if was_skipped and context.original_profile:
|
|
539
|
+
orig_columns = {c.name: c for c in context.original_profile.columns}
|
|
540
|
+
if col_name in orig_columns:
|
|
541
|
+
# Compare with original - should be identical
|
|
542
|
+
orig_col = orig_columns[col_name]
|
|
543
|
+
if inc_col.row_count != orig_col.row_count:
|
|
544
|
+
self.add_issue(
|
|
545
|
+
ValidationSeverity.ERROR,
|
|
546
|
+
"Skipped column has different row count from original",
|
|
547
|
+
column_name=col_name,
|
|
548
|
+
expected=orig_col.row_count,
|
|
549
|
+
actual=inc_col.row_count,
|
|
550
|
+
recommendation="Column may have changed but was not detected",
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Compare with full profile for accuracy
|
|
554
|
+
issues_found = []
|
|
555
|
+
|
|
556
|
+
# Row count
|
|
557
|
+
if inc_col.row_count != full_col.row_count:
|
|
558
|
+
issues_found.append(("row_count", inc_col.row_count, full_col.row_count))
|
|
559
|
+
|
|
560
|
+
# Null count
|
|
561
|
+
if inc_col.null_count != full_col.null_count:
|
|
562
|
+
issues_found.append(("null_count", inc_col.null_count, full_col.null_count))
|
|
563
|
+
|
|
564
|
+
# Null ratio (with tolerance)
|
|
565
|
+
if abs(inc_col.null_ratio - full_col.null_ratio) > self.tolerance:
|
|
566
|
+
issues_found.append(("null_ratio", inc_col.null_ratio, full_col.null_ratio))
|
|
567
|
+
|
|
568
|
+
# Unique count
|
|
569
|
+
if inc_col.distinct_count != full_col.distinct_count:
|
|
570
|
+
issues_found.append(("distinct_count", inc_col.distinct_count, full_col.distinct_count))
|
|
571
|
+
|
|
572
|
+
if issues_found and was_skipped:
|
|
573
|
+
# False negative - change was not detected
|
|
574
|
+
for field_name, inc_val, full_val in issues_found:
|
|
575
|
+
self.add_issue(
|
|
576
|
+
ValidationSeverity.ERROR,
|
|
577
|
+
f"Change in {field_name} not detected (false negative)",
|
|
578
|
+
column_name=col_name,
|
|
579
|
+
expected=full_val,
|
|
580
|
+
actual=inc_val,
|
|
581
|
+
recommendation="Increase change detection sensitivity",
|
|
582
|
+
false_negative=True,
|
|
583
|
+
)
|
|
584
|
+
elif not issues_found and was_profiled:
|
|
585
|
+
# Check if it was a false positive
|
|
586
|
+
if col_name in context.change_reasons:
|
|
587
|
+
reason = context.change_reasons[col_name]
|
|
588
|
+
if reason not in [ChangeReason.NEW_COLUMN, ChangeReason.FORCED]:
|
|
589
|
+
# Might be a false positive if data hasn't changed
|
|
590
|
+
# This is informational since re-profiling is safe
|
|
591
|
+
self.add_issue(
|
|
592
|
+
ValidationSeverity.INFO,
|
|
593
|
+
f"Column was re-profiled but no differences found (potential false positive)",
|
|
594
|
+
column_name=col_name,
|
|
595
|
+
metadata={"reason": reason.value},
|
|
596
|
+
false_positive=True,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
class SchemaChangeValidator(BaseValidator):
|
|
601
|
+
"""Validates schema change detection."""
|
|
602
|
+
|
|
603
|
+
@property
|
|
604
|
+
def name(self) -> str:
|
|
605
|
+
return "schema_change"
|
|
606
|
+
|
|
607
|
+
@property
|
|
608
|
+
def category(self) -> ValidationCategory:
|
|
609
|
+
return ValidationCategory.SCHEMA
|
|
610
|
+
|
|
611
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
612
|
+
"""Validate schema change detection."""
|
|
613
|
+
self.reset()
|
|
614
|
+
|
|
615
|
+
if not isinstance(context.data, (pl.LazyFrame, pl.DataFrame)):
|
|
616
|
+
self.add_issue(
|
|
617
|
+
ValidationSeverity.ERROR,
|
|
618
|
+
"Invalid data type for validation",
|
|
619
|
+
)
|
|
620
|
+
return self._issues
|
|
621
|
+
|
|
622
|
+
lf = context.data.lazy() if isinstance(context.data, pl.DataFrame) else context.data
|
|
623
|
+
schema = lf.collect_schema()
|
|
624
|
+
|
|
625
|
+
if context.original_profile:
|
|
626
|
+
orig_columns = {c.name: c for c in context.original_profile.columns}
|
|
627
|
+
|
|
628
|
+
for col_name, dtype in schema.items():
|
|
629
|
+
if col_name in orig_columns:
|
|
630
|
+
orig_type = orig_columns[col_name].physical_type
|
|
631
|
+
current_type = str(dtype)
|
|
632
|
+
|
|
633
|
+
if orig_type != current_type:
|
|
634
|
+
# Schema changed
|
|
635
|
+
was_detected = (
|
|
636
|
+
col_name in context.change_reasons and
|
|
637
|
+
context.change_reasons[col_name] == ChangeReason.SCHEMA_CHANGED
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
if not was_detected:
|
|
641
|
+
self.add_issue(
|
|
642
|
+
ValidationSeverity.ERROR,
|
|
643
|
+
"Schema change not detected",
|
|
644
|
+
column_name=col_name,
|
|
645
|
+
expected=orig_type,
|
|
646
|
+
actual=current_type,
|
|
647
|
+
recommendation="Enable schema change detection",
|
|
648
|
+
)
|
|
649
|
+
else:
|
|
650
|
+
self.add_issue(
|
|
651
|
+
ValidationSeverity.INFO,
|
|
652
|
+
"Schema change correctly detected",
|
|
653
|
+
column_name=col_name,
|
|
654
|
+
expected=orig_type,
|
|
655
|
+
actual=current_type,
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
return self._issues
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
class StalenessValidator(BaseValidator):
|
|
662
|
+
"""Validates staleness detection."""
|
|
663
|
+
|
|
664
|
+
@property
|
|
665
|
+
def name(self) -> str:
|
|
666
|
+
return "staleness"
|
|
667
|
+
|
|
668
|
+
@property
|
|
669
|
+
def category(self) -> ValidationCategory:
|
|
670
|
+
return ValidationCategory.CHANGE_DETECTION
|
|
671
|
+
|
|
672
|
+
def __init__(
|
|
673
|
+
self,
|
|
674
|
+
max_age: timedelta | None = None,
|
|
675
|
+
enabled: bool = True,
|
|
676
|
+
):
|
|
677
|
+
super().__init__(enabled)
|
|
678
|
+
self.max_age = max_age
|
|
679
|
+
|
|
680
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
681
|
+
"""Validate staleness detection."""
|
|
682
|
+
self.reset()
|
|
683
|
+
|
|
684
|
+
if context.original_profile is None:
|
|
685
|
+
return self._issues
|
|
686
|
+
|
|
687
|
+
config_max_age = (
|
|
688
|
+
context.config.max_profile_age
|
|
689
|
+
if context.config else self.max_age
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
if config_max_age is None:
|
|
693
|
+
return self._issues
|
|
694
|
+
|
|
695
|
+
for col in context.original_profile.columns:
|
|
696
|
+
age = datetime.now() - col.profiled_at
|
|
697
|
+
is_stale = age > config_max_age
|
|
698
|
+
|
|
699
|
+
was_detected = (
|
|
700
|
+
col.name in context.change_reasons and
|
|
701
|
+
context.change_reasons[col.name] == ChangeReason.STALE
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
if is_stale and not was_detected:
|
|
705
|
+
self.add_issue(
|
|
706
|
+
ValidationSeverity.WARNING,
|
|
707
|
+
"Stale column not re-profiled",
|
|
708
|
+
column_name=col.name,
|
|
709
|
+
expected=f"Re-profile after {config_max_age}",
|
|
710
|
+
actual=f"Age: {age}",
|
|
711
|
+
recommendation="Check staleness configuration",
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
return self._issues
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
# =============================================================================
|
|
718
|
+
# Fingerprint Validators
|
|
719
|
+
# =============================================================================
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
class FingerprintConsistencyValidator(BaseValidator):
|
|
723
|
+
"""Validates fingerprint consistency and correctness."""
|
|
724
|
+
|
|
725
|
+
@property
|
|
726
|
+
def name(self) -> str:
|
|
727
|
+
return "fingerprint_consistency"
|
|
728
|
+
|
|
729
|
+
@property
|
|
730
|
+
def category(self) -> ValidationCategory:
|
|
731
|
+
return ValidationCategory.FINGERPRINT
|
|
732
|
+
|
|
733
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
734
|
+
"""Validate fingerprint consistency."""
|
|
735
|
+
self.reset()
|
|
736
|
+
|
|
737
|
+
if not context.current_fingerprints:
|
|
738
|
+
self.add_issue(
|
|
739
|
+
ValidationSeverity.INFO,
|
|
740
|
+
"No fingerprints to validate",
|
|
741
|
+
)
|
|
742
|
+
return self._issues
|
|
743
|
+
|
|
744
|
+
# Check fingerprint stability (same data should produce same fingerprint)
|
|
745
|
+
calculator = FingerprintCalculator()
|
|
746
|
+
|
|
747
|
+
lf = (
|
|
748
|
+
context.data.lazy()
|
|
749
|
+
if isinstance(context.data, pl.DataFrame)
|
|
750
|
+
else context.data
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
for col_name, fp in context.current_fingerprints.items():
|
|
754
|
+
# Recalculate fingerprint
|
|
755
|
+
try:
|
|
756
|
+
recalc_fp = calculator.calculate(lf, col_name)
|
|
757
|
+
|
|
758
|
+
if recalc_fp.sample_hash != fp.sample_hash:
|
|
759
|
+
self.add_issue(
|
|
760
|
+
ValidationSeverity.ERROR,
|
|
761
|
+
"Fingerprint not stable (different hashes for same data)",
|
|
762
|
+
column_name=col_name,
|
|
763
|
+
expected=fp.sample_hash,
|
|
764
|
+
actual=recalc_fp.sample_hash,
|
|
765
|
+
recommendation="Check fingerprint calculation determinism",
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
if recalc_fp.row_count != fp.row_count:
|
|
769
|
+
self.add_issue(
|
|
770
|
+
ValidationSeverity.ERROR,
|
|
771
|
+
"Fingerprint row count mismatch",
|
|
772
|
+
column_name=col_name,
|
|
773
|
+
expected=fp.row_count,
|
|
774
|
+
actual=recalc_fp.row_count,
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
except Exception as e:
|
|
778
|
+
self.add_issue(
|
|
779
|
+
ValidationSeverity.ERROR,
|
|
780
|
+
f"Failed to calculate fingerprint: {e}",
|
|
781
|
+
column_name=col_name,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
return self._issues
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
class FingerprintSensitivityValidator(BaseValidator):
|
|
788
|
+
"""Validates fingerprint sensitivity to changes."""
|
|
789
|
+
|
|
790
|
+
@property
|
|
791
|
+
def name(self) -> str:
|
|
792
|
+
return "fingerprint_sensitivity"
|
|
793
|
+
|
|
794
|
+
@property
|
|
795
|
+
def category(self) -> ValidationCategory:
|
|
796
|
+
return ValidationCategory.FINGERPRINT
|
|
797
|
+
|
|
798
|
+
def __init__(
|
|
799
|
+
self,
|
|
800
|
+
min_change_detection_rate: float = 0.95,
|
|
801
|
+
enabled: bool = True,
|
|
802
|
+
):
|
|
803
|
+
"""Initialize validator.
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
min_change_detection_rate: Minimum rate for detecting actual changes
|
|
807
|
+
enabled: Whether validator is enabled
|
|
808
|
+
"""
|
|
809
|
+
super().__init__(enabled)
|
|
810
|
+
self.min_change_detection_rate = min_change_detection_rate
|
|
811
|
+
|
|
812
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
813
|
+
"""Validate fingerprint sensitivity."""
|
|
814
|
+
self.reset()
|
|
815
|
+
|
|
816
|
+
if not context.original_fingerprints or not context.current_fingerprints:
|
|
817
|
+
self.add_issue(
|
|
818
|
+
ValidationSeverity.INFO,
|
|
819
|
+
"Need both original and current fingerprints for sensitivity validation",
|
|
820
|
+
)
|
|
821
|
+
return self._issues
|
|
822
|
+
|
|
823
|
+
changes_detected = 0
|
|
824
|
+
actual_changes = 0
|
|
825
|
+
|
|
826
|
+
for col_name in context.current_fingerprints:
|
|
827
|
+
if col_name not in context.original_fingerprints:
|
|
828
|
+
continue
|
|
829
|
+
|
|
830
|
+
orig_fp = context.original_fingerprints[col_name]
|
|
831
|
+
curr_fp = context.current_fingerprints[col_name]
|
|
832
|
+
|
|
833
|
+
# Check if data actually changed
|
|
834
|
+
data_changed = (
|
|
835
|
+
orig_fp.row_count != curr_fp.row_count or
|
|
836
|
+
orig_fp.null_count != curr_fp.null_count or
|
|
837
|
+
orig_fp.sample_hash != curr_fp.sample_hash
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
if data_changed:
|
|
841
|
+
actual_changes += 1
|
|
842
|
+
|
|
843
|
+
# Check if change was detected
|
|
844
|
+
was_detected = col_name in context.profiled_columns
|
|
845
|
+
|
|
846
|
+
if was_detected:
|
|
847
|
+
changes_detected += 1
|
|
848
|
+
else:
|
|
849
|
+
self.add_issue(
|
|
850
|
+
ValidationSeverity.WARNING,
|
|
851
|
+
"Data change not detected by fingerprint",
|
|
852
|
+
column_name=col_name,
|
|
853
|
+
metadata={
|
|
854
|
+
"orig_hash": orig_fp.sample_hash,
|
|
855
|
+
"curr_hash": curr_fp.sample_hash,
|
|
856
|
+
},
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
if actual_changes > 0:
|
|
860
|
+
detection_rate = changes_detected / actual_changes
|
|
861
|
+
if detection_rate < self.min_change_detection_rate:
|
|
862
|
+
self.add_issue(
|
|
863
|
+
ValidationSeverity.ERROR,
|
|
864
|
+
f"Change detection rate below threshold",
|
|
865
|
+
expected=f">= {self.min_change_detection_rate:.0%}",
|
|
866
|
+
actual=f"{detection_rate:.0%}",
|
|
867
|
+
recommendation="Adjust fingerprint sensitivity settings",
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
return self._issues
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
# =============================================================================
|
|
874
|
+
# Profile Merge Validators
|
|
875
|
+
# =============================================================================
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
class ProfileMergeValidator(BaseValidator):
|
|
879
|
+
"""Validates profile merge correctness."""
|
|
880
|
+
|
|
881
|
+
@property
|
|
882
|
+
def name(self) -> str:
|
|
883
|
+
return "profile_merge"
|
|
884
|
+
|
|
885
|
+
@property
|
|
886
|
+
def category(self) -> ValidationCategory:
|
|
887
|
+
return ValidationCategory.PROFILE_MERGE
|
|
888
|
+
|
|
889
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
890
|
+
"""Validate profile merge operations."""
|
|
891
|
+
self.reset()
|
|
892
|
+
|
|
893
|
+
# This validator needs merge test data
|
|
894
|
+
if not hasattr(context, 'merge_inputs') or not hasattr(context, 'merge_output'):
|
|
895
|
+
return self._issues
|
|
896
|
+
|
|
897
|
+
merge_inputs = getattr(context, 'merge_inputs', [])
|
|
898
|
+
merge_output = getattr(context, 'merge_output', None)
|
|
899
|
+
|
|
900
|
+
if not merge_inputs or merge_output is None:
|
|
901
|
+
return self._issues
|
|
902
|
+
|
|
903
|
+
self._validate_column_preservation(merge_inputs, merge_output)
|
|
904
|
+
self._validate_row_count(merge_inputs, merge_output)
|
|
905
|
+
self._validate_latest_wins(merge_inputs, merge_output)
|
|
906
|
+
|
|
907
|
+
return self._issues
|
|
908
|
+
|
|
909
|
+
def _validate_column_preservation(
|
|
910
|
+
self,
|
|
911
|
+
inputs: list[TableProfile],
|
|
912
|
+
output: TableProfile,
|
|
913
|
+
) -> None:
|
|
914
|
+
"""Check all input columns appear in output."""
|
|
915
|
+
all_input_columns = set()
|
|
916
|
+
for profile in inputs:
|
|
917
|
+
all_input_columns.update(c.name for c in profile.columns)
|
|
918
|
+
|
|
919
|
+
output_columns = {c.name for c in output.columns}
|
|
920
|
+
|
|
921
|
+
missing = all_input_columns - output_columns
|
|
922
|
+
if missing:
|
|
923
|
+
self.add_issue(
|
|
924
|
+
ValidationSeverity.ERROR,
|
|
925
|
+
f"Columns lost during merge: {missing}",
|
|
926
|
+
recommendation="Check merge logic for column preservation",
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
def _validate_row_count(
|
|
930
|
+
self,
|
|
931
|
+
inputs: list[TableProfile],
|
|
932
|
+
output: TableProfile,
|
|
933
|
+
) -> None:
|
|
934
|
+
"""Validate row count after merge."""
|
|
935
|
+
expected_rows = sum(p.row_count for p in inputs)
|
|
936
|
+
|
|
937
|
+
if output.row_count != expected_rows:
|
|
938
|
+
self.add_issue(
|
|
939
|
+
ValidationSeverity.WARNING,
|
|
940
|
+
"Merged row count doesn't match sum of inputs",
|
|
941
|
+
expected=expected_rows,
|
|
942
|
+
actual=output.row_count,
|
|
943
|
+
recommendation="This may be expected if merging overlapping data",
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
def _validate_latest_wins(
|
|
947
|
+
self,
|
|
948
|
+
inputs: list[TableProfile],
|
|
949
|
+
output: TableProfile,
|
|
950
|
+
) -> None:
|
|
951
|
+
"""Validate that latest profile wins for duplicates."""
|
|
952
|
+
# Sort inputs by profiled_at
|
|
953
|
+
sorted_inputs = sorted(inputs, key=lambda p: p.profiled_at)
|
|
954
|
+
|
|
955
|
+
# Build expected output
|
|
956
|
+
expected_columns = {}
|
|
957
|
+
for profile in sorted_inputs:
|
|
958
|
+
for col in profile.columns:
|
|
959
|
+
expected_columns[col.name] = col
|
|
960
|
+
|
|
961
|
+
# Compare with actual output
|
|
962
|
+
output_columns = {c.name: c for c in output.columns}
|
|
963
|
+
|
|
964
|
+
for col_name, expected_col in expected_columns.items():
|
|
965
|
+
if col_name not in output_columns:
|
|
966
|
+
continue
|
|
967
|
+
|
|
968
|
+
actual_col = output_columns[col_name]
|
|
969
|
+
|
|
970
|
+
# The profile should match the latest input
|
|
971
|
+
if actual_col.profiled_at != expected_col.profiled_at:
|
|
972
|
+
self.add_issue(
|
|
973
|
+
ValidationSeverity.WARNING,
|
|
974
|
+
"Merged column doesn't use latest profile",
|
|
975
|
+
column_name=col_name,
|
|
976
|
+
expected=expected_col.profiled_at.isoformat(),
|
|
977
|
+
actual=actual_col.profiled_at.isoformat(),
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
# =============================================================================
|
|
982
|
+
# Data Integrity Validators
|
|
983
|
+
# =============================================================================
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
class DataIntegrityValidator(BaseValidator):
|
|
987
|
+
"""Validates data integrity in profiles."""
|
|
988
|
+
|
|
989
|
+
@property
|
|
990
|
+
def name(self) -> str:
|
|
991
|
+
return "data_integrity"
|
|
992
|
+
|
|
993
|
+
@property
|
|
994
|
+
def category(self) -> ValidationCategory:
|
|
995
|
+
return ValidationCategory.DATA_INTEGRITY
|
|
996
|
+
|
|
997
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
998
|
+
"""Validate data integrity."""
|
|
999
|
+
self.reset()
|
|
1000
|
+
|
|
1001
|
+
profile = context.incremental_profile or context.full_profile
|
|
1002
|
+
if profile is None:
|
|
1003
|
+
self.add_issue(
|
|
1004
|
+
ValidationSeverity.ERROR,
|
|
1005
|
+
"No profile to validate",
|
|
1006
|
+
)
|
|
1007
|
+
return self._issues
|
|
1008
|
+
|
|
1009
|
+
lf = (
|
|
1010
|
+
context.data.lazy()
|
|
1011
|
+
if isinstance(context.data, pl.DataFrame)
|
|
1012
|
+
else context.data
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
actual_row_count = lf.select(pl.len()).collect().item()
|
|
1016
|
+
|
|
1017
|
+
# Row count check
|
|
1018
|
+
if profile.row_count != actual_row_count:
|
|
1019
|
+
self.add_issue(
|
|
1020
|
+
ValidationSeverity.ERROR,
|
|
1021
|
+
"Profile row count doesn't match data",
|
|
1022
|
+
expected=actual_row_count,
|
|
1023
|
+
actual=profile.row_count,
|
|
1024
|
+
recommendation="Profile may be stale",
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
# Column count check
|
|
1028
|
+
schema = lf.collect_schema()
|
|
1029
|
+
if profile.column_count != len(schema):
|
|
1030
|
+
self.add_issue(
|
|
1031
|
+
ValidationSeverity.ERROR,
|
|
1032
|
+
"Profile column count doesn't match data",
|
|
1033
|
+
expected=len(schema),
|
|
1034
|
+
actual=profile.column_count,
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
# Check each column
|
|
1038
|
+
for col in profile.columns:
|
|
1039
|
+
if col.name not in schema:
|
|
1040
|
+
self.add_issue(
|
|
1041
|
+
ValidationSeverity.ERROR,
|
|
1042
|
+
"Profile column not found in data",
|
|
1043
|
+
column_name=col.name,
|
|
1044
|
+
recommendation="Schema may have changed",
|
|
1045
|
+
)
|
|
1046
|
+
continue
|
|
1047
|
+
|
|
1048
|
+
# Verify null count
|
|
1049
|
+
actual_nulls = lf.select(pl.col(col.name).null_count()).collect().item()
|
|
1050
|
+
if col.null_count != actual_nulls:
|
|
1051
|
+
self.add_issue(
|
|
1052
|
+
ValidationSeverity.ERROR,
|
|
1053
|
+
"Null count mismatch",
|
|
1054
|
+
column_name=col.name,
|
|
1055
|
+
expected=actual_nulls,
|
|
1056
|
+
actual=col.null_count,
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
return self._issues
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
# =============================================================================
|
|
1063
|
+
# Performance Validators
|
|
1064
|
+
# =============================================================================
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
class PerformanceValidator(BaseValidator):
|
|
1068
|
+
"""Validates performance improvements from incremental profiling."""
|
|
1069
|
+
|
|
1070
|
+
@property
|
|
1071
|
+
def name(self) -> str:
|
|
1072
|
+
return "performance"
|
|
1073
|
+
|
|
1074
|
+
@property
|
|
1075
|
+
def category(self) -> ValidationCategory:
|
|
1076
|
+
return ValidationCategory.PERFORMANCE
|
|
1077
|
+
|
|
1078
|
+
def __init__(
|
|
1079
|
+
self,
|
|
1080
|
+
min_speedup: float = 1.0,
|
|
1081
|
+
max_overhead: float = 0.2,
|
|
1082
|
+
enabled: bool = True,
|
|
1083
|
+
):
|
|
1084
|
+
"""Initialize validator.
|
|
1085
|
+
|
|
1086
|
+
Args:
|
|
1087
|
+
min_speedup: Minimum expected speedup ratio
|
|
1088
|
+
max_overhead: Maximum acceptable overhead ratio
|
|
1089
|
+
enabled: Whether validator is enabled
|
|
1090
|
+
"""
|
|
1091
|
+
super().__init__(enabled)
|
|
1092
|
+
self.min_speedup = min_speedup
|
|
1093
|
+
self.max_overhead = max_overhead
|
|
1094
|
+
|
|
1095
|
+
def validate(self, context: ValidationContext) -> list[ValidationIssue]:
|
|
1096
|
+
"""Validate performance."""
|
|
1097
|
+
self.reset()
|
|
1098
|
+
|
|
1099
|
+
inc_profile = context.incremental_profile
|
|
1100
|
+
full_profile = context.full_profile
|
|
1101
|
+
|
|
1102
|
+
if inc_profile is None or full_profile is None:
|
|
1103
|
+
return self._issues
|
|
1104
|
+
|
|
1105
|
+
inc_duration = inc_profile.profile_duration_ms
|
|
1106
|
+
full_duration = full_profile.profile_duration_ms
|
|
1107
|
+
|
|
1108
|
+
if full_duration == 0:
|
|
1109
|
+
return self._issues
|
|
1110
|
+
|
|
1111
|
+
speedup = full_duration / inc_duration if inc_duration > 0 else float('inf')
|
|
1112
|
+
columns_skipped = len(context.skipped_columns)
|
|
1113
|
+
columns_total = inc_profile.column_count
|
|
1114
|
+
|
|
1115
|
+
if columns_skipped > 0:
|
|
1116
|
+
# Should see improvement if columns were skipped
|
|
1117
|
+
if speedup < self.min_speedup:
|
|
1118
|
+
self.add_issue(
|
|
1119
|
+
ValidationSeverity.WARNING,
|
|
1120
|
+
f"Incremental profiling slower than expected",
|
|
1121
|
+
expected=f">= {self.min_speedup:.1f}x speedup",
|
|
1122
|
+
actual=f"{speedup:.2f}x",
|
|
1123
|
+
metadata={
|
|
1124
|
+
"columns_skipped": columns_skipped,
|
|
1125
|
+
"incremental_ms": inc_duration,
|
|
1126
|
+
"full_ms": full_duration,
|
|
1127
|
+
},
|
|
1128
|
+
)
|
|
1129
|
+
else:
|
|
1130
|
+
# If nothing skipped, check overhead isn't too high
|
|
1131
|
+
overhead = (inc_duration - full_duration) / full_duration if full_duration > 0 else 0
|
|
1132
|
+
if overhead > self.max_overhead:
|
|
1133
|
+
self.add_issue(
|
|
1134
|
+
ValidationSeverity.WARNING,
|
|
1135
|
+
"Incremental overhead too high when nothing skipped",
|
|
1136
|
+
expected=f"<= {self.max_overhead:.0%} overhead",
|
|
1137
|
+
actual=f"{overhead:.0%}",
|
|
1138
|
+
recommendation="Check fingerprint calculation efficiency",
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
return self._issues
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
# =============================================================================
|
|
1145
|
+
# Validator Registry
|
|
1146
|
+
# =============================================================================
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
class ValidatorRegistry:
|
|
1150
|
+
"""Registry for validators.
|
|
1151
|
+
|
|
1152
|
+
Allows dynamic registration and discovery of validators.
|
|
1153
|
+
"""
|
|
1154
|
+
|
|
1155
|
+
def __init__(self):
|
|
1156
|
+
self._validators: dict[str, type[BaseValidator]] = {}
|
|
1157
|
+
self._instances: dict[str, BaseValidator] = {}
|
|
1158
|
+
|
|
1159
|
+
def register(
|
|
1160
|
+
self,
|
|
1161
|
+
name: str,
|
|
1162
|
+
validator_class: type[BaseValidator],
|
|
1163
|
+
) -> None:
|
|
1164
|
+
"""Register a validator class."""
|
|
1165
|
+
self._validators[name] = validator_class
|
|
1166
|
+
|
|
1167
|
+
def get(self, name: str, **kwargs: Any) -> BaseValidator:
|
|
1168
|
+
"""Get or create a validator instance."""
|
|
1169
|
+
if name not in self._instances:
|
|
1170
|
+
if name not in self._validators:
|
|
1171
|
+
raise KeyError(f"Unknown validator: {name}")
|
|
1172
|
+
self._instances[name] = self._validators[name](**kwargs)
|
|
1173
|
+
return self._instances[name]
|
|
1174
|
+
|
|
1175
|
+
def get_all(self, **kwargs: Any) -> list[BaseValidator]:
|
|
1176
|
+
"""Get all registered validators."""
|
|
1177
|
+
return [
|
|
1178
|
+
self.get(name, **kwargs)
|
|
1179
|
+
for name in self._validators
|
|
1180
|
+
]
|
|
1181
|
+
|
|
1182
|
+
def get_by_category(
|
|
1183
|
+
self,
|
|
1184
|
+
category: ValidationCategory,
|
|
1185
|
+
**kwargs: Any,
|
|
1186
|
+
) -> list[BaseValidator]:
|
|
1187
|
+
"""Get validators for a category."""
|
|
1188
|
+
return [
|
|
1189
|
+
v for v in self.get_all(**kwargs)
|
|
1190
|
+
if v.category == category
|
|
1191
|
+
]
|
|
1192
|
+
|
|
1193
|
+
@property
|
|
1194
|
+
def registered_names(self) -> list[str]:
|
|
1195
|
+
"""Get names of registered validators."""
|
|
1196
|
+
return list(self._validators.keys())
|
|
1197
|
+
|
|
1198
|
+
|
|
1199
|
+
# Global registry
|
|
1200
|
+
validator_registry = ValidatorRegistry()
|
|
1201
|
+
|
|
1202
|
+
# Register built-in validators
|
|
1203
|
+
validator_registry.register("change_detection_accuracy", ChangeDetectionAccuracyValidator)
|
|
1204
|
+
validator_registry.register("schema_change", SchemaChangeValidator)
|
|
1205
|
+
validator_registry.register("staleness", StalenessValidator)
|
|
1206
|
+
validator_registry.register("fingerprint_consistency", FingerprintConsistencyValidator)
|
|
1207
|
+
validator_registry.register("fingerprint_sensitivity", FingerprintSensitivityValidator)
|
|
1208
|
+
validator_registry.register("profile_merge", ProfileMergeValidator)
|
|
1209
|
+
validator_registry.register("data_integrity", DataIntegrityValidator)
|
|
1210
|
+
validator_registry.register("performance", PerformanceValidator)
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def register_validator(name: str) -> Callable[[type[BaseValidator]], type[BaseValidator]]:
|
|
1214
|
+
"""Decorator to register a validator."""
|
|
1215
|
+
def decorator(cls: type[BaseValidator]) -> type[BaseValidator]:
|
|
1216
|
+
validator_registry.register(name, cls)
|
|
1217
|
+
return cls
|
|
1218
|
+
return decorator
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
# =============================================================================
|
|
1222
|
+
# Validation Configuration
|
|
1223
|
+
# =============================================================================
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
@dataclass
|
|
1227
|
+
class ValidationConfig:
|
|
1228
|
+
"""Configuration for validation.
|
|
1229
|
+
|
|
1230
|
+
Attributes:
|
|
1231
|
+
validation_type: Type of validation to perform
|
|
1232
|
+
enabled_validators: Set of validator names to run
|
|
1233
|
+
disabled_validators: Set of validator names to skip
|
|
1234
|
+
max_profile_age: Maximum profile age for staleness checks
|
|
1235
|
+
tolerance: Tolerance for numerical comparisons
|
|
1236
|
+
fail_on_warning: Whether warnings should fail validation
|
|
1237
|
+
fail_on_error: Whether errors should fail validation
|
|
1238
|
+
collect_all_issues: Collect all issues or stop on first failure
|
|
1239
|
+
"""
|
|
1240
|
+
|
|
1241
|
+
validation_type: ValidationType = ValidationType.FULL
|
|
1242
|
+
enabled_validators: set[str] | None = None
|
|
1243
|
+
disabled_validators: set[str] = field(default_factory=set)
|
|
1244
|
+
max_profile_age: timedelta | None = None
|
|
1245
|
+
tolerance: float = 0.01
|
|
1246
|
+
fail_on_warning: bool = False
|
|
1247
|
+
fail_on_error: bool = True
|
|
1248
|
+
collect_all_issues: bool = True
|
|
1249
|
+
|
|
1250
|
+
@classmethod
|
|
1251
|
+
def quick(cls) -> "ValidationConfig":
|
|
1252
|
+
"""Quick validation configuration."""
|
|
1253
|
+
return cls(
|
|
1254
|
+
validation_type=ValidationType.QUICK,
|
|
1255
|
+
enabled_validators={
|
|
1256
|
+
"change_detection_accuracy",
|
|
1257
|
+
"data_integrity",
|
|
1258
|
+
},
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
@classmethod
|
|
1262
|
+
def strict(cls) -> "ValidationConfig":
|
|
1263
|
+
"""Strict validation configuration."""
|
|
1264
|
+
return cls(
|
|
1265
|
+
validation_type=ValidationType.FULL,
|
|
1266
|
+
fail_on_warning=True,
|
|
1267
|
+
tolerance=0.001,
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
@classmethod
|
|
1271
|
+
def change_detection_only(cls) -> "ValidationConfig":
|
|
1272
|
+
"""Only validate change detection."""
|
|
1273
|
+
return cls(
|
|
1274
|
+
validation_type=ValidationType.CHANGE_ONLY,
|
|
1275
|
+
enabled_validators={
|
|
1276
|
+
"change_detection_accuracy",
|
|
1277
|
+
"schema_change",
|
|
1278
|
+
"staleness",
|
|
1279
|
+
"fingerprint_consistency",
|
|
1280
|
+
"fingerprint_sensitivity",
|
|
1281
|
+
},
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
# =============================================================================
|
|
1286
|
+
# Validation Runner
|
|
1287
|
+
# =============================================================================
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
class ValidationRunner:
|
|
1291
|
+
"""Runs validation checks.
|
|
1292
|
+
|
|
1293
|
+
Orchestrates validators and collects results.
|
|
1294
|
+
|
|
1295
|
+
Example:
|
|
1296
|
+
runner = ValidationRunner(ValidationConfig.strict())
|
|
1297
|
+
result = runner.run(context)
|
|
1298
|
+
if not result.passed:
|
|
1299
|
+
print(result.to_markdown())
|
|
1300
|
+
"""
|
|
1301
|
+
|
|
1302
|
+
def __init__(
|
|
1303
|
+
self,
|
|
1304
|
+
config: ValidationConfig | None = None,
|
|
1305
|
+
registry: ValidatorRegistry | None = None,
|
|
1306
|
+
):
|
|
1307
|
+
"""Initialize runner.
|
|
1308
|
+
|
|
1309
|
+
Args:
|
|
1310
|
+
config: Validation configuration
|
|
1311
|
+
registry: Validator registry
|
|
1312
|
+
"""
|
|
1313
|
+
self.config = config or ValidationConfig()
|
|
1314
|
+
self.registry = registry or validator_registry
|
|
1315
|
+
|
|
1316
|
+
def run(self, context: ValidationContext) -> ValidationResult:
|
|
1317
|
+
"""Run validation.
|
|
1318
|
+
|
|
1319
|
+
Args:
|
|
1320
|
+
context: Validation context
|
|
1321
|
+
|
|
1322
|
+
Returns:
|
|
1323
|
+
Validation result
|
|
1324
|
+
"""
|
|
1325
|
+
start_time = time.perf_counter()
|
|
1326
|
+
context.config = self.config
|
|
1327
|
+
|
|
1328
|
+
all_issues: list[ValidationIssue] = []
|
|
1329
|
+
metrics = ValidationMetrics()
|
|
1330
|
+
details: dict[ValidationCategory, dict[str, Any]] = {}
|
|
1331
|
+
|
|
1332
|
+
# Get validators to run
|
|
1333
|
+
validators = self._get_validators()
|
|
1334
|
+
|
|
1335
|
+
for validator in validators:
|
|
1336
|
+
if not validator.enabled:
|
|
1337
|
+
metrics.skipped_checks += 1
|
|
1338
|
+
continue
|
|
1339
|
+
|
|
1340
|
+
try:
|
|
1341
|
+
issues = validator.validate(context)
|
|
1342
|
+
all_issues.extend(issues)
|
|
1343
|
+
|
|
1344
|
+
metrics.total_checks += 1
|
|
1345
|
+
if not issues or all(i.severity == ValidationSeverity.INFO for i in issues):
|
|
1346
|
+
metrics.passed_checks += 1
|
|
1347
|
+
else:
|
|
1348
|
+
has_error = any(
|
|
1349
|
+
i.severity in [ValidationSeverity.ERROR, ValidationSeverity.CRITICAL]
|
|
1350
|
+
for i in issues
|
|
1351
|
+
)
|
|
1352
|
+
if has_error:
|
|
1353
|
+
metrics.failed_checks += 1
|
|
1354
|
+
else:
|
|
1355
|
+
metrics.passed_checks += 1
|
|
1356
|
+
|
|
1357
|
+
# Track details per category
|
|
1358
|
+
if validator.category not in details:
|
|
1359
|
+
details[validator.category] = {"validators_run": []}
|
|
1360
|
+
details[validator.category]["validators_run"].append(validator.name)
|
|
1361
|
+
|
|
1362
|
+
if not self.config.collect_all_issues:
|
|
1363
|
+
# Check if we should stop
|
|
1364
|
+
has_critical = any(
|
|
1365
|
+
i.severity == ValidationSeverity.CRITICAL for i in issues
|
|
1366
|
+
)
|
|
1367
|
+
if has_critical:
|
|
1368
|
+
break
|
|
1369
|
+
|
|
1370
|
+
except Exception as e:
|
|
1371
|
+
logger.exception(f"Validator {validator.name} failed: {e}")
|
|
1372
|
+
all_issues.append(ValidationIssue(
|
|
1373
|
+
category=validator.category,
|
|
1374
|
+
severity=ValidationSeverity.CRITICAL,
|
|
1375
|
+
message=f"Validator {validator.name} raised exception: {e}",
|
|
1376
|
+
))
|
|
1377
|
+
metrics.failed_checks += 1
|
|
1378
|
+
|
|
1379
|
+
# Calculate additional metrics
|
|
1380
|
+
metrics.duration_ms = (time.perf_counter() - start_time) * 1000
|
|
1381
|
+
metrics.columns_validated = len(context.profiled_columns | context.skipped_columns)
|
|
1382
|
+
metrics.changes_detected = len(context.profiled_columns)
|
|
1383
|
+
|
|
1384
|
+
# Count false positives/negatives
|
|
1385
|
+
for issue in all_issues:
|
|
1386
|
+
if issue.metadata.get("false_positive"):
|
|
1387
|
+
metrics.false_positives += 1
|
|
1388
|
+
if issue.metadata.get("false_negative"):
|
|
1389
|
+
metrics.false_negatives += 1
|
|
1390
|
+
|
|
1391
|
+
# Determine pass/fail
|
|
1392
|
+
passed = self._determine_passed(all_issues)
|
|
1393
|
+
|
|
1394
|
+
return ValidationResult(
|
|
1395
|
+
passed=passed,
|
|
1396
|
+
validation_type=self.config.validation_type,
|
|
1397
|
+
issues=all_issues,
|
|
1398
|
+
metrics=metrics,
|
|
1399
|
+
config={"tolerance": self.config.tolerance},
|
|
1400
|
+
details=details,
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
def _get_validators(self) -> list[BaseValidator]:
|
|
1404
|
+
"""Get validators to run based on config."""
|
|
1405
|
+
all_validators = self.registry.get_all()
|
|
1406
|
+
|
|
1407
|
+
if self.config.enabled_validators:
|
|
1408
|
+
validators = [
|
|
1409
|
+
v for v in all_validators
|
|
1410
|
+
if v.name in self.config.enabled_validators
|
|
1411
|
+
]
|
|
1412
|
+
else:
|
|
1413
|
+
validators = all_validators
|
|
1414
|
+
|
|
1415
|
+
# Remove disabled
|
|
1416
|
+
validators = [
|
|
1417
|
+
v for v in validators
|
|
1418
|
+
if v.name not in self.config.disabled_validators
|
|
1419
|
+
]
|
|
1420
|
+
|
|
1421
|
+
return validators
|
|
1422
|
+
|
|
1423
|
+
def _determine_passed(self, issues: list[ValidationIssue]) -> bool:
|
|
1424
|
+
"""Determine if validation passed."""
|
|
1425
|
+
for issue in issues:
|
|
1426
|
+
if issue.severity == ValidationSeverity.CRITICAL:
|
|
1427
|
+
return False
|
|
1428
|
+
if issue.severity == ValidationSeverity.ERROR and self.config.fail_on_error:
|
|
1429
|
+
return False
|
|
1430
|
+
if issue.severity == ValidationSeverity.WARNING and self.config.fail_on_warning:
|
|
1431
|
+
return False
|
|
1432
|
+
return True
|
|
1433
|
+
|
|
1434
|
+
|
|
1435
|
+
# =============================================================================
|
|
1436
|
+
# Main Validator Class
|
|
1437
|
+
# =============================================================================
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
class IncrementalValidator:
|
|
1441
|
+
"""Main validator for incremental profiling.
|
|
1442
|
+
|
|
1443
|
+
Provides a high-level interface for validation.
|
|
1444
|
+
|
|
1445
|
+
Example:
|
|
1446
|
+
validator = IncrementalValidator()
|
|
1447
|
+
|
|
1448
|
+
# Validate change detection
|
|
1449
|
+
result = validator.validate_change_detection(
|
|
1450
|
+
data=df,
|
|
1451
|
+
original_profile=profile1,
|
|
1452
|
+
incremental_profile=profile2,
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
# Validate profile merge
|
|
1456
|
+
result = validator.validate_merge(
|
|
1457
|
+
profiles=[profile1, profile2],
|
|
1458
|
+
merged_profile=merged,
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
# Full validation with profiling
|
|
1462
|
+
result = validator.validate_full(
|
|
1463
|
+
data=df,
|
|
1464
|
+
original_profile=profile1,
|
|
1465
|
+
)
|
|
1466
|
+
"""
|
|
1467
|
+
|
|
1468
|
+
def __init__(
|
|
1469
|
+
self,
|
|
1470
|
+
config: ValidationConfig | None = None,
|
|
1471
|
+
):
|
|
1472
|
+
"""Initialize validator.
|
|
1473
|
+
|
|
1474
|
+
Args:
|
|
1475
|
+
config: Validation configuration
|
|
1476
|
+
"""
|
|
1477
|
+
self.config = config or ValidationConfig()
|
|
1478
|
+
self.runner = ValidationRunner(self.config)
|
|
1479
|
+
|
|
1480
|
+
def validate(
|
|
1481
|
+
self,
|
|
1482
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
1483
|
+
*,
|
|
1484
|
+
original_profile: TableProfile | None = None,
|
|
1485
|
+
incremental_profile: TableProfile | None = None,
|
|
1486
|
+
full_profile: TableProfile | None = None,
|
|
1487
|
+
profiled_columns: set[str] | None = None,
|
|
1488
|
+
skipped_columns: set[str] | None = None,
|
|
1489
|
+
change_reasons: dict[str, ChangeReason] | None = None,
|
|
1490
|
+
) -> ValidationResult:
|
|
1491
|
+
"""Validate incremental profiling results.
|
|
1492
|
+
|
|
1493
|
+
Args:
|
|
1494
|
+
data: Data that was profiled
|
|
1495
|
+
original_profile: Previous profile
|
|
1496
|
+
incremental_profile: Incremental profile to validate
|
|
1497
|
+
full_profile: Full profile for comparison
|
|
1498
|
+
profiled_columns: Columns that were re-profiled
|
|
1499
|
+
skipped_columns: Columns that were skipped
|
|
1500
|
+
change_reasons: Reasons for re-profiling
|
|
1501
|
+
|
|
1502
|
+
Returns:
|
|
1503
|
+
Validation result
|
|
1504
|
+
"""
|
|
1505
|
+
context = ValidationContext(
|
|
1506
|
+
data=data,
|
|
1507
|
+
original_profile=original_profile,
|
|
1508
|
+
incremental_profile=incremental_profile,
|
|
1509
|
+
full_profile=full_profile,
|
|
1510
|
+
profiled_columns=profiled_columns or set(),
|
|
1511
|
+
skipped_columns=skipped_columns or set(),
|
|
1512
|
+
change_reasons=change_reasons or {},
|
|
1513
|
+
)
|
|
1514
|
+
|
|
1515
|
+
return self.runner.run(context)
|
|
1516
|
+
|
|
1517
|
+
def validate_change_detection(
|
|
1518
|
+
self,
|
|
1519
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
1520
|
+
original_profile: TableProfile,
|
|
1521
|
+
incremental_profile: TableProfile,
|
|
1522
|
+
*,
|
|
1523
|
+
full_profile: TableProfile | None = None,
|
|
1524
|
+
) -> ValidationResult:
|
|
1525
|
+
"""Validate change detection specifically.
|
|
1526
|
+
|
|
1527
|
+
Focuses on accuracy of change detection.
|
|
1528
|
+
|
|
1529
|
+
Args:
|
|
1530
|
+
data: Current data
|
|
1531
|
+
original_profile: Previous profile
|
|
1532
|
+
incremental_profile: New incremental profile
|
|
1533
|
+
full_profile: Optional full profile for comparison
|
|
1534
|
+
|
|
1535
|
+
Returns:
|
|
1536
|
+
Validation result
|
|
1537
|
+
"""
|
|
1538
|
+
config = ValidationConfig.change_detection_only()
|
|
1539
|
+
runner = ValidationRunner(config)
|
|
1540
|
+
|
|
1541
|
+
context = ValidationContext(
|
|
1542
|
+
data=data,
|
|
1543
|
+
original_profile=original_profile,
|
|
1544
|
+
incremental_profile=incremental_profile,
|
|
1545
|
+
full_profile=full_profile,
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
return runner.run(context)
|
|
1549
|
+
|
|
1550
|
+
def validate_merge(
|
|
1551
|
+
self,
|
|
1552
|
+
profiles: list[TableProfile],
|
|
1553
|
+
merged_profile: TableProfile,
|
|
1554
|
+
) -> ValidationResult:
|
|
1555
|
+
"""Validate profile merge.
|
|
1556
|
+
|
|
1557
|
+
Args:
|
|
1558
|
+
profiles: Input profiles
|
|
1559
|
+
merged_profile: Merged output profile
|
|
1560
|
+
|
|
1561
|
+
Returns:
|
|
1562
|
+
Validation result
|
|
1563
|
+
"""
|
|
1564
|
+
config = ValidationConfig(
|
|
1565
|
+
validation_type=ValidationType.MERGE_ONLY,
|
|
1566
|
+
enabled_validators={"profile_merge"},
|
|
1567
|
+
)
|
|
1568
|
+
runner = ValidationRunner(config)
|
|
1569
|
+
|
|
1570
|
+
# Create context with merge data
|
|
1571
|
+
context = ValidationContext(
|
|
1572
|
+
data=pl.DataFrame(), # Empty, not needed for merge validation
|
|
1573
|
+
)
|
|
1574
|
+
setattr(context, 'merge_inputs', profiles)
|
|
1575
|
+
setattr(context, 'merge_output', merged_profile)
|
|
1576
|
+
|
|
1577
|
+
return runner.run(context)
|
|
1578
|
+
|
|
1579
|
+
def validate_full(
|
|
1580
|
+
self,
|
|
1581
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
1582
|
+
original_profile: TableProfile,
|
|
1583
|
+
*,
|
|
1584
|
+
incremental_config: IncrementalConfig | None = None,
|
|
1585
|
+
) -> ValidationResult:
|
|
1586
|
+
"""Full validation with actual profiling.
|
|
1587
|
+
|
|
1588
|
+
Performs incremental profiling, full profiling, and compares results.
|
|
1589
|
+
|
|
1590
|
+
Args:
|
|
1591
|
+
data: Data to profile
|
|
1592
|
+
original_profile: Previous profile
|
|
1593
|
+
incremental_config: Incremental profiling configuration
|
|
1594
|
+
|
|
1595
|
+
Returns:
|
|
1596
|
+
Validation result
|
|
1597
|
+
"""
|
|
1598
|
+
# Perform incremental profiling
|
|
1599
|
+
inc_profiler = IncrementalProfiler(config=incremental_config)
|
|
1600
|
+
inc_profile = inc_profiler.profile(data, previous=original_profile)
|
|
1601
|
+
|
|
1602
|
+
# Perform full profiling for comparison
|
|
1603
|
+
from truthound.profiler.table_profiler import DataProfiler
|
|
1604
|
+
full_profiler = DataProfiler()
|
|
1605
|
+
|
|
1606
|
+
lf = data.lazy() if isinstance(data, pl.DataFrame) else data
|
|
1607
|
+
full_profile = full_profiler.profile(lf)
|
|
1608
|
+
|
|
1609
|
+
# Calculate fingerprints
|
|
1610
|
+
fp_calculator = FingerprintCalculator()
|
|
1611
|
+
current_fps = {}
|
|
1612
|
+
|
|
1613
|
+
schema = lf.collect_schema()
|
|
1614
|
+
for col_name in schema.names():
|
|
1615
|
+
try:
|
|
1616
|
+
current_fps[col_name] = fp_calculator.calculate(lf, col_name)
|
|
1617
|
+
except Exception:
|
|
1618
|
+
pass
|
|
1619
|
+
|
|
1620
|
+
context = ValidationContext(
|
|
1621
|
+
data=data,
|
|
1622
|
+
original_profile=original_profile,
|
|
1623
|
+
incremental_profile=inc_profile,
|
|
1624
|
+
full_profile=full_profile,
|
|
1625
|
+
current_fingerprints=current_fps,
|
|
1626
|
+
profiled_columns=inc_profiler.last_profiled_columns,
|
|
1627
|
+
skipped_columns=inc_profiler.last_skipped_columns,
|
|
1628
|
+
change_reasons=inc_profiler.last_change_reasons,
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
return self.runner.run(context)
|
|
1632
|
+
|
|
1633
|
+
|
|
1634
|
+
# =============================================================================
|
|
1635
|
+
# Convenience Functions
|
|
1636
|
+
# =============================================================================
|
|
1637
|
+
|
|
1638
|
+
|
|
1639
|
+
def validate_incremental(
|
|
1640
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
1641
|
+
original_profile: TableProfile,
|
|
1642
|
+
incremental_profile: TableProfile,
|
|
1643
|
+
*,
|
|
1644
|
+
full_profile: TableProfile | None = None,
|
|
1645
|
+
strict: bool = False,
|
|
1646
|
+
) -> ValidationResult:
|
|
1647
|
+
"""Convenience function to validate incremental profiling.
|
|
1648
|
+
|
|
1649
|
+
Args:
|
|
1650
|
+
data: Data that was profiled
|
|
1651
|
+
original_profile: Previous profile
|
|
1652
|
+
incremental_profile: New incremental profile
|
|
1653
|
+
full_profile: Optional full profile for comparison
|
|
1654
|
+
strict: Use strict validation
|
|
1655
|
+
|
|
1656
|
+
Returns:
|
|
1657
|
+
Validation result
|
|
1658
|
+
"""
|
|
1659
|
+
config = ValidationConfig.strict() if strict else ValidationConfig()
|
|
1660
|
+
validator = IncrementalValidator(config)
|
|
1661
|
+
|
|
1662
|
+
return validator.validate(
|
|
1663
|
+
data=data,
|
|
1664
|
+
original_profile=original_profile,
|
|
1665
|
+
incremental_profile=incremental_profile,
|
|
1666
|
+
full_profile=full_profile,
|
|
1667
|
+
)
|
|
1668
|
+
|
|
1669
|
+
|
|
1670
|
+
def validate_merge(
|
|
1671
|
+
profiles: list[TableProfile],
|
|
1672
|
+
merged_profile: TableProfile,
|
|
1673
|
+
) -> ValidationResult:
|
|
1674
|
+
"""Convenience function to validate profile merge.
|
|
1675
|
+
|
|
1676
|
+
Args:
|
|
1677
|
+
profiles: Input profiles
|
|
1678
|
+
merged_profile: Merged output
|
|
1679
|
+
|
|
1680
|
+
Returns:
|
|
1681
|
+
Validation result
|
|
1682
|
+
"""
|
|
1683
|
+
validator = IncrementalValidator()
|
|
1684
|
+
return validator.validate_merge(profiles, merged_profile)
|
|
1685
|
+
|
|
1686
|
+
|
|
1687
|
+
def validate_fingerprints(
|
|
1688
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
1689
|
+
fingerprints: dict[str, ColumnFingerprint],
|
|
1690
|
+
) -> ValidationResult:
|
|
1691
|
+
"""Validate fingerprint consistency.
|
|
1692
|
+
|
|
1693
|
+
Args:
|
|
1694
|
+
data: Data to check fingerprints against
|
|
1695
|
+
fingerprints: Fingerprints to validate
|
|
1696
|
+
|
|
1697
|
+
Returns:
|
|
1698
|
+
Validation result
|
|
1699
|
+
"""
|
|
1700
|
+
config = ValidationConfig(
|
|
1701
|
+
enabled_validators={"fingerprint_consistency"},
|
|
1702
|
+
)
|
|
1703
|
+
runner = ValidationRunner(config)
|
|
1704
|
+
|
|
1705
|
+
context = ValidationContext(
|
|
1706
|
+
data=data,
|
|
1707
|
+
current_fingerprints=fingerprints,
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1710
|
+
return runner.run(context)
|