truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
"""Within-record uniqueness validators.
|
|
2
|
+
|
|
3
|
+
Provides vectorized validation for checking uniqueness within each row,
|
|
4
|
+
using Polars' horizontal operations for optimal performance.
|
|
5
|
+
|
|
6
|
+
Type Safety:
|
|
7
|
+
- Pairwise strategy: Native type comparison (no casting)
|
|
8
|
+
- Horizontal strategy: Type-grouped comparison for accuracy
|
|
9
|
+
|
|
10
|
+
Performance Notes:
|
|
11
|
+
- Vectorized approach: ~100x faster than row-by-row iteration
|
|
12
|
+
- Scales linearly with row count, not column count
|
|
13
|
+
- Memory efficient: no Python list creation per row
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
import polars as pl
|
|
19
|
+
|
|
20
|
+
from truthound.types import Severity
|
|
21
|
+
from truthound.validators.base import ValidationIssue, Validator
|
|
22
|
+
from truthound.validators.registry import register_validator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_column_type_groups(
|
|
26
|
+
lf: pl.LazyFrame,
|
|
27
|
+
columns: list[str],
|
|
28
|
+
) -> dict[str, list[str]]:
|
|
29
|
+
"""Group columns by their base type for type-safe comparison.
|
|
30
|
+
|
|
31
|
+
This prevents false positives from comparing numeric 1 with string "1".
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Dict mapping type category to list of column names
|
|
35
|
+
"""
|
|
36
|
+
schema = lf.collect_schema()
|
|
37
|
+
|
|
38
|
+
type_groups: dict[str, list[str]] = {
|
|
39
|
+
"numeric": [],
|
|
40
|
+
"string": [],
|
|
41
|
+
"datetime": [],
|
|
42
|
+
"boolean": [],
|
|
43
|
+
"other": [],
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
numeric_types = {
|
|
47
|
+
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
48
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
49
|
+
pl.Float32, pl.Float64,
|
|
50
|
+
}
|
|
51
|
+
string_types = {pl.String, pl.Utf8}
|
|
52
|
+
datetime_types = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
|
|
53
|
+
boolean_types = {pl.Boolean}
|
|
54
|
+
|
|
55
|
+
for col in columns:
|
|
56
|
+
dtype = type(schema[col])
|
|
57
|
+
if dtype in numeric_types:
|
|
58
|
+
type_groups["numeric"].append(col)
|
|
59
|
+
elif dtype in string_types:
|
|
60
|
+
type_groups["string"].append(col)
|
|
61
|
+
elif dtype in datetime_types:
|
|
62
|
+
type_groups["datetime"].append(col)
|
|
63
|
+
elif dtype in boolean_types:
|
|
64
|
+
type_groups["boolean"].append(col)
|
|
65
|
+
else:
|
|
66
|
+
type_groups["other"].append(col)
|
|
67
|
+
|
|
68
|
+
# Remove empty groups
|
|
69
|
+
return {k: v for k, v in type_groups.items() if v}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _build_pairwise_equality_expr(
|
|
73
|
+
lf: pl.LazyFrame,
|
|
74
|
+
columns: list[str],
|
|
75
|
+
ignore_nulls: bool = True,
|
|
76
|
+
) -> pl.Expr:
|
|
77
|
+
"""Build a vectorized expression to detect duplicate values within rows.
|
|
78
|
+
|
|
79
|
+
Strategy: For N columns, check all N*(N-1)/2 pairs for equality.
|
|
80
|
+
Only compares columns of compatible types to prevent Polars type errors.
|
|
81
|
+
|
|
82
|
+
Type Safety:
|
|
83
|
+
- Only compares columns with the same base type
|
|
84
|
+
- numeric 1 will not be compared with string "1"
|
|
85
|
+
- int 1 == float 1.0 (Polars handles numeric coercion correctly)
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
lf: LazyFrame to get schema from
|
|
89
|
+
columns: List of column names to check
|
|
90
|
+
ignore_nulls: If True, null values are not considered duplicates
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A Polars expression that evaluates to True for rows with duplicates
|
|
94
|
+
"""
|
|
95
|
+
if len(columns) < 2:
|
|
96
|
+
# Need at least 2 columns to have duplicates
|
|
97
|
+
return pl.lit(False)
|
|
98
|
+
|
|
99
|
+
# Group columns by type to only compare compatible types
|
|
100
|
+
type_groups = _get_column_type_groups(lf, columns)
|
|
101
|
+
|
|
102
|
+
# Build pairwise equality checks only within same type groups
|
|
103
|
+
equality_checks: list[pl.Expr] = []
|
|
104
|
+
|
|
105
|
+
for type_name, group_cols in type_groups.items():
|
|
106
|
+
if len(group_cols) < 2:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Compare pairs within this type group
|
|
110
|
+
for i in range(len(group_cols)):
|
|
111
|
+
for j in range(i + 1, len(group_cols)):
|
|
112
|
+
col_i, col_j = group_cols[i], group_cols[j]
|
|
113
|
+
|
|
114
|
+
if ignore_nulls:
|
|
115
|
+
# Only consider equal if both are non-null AND equal
|
|
116
|
+
pair_equal = (
|
|
117
|
+
pl.col(col_i).is_not_null()
|
|
118
|
+
& pl.col(col_j).is_not_null()
|
|
119
|
+
& (pl.col(col_i) == pl.col(col_j))
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
# Consider equal including null == null
|
|
123
|
+
pair_equal = (
|
|
124
|
+
(pl.col(col_i) == pl.col(col_j))
|
|
125
|
+
| (pl.col(col_i).is_null() & pl.col(col_j).is_null())
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
equality_checks.append(pair_equal)
|
|
129
|
+
|
|
130
|
+
if not equality_checks:
|
|
131
|
+
# No columns of the same type to compare
|
|
132
|
+
return pl.lit(False)
|
|
133
|
+
|
|
134
|
+
# Combine all checks with OR - any duplicate pair means the row has duplicates
|
|
135
|
+
if len(equality_checks) == 1:
|
|
136
|
+
return equality_checks[0]
|
|
137
|
+
|
|
138
|
+
result = equality_checks[0]
|
|
139
|
+
for check in equality_checks[1:]:
|
|
140
|
+
result = result | check
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _build_type_safe_horizontal_expr(
|
|
146
|
+
lf: pl.LazyFrame,
|
|
147
|
+
columns: list[str],
|
|
148
|
+
ignore_nulls: bool = True,
|
|
149
|
+
) -> pl.Expr:
|
|
150
|
+
"""Build type-safe horizontal uniqueness expression.
|
|
151
|
+
|
|
152
|
+
Groups columns by type and checks for duplicates within each type group.
|
|
153
|
+
This prevents false positives from type coercion (e.g., 1 != "1").
|
|
154
|
+
|
|
155
|
+
Strategy:
|
|
156
|
+
1. Group columns by type (numeric, string, datetime, etc.)
|
|
157
|
+
2. For each type group with 2+ columns, check for duplicates
|
|
158
|
+
3. OR all group results together
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
lf: LazyFrame to get schema from
|
|
162
|
+
columns: List of column names to check
|
|
163
|
+
ignore_nulls: If True, null values don't count toward uniqueness
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Expression that evaluates to True for rows with duplicates
|
|
167
|
+
"""
|
|
168
|
+
if len(columns) < 2:
|
|
169
|
+
return pl.lit(False)
|
|
170
|
+
|
|
171
|
+
type_groups = _get_column_type_groups(lf, columns)
|
|
172
|
+
|
|
173
|
+
# Build expressions for each type group
|
|
174
|
+
group_exprs: list[pl.Expr] = []
|
|
175
|
+
|
|
176
|
+
for type_name, group_cols in type_groups.items():
|
|
177
|
+
if len(group_cols) < 2:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# For each type group, use concat_list without casting
|
|
181
|
+
# All columns in the group already have compatible types
|
|
182
|
+
col_exprs = [pl.col(c) for c in group_cols]
|
|
183
|
+
list_expr = pl.concat_list(col_exprs)
|
|
184
|
+
|
|
185
|
+
if ignore_nulls:
|
|
186
|
+
# Filter out nulls and compare unique count vs total non-null count
|
|
187
|
+
non_null_list = list_expr.list.eval(pl.element().drop_nulls())
|
|
188
|
+
unique_count = non_null_list.list.n_unique()
|
|
189
|
+
total_count = non_null_list.list.len()
|
|
190
|
+
group_has_dup = unique_count < total_count
|
|
191
|
+
else:
|
|
192
|
+
# Compare unique count vs total count
|
|
193
|
+
group_has_dup = list_expr.list.n_unique() < list_expr.list.len()
|
|
194
|
+
|
|
195
|
+
group_exprs.append(group_has_dup)
|
|
196
|
+
|
|
197
|
+
if not group_exprs:
|
|
198
|
+
return pl.lit(False)
|
|
199
|
+
|
|
200
|
+
# Combine all group expressions with OR
|
|
201
|
+
if len(group_exprs) == 1:
|
|
202
|
+
return group_exprs[0]
|
|
203
|
+
|
|
204
|
+
result = group_exprs[0]
|
|
205
|
+
for expr in group_exprs[1:]:
|
|
206
|
+
result = result | expr
|
|
207
|
+
|
|
208
|
+
return result
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _build_horizontal_n_unique_expr(
|
|
212
|
+
columns: list[str],
|
|
213
|
+
ignore_nulls: bool = True,
|
|
214
|
+
) -> pl.Expr:
|
|
215
|
+
"""Build expression using horizontal unique count (legacy, type-unsafe).
|
|
216
|
+
|
|
217
|
+
WARNING: This function casts all columns to string, which may cause
|
|
218
|
+
false positives (numeric 1 == string "1"). Use _build_type_safe_horizontal_expr
|
|
219
|
+
for type-safe comparison.
|
|
220
|
+
|
|
221
|
+
Kept for backwards compatibility and cases where type coercion is desired.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
columns: List of column names to check
|
|
225
|
+
ignore_nulls: If True, null values don't count toward uniqueness
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Expression that evaluates to True for rows with duplicates
|
|
229
|
+
"""
|
|
230
|
+
if len(columns) < 2:
|
|
231
|
+
return pl.lit(False)
|
|
232
|
+
|
|
233
|
+
# Cast all columns to string for uniform comparison
|
|
234
|
+
col_exprs = [pl.col(c).cast(pl.Utf8) for c in columns]
|
|
235
|
+
|
|
236
|
+
# Create a list of values for each row
|
|
237
|
+
list_expr = pl.concat_list(col_exprs)
|
|
238
|
+
|
|
239
|
+
if ignore_nulls:
|
|
240
|
+
# Filter out nulls and compare unique count vs total non-null count
|
|
241
|
+
non_null_list = list_expr.list.eval(pl.element().drop_nulls())
|
|
242
|
+
unique_count = non_null_list.list.n_unique()
|
|
243
|
+
total_count = non_null_list.list.len()
|
|
244
|
+
return unique_count < total_count
|
|
245
|
+
else:
|
|
246
|
+
# Compare unique count vs total count
|
|
247
|
+
return list_expr.list.n_unique() < list_expr.list.len()
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@register_validator
|
|
251
|
+
class UniqueWithinRecordValidator(Validator):
|
|
252
|
+
"""Validates that specified columns have unique values within each row.
|
|
253
|
+
|
|
254
|
+
Uses vectorized Polars operations for optimal performance - approximately
|
|
255
|
+
100x faster than row-by-row iteration on large datasets.
|
|
256
|
+
|
|
257
|
+
Type Safety:
|
|
258
|
+
- "pairwise" strategy: Uses native Polars comparison (type-safe)
|
|
259
|
+
- "horizontal" strategy: Now uses type-grouped comparison (type-safe)
|
|
260
|
+
- "horizontal_legacy": Uses string casting (may have false positives)
|
|
261
|
+
|
|
262
|
+
Strategy Selection (auto mode):
|
|
263
|
+
- For 2-6 columns: Uses pairwise equality (faster, more precise)
|
|
264
|
+
- For 7+ columns: Uses type-safe horizontal n_unique (more scalable)
|
|
265
|
+
|
|
266
|
+
Example:
|
|
267
|
+
# Primary and secondary contacts should be different
|
|
268
|
+
validator = UniqueWithinRecordValidator(
|
|
269
|
+
columns=["primary_contact", "secondary_contact"],
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# All three choice fields should be unique
|
|
273
|
+
validator = UniqueWithinRecordValidator(
|
|
274
|
+
columns=["choice_1", "choice_2", "choice_3"],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
Performance:
|
|
278
|
+
- 1M rows, 3 columns: ~0.05s (vs ~1.4s with iteration)
|
|
279
|
+
- Scales linearly with row count
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
name = "unique_within_record"
|
|
283
|
+
category = "uniqueness"
|
|
284
|
+
|
|
285
|
+
# Threshold for switching strategies
|
|
286
|
+
PAIRWISE_THRESHOLD = 6
|
|
287
|
+
|
|
288
|
+
def __init__(
|
|
289
|
+
self,
|
|
290
|
+
columns: list[str],
|
|
291
|
+
ignore_nulls: bool = True,
|
|
292
|
+
strategy: str = "auto",
|
|
293
|
+
**kwargs: Any,
|
|
294
|
+
):
|
|
295
|
+
"""Initialize the validator.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
columns: List of columns to check for uniqueness within each row
|
|
299
|
+
ignore_nulls: If True, null values are excluded from comparison
|
|
300
|
+
strategy: "auto" (default), "pairwise", "horizontal", or "horizontal_legacy"
|
|
301
|
+
**kwargs: Additional validator configuration
|
|
302
|
+
"""
|
|
303
|
+
super().__init__(**kwargs)
|
|
304
|
+
self.check_columns = columns
|
|
305
|
+
self.ignore_nulls = ignore_nulls
|
|
306
|
+
self.strategy = strategy
|
|
307
|
+
|
|
308
|
+
if len(columns) < 2:
|
|
309
|
+
raise ValueError("At least 2 columns required for within-record uniqueness")
|
|
310
|
+
|
|
311
|
+
def _get_strategy(self) -> str:
|
|
312
|
+
"""Determine which strategy to use."""
|
|
313
|
+
if self.strategy != "auto":
|
|
314
|
+
return self.strategy
|
|
315
|
+
|
|
316
|
+
# Use pairwise for small column counts (fewer comparisons)
|
|
317
|
+
# Use horizontal for large column counts (more scalable)
|
|
318
|
+
n_cols = len(self.check_columns)
|
|
319
|
+
n_pairs = n_cols * (n_cols - 1) // 2
|
|
320
|
+
|
|
321
|
+
if n_pairs <= 15: # Up to 6 columns = 15 pairs
|
|
322
|
+
return "pairwise"
|
|
323
|
+
return "horizontal"
|
|
324
|
+
|
|
325
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
326
|
+
issues: list[ValidationIssue] = []
|
|
327
|
+
|
|
328
|
+
# Build the duplicate detection expression
|
|
329
|
+
strategy = self._get_strategy()
|
|
330
|
+
|
|
331
|
+
if strategy == "pairwise":
|
|
332
|
+
has_dup_expr = _build_pairwise_equality_expr(
|
|
333
|
+
lf, self.check_columns, self.ignore_nulls
|
|
334
|
+
)
|
|
335
|
+
elif strategy == "horizontal_legacy":
|
|
336
|
+
# Legacy mode with string casting (for backwards compatibility)
|
|
337
|
+
has_dup_expr = _build_horizontal_n_unique_expr(
|
|
338
|
+
self.check_columns, self.ignore_nulls
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
# Default: type-safe horizontal comparison
|
|
342
|
+
has_dup_expr = _build_type_safe_horizontal_expr(
|
|
343
|
+
lf, self.check_columns, self.ignore_nulls
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Execute vectorized query
|
|
347
|
+
result = lf.select(
|
|
348
|
+
pl.len().alias("_total"),
|
|
349
|
+
has_dup_expr.sum().alias("_dup_count"),
|
|
350
|
+
).collect()
|
|
351
|
+
|
|
352
|
+
total_rows = result["_total"][0]
|
|
353
|
+
if total_rows == 0:
|
|
354
|
+
return issues
|
|
355
|
+
|
|
356
|
+
duplicate_rows = result["_dup_count"][0]
|
|
357
|
+
|
|
358
|
+
if duplicate_rows > 0:
|
|
359
|
+
if self._passes_mostly(duplicate_rows, total_rows):
|
|
360
|
+
return issues
|
|
361
|
+
|
|
362
|
+
ratio = duplicate_rows / total_rows
|
|
363
|
+
col_desc = ", ".join(self.check_columns)
|
|
364
|
+
|
|
365
|
+
# Get sample values (collect only if needed)
|
|
366
|
+
samples = self._get_duplicate_samples(lf, has_dup_expr)
|
|
367
|
+
|
|
368
|
+
issues.append(
|
|
369
|
+
ValidationIssue(
|
|
370
|
+
column=f"[{col_desc}]",
|
|
371
|
+
issue_type="duplicate_within_record",
|
|
372
|
+
count=duplicate_rows,
|
|
373
|
+
severity=self._calculate_severity(ratio),
|
|
374
|
+
details=f"{duplicate_rows} rows have duplicate values across columns",
|
|
375
|
+
expected="Unique values within each row",
|
|
376
|
+
sample_values=samples,
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
return issues
|
|
381
|
+
|
|
382
|
+
def _get_duplicate_samples(
|
|
383
|
+
self,
|
|
384
|
+
lf: pl.LazyFrame,
|
|
385
|
+
has_dup_expr: pl.Expr,
|
|
386
|
+
) -> list[str]:
|
|
387
|
+
"""Get sample rows with duplicates for error reporting."""
|
|
388
|
+
sample_df = (
|
|
389
|
+
lf.with_row_index("_row_idx")
|
|
390
|
+
.filter(has_dup_expr)
|
|
391
|
+
.select(["_row_idx"] + self.check_columns)
|
|
392
|
+
.head(self.config.sample_size)
|
|
393
|
+
.collect()
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
samples = []
|
|
397
|
+
for row in sample_df.iter_rows(named=True):
|
|
398
|
+
idx = row["_row_idx"]
|
|
399
|
+
vals = [str(row[c]) for c in self.check_columns]
|
|
400
|
+
samples.append(f"row {idx}: [{', '.join(vals)}]")
|
|
401
|
+
|
|
402
|
+
return samples
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
@register_validator
|
|
406
|
+
class AllColumnsUniqueWithinRecordValidator(Validator):
|
|
407
|
+
"""Validates that all non-null values in each row are unique.
|
|
408
|
+
|
|
409
|
+
Uses vectorized horizontal operations for optimal performance.
|
|
410
|
+
Type-safe: Only compares values of the same type.
|
|
411
|
+
|
|
412
|
+
Example:
|
|
413
|
+
# Each row's values should all be different
|
|
414
|
+
validator = AllColumnsUniqueWithinRecordValidator()
|
|
415
|
+
|
|
416
|
+
# Check only specific columns
|
|
417
|
+
validator = AllColumnsUniqueWithinRecordValidator(
|
|
418
|
+
columns=["field_a", "field_b", "field_c"],
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
Performance:
|
|
422
|
+
- Uses horizontal n_unique for scalability
|
|
423
|
+
- 1M rows: ~0.1s (vs ~1.4s with iteration)
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
name = "all_columns_unique_within_record"
|
|
427
|
+
category = "uniqueness"
|
|
428
|
+
|
|
429
|
+
def __init__(
|
|
430
|
+
self,
|
|
431
|
+
ignore_nulls: bool = True,
|
|
432
|
+
type_safe: bool = True,
|
|
433
|
+
**kwargs: Any,
|
|
434
|
+
):
|
|
435
|
+
"""Initialize the validator.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
ignore_nulls: If True, null values are excluded from comparison
|
|
439
|
+
type_safe: If True, only compare values of the same type
|
|
440
|
+
**kwargs: Additional validator configuration
|
|
441
|
+
"""
|
|
442
|
+
super().__init__(**kwargs)
|
|
443
|
+
self.ignore_nulls = ignore_nulls
|
|
444
|
+
self.type_safe = type_safe
|
|
445
|
+
|
|
446
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
447
|
+
issues: list[ValidationIssue] = []
|
|
448
|
+
|
|
449
|
+
columns = self._get_target_columns(lf)
|
|
450
|
+
if len(columns) < 2:
|
|
451
|
+
return issues
|
|
452
|
+
|
|
453
|
+
# Use type-safe or legacy strategy based on setting
|
|
454
|
+
if self.type_safe:
|
|
455
|
+
has_dup_expr = _build_type_safe_horizontal_expr(
|
|
456
|
+
lf, columns, self.ignore_nulls
|
|
457
|
+
)
|
|
458
|
+
else:
|
|
459
|
+
has_dup_expr = _build_horizontal_n_unique_expr(columns, self.ignore_nulls)
|
|
460
|
+
|
|
461
|
+
result = lf.select(
|
|
462
|
+
pl.len().alias("_total"),
|
|
463
|
+
has_dup_expr.sum().alias("_dup_count"),
|
|
464
|
+
).collect()
|
|
465
|
+
|
|
466
|
+
total_rows = result["_total"][0]
|
|
467
|
+
if total_rows == 0:
|
|
468
|
+
return issues
|
|
469
|
+
|
|
470
|
+
duplicate_rows = result["_dup_count"][0]
|
|
471
|
+
|
|
472
|
+
if duplicate_rows > 0:
|
|
473
|
+
if self._passes_mostly(duplicate_rows, total_rows):
|
|
474
|
+
return issues
|
|
475
|
+
|
|
476
|
+
ratio = duplicate_rows / total_rows
|
|
477
|
+
|
|
478
|
+
issues.append(
|
|
479
|
+
ValidationIssue(
|
|
480
|
+
column="_all_columns",
|
|
481
|
+
issue_type="duplicate_values_in_record",
|
|
482
|
+
count=duplicate_rows,
|
|
483
|
+
severity=self._calculate_severity(ratio),
|
|
484
|
+
details=f"{duplicate_rows} rows have duplicate values",
|
|
485
|
+
expected="All column values unique within each row",
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
return issues
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
@register_validator
|
|
493
|
+
class ColumnPairUniqueValidator(Validator):
|
|
494
|
+
"""Validates that a specific pair of columns never have the same value.
|
|
495
|
+
|
|
496
|
+
Optimized for the common case of checking just two columns.
|
|
497
|
+
Uses direct comparison expression for maximum performance.
|
|
498
|
+
|
|
499
|
+
Type Safety:
|
|
500
|
+
- Uses native Polars comparison which respects types
|
|
501
|
+
- numeric 1 != string "1" (no false positives)
|
|
502
|
+
|
|
503
|
+
Example:
|
|
504
|
+
# Sender and receiver should never be the same
|
|
505
|
+
validator = ColumnPairUniqueValidator(
|
|
506
|
+
column_a="sender_id",
|
|
507
|
+
column_b="receiver_id",
|
|
508
|
+
)
|
|
509
|
+
"""
|
|
510
|
+
|
|
511
|
+
name = "column_pair_unique"
|
|
512
|
+
category = "uniqueness"
|
|
513
|
+
|
|
514
|
+
def __init__(
|
|
515
|
+
self,
|
|
516
|
+
column_a: str,
|
|
517
|
+
column_b: str,
|
|
518
|
+
ignore_nulls: bool = True,
|
|
519
|
+
**kwargs: Any,
|
|
520
|
+
):
|
|
521
|
+
"""Initialize the validator.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
column_a: First column name
|
|
525
|
+
column_b: Second column name
|
|
526
|
+
ignore_nulls: If True, null values are not considered equal
|
|
527
|
+
**kwargs: Additional validator configuration
|
|
528
|
+
"""
|
|
529
|
+
super().__init__(**kwargs)
|
|
530
|
+
self.column_a = column_a
|
|
531
|
+
self.column_b = column_b
|
|
532
|
+
self.ignore_nulls = ignore_nulls
|
|
533
|
+
|
|
534
|
+
def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
|
|
535
|
+
issues: list[ValidationIssue] = []
|
|
536
|
+
|
|
537
|
+
# Build simple equality expression
|
|
538
|
+
# Native Polars comparison respects types (no false positives)
|
|
539
|
+
if self.ignore_nulls:
|
|
540
|
+
equal_expr = (
|
|
541
|
+
pl.col(self.column_a).is_not_null()
|
|
542
|
+
& pl.col(self.column_b).is_not_null()
|
|
543
|
+
& (pl.col(self.column_a) == pl.col(self.column_b))
|
|
544
|
+
)
|
|
545
|
+
else:
|
|
546
|
+
equal_expr = (
|
|
547
|
+
(pl.col(self.column_a) == pl.col(self.column_b))
|
|
548
|
+
| (pl.col(self.column_a).is_null() & pl.col(self.column_b).is_null())
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
result = lf.select(
|
|
552
|
+
pl.len().alias("_total"),
|
|
553
|
+
equal_expr.sum().alias("_equal_count"),
|
|
554
|
+
).collect()
|
|
555
|
+
|
|
556
|
+
total_rows = result["_total"][0]
|
|
557
|
+
if total_rows == 0:
|
|
558
|
+
return issues
|
|
559
|
+
|
|
560
|
+
equal_count = result["_equal_count"][0]
|
|
561
|
+
|
|
562
|
+
if equal_count > 0:
|
|
563
|
+
if self._passes_mostly(equal_count, total_rows):
|
|
564
|
+
return issues
|
|
565
|
+
|
|
566
|
+
ratio = equal_count / total_rows
|
|
567
|
+
|
|
568
|
+
# Get samples
|
|
569
|
+
samples = self._get_equal_samples(lf, equal_expr)
|
|
570
|
+
|
|
571
|
+
issues.append(
|
|
572
|
+
ValidationIssue(
|
|
573
|
+
column=f"[{self.column_a}, {self.column_b}]",
|
|
574
|
+
issue_type="column_pair_not_unique",
|
|
575
|
+
count=equal_count,
|
|
576
|
+
severity=self._calculate_severity(ratio),
|
|
577
|
+
details=f"{equal_count} rows have equal values in both columns",
|
|
578
|
+
expected=f"{self.column_a} != {self.column_b}",
|
|
579
|
+
sample_values=samples,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
return issues
|
|
584
|
+
|
|
585
|
+
def _get_equal_samples(self, lf: pl.LazyFrame, equal_expr: pl.Expr) -> list[str]:
|
|
586
|
+
"""Get sample rows where columns are equal."""
|
|
587
|
+
sample_df = (
|
|
588
|
+
lf.filter(equal_expr)
|
|
589
|
+
.select([self.column_a, self.column_b])
|
|
590
|
+
.head(self.config.sample_size)
|
|
591
|
+
.collect()
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
samples = []
|
|
595
|
+
for row in sample_df.iter_rows(named=True):
|
|
596
|
+
val_a = row[self.column_a]
|
|
597
|
+
samples.append(f"{self.column_a}={val_a}")
|
|
598
|
+
|
|
599
|
+
return samples
|