truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,794 @@
|
|
|
1
|
+
"""Memory-safe pattern matcher with integrated sampling.
|
|
2
|
+
|
|
3
|
+
This module provides a pattern matcher that integrates sampling strategies
|
|
4
|
+
to prevent OOM errors while maintaining statistical accuracy.
|
|
5
|
+
|
|
6
|
+
Key features:
|
|
7
|
+
- Configurable sampling strategies
|
|
8
|
+
- Memory-aware processing
|
|
9
|
+
- Statistical confidence reporting
|
|
10
|
+
- Graceful degradation on failures
|
|
11
|
+
- Telemetry integration
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
from truthound.profiler.sampled_matcher import (
|
|
15
|
+
SampledPatternMatcher,
|
|
16
|
+
SampledMatcherConfig,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
matcher = SampledPatternMatcher(
|
|
20
|
+
sampling_config=SamplingConfig.for_accuracy("high"),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
results = matcher.match_column(lf, "email")
|
|
24
|
+
print(f"Confidence: {results.confidence:.2%}")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
import time
|
|
31
|
+
from dataclasses import dataclass, field
|
|
32
|
+
from typing import Any, Callable, Sequence
|
|
33
|
+
|
|
34
|
+
import polars as pl
|
|
35
|
+
|
|
36
|
+
from truthound.profiler.base import DataType, PatternMatch
|
|
37
|
+
from truthound.profiler.native_patterns import (
|
|
38
|
+
BUILTIN_PATTERNS,
|
|
39
|
+
NativePatternMatcher,
|
|
40
|
+
PatternMatchResult,
|
|
41
|
+
PatternRegistry,
|
|
42
|
+
PatternSpec,
|
|
43
|
+
)
|
|
44
|
+
from truthound.profiler.sampling import (
|
|
45
|
+
DEFAULT_SAMPLING_CONFIG,
|
|
46
|
+
DataSizeEstimator,
|
|
47
|
+
Sampler,
|
|
48
|
+
SamplingConfig,
|
|
49
|
+
SamplingMetrics,
|
|
50
|
+
SamplingMethod,
|
|
51
|
+
SamplingResult,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
logger = logging.getLogger(__name__)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# =============================================================================
|
|
58
|
+
# Sampled Match Result
|
|
59
|
+
# =============================================================================
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class SampledPatternMatchResult:
|
|
64
|
+
"""Pattern match result with sampling metadata.
|
|
65
|
+
|
|
66
|
+
Extends PatternMatchResult with statistical confidence
|
|
67
|
+
information from sampling.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
pattern: The matched pattern specification
|
|
71
|
+
match_count: Number of matches in sample
|
|
72
|
+
total_count: Total non-null values in sample
|
|
73
|
+
match_ratio: Ratio of matches in sample
|
|
74
|
+
sample_matches: Example matching values
|
|
75
|
+
sample_non_matches: Example non-matching values
|
|
76
|
+
sampling_metrics: Metrics from sampling operation
|
|
77
|
+
estimated_population_matches: Extrapolated matches in full data
|
|
78
|
+
confidence_interval: (lower, upper) bounds for match ratio
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
pattern: PatternSpec
|
|
82
|
+
match_count: int
|
|
83
|
+
total_count: int
|
|
84
|
+
match_ratio: float
|
|
85
|
+
sample_matches: tuple[str, ...] = field(default_factory=tuple)
|
|
86
|
+
sample_non_matches: tuple[str, ...] = field(default_factory=tuple)
|
|
87
|
+
sampling_metrics: SamplingMetrics | None = None
|
|
88
|
+
estimated_population_matches: int = 0
|
|
89
|
+
confidence_interval: tuple[float, float] = (0.0, 1.0)
|
|
90
|
+
|
|
91
|
+
def __post_init__(self) -> None:
|
|
92
|
+
"""Calculate derived fields."""
|
|
93
|
+
if self.sampling_metrics and self.sampling_metrics.original_size > 0:
|
|
94
|
+
# Extrapolate to full population
|
|
95
|
+
self.estimated_population_matches = int(
|
|
96
|
+
self.match_ratio * self.sampling_metrics.original_size
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Calculate confidence interval using Wilson score
|
|
100
|
+
self._calculate_confidence_interval()
|
|
101
|
+
|
|
102
|
+
def _calculate_confidence_interval(self) -> None:
|
|
103
|
+
"""Calculate Wilson score confidence interval."""
|
|
104
|
+
if self.total_count == 0:
|
|
105
|
+
self.confidence_interval = (0.0, 1.0)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
n = self.total_count
|
|
109
|
+
p = self.match_ratio
|
|
110
|
+
|
|
111
|
+
# Z-score for confidence level (default 95%)
|
|
112
|
+
z = 1.96
|
|
113
|
+
if self.sampling_metrics:
|
|
114
|
+
z = self._z_from_confidence(self.sampling_metrics.confidence_level)
|
|
115
|
+
|
|
116
|
+
# Wilson score interval
|
|
117
|
+
denominator = 1 + z * z / n
|
|
118
|
+
center = (p + z * z / (2 * n)) / denominator
|
|
119
|
+
spread = z * ((p * (1 - p) / n + z * z / (4 * n * n)) ** 0.5) / denominator
|
|
120
|
+
|
|
121
|
+
lower = max(0.0, center - spread)
|
|
122
|
+
upper = min(1.0, center + spread)
|
|
123
|
+
|
|
124
|
+
self.confidence_interval = (lower, upper)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _z_from_confidence(confidence: float) -> float:
|
|
128
|
+
"""Get Z-score from confidence level."""
|
|
129
|
+
z_scores = {
|
|
130
|
+
0.90: 1.645,
|
|
131
|
+
0.95: 1.96,
|
|
132
|
+
0.99: 2.576,
|
|
133
|
+
0.999: 3.291,
|
|
134
|
+
}
|
|
135
|
+
return z_scores.get(round(confidence, 3), 1.96)
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def confidence(self) -> float:
|
|
139
|
+
"""Get confidence in the match ratio estimate."""
|
|
140
|
+
if self.sampling_metrics:
|
|
141
|
+
return self.sampling_metrics.confidence_level
|
|
142
|
+
return 1.0 # No sampling = full confidence
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def is_sampled(self) -> bool:
|
|
146
|
+
"""Check if result is from sampled data."""
|
|
147
|
+
return self.sampling_metrics is not None and self.sampling_metrics.is_full_scan is False
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def margin_of_error(self) -> float:
|
|
151
|
+
"""Get margin of error for match ratio."""
|
|
152
|
+
lower, upper = self.confidence_interval
|
|
153
|
+
return (upper - lower) / 2
|
|
154
|
+
|
|
155
|
+
def to_pattern_match(self) -> PatternMatch:
|
|
156
|
+
"""Convert to legacy PatternMatch format."""
|
|
157
|
+
return PatternMatch(
|
|
158
|
+
pattern=self.pattern.name,
|
|
159
|
+
regex=self.pattern.regex,
|
|
160
|
+
match_ratio=self.match_ratio,
|
|
161
|
+
sample_matches=self.sample_matches,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> dict[str, Any]:
|
|
165
|
+
"""Convert to dictionary for serialization."""
|
|
166
|
+
return {
|
|
167
|
+
"pattern_name": self.pattern.name,
|
|
168
|
+
"pattern_regex": self.pattern.regex,
|
|
169
|
+
"match_count": self.match_count,
|
|
170
|
+
"total_count": self.total_count,
|
|
171
|
+
"match_ratio": self.match_ratio,
|
|
172
|
+
"confidence": self.confidence,
|
|
173
|
+
"confidence_interval": list(self.confidence_interval),
|
|
174
|
+
"margin_of_error": self.margin_of_error,
|
|
175
|
+
"is_sampled": self.is_sampled,
|
|
176
|
+
"estimated_population_matches": self.estimated_population_matches,
|
|
177
|
+
"sample_matches": list(self.sample_matches),
|
|
178
|
+
"sampling_metrics": (
|
|
179
|
+
self.sampling_metrics.to_dict() if self.sampling_metrics else None
|
|
180
|
+
),
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class SampledColumnMatchResult:
|
|
186
|
+
"""Complete result for a column including all matches and metadata."""
|
|
187
|
+
|
|
188
|
+
column: str
|
|
189
|
+
matches: list[SampledPatternMatchResult]
|
|
190
|
+
sampling_metrics: SamplingMetrics | None
|
|
191
|
+
processing_time_ms: float
|
|
192
|
+
inferred_type: DataType | None = None
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def has_matches(self) -> bool:
|
|
196
|
+
"""Check if any patterns matched."""
|
|
197
|
+
return len(self.matches) > 0
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def best_match(self) -> SampledPatternMatchResult | None:
|
|
201
|
+
"""Get the best (highest ratio) match."""
|
|
202
|
+
if not self.matches:
|
|
203
|
+
return None
|
|
204
|
+
return self.matches[0]
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def is_sampled(self) -> bool:
|
|
208
|
+
"""Check if sampling was applied."""
|
|
209
|
+
return (
|
|
210
|
+
self.sampling_metrics is not None
|
|
211
|
+
and not self.sampling_metrics.is_full_scan
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def to_dict(self) -> dict[str, Any]:
|
|
215
|
+
"""Convert to dictionary."""
|
|
216
|
+
return {
|
|
217
|
+
"column": self.column,
|
|
218
|
+
"matches": [m.to_dict() for m in self.matches],
|
|
219
|
+
"sampling_metrics": (
|
|
220
|
+
self.sampling_metrics.to_dict() if self.sampling_metrics else None
|
|
221
|
+
),
|
|
222
|
+
"processing_time_ms": self.processing_time_ms,
|
|
223
|
+
"inferred_type": self.inferred_type.value if self.inferred_type else None,
|
|
224
|
+
"is_sampled": self.is_sampled,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# =============================================================================
|
|
229
|
+
# Sampled Pattern Matcher Configuration
|
|
230
|
+
# =============================================================================
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@dataclass
|
|
234
|
+
class SampledMatcherConfig:
|
|
235
|
+
"""Configuration for SampledPatternMatcher.
|
|
236
|
+
|
|
237
|
+
Attributes:
|
|
238
|
+
sampling_config: Sampling configuration
|
|
239
|
+
patterns: Pattern registry to use
|
|
240
|
+
min_match_ratio: Minimum ratio to consider a match
|
|
241
|
+
sample_size: Number of sample values to collect
|
|
242
|
+
include_non_matches: Whether to collect non-matching samples
|
|
243
|
+
parallel_threshold: Row count above which to use parallel processing
|
|
244
|
+
fallback_on_error: Whether to fallback to head sampling on error
|
|
245
|
+
cache_sampling_decisions: Cache sampling decisions for same data
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
sampling_config: SamplingConfig = field(default_factory=lambda: DEFAULT_SAMPLING_CONFIG)
|
|
249
|
+
patterns: PatternRegistry | None = None
|
|
250
|
+
min_match_ratio: float = 0.8
|
|
251
|
+
sample_size: int = 5
|
|
252
|
+
include_non_matches: bool = False
|
|
253
|
+
parallel_threshold: int = 100_000
|
|
254
|
+
fallback_on_error: bool = True
|
|
255
|
+
cache_sampling_decisions: bool = True
|
|
256
|
+
|
|
257
|
+
def __post_init__(self) -> None:
|
|
258
|
+
"""Validate configuration."""
|
|
259
|
+
if not 0.0 <= self.min_match_ratio <= 1.0:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"min_match_ratio must be between 0 and 1, got {self.min_match_ratio}"
|
|
262
|
+
)
|
|
263
|
+
if self.sample_size < 0:
|
|
264
|
+
raise ValueError(f"sample_size must be non-negative, got {self.sample_size}")
|
|
265
|
+
|
|
266
|
+
@classmethod
|
|
267
|
+
def fast(cls) -> "SampledMatcherConfig":
|
|
268
|
+
"""Create config optimized for speed."""
|
|
269
|
+
return cls(
|
|
270
|
+
sampling_config=SamplingConfig.for_speed(),
|
|
271
|
+
min_match_ratio=0.7,
|
|
272
|
+
sample_size=3,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
@classmethod
|
|
276
|
+
def accurate(cls) -> "SampledMatcherConfig":
|
|
277
|
+
"""Create config optimized for accuracy."""
|
|
278
|
+
return cls(
|
|
279
|
+
sampling_config=SamplingConfig.for_accuracy("high"),
|
|
280
|
+
min_match_ratio=0.85,
|
|
281
|
+
sample_size=10,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
@classmethod
|
|
285
|
+
def balanced(cls) -> "SampledMatcherConfig":
|
|
286
|
+
"""Create balanced config (default)."""
|
|
287
|
+
return cls(
|
|
288
|
+
sampling_config=SamplingConfig.for_accuracy("medium"),
|
|
289
|
+
min_match_ratio=0.8,
|
|
290
|
+
sample_size=5,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# =============================================================================
|
|
295
|
+
# Sampled Pattern Matcher
|
|
296
|
+
# =============================================================================
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class SampledPatternMatcher:
|
|
300
|
+
"""Memory-safe pattern matcher with integrated sampling.
|
|
301
|
+
|
|
302
|
+
This is the recommended pattern matcher for production use.
|
|
303
|
+
It automatically samples large datasets to prevent OOM errors
|
|
304
|
+
while providing statistical confidence metrics.
|
|
305
|
+
|
|
306
|
+
Example:
|
|
307
|
+
# Basic usage
|
|
308
|
+
matcher = SampledPatternMatcher()
|
|
309
|
+
results = matcher.match_column(lf, "email")
|
|
310
|
+
|
|
311
|
+
for result in results.matches:
|
|
312
|
+
print(f"{result.pattern.name}: {result.match_ratio:.2%} "
|
|
313
|
+
f"(±{result.margin_of_error:.2%})")
|
|
314
|
+
|
|
315
|
+
# Custom configuration
|
|
316
|
+
config = SampledMatcherConfig(
|
|
317
|
+
sampling_config=SamplingConfig(
|
|
318
|
+
strategy=SamplingMethod.RANDOM,
|
|
319
|
+
max_rows=50_000,
|
|
320
|
+
confidence_level=0.99,
|
|
321
|
+
),
|
|
322
|
+
min_match_ratio=0.9,
|
|
323
|
+
)
|
|
324
|
+
matcher = SampledPatternMatcher(config=config)
|
|
325
|
+
|
|
326
|
+
# Memory-constrained environment
|
|
327
|
+
matcher = SampledPatternMatcher(
|
|
328
|
+
config=SampledMatcherConfig(
|
|
329
|
+
sampling_config=SamplingConfig.for_memory(max_memory_mb=100)
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
def __init__(
|
|
335
|
+
self,
|
|
336
|
+
config: SampledMatcherConfig | None = None,
|
|
337
|
+
sampling_config: SamplingConfig | None = None,
|
|
338
|
+
patterns: PatternRegistry | None = None,
|
|
339
|
+
):
|
|
340
|
+
"""Initialize the sampled pattern matcher.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
config: Full matcher configuration
|
|
344
|
+
sampling_config: Override sampling config (convenience)
|
|
345
|
+
patterns: Override pattern registry (convenience)
|
|
346
|
+
"""
|
|
347
|
+
self.config = config or SampledMatcherConfig.balanced()
|
|
348
|
+
|
|
349
|
+
# Allow convenience overrides
|
|
350
|
+
if sampling_config is not None:
|
|
351
|
+
self.config.sampling_config = sampling_config
|
|
352
|
+
if patterns is not None:
|
|
353
|
+
self.config.patterns = patterns
|
|
354
|
+
|
|
355
|
+
# Initialize components
|
|
356
|
+
self._sampler = Sampler(self.config.sampling_config)
|
|
357
|
+
self._size_estimator = DataSizeEstimator()
|
|
358
|
+
self._patterns = self.config.patterns or BUILTIN_PATTERNS
|
|
359
|
+
|
|
360
|
+
# Internal matcher for actual pattern matching
|
|
361
|
+
self._matcher = NativePatternMatcher(
|
|
362
|
+
patterns=self._patterns,
|
|
363
|
+
min_match_ratio=self.config.min_match_ratio,
|
|
364
|
+
sample_size=self.config.sample_size,
|
|
365
|
+
include_non_matches=self.config.include_non_matches,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def patterns(self) -> PatternRegistry:
|
|
370
|
+
"""Get the pattern registry."""
|
|
371
|
+
return self._patterns
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def sampling_config(self) -> SamplingConfig:
|
|
375
|
+
"""Get the sampling configuration."""
|
|
376
|
+
return self.config.sampling_config
|
|
377
|
+
|
|
378
|
+
def match_column(
|
|
379
|
+
self,
|
|
380
|
+
lf: pl.LazyFrame,
|
|
381
|
+
column: str,
|
|
382
|
+
*,
|
|
383
|
+
patterns: Sequence[PatternSpec] | None = None,
|
|
384
|
+
sampling_config: SamplingConfig | None = None,
|
|
385
|
+
) -> SampledColumnMatchResult:
|
|
386
|
+
"""Match patterns against a column with automatic sampling.
|
|
387
|
+
|
|
388
|
+
This is the main entry point. It will:
|
|
389
|
+
1. Estimate data size
|
|
390
|
+
2. Apply appropriate sampling strategy
|
|
391
|
+
3. Run pattern matching on sample
|
|
392
|
+
4. Calculate statistical confidence
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
lf: LazyFrame containing the data
|
|
396
|
+
column: Column name to analyze
|
|
397
|
+
patterns: Optional specific patterns to check
|
|
398
|
+
sampling_config: Override sampling config for this call
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
SampledColumnMatchResult with matches and metrics
|
|
402
|
+
"""
|
|
403
|
+
start_time = time.perf_counter()
|
|
404
|
+
|
|
405
|
+
# Use override or default config
|
|
406
|
+
config = sampling_config or self.config.sampling_config
|
|
407
|
+
|
|
408
|
+
# Step 1: Sample the data
|
|
409
|
+
try:
|
|
410
|
+
sampling_result = self._sample_column(lf, column, config)
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.error(f"Sampling failed for column '{column}': {e}")
|
|
413
|
+
if self.config.fallback_on_error:
|
|
414
|
+
# Fallback to simple head sampling
|
|
415
|
+
sampling_result = self._fallback_sample(lf, column, config)
|
|
416
|
+
else:
|
|
417
|
+
raise
|
|
418
|
+
|
|
419
|
+
# Step 2: Run pattern matching on sampled data
|
|
420
|
+
try:
|
|
421
|
+
pattern_results = self._match_on_sample(
|
|
422
|
+
sampling_result.data,
|
|
423
|
+
column,
|
|
424
|
+
patterns,
|
|
425
|
+
)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.error(f"Pattern matching failed for column '{column}': {e}")
|
|
428
|
+
pattern_results = []
|
|
429
|
+
|
|
430
|
+
# Step 3: Convert to sampled results with confidence
|
|
431
|
+
sampled_results = self._enhance_results(
|
|
432
|
+
pattern_results,
|
|
433
|
+
sampling_result.metrics,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Step 4: Infer type from best match
|
|
437
|
+
inferred_type = None
|
|
438
|
+
if sampled_results:
|
|
439
|
+
inferred_type = sampled_results[0].pattern.data_type
|
|
440
|
+
|
|
441
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
442
|
+
|
|
443
|
+
return SampledColumnMatchResult(
|
|
444
|
+
column=column,
|
|
445
|
+
matches=sampled_results,
|
|
446
|
+
sampling_metrics=sampling_result.metrics,
|
|
447
|
+
processing_time_ms=elapsed_ms,
|
|
448
|
+
inferred_type=inferred_type,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
def match_all_columns(
|
|
452
|
+
self,
|
|
453
|
+
lf: pl.LazyFrame,
|
|
454
|
+
*,
|
|
455
|
+
string_columns_only: bool = True,
|
|
456
|
+
sampling_config: SamplingConfig | None = None,
|
|
457
|
+
) -> dict[str, SampledColumnMatchResult]:
|
|
458
|
+
"""Match patterns against all applicable columns.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
lf: LazyFrame to analyze
|
|
462
|
+
string_columns_only: Only analyze string columns
|
|
463
|
+
sampling_config: Override sampling configuration
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
Dictionary mapping column names to their results
|
|
467
|
+
"""
|
|
468
|
+
schema = lf.collect_schema()
|
|
469
|
+
results: dict[str, SampledColumnMatchResult] = {}
|
|
470
|
+
|
|
471
|
+
for col_name, dtype in schema.items():
|
|
472
|
+
if string_columns_only:
|
|
473
|
+
if dtype not in {pl.String, pl.Utf8}:
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
result = self.match_column(
|
|
477
|
+
lf,
|
|
478
|
+
col_name,
|
|
479
|
+
sampling_config=sampling_config,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
if result.has_matches:
|
|
483
|
+
results[col_name] = result
|
|
484
|
+
|
|
485
|
+
return results
|
|
486
|
+
|
|
487
|
+
def infer_type(
|
|
488
|
+
self,
|
|
489
|
+
lf: pl.LazyFrame,
|
|
490
|
+
column: str,
|
|
491
|
+
*,
|
|
492
|
+
min_match_ratio: float | None = None,
|
|
493
|
+
) -> DataType | None:
|
|
494
|
+
"""Infer semantic type for a column.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
lf: LazyFrame containing the data
|
|
498
|
+
column: Column name to analyze
|
|
499
|
+
min_match_ratio: Override minimum match ratio
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
Inferred DataType or None
|
|
503
|
+
"""
|
|
504
|
+
result = self.match_column(lf, column)
|
|
505
|
+
return result.inferred_type
|
|
506
|
+
|
|
507
|
+
def _sample_column(
|
|
508
|
+
self,
|
|
509
|
+
lf: pl.LazyFrame,
|
|
510
|
+
column: str,
|
|
511
|
+
config: SamplingConfig,
|
|
512
|
+
) -> SamplingResult:
|
|
513
|
+
"""Sample a single column for pattern matching."""
|
|
514
|
+
# Select only the needed column for efficiency
|
|
515
|
+
column_lf = lf.select(pl.col(column))
|
|
516
|
+
return self._sampler.sample(column_lf, config)
|
|
517
|
+
|
|
518
|
+
def _fallback_sample(
|
|
519
|
+
self,
|
|
520
|
+
lf: pl.LazyFrame,
|
|
521
|
+
column: str,
|
|
522
|
+
config: SamplingConfig,
|
|
523
|
+
) -> SamplingResult:
|
|
524
|
+
"""Fallback sampling using simple head."""
|
|
525
|
+
fallback_config = SamplingConfig(
|
|
526
|
+
strategy=config.fallback_strategy,
|
|
527
|
+
max_rows=config.max_rows or 10_000,
|
|
528
|
+
confidence_level=config.confidence_level,
|
|
529
|
+
margin_of_error=config.margin_of_error,
|
|
530
|
+
)
|
|
531
|
+
column_lf = lf.select(pl.col(column))
|
|
532
|
+
return self._sampler.sample(column_lf, fallback_config)
|
|
533
|
+
|
|
534
|
+
def _match_on_sample(
|
|
535
|
+
self,
|
|
536
|
+
sampled_lf: pl.LazyFrame,
|
|
537
|
+
column: str,
|
|
538
|
+
patterns: Sequence[PatternSpec] | None,
|
|
539
|
+
) -> list[PatternMatchResult]:
|
|
540
|
+
"""Run pattern matching on sampled data."""
|
|
541
|
+
return self._matcher.match_column(
|
|
542
|
+
sampled_lf,
|
|
543
|
+
column,
|
|
544
|
+
patterns=patterns,
|
|
545
|
+
limit=None, # Already sampled
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
def _enhance_results(
|
|
549
|
+
self,
|
|
550
|
+
results: list[PatternMatchResult],
|
|
551
|
+
sampling_metrics: SamplingMetrics,
|
|
552
|
+
) -> list[SampledPatternMatchResult]:
|
|
553
|
+
"""Enhance pattern results with sampling metadata."""
|
|
554
|
+
enhanced = []
|
|
555
|
+
|
|
556
|
+
for result in results:
|
|
557
|
+
enhanced.append(
|
|
558
|
+
SampledPatternMatchResult(
|
|
559
|
+
pattern=result.pattern,
|
|
560
|
+
match_count=result.match_count,
|
|
561
|
+
total_count=result.total_count,
|
|
562
|
+
match_ratio=result.match_ratio,
|
|
563
|
+
sample_matches=result.sample_matches,
|
|
564
|
+
sample_non_matches=result.sample_non_matches,
|
|
565
|
+
sampling_metrics=sampling_metrics,
|
|
566
|
+
)
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
return enhanced
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
# =============================================================================
|
|
573
|
+
# Factory Functions
|
|
574
|
+
# =============================================================================
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def create_sampled_matcher(
|
|
578
|
+
strategy: str | SamplingMethod = "adaptive",
|
|
579
|
+
max_rows: int = 100_000,
|
|
580
|
+
min_match_ratio: float = 0.8,
|
|
581
|
+
**kwargs: Any,
|
|
582
|
+
) -> SampledPatternMatcher:
|
|
583
|
+
"""Create a sampled pattern matcher with common options.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
strategy: Sampling strategy
|
|
587
|
+
max_rows: Maximum rows to sample
|
|
588
|
+
min_match_ratio: Minimum match ratio threshold
|
|
589
|
+
**kwargs: Additional SamplingConfig options
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
Configured SampledPatternMatcher
|
|
593
|
+
|
|
594
|
+
Example:
|
|
595
|
+
matcher = create_sampled_matcher(
|
|
596
|
+
strategy="random",
|
|
597
|
+
max_rows=50_000,
|
|
598
|
+
confidence_level=0.99,
|
|
599
|
+
)
|
|
600
|
+
"""
|
|
601
|
+
if isinstance(strategy, str):
|
|
602
|
+
strategy = SamplingMethod(strategy)
|
|
603
|
+
|
|
604
|
+
sampling_config = SamplingConfig(
|
|
605
|
+
strategy=strategy,
|
|
606
|
+
max_rows=max_rows,
|
|
607
|
+
**kwargs,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
config = SampledMatcherConfig(
|
|
611
|
+
sampling_config=sampling_config,
|
|
612
|
+
min_match_ratio=min_match_ratio,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
return SampledPatternMatcher(config=config)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def match_patterns_safe(
|
|
619
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
620
|
+
column: str,
|
|
621
|
+
*,
|
|
622
|
+
max_rows: int = 100_000,
|
|
623
|
+
min_ratio: float = 0.8,
|
|
624
|
+
) -> SampledColumnMatchResult:
|
|
625
|
+
"""Convenience function for safe pattern matching.
|
|
626
|
+
|
|
627
|
+
Always applies sampling to prevent OOM.
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
data: DataFrame or LazyFrame
|
|
631
|
+
column: Column name to analyze
|
|
632
|
+
max_rows: Maximum rows to sample
|
|
633
|
+
min_ratio: Minimum match ratio
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
SampledColumnMatchResult
|
|
637
|
+
|
|
638
|
+
Example:
|
|
639
|
+
import polars as pl
|
|
640
|
+
from truthound.profiler.sampled_matcher import match_patterns_safe
|
|
641
|
+
|
|
642
|
+
df = pl.read_parquet("large_file.parquet")
|
|
643
|
+
result = match_patterns_safe(df.lazy(), "email_column")
|
|
644
|
+
|
|
645
|
+
print(f"Best match: {result.best_match.pattern.name}")
|
|
646
|
+
print(f"Confidence: {result.best_match.confidence:.2%}")
|
|
647
|
+
"""
|
|
648
|
+
if isinstance(data, pl.DataFrame):
|
|
649
|
+
data = data.lazy()
|
|
650
|
+
|
|
651
|
+
matcher = create_sampled_matcher(
|
|
652
|
+
max_rows=max_rows,
|
|
653
|
+
min_match_ratio=min_ratio,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
return matcher.match_column(data, column)
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def infer_column_type_safe(
|
|
660
|
+
data: pl.LazyFrame | pl.DataFrame,
|
|
661
|
+
column: str,
|
|
662
|
+
*,
|
|
663
|
+
max_rows: int = 100_000,
|
|
664
|
+
min_ratio: float = 0.9,
|
|
665
|
+
) -> DataType | None:
|
|
666
|
+
"""Convenience function for safe type inference.
|
|
667
|
+
|
|
668
|
+
Args:
|
|
669
|
+
data: DataFrame or LazyFrame
|
|
670
|
+
column: Column name
|
|
671
|
+
max_rows: Maximum rows to sample
|
|
672
|
+
min_ratio: Minimum match ratio for inference
|
|
673
|
+
|
|
674
|
+
Returns:
|
|
675
|
+
Inferred DataType or None
|
|
676
|
+
|
|
677
|
+
Example:
|
|
678
|
+
from truthound.profiler.sampled_matcher import infer_column_type_safe
|
|
679
|
+
|
|
680
|
+
dtype = infer_column_type_safe(df, "mystery_column")
|
|
681
|
+
if dtype:
|
|
682
|
+
print(f"Detected type: {dtype.value}")
|
|
683
|
+
"""
|
|
684
|
+
if isinstance(data, pl.DataFrame):
|
|
685
|
+
data = data.lazy()
|
|
686
|
+
|
|
687
|
+
matcher = create_sampled_matcher(
|
|
688
|
+
max_rows=max_rows,
|
|
689
|
+
min_match_ratio=min_ratio,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
return matcher.infer_type(data, column)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
# =============================================================================
|
|
696
|
+
# Integration with NativePatternMatcher (Backward Compatibility)
|
|
697
|
+
# =============================================================================
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
class SafeNativePatternMatcher(NativePatternMatcher):
|
|
701
|
+
"""Drop-in replacement for NativePatternMatcher with sampling.
|
|
702
|
+
|
|
703
|
+
This class extends NativePatternMatcher to add automatic
|
|
704
|
+
sampling, making it safe for use with large datasets.
|
|
705
|
+
|
|
706
|
+
It maintains the same API as NativePatternMatcher but
|
|
707
|
+
adds sampling configuration options.
|
|
708
|
+
|
|
709
|
+
Example:
|
|
710
|
+
# Drop-in replacement
|
|
711
|
+
matcher = SafeNativePatternMatcher(max_rows=50_000)
|
|
712
|
+
results = matcher.match_column(lf, "email")
|
|
713
|
+
|
|
714
|
+
# Same API as before, but now memory-safe
|
|
715
|
+
"""
|
|
716
|
+
|
|
717
|
+
def __init__(
|
|
718
|
+
self,
|
|
719
|
+
patterns: PatternRegistry | None = None,
|
|
720
|
+
*,
|
|
721
|
+
min_match_ratio: float = 0.8,
|
|
722
|
+
sample_size: int = 5,
|
|
723
|
+
include_non_matches: bool = False,
|
|
724
|
+
# New sampling options
|
|
725
|
+
max_rows: int = 100_000,
|
|
726
|
+
sampling_strategy: SamplingMethod = SamplingMethod.ADAPTIVE,
|
|
727
|
+
confidence_level: float = 0.95,
|
|
728
|
+
):
|
|
729
|
+
"""Initialize with sampling options.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
patterns: Pattern registry
|
|
733
|
+
min_match_ratio: Minimum match ratio
|
|
734
|
+
sample_size: Number of sample values
|
|
735
|
+
include_non_matches: Include non-matching samples
|
|
736
|
+
max_rows: Maximum rows to process
|
|
737
|
+
sampling_strategy: Sampling strategy to use
|
|
738
|
+
confidence_level: Statistical confidence level
|
|
739
|
+
"""
|
|
740
|
+
super().__init__(
|
|
741
|
+
patterns=patterns,
|
|
742
|
+
min_match_ratio=min_match_ratio,
|
|
743
|
+
sample_size=sample_size,
|
|
744
|
+
include_non_matches=include_non_matches,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
self._sampling_config = SamplingConfig(
|
|
748
|
+
strategy=sampling_strategy,
|
|
749
|
+
max_rows=max_rows,
|
|
750
|
+
confidence_level=confidence_level,
|
|
751
|
+
)
|
|
752
|
+
self._sampler = Sampler(self._sampling_config)
|
|
753
|
+
|
|
754
|
+
def match_column(
|
|
755
|
+
self,
|
|
756
|
+
lf: pl.LazyFrame,
|
|
757
|
+
column: str,
|
|
758
|
+
*,
|
|
759
|
+
patterns: Sequence[PatternSpec] | None = None,
|
|
760
|
+
limit: int | None = None, # Now uses sampling instead
|
|
761
|
+
) -> list[PatternMatchResult]:
|
|
762
|
+
"""Match patterns with automatic sampling.
|
|
763
|
+
|
|
764
|
+
Overrides parent to add sampling before matching.
|
|
765
|
+
|
|
766
|
+
Args:
|
|
767
|
+
lf: LazyFrame containing the data
|
|
768
|
+
column: Column name to analyze
|
|
769
|
+
patterns: Optional patterns to check
|
|
770
|
+
limit: Ignored (uses sampling config instead)
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
List of PatternMatchResult
|
|
774
|
+
"""
|
|
775
|
+
# Apply sampling
|
|
776
|
+
column_lf = lf.select(pl.col(column))
|
|
777
|
+
sampling_result = self._sampler.sample(column_lf)
|
|
778
|
+
|
|
779
|
+
# Log sampling decision
|
|
780
|
+
if sampling_result.is_sampled:
|
|
781
|
+
logger.debug(
|
|
782
|
+
f"Sampled column '{column}': "
|
|
783
|
+
f"{sampling_result.metrics.sample_size:,} of "
|
|
784
|
+
f"{sampling_result.metrics.original_size:,} rows "
|
|
785
|
+
f"({sampling_result.metrics.sampling_ratio:.1%})"
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# Run parent's match_column on sampled data
|
|
789
|
+
return super().match_column(
|
|
790
|
+
sampling_result.data,
|
|
791
|
+
column,
|
|
792
|
+
patterns=patterns,
|
|
793
|
+
limit=None, # Already sampled
|
|
794
|
+
)
|