truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,976 @@
|
|
|
1
|
+
"""Dask-native execution engine for distributed data validation.
|
|
2
|
+
|
|
3
|
+
This module provides a Dask-native execution engine that:
|
|
4
|
+
- Executes validation operations directly on Dask DataFrames
|
|
5
|
+
- Avoids Polars conversion overhead for distributed operations
|
|
6
|
+
- Uses Arrow for efficient data transfer when conversion is needed
|
|
7
|
+
- Supports distributed aggregations with proper reduce semantics
|
|
8
|
+
|
|
9
|
+
Architecture:
|
|
10
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
11
|
+
│ DaskExecutionEngine │
|
|
12
|
+
│ │
|
|
13
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
14
|
+
│ │ Native Dask Operations │ │
|
|
15
|
+
│ │ (count, aggregate, filter - no conversion overhead) │ │
|
|
16
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
17
|
+
│ │ │
|
|
18
|
+
│ ▼ │
|
|
19
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
20
|
+
│ │ Arrow Bridge (when needed) │ │
|
|
21
|
+
│ │ (zero-copy conversion to Polars for ML validators) │ │
|
|
22
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
23
|
+
│ │ │
|
|
24
|
+
│ ▼ │
|
|
25
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
26
|
+
│ │ Polars LazyFrame (fallback) │ │
|
|
27
|
+
│ │ (only for validators that require Polars operations) │ │
|
|
28
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> import dask.dataframe as dd
|
|
34
|
+
>>> from truthound.execution.distributed import DaskExecutionEngine
|
|
35
|
+
>>>
|
|
36
|
+
>>> ddf = dd.read_parquet("large_data.parquet")
|
|
37
|
+
>>>
|
|
38
|
+
>>> # Create native Dask engine
|
|
39
|
+
>>> engine = DaskExecutionEngine.from_dataframe(ddf)
|
|
40
|
+
>>>
|
|
41
|
+
>>> # Native Dask operations (no conversion overhead)
|
|
42
|
+
>>> row_count = engine.count_rows()
|
|
43
|
+
>>> null_counts = engine.count_nulls_all()
|
|
44
|
+
>>> stats = engine.get_stats("price")
|
|
45
|
+
>>>
|
|
46
|
+
>>> # Convert to Polars only when needed (via Arrow)
|
|
47
|
+
>>> lf = engine.to_polars_lazyframe()
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
from __future__ import annotations
|
|
51
|
+
|
|
52
|
+
import logging
|
|
53
|
+
import time
|
|
54
|
+
from dataclasses import dataclass, field
|
|
55
|
+
from functools import reduce
|
|
56
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator
|
|
57
|
+
|
|
58
|
+
from truthound.execution.distributed.base import (
|
|
59
|
+
BaseDistributedEngine,
|
|
60
|
+
DistributedEngineConfig,
|
|
61
|
+
ExecutionMetrics,
|
|
62
|
+
)
|
|
63
|
+
from truthound.execution.distributed.protocols import (
|
|
64
|
+
AggregationScope,
|
|
65
|
+
AggregationSpec,
|
|
66
|
+
ComputeBackend,
|
|
67
|
+
DistributedResult,
|
|
68
|
+
PartitionInfo,
|
|
69
|
+
PartitionStrategy,
|
|
70
|
+
get_aggregator,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if TYPE_CHECKING:
|
|
74
|
+
import dask.dataframe as dd
|
|
75
|
+
import pandas as pd
|
|
76
|
+
import pyarrow as pa
|
|
77
|
+
from distributed import Client
|
|
78
|
+
|
|
79
|
+
logger = logging.getLogger(__name__)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# Configuration
|
|
84
|
+
# =============================================================================
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class DaskEngineConfig(DistributedEngineConfig):
|
|
89
|
+
"""Configuration for Dask execution engine.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
scheduler: Dask scheduler to use ('distributed', 'threads', 'synchronous').
|
|
93
|
+
client_address: Address of distributed scheduler (for distributed mode).
|
|
94
|
+
n_workers: Number of workers (for local cluster).
|
|
95
|
+
threads_per_worker: Threads per worker.
|
|
96
|
+
memory_per_worker: Memory limit per worker.
|
|
97
|
+
processes: Use processes instead of threads.
|
|
98
|
+
dashboard_address: Dashboard address (for distributed mode).
|
|
99
|
+
blocksize: Block size for reading files.
|
|
100
|
+
persist_intermediate: Persist intermediate results.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
scheduler: str = "threads" # 'distributed', 'threads', 'synchronous'
|
|
104
|
+
client_address: str | None = None
|
|
105
|
+
n_workers: int | None = None
|
|
106
|
+
threads_per_worker: int = 2
|
|
107
|
+
memory_per_worker: str = "2GB"
|
|
108
|
+
processes: bool = False
|
|
109
|
+
dashboard_address: str = ":8787"
|
|
110
|
+
blocksize: str = "128MB"
|
|
111
|
+
persist_intermediate: bool = False
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _check_dask_available() -> None:
|
|
115
|
+
"""Check if Dask is available."""
|
|
116
|
+
try:
|
|
117
|
+
import dask.dataframe # noqa: F401
|
|
118
|
+
except ImportError:
|
|
119
|
+
raise ImportError(
|
|
120
|
+
"dask is required for DaskExecutionEngine. "
|
|
121
|
+
"Install with: pip install dask[dataframe] distributed"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# Dask Execution Engine
|
|
127
|
+
# =============================================================================
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class DaskExecutionEngine(BaseDistributedEngine[DaskEngineConfig]):
|
|
131
|
+
"""Dask-native execution engine for distributed validation.
|
|
132
|
+
|
|
133
|
+
This engine executes validation operations directly on Dask DataFrames,
|
|
134
|
+
avoiding the overhead of converting to Polars for operations that can
|
|
135
|
+
be performed natively in Dask.
|
|
136
|
+
|
|
137
|
+
Key Features:
|
|
138
|
+
- Native Dask aggregations (count, sum, mean, min, max, etc.)
|
|
139
|
+
- Distributed null/duplicate checking
|
|
140
|
+
- Arrow-based zero-copy conversion to Polars when needed
|
|
141
|
+
- Partition-aware operations
|
|
142
|
+
- Lazy evaluation with optimized task graphs
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> engine = DaskExecutionEngine.from_dataframe(dask_df)
|
|
146
|
+
>>> null_counts = engine.count_nulls_all() # Native Dask
|
|
147
|
+
>>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
engine_type = "dask"
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
dask_df: "dd.DataFrame",
|
|
155
|
+
config: DaskEngineConfig | None = None,
|
|
156
|
+
client: "Client | None" = None,
|
|
157
|
+
) -> None:
|
|
158
|
+
"""Initialize Dask execution engine.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
dask_df: Dask DataFrame.
|
|
162
|
+
config: Optional configuration.
|
|
163
|
+
client: Optional Dask distributed client.
|
|
164
|
+
"""
|
|
165
|
+
_check_dask_available()
|
|
166
|
+
super().__init__(config)
|
|
167
|
+
|
|
168
|
+
self._ddf = dask_df
|
|
169
|
+
self._client = client
|
|
170
|
+
self._columns = list(dask_df.columns)
|
|
171
|
+
self._cached_row_count: int | None = None
|
|
172
|
+
self._dtypes = dict(dask_df.dtypes)
|
|
173
|
+
|
|
174
|
+
# Initialize distributed client if configured
|
|
175
|
+
self._setup_client()
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def _default_config(cls) -> DaskEngineConfig:
|
|
179
|
+
"""Create default configuration."""
|
|
180
|
+
return DaskEngineConfig()
|
|
181
|
+
|
|
182
|
+
def _setup_client(self) -> None:
|
|
183
|
+
"""Set up Dask distributed client if needed."""
|
|
184
|
+
if self._config.scheduler == "distributed" and self._client is None:
|
|
185
|
+
try:
|
|
186
|
+
from distributed import Client
|
|
187
|
+
|
|
188
|
+
if self._config.client_address:
|
|
189
|
+
self._client = Client(self._config.client_address)
|
|
190
|
+
else:
|
|
191
|
+
self._client = Client(
|
|
192
|
+
n_workers=self._config.n_workers,
|
|
193
|
+
threads_per_worker=self._config.threads_per_worker,
|
|
194
|
+
memory_limit=self._config.memory_per_worker,
|
|
195
|
+
processes=self._config.processes,
|
|
196
|
+
dashboard_address=self._config.dashboard_address,
|
|
197
|
+
)
|
|
198
|
+
except ImportError:
|
|
199
|
+
logger.warning(
|
|
200
|
+
"distributed not installed. Using default scheduler."
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# -------------------------------------------------------------------------
|
|
204
|
+
# Factory Methods
|
|
205
|
+
# -------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
@classmethod
|
|
208
|
+
def from_dataframe(
|
|
209
|
+
cls,
|
|
210
|
+
ddf: "dd.DataFrame",
|
|
211
|
+
config: DaskEngineConfig | None = None,
|
|
212
|
+
client: "Client | None" = None,
|
|
213
|
+
) -> "DaskExecutionEngine":
|
|
214
|
+
"""Create engine from existing Dask DataFrame.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
ddf: Dask DataFrame.
|
|
218
|
+
config: Optional configuration.
|
|
219
|
+
client: Optional distributed client.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
DaskExecutionEngine instance.
|
|
223
|
+
"""
|
|
224
|
+
return cls(ddf, config, client)
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def from_parquet(
|
|
228
|
+
cls,
|
|
229
|
+
path: str,
|
|
230
|
+
config: DaskEngineConfig | None = None,
|
|
231
|
+
client: "Client | None" = None,
|
|
232
|
+
**read_kwargs: Any,
|
|
233
|
+
) -> "DaskExecutionEngine":
|
|
234
|
+
"""Create engine from Parquet files.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
path: Path to Parquet files (can use glob patterns).
|
|
238
|
+
config: Optional configuration.
|
|
239
|
+
client: Optional distributed client.
|
|
240
|
+
**read_kwargs: Additional arguments for read_parquet.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
DaskExecutionEngine instance.
|
|
244
|
+
"""
|
|
245
|
+
_check_dask_available()
|
|
246
|
+
import dask.dataframe as dd
|
|
247
|
+
|
|
248
|
+
cfg = config or DaskEngineConfig()
|
|
249
|
+
ddf = dd.read_parquet(path, blocksize=cfg.blocksize, **read_kwargs)
|
|
250
|
+
|
|
251
|
+
return cls(ddf, config, client)
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def from_csv(
|
|
255
|
+
cls,
|
|
256
|
+
path: str,
|
|
257
|
+
config: DaskEngineConfig | None = None,
|
|
258
|
+
client: "Client | None" = None,
|
|
259
|
+
**read_kwargs: Any,
|
|
260
|
+
) -> "DaskExecutionEngine":
|
|
261
|
+
"""Create engine from CSV files.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
path: Path to CSV files (can use glob patterns).
|
|
265
|
+
config: Optional configuration.
|
|
266
|
+
client: Optional distributed client.
|
|
267
|
+
**read_kwargs: Additional arguments for read_csv.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
DaskExecutionEngine instance.
|
|
271
|
+
"""
|
|
272
|
+
_check_dask_available()
|
|
273
|
+
import dask.dataframe as dd
|
|
274
|
+
|
|
275
|
+
cfg = config or DaskEngineConfig()
|
|
276
|
+
ddf = dd.read_csv(path, blocksize=cfg.blocksize, **read_kwargs)
|
|
277
|
+
|
|
278
|
+
return cls(ddf, config, client)
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def from_pandas(
|
|
282
|
+
cls,
|
|
283
|
+
pdf: "pd.DataFrame",
|
|
284
|
+
npartitions: int = 4,
|
|
285
|
+
config: DaskEngineConfig | None = None,
|
|
286
|
+
client: "Client | None" = None,
|
|
287
|
+
) -> "DaskExecutionEngine":
|
|
288
|
+
"""Create engine from Pandas DataFrame.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
pdf: Pandas DataFrame.
|
|
292
|
+
npartitions: Number of partitions.
|
|
293
|
+
config: Optional configuration.
|
|
294
|
+
client: Optional distributed client.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
DaskExecutionEngine instance.
|
|
298
|
+
"""
|
|
299
|
+
_check_dask_available()
|
|
300
|
+
import dask.dataframe as dd
|
|
301
|
+
|
|
302
|
+
ddf = dd.from_pandas(pdf, npartitions=npartitions)
|
|
303
|
+
|
|
304
|
+
return cls(ddf, config, client)
|
|
305
|
+
|
|
306
|
+
# -------------------------------------------------------------------------
|
|
307
|
+
# Properties
|
|
308
|
+
# -------------------------------------------------------------------------
|
|
309
|
+
|
|
310
|
+
@property
|
|
311
|
+
def backend_type(self) -> ComputeBackend:
|
|
312
|
+
"""Get the compute backend type."""
|
|
313
|
+
return ComputeBackend.DASK
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def dask_dataframe(self) -> "dd.DataFrame":
|
|
317
|
+
"""Get the underlying Dask DataFrame."""
|
|
318
|
+
return self._ddf
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def client(self) -> "Client | None":
|
|
322
|
+
"""Get the distributed client."""
|
|
323
|
+
return self._client
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def supports_sql_pushdown(self) -> bool:
|
|
327
|
+
"""Dask has limited SQL pushdown support via dask-sql."""
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
# -------------------------------------------------------------------------
|
|
331
|
+
# Abstract Method Implementations
|
|
332
|
+
# -------------------------------------------------------------------------
|
|
333
|
+
|
|
334
|
+
def _get_partition_count(self) -> int:
|
|
335
|
+
"""Get number of data partitions."""
|
|
336
|
+
return self._ddf.npartitions
|
|
337
|
+
|
|
338
|
+
def _get_partition_info(self) -> list[PartitionInfo]:
|
|
339
|
+
"""Get information about all partitions.
|
|
340
|
+
|
|
341
|
+
Note: Dask doesn't expose partition boundaries easily,
|
|
342
|
+
so we return estimated information.
|
|
343
|
+
"""
|
|
344
|
+
num_partitions = self._get_partition_count()
|
|
345
|
+
columns = tuple(self._columns)
|
|
346
|
+
|
|
347
|
+
return [
|
|
348
|
+
PartitionInfo(
|
|
349
|
+
partition_id=i,
|
|
350
|
+
total_partitions=num_partitions,
|
|
351
|
+
columns=columns,
|
|
352
|
+
)
|
|
353
|
+
for i in range(num_partitions)
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
def _execute_on_partitions(
|
|
357
|
+
self,
|
|
358
|
+
operation: str,
|
|
359
|
+
func: Callable[[Any], dict[str, Any]],
|
|
360
|
+
columns: list[str] | None = None,
|
|
361
|
+
) -> list[DistributedResult]:
|
|
362
|
+
"""Execute function on all partitions using map_partitions.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
operation: Operation name for metrics.
|
|
366
|
+
func: Function to apply to each partition (receives pandas DataFrame).
|
|
367
|
+
columns: Columns to include (None = all).
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Results from all partitions.
|
|
371
|
+
"""
|
|
372
|
+
import pandas as pd
|
|
373
|
+
|
|
374
|
+
metrics = self._start_metrics(operation)
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
ddf = self._ddf
|
|
378
|
+
if columns:
|
|
379
|
+
ddf = ddf[columns]
|
|
380
|
+
|
|
381
|
+
# Map partitions - func receives pandas DataFrame
|
|
382
|
+
def wrapped_func(pdf: pd.DataFrame, partition_info: dict | None = None) -> pd.DataFrame:
|
|
383
|
+
start_time = time.time()
|
|
384
|
+
result = func(pdf)
|
|
385
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
386
|
+
|
|
387
|
+
partition_id = 0
|
|
388
|
+
if partition_info:
|
|
389
|
+
partition_id = partition_info.get("number", 0)
|
|
390
|
+
|
|
391
|
+
return pd.DataFrame([{
|
|
392
|
+
"partition_id": partition_id,
|
|
393
|
+
"value": result.get("value"),
|
|
394
|
+
"row_count": len(pdf),
|
|
395
|
+
"duration_ms": duration_ms,
|
|
396
|
+
"errors": result.get("errors", []),
|
|
397
|
+
"metadata": result.get("metadata", {}),
|
|
398
|
+
}])
|
|
399
|
+
|
|
400
|
+
results_ddf = ddf.map_partitions(
|
|
401
|
+
wrapped_func,
|
|
402
|
+
meta=pd.DataFrame({
|
|
403
|
+
"partition_id": pd.Series(dtype=int),
|
|
404
|
+
"value": pd.Series(dtype=object),
|
|
405
|
+
"row_count": pd.Series(dtype=int),
|
|
406
|
+
"duration_ms": pd.Series(dtype=float),
|
|
407
|
+
"errors": pd.Series(dtype=object),
|
|
408
|
+
"metadata": pd.Series(dtype=object),
|
|
409
|
+
}),
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
results_pdf = results_ddf.compute()
|
|
413
|
+
|
|
414
|
+
results = []
|
|
415
|
+
total_rows = 0
|
|
416
|
+
for _, row in results_pdf.iterrows():
|
|
417
|
+
row_count = row["row_count"]
|
|
418
|
+
total_rows += row_count
|
|
419
|
+
results.append(
|
|
420
|
+
DistributedResult(
|
|
421
|
+
partition_id=row["partition_id"],
|
|
422
|
+
operation=operation,
|
|
423
|
+
value=row["value"],
|
|
424
|
+
row_count=row_count,
|
|
425
|
+
duration_ms=row["duration_ms"],
|
|
426
|
+
errors=row["errors"] if row["errors"] else [],
|
|
427
|
+
metadata=row["metadata"] if row["metadata"] else {},
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
metrics.partitions_processed = len(results)
|
|
432
|
+
metrics.rows_processed = total_rows
|
|
433
|
+
|
|
434
|
+
return results
|
|
435
|
+
|
|
436
|
+
except Exception as e:
|
|
437
|
+
metrics.errors.append(str(e))
|
|
438
|
+
raise
|
|
439
|
+
finally:
|
|
440
|
+
self._end_metrics(metrics)
|
|
441
|
+
|
|
442
|
+
def _aggregate_distributed(
|
|
443
|
+
self,
|
|
444
|
+
spec: AggregationSpec,
|
|
445
|
+
) -> dict[str, Any]:
|
|
446
|
+
"""Perform distributed aggregation using native Dask operations.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
spec: Aggregation specification.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Aggregated results.
|
|
453
|
+
"""
|
|
454
|
+
metrics = self._start_metrics("aggregate")
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
results = {}
|
|
458
|
+
|
|
459
|
+
for agg in spec.aggregations:
|
|
460
|
+
column = agg.column
|
|
461
|
+
operation = agg.operation
|
|
462
|
+
alias = agg.alias
|
|
463
|
+
params = agg.params
|
|
464
|
+
|
|
465
|
+
if operation == "count":
|
|
466
|
+
if column == "*":
|
|
467
|
+
value = len(self._ddf)
|
|
468
|
+
else:
|
|
469
|
+
value = self._ddf[column].count().compute()
|
|
470
|
+
results[alias] = value
|
|
471
|
+
|
|
472
|
+
elif operation == "sum":
|
|
473
|
+
value = self._ddf[column].sum().compute()
|
|
474
|
+
results[alias] = value
|
|
475
|
+
|
|
476
|
+
elif operation == "mean":
|
|
477
|
+
value = self._ddf[column].mean().compute()
|
|
478
|
+
results[alias] = value
|
|
479
|
+
|
|
480
|
+
elif operation == "min":
|
|
481
|
+
value = self._ddf[column].min().compute()
|
|
482
|
+
results[alias] = value
|
|
483
|
+
|
|
484
|
+
elif operation == "max":
|
|
485
|
+
value = self._ddf[column].max().compute()
|
|
486
|
+
results[alias] = value
|
|
487
|
+
|
|
488
|
+
elif operation == "std":
|
|
489
|
+
ddof = params.get("ddof", 1)
|
|
490
|
+
value = self._ddf[column].std(ddof=ddof).compute()
|
|
491
|
+
results[alias] = value
|
|
492
|
+
|
|
493
|
+
elif operation == "var":
|
|
494
|
+
ddof = params.get("ddof", 1)
|
|
495
|
+
value = self._ddf[column].var(ddof=ddof).compute()
|
|
496
|
+
results[alias] = value
|
|
497
|
+
|
|
498
|
+
elif operation == "minmax":
|
|
499
|
+
min_val = self._ddf[column].min().compute()
|
|
500
|
+
max_val = self._ddf[column].max().compute()
|
|
501
|
+
results[alias] = {"min": min_val, "max": max_val}
|
|
502
|
+
|
|
503
|
+
elif operation == "null_count":
|
|
504
|
+
null_count = self._ddf[column].isna().sum().compute()
|
|
505
|
+
total_count = len(self._ddf)
|
|
506
|
+
results[alias] = {
|
|
507
|
+
"null_count": int(null_count),
|
|
508
|
+
"total_count": total_count,
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
elif operation == "distinct_count":
|
|
512
|
+
value = self._ddf[column].nunique().compute()
|
|
513
|
+
results[alias] = int(value)
|
|
514
|
+
|
|
515
|
+
else:
|
|
516
|
+
# Use custom aggregator via map-reduce
|
|
517
|
+
result = self._aggregate_with_aggregator(agg)
|
|
518
|
+
results[alias] = result
|
|
519
|
+
|
|
520
|
+
return results
|
|
521
|
+
|
|
522
|
+
except Exception as e:
|
|
523
|
+
metrics.errors.append(str(e))
|
|
524
|
+
raise
|
|
525
|
+
finally:
|
|
526
|
+
self._end_metrics(metrics)
|
|
527
|
+
|
|
528
|
+
def _aggregate_with_aggregator(
|
|
529
|
+
self,
|
|
530
|
+
agg: Any,
|
|
531
|
+
) -> Any:
|
|
532
|
+
"""Perform aggregation using custom aggregator via map-reduce.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
agg: Aggregation specification.
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
Aggregated result.
|
|
539
|
+
"""
|
|
540
|
+
import pandas as pd
|
|
541
|
+
|
|
542
|
+
aggregator = get_aggregator(agg.operation, **agg.params)
|
|
543
|
+
column = agg.column
|
|
544
|
+
|
|
545
|
+
# Map phase: compute partial aggregates per partition
|
|
546
|
+
def map_partition(pdf: pd.DataFrame) -> pd.DataFrame:
|
|
547
|
+
state = aggregator.initialize()
|
|
548
|
+
for value in pdf[column]:
|
|
549
|
+
state = aggregator.accumulate(state, value)
|
|
550
|
+
return pd.DataFrame([{"state": state}])
|
|
551
|
+
|
|
552
|
+
partial_results = self._ddf.map_partitions(
|
|
553
|
+
map_partition,
|
|
554
|
+
meta=pd.DataFrame({"state": pd.Series(dtype=object)}),
|
|
555
|
+
).compute()
|
|
556
|
+
|
|
557
|
+
states = partial_results["state"].tolist()
|
|
558
|
+
|
|
559
|
+
# Reduce phase: merge all partial results
|
|
560
|
+
if not states:
|
|
561
|
+
return aggregator.finalize(aggregator.initialize())
|
|
562
|
+
|
|
563
|
+
final_state = reduce(aggregator.merge, states)
|
|
564
|
+
return aggregator.finalize(final_state)
|
|
565
|
+
|
|
566
|
+
def _to_arrow_batches(
|
|
567
|
+
self,
|
|
568
|
+
batch_size: int | None = None,
|
|
569
|
+
) -> list["pa.RecordBatch"]:
|
|
570
|
+
"""Convert Dask DataFrame to Arrow batches.
|
|
571
|
+
|
|
572
|
+
Args:
|
|
573
|
+
batch_size: Batch size for conversion.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
List of Arrow record batches.
|
|
577
|
+
"""
|
|
578
|
+
import pyarrow as pa
|
|
579
|
+
|
|
580
|
+
batch_size = batch_size or self._config.arrow_batch_size
|
|
581
|
+
|
|
582
|
+
try:
|
|
583
|
+
# Dask has native Arrow support via to_arrow
|
|
584
|
+
# This works when pyarrow is installed
|
|
585
|
+
table = self._ddf.compute().to_arrow()
|
|
586
|
+
return table.to_batches(max_chunksize=batch_size)
|
|
587
|
+
except AttributeError:
|
|
588
|
+
# Fallback: Convert via Pandas
|
|
589
|
+
logger.debug("Falling back to Pandas-based Arrow conversion")
|
|
590
|
+
pdf = self._ddf.compute()
|
|
591
|
+
table = pa.Table.from_pandas(pdf)
|
|
592
|
+
return table.to_batches(max_chunksize=batch_size)
|
|
593
|
+
|
|
594
|
+
def _repartition(self, num_partitions: int) -> "DaskExecutionEngine":
|
|
595
|
+
"""Repartition the underlying DataFrame.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
num_partitions: New number of partitions.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
New engine with repartitioned data.
|
|
602
|
+
"""
|
|
603
|
+
repartitioned = self._ddf.repartition(npartitions=num_partitions)
|
|
604
|
+
return DaskExecutionEngine(repartitioned, self._config, self._client)
|
|
605
|
+
|
|
606
|
+
def coalesce(self, num_partitions: int) -> "DaskExecutionEngine":
|
|
607
|
+
"""Coalesce partitions (no shuffle when reducing).
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
num_partitions: New number of partitions.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
New engine with coalesced data.
|
|
614
|
+
"""
|
|
615
|
+
# Dask's repartition with fewer partitions is similar to coalesce
|
|
616
|
+
coalesced = self._ddf.repartition(npartitions=num_partitions)
|
|
617
|
+
return DaskExecutionEngine(coalesced, self._config, self._client)
|
|
618
|
+
|
|
619
|
+
# -------------------------------------------------------------------------
|
|
620
|
+
# Core Operation Overrides (Native Dask)
|
|
621
|
+
# -------------------------------------------------------------------------
|
|
622
|
+
|
|
623
|
+
def count_rows(self) -> int:
|
|
624
|
+
"""Count rows using native Dask len."""
|
|
625
|
+
if self._cached_row_count is not None:
|
|
626
|
+
return self._cached_row_count
|
|
627
|
+
|
|
628
|
+
cache_key = self._cache_key("count_rows")
|
|
629
|
+
cached = self._get_cached(cache_key)
|
|
630
|
+
if cached is not None:
|
|
631
|
+
return cached
|
|
632
|
+
|
|
633
|
+
count = len(self._ddf)
|
|
634
|
+
self._cached_row_count = count
|
|
635
|
+
self._set_cached(cache_key, count)
|
|
636
|
+
return count
|
|
637
|
+
|
|
638
|
+
def get_columns(self) -> list[str]:
|
|
639
|
+
"""Get column names."""
|
|
640
|
+
return self._columns
|
|
641
|
+
|
|
642
|
+
def count_nulls(self, column: str) -> int:
|
|
643
|
+
"""Count nulls using native Dask isna."""
|
|
644
|
+
cache_key = self._cache_key("count_nulls", column)
|
|
645
|
+
cached = self._get_cached(cache_key)
|
|
646
|
+
if cached is not None:
|
|
647
|
+
return cached
|
|
648
|
+
|
|
649
|
+
count = int(self._ddf[column].isna().sum().compute())
|
|
650
|
+
self._set_cached(cache_key, count)
|
|
651
|
+
return count
|
|
652
|
+
|
|
653
|
+
def count_nulls_all(self) -> dict[str, int]:
|
|
654
|
+
"""Count nulls in all columns using batch aggregation."""
|
|
655
|
+
cache_key = self._cache_key("count_nulls_all")
|
|
656
|
+
cached = self._get_cached(cache_key)
|
|
657
|
+
if cached is not None:
|
|
658
|
+
return cached
|
|
659
|
+
|
|
660
|
+
# Compute all null counts in parallel
|
|
661
|
+
results = {}
|
|
662
|
+
for col in self._columns:
|
|
663
|
+
results[col] = self._ddf[col].isna().sum()
|
|
664
|
+
|
|
665
|
+
# Compute all at once
|
|
666
|
+
import dask
|
|
667
|
+
|
|
668
|
+
computed = dask.compute(results)[0]
|
|
669
|
+
result = {col: int(val) for col, val in computed.items()}
|
|
670
|
+
|
|
671
|
+
self._set_cached(cache_key, result)
|
|
672
|
+
return result
|
|
673
|
+
|
|
674
|
+
def count_distinct(self, column: str) -> int:
|
|
675
|
+
"""Count distinct values using native Dask nunique."""
|
|
676
|
+
cache_key = self._cache_key("count_distinct", column)
|
|
677
|
+
cached = self._get_cached(cache_key)
|
|
678
|
+
if cached is not None:
|
|
679
|
+
return cached
|
|
680
|
+
|
|
681
|
+
count = int(self._ddf[column].nunique().compute())
|
|
682
|
+
self._set_cached(cache_key, count)
|
|
683
|
+
return count
|
|
684
|
+
|
|
685
|
+
def get_stats(self, column: str) -> dict[str, Any]:
|
|
686
|
+
"""Get column statistics using native Dask aggregations."""
|
|
687
|
+
cache_key = self._cache_key("get_stats", column)
|
|
688
|
+
cached = self._get_cached(cache_key)
|
|
689
|
+
if cached is not None:
|
|
690
|
+
return cached
|
|
691
|
+
|
|
692
|
+
# Compute all stats in parallel
|
|
693
|
+
col = self._ddf[column]
|
|
694
|
+
computations = {
|
|
695
|
+
"count": col.count(),
|
|
696
|
+
"null_count": col.isna().sum(),
|
|
697
|
+
"mean": col.mean(),
|
|
698
|
+
"std": col.std(),
|
|
699
|
+
"min": col.min(),
|
|
700
|
+
"max": col.max(),
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
import dask
|
|
704
|
+
|
|
705
|
+
computed = dask.compute(computations)[0]
|
|
706
|
+
|
|
707
|
+
stats = {
|
|
708
|
+
"count": int(computed["count"]),
|
|
709
|
+
"null_count": int(computed["null_count"]),
|
|
710
|
+
"mean": float(computed["mean"]) if computed["mean"] is not None else None,
|
|
711
|
+
"std": float(computed["std"]) if computed["std"] is not None else None,
|
|
712
|
+
"min": computed["min"],
|
|
713
|
+
"max": computed["max"],
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
self._set_cached(cache_key, stats)
|
|
717
|
+
return stats
|
|
718
|
+
|
|
719
|
+
def get_quantiles(
|
|
720
|
+
self,
|
|
721
|
+
column: str,
|
|
722
|
+
quantiles: list[float],
|
|
723
|
+
) -> list[float]:
|
|
724
|
+
"""Get quantiles using Dask's quantile method."""
|
|
725
|
+
cache_key = self._cache_key("get_quantiles", column, tuple(quantiles))
|
|
726
|
+
cached = self._get_cached(cache_key)
|
|
727
|
+
if cached is not None:
|
|
728
|
+
return cached
|
|
729
|
+
|
|
730
|
+
result = self._ddf[column].quantile(quantiles).compute()
|
|
731
|
+
result_list = list(result)
|
|
732
|
+
|
|
733
|
+
self._set_cached(cache_key, result_list)
|
|
734
|
+
return result_list
|
|
735
|
+
|
|
736
|
+
def get_value_counts(
|
|
737
|
+
self,
|
|
738
|
+
column: str,
|
|
739
|
+
limit: int | None = None,
|
|
740
|
+
) -> dict[Any, int]:
|
|
741
|
+
"""Get value counts using native Dask value_counts."""
|
|
742
|
+
cache_key = self._cache_key("get_value_counts", column, limit)
|
|
743
|
+
cached = self._get_cached(cache_key)
|
|
744
|
+
if cached is not None:
|
|
745
|
+
return cached
|
|
746
|
+
|
|
747
|
+
counts = self._ddf[column].value_counts()
|
|
748
|
+
|
|
749
|
+
if limit:
|
|
750
|
+
counts = counts.head(limit, npartitions=-1, compute=False)
|
|
751
|
+
|
|
752
|
+
result_series = counts.compute()
|
|
753
|
+
result = dict(result_series)
|
|
754
|
+
|
|
755
|
+
self._set_cached(cache_key, result)
|
|
756
|
+
return result
|
|
757
|
+
|
|
758
|
+
def count_duplicates(self, columns: list[str]) -> int:
|
|
759
|
+
"""Count duplicates using native Dask operations."""
|
|
760
|
+
cache_key = self._cache_key("count_duplicates", tuple(columns))
|
|
761
|
+
cached = self._get_cached(cache_key)
|
|
762
|
+
if cached is not None:
|
|
763
|
+
return cached
|
|
764
|
+
|
|
765
|
+
total = self.count_rows()
|
|
766
|
+
unique = len(self._ddf[columns].drop_duplicates())
|
|
767
|
+
duplicates = total - unique
|
|
768
|
+
|
|
769
|
+
self._set_cached(cache_key, duplicates)
|
|
770
|
+
return duplicates
|
|
771
|
+
|
|
772
|
+
def count_matching_regex(self, column: str, pattern: str) -> int:
|
|
773
|
+
"""Count values matching regex."""
|
|
774
|
+
cache_key = self._cache_key("count_matching_regex", column, pattern)
|
|
775
|
+
cached = self._get_cached(cache_key)
|
|
776
|
+
if cached is not None:
|
|
777
|
+
return cached
|
|
778
|
+
|
|
779
|
+
count = int(
|
|
780
|
+
self._ddf[column]
|
|
781
|
+
.str.match(pattern, na=False)
|
|
782
|
+
.sum()
|
|
783
|
+
.compute()
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
self._set_cached(cache_key, count)
|
|
787
|
+
return count
|
|
788
|
+
|
|
789
|
+
def count_in_range(
|
|
790
|
+
self,
|
|
791
|
+
column: str,
|
|
792
|
+
min_value: Any | None = None,
|
|
793
|
+
max_value: Any | None = None,
|
|
794
|
+
inclusive: bool = True,
|
|
795
|
+
) -> int:
|
|
796
|
+
"""Count values in range using native Dask filter."""
|
|
797
|
+
cache_key = self._cache_key(
|
|
798
|
+
"count_in_range", column, min_value, max_value, inclusive
|
|
799
|
+
)
|
|
800
|
+
cached = self._get_cached(cache_key)
|
|
801
|
+
if cached is not None:
|
|
802
|
+
return cached
|
|
803
|
+
|
|
804
|
+
series = self._ddf[column]
|
|
805
|
+
mask = None
|
|
806
|
+
|
|
807
|
+
if min_value is not None:
|
|
808
|
+
if inclusive:
|
|
809
|
+
mask = series >= min_value
|
|
810
|
+
else:
|
|
811
|
+
mask = series > min_value
|
|
812
|
+
|
|
813
|
+
if max_value is not None:
|
|
814
|
+
max_mask = series <= max_value if inclusive else series < max_value
|
|
815
|
+
mask = mask & max_mask if mask is not None else max_mask
|
|
816
|
+
|
|
817
|
+
if mask is None:
|
|
818
|
+
count = self.count_rows()
|
|
819
|
+
else:
|
|
820
|
+
count = int(mask.sum().compute())
|
|
821
|
+
|
|
822
|
+
self._set_cached(cache_key, count)
|
|
823
|
+
return count
|
|
824
|
+
|
|
825
|
+
def count_in_set(self, column: str, values: set[Any]) -> int:
|
|
826
|
+
"""Count values in set using Dask isin."""
|
|
827
|
+
cache_key = self._cache_key("count_in_set", column, frozenset(values))
|
|
828
|
+
cached = self._get_cached(cache_key)
|
|
829
|
+
if cached is not None:
|
|
830
|
+
return cached
|
|
831
|
+
|
|
832
|
+
count = int(self._ddf[column].isin(list(values)).sum().compute())
|
|
833
|
+
self._set_cached(cache_key, count)
|
|
834
|
+
return count
|
|
835
|
+
|
|
836
|
+
# -------------------------------------------------------------------------
|
|
837
|
+
# Sampling
|
|
838
|
+
# -------------------------------------------------------------------------
|
|
839
|
+
|
|
840
|
+
def sample(
|
|
841
|
+
self,
|
|
842
|
+
n: int = 1000,
|
|
843
|
+
seed: int | None = None,
|
|
844
|
+
) -> "DaskExecutionEngine":
|
|
845
|
+
"""Create sampled engine using Dask's native sampling.
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
n: Target number of rows.
|
|
849
|
+
seed: Random seed.
|
|
850
|
+
|
|
851
|
+
Returns:
|
|
852
|
+
New engine with sampled data.
|
|
853
|
+
"""
|
|
854
|
+
row_count = self.count_rows()
|
|
855
|
+
|
|
856
|
+
if row_count <= n:
|
|
857
|
+
return self
|
|
858
|
+
|
|
859
|
+
fraction = min((n * 1.1) / row_count, 1.0)
|
|
860
|
+
|
|
861
|
+
sampled = self._ddf.sample(
|
|
862
|
+
frac=fraction,
|
|
863
|
+
random_state=seed,
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
# Limit to exact n rows
|
|
867
|
+
sampled = sampled.head(n, npartitions=-1, compute=False)
|
|
868
|
+
|
|
869
|
+
return DaskExecutionEngine(sampled, self._config, self._client)
|
|
870
|
+
|
|
871
|
+
# -------------------------------------------------------------------------
|
|
872
|
+
# Dask-Specific Methods
|
|
873
|
+
# -------------------------------------------------------------------------
|
|
874
|
+
|
|
875
|
+
def persist(self) -> "DaskExecutionEngine":
|
|
876
|
+
"""Persist the DataFrame in distributed memory.
|
|
877
|
+
|
|
878
|
+
Returns:
|
|
879
|
+
Self after persisting.
|
|
880
|
+
"""
|
|
881
|
+
self._ddf = self._ddf.persist()
|
|
882
|
+
return self
|
|
883
|
+
|
|
884
|
+
def compute(self) -> "pd.DataFrame":
|
|
885
|
+
"""Compute and return as Pandas DataFrame.
|
|
886
|
+
|
|
887
|
+
Returns:
|
|
888
|
+
Pandas DataFrame.
|
|
889
|
+
"""
|
|
890
|
+
return self._ddf.compute()
|
|
891
|
+
|
|
892
|
+
def visualize(
|
|
893
|
+
self,
|
|
894
|
+
filename: str = "dask_graph",
|
|
895
|
+
format: str = "png",
|
|
896
|
+
) -> str:
|
|
897
|
+
"""Visualize the task graph.
|
|
898
|
+
|
|
899
|
+
Args:
|
|
900
|
+
filename: Output filename (without extension).
|
|
901
|
+
format: Output format (png, svg, pdf).
|
|
902
|
+
|
|
903
|
+
Returns:
|
|
904
|
+
Path to the generated file.
|
|
905
|
+
"""
|
|
906
|
+
return self._ddf.visualize(filename=filename, format=format)
|
|
907
|
+
|
|
908
|
+
def filter(self, condition: str) -> "DaskExecutionEngine":
|
|
909
|
+
"""Filter the DataFrame using a query string.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
condition: Query condition string.
|
|
913
|
+
|
|
914
|
+
Returns:
|
|
915
|
+
New engine with filtered data.
|
|
916
|
+
"""
|
|
917
|
+
filtered = self._ddf.query(condition)
|
|
918
|
+
return DaskExecutionEngine(filtered, self._config, self._client)
|
|
919
|
+
|
|
920
|
+
def select(self, columns: list[str]) -> "DaskExecutionEngine":
|
|
921
|
+
"""Select specific columns.
|
|
922
|
+
|
|
923
|
+
Args:
|
|
924
|
+
columns: Columns to select.
|
|
925
|
+
|
|
926
|
+
Returns:
|
|
927
|
+
New engine with selected columns.
|
|
928
|
+
"""
|
|
929
|
+
selected = self._ddf[columns]
|
|
930
|
+
return DaskExecutionEngine(selected, self._config, self._client)
|
|
931
|
+
|
|
932
|
+
def head(self, n: int = 5) -> "pd.DataFrame":
|
|
933
|
+
"""Get first n rows as Pandas DataFrame.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
n: Number of rows.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
Pandas DataFrame.
|
|
940
|
+
"""
|
|
941
|
+
return self._ddf.head(n)
|
|
942
|
+
|
|
943
|
+
def tail(self, n: int = 5) -> "pd.DataFrame":
|
|
944
|
+
"""Get last n rows as Pandas DataFrame.
|
|
945
|
+
|
|
946
|
+
Args:
|
|
947
|
+
n: Number of rows.
|
|
948
|
+
|
|
949
|
+
Returns:
|
|
950
|
+
Pandas DataFrame.
|
|
951
|
+
"""
|
|
952
|
+
return self._ddf.tail(n)
|
|
953
|
+
|
|
954
|
+
def describe(self) -> "pd.DataFrame":
|
|
955
|
+
"""Get descriptive statistics.
|
|
956
|
+
|
|
957
|
+
Returns:
|
|
958
|
+
Pandas DataFrame with statistics.
|
|
959
|
+
"""
|
|
960
|
+
return self._ddf.describe().compute()
|
|
961
|
+
|
|
962
|
+
# -------------------------------------------------------------------------
|
|
963
|
+
# Context Manager
|
|
964
|
+
# -------------------------------------------------------------------------
|
|
965
|
+
|
|
966
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
967
|
+
"""Context manager exit - cleanup client if we created it."""
|
|
968
|
+
super().__exit__(exc_type, exc_val, exc_tb)
|
|
969
|
+
# Note: We don't close the client here as it might be shared
|
|
970
|
+
# Users should manage client lifecycle separately
|
|
971
|
+
|
|
972
|
+
def close(self) -> None:
|
|
973
|
+
"""Close the distributed client if it exists."""
|
|
974
|
+
if self._client is not None:
|
|
975
|
+
self._client.close()
|
|
976
|
+
self._client = None
|