truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1011 @@
|
|
|
1
|
+
"""Spark-native execution engine for distributed data validation.
|
|
2
|
+
|
|
3
|
+
This module provides a Spark-native execution engine that:
|
|
4
|
+
- Executes validation operations directly on Spark DataFrames
|
|
5
|
+
- Avoids Polars conversion overhead for distributed operations
|
|
6
|
+
- Uses Arrow for efficient data transfer when conversion is needed
|
|
7
|
+
- Supports distributed aggregations with proper reduce semantics
|
|
8
|
+
|
|
9
|
+
Architecture:
|
|
10
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
11
|
+
│ SparkExecutionEngine │
|
|
12
|
+
│ │
|
|
13
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
14
|
+
│ │ Native Spark Operations │ │
|
|
15
|
+
│ │ (count, aggregate, filter - no conversion overhead) │ │
|
|
16
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
17
|
+
│ │ │
|
|
18
|
+
│ ▼ │
|
|
19
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
20
|
+
│ │ Arrow Bridge (when needed) │ │
|
|
21
|
+
│ │ (zero-copy conversion to Polars for ML validators) │ │
|
|
22
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
23
|
+
│ │ │
|
|
24
|
+
│ ▼ │
|
|
25
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
26
|
+
│ │ Polars LazyFrame (fallback) │ │
|
|
27
|
+
│ │ (only for validators that require Polars operations) │ │
|
|
28
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> from pyspark.sql import SparkSession
|
|
34
|
+
>>> from truthound.execution.distributed import SparkExecutionEngine
|
|
35
|
+
>>>
|
|
36
|
+
>>> spark = SparkSession.builder.getOrCreate()
|
|
37
|
+
>>> df = spark.read.parquet("large_data.parquet")
|
|
38
|
+
>>>
|
|
39
|
+
>>> # Create native Spark engine
|
|
40
|
+
>>> engine = SparkExecutionEngine.from_dataframe(df)
|
|
41
|
+
>>>
|
|
42
|
+
>>> # Native Spark operations (no conversion overhead)
|
|
43
|
+
>>> row_count = engine.count_rows()
|
|
44
|
+
>>> null_counts = engine.count_nulls_all()
|
|
45
|
+
>>> stats = engine.get_stats("price")
|
|
46
|
+
>>>
|
|
47
|
+
>>> # Convert to Polars only when needed (via Arrow)
|
|
48
|
+
>>> lf = engine.to_polars_lazyframe()
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
import logging
|
|
54
|
+
import time
|
|
55
|
+
from dataclasses import dataclass, field
|
|
56
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator
|
|
57
|
+
|
|
58
|
+
from truthound.execution.distributed.base import (
|
|
59
|
+
BaseDistributedEngine,
|
|
60
|
+
DistributedEngineConfig,
|
|
61
|
+
ExecutionMetrics,
|
|
62
|
+
)
|
|
63
|
+
from truthound.execution.distributed.protocols import (
|
|
64
|
+
AggregationScope,
|
|
65
|
+
AggregationSpec,
|
|
66
|
+
ComputeBackend,
|
|
67
|
+
DistributedResult,
|
|
68
|
+
PartitionInfo,
|
|
69
|
+
PartitionStrategy,
|
|
70
|
+
get_aggregator,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if TYPE_CHECKING:
|
|
74
|
+
import pyarrow as pa
|
|
75
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
|
76
|
+
from pyspark.sql import SparkSession
|
|
77
|
+
|
|
78
|
+
logger = logging.getLogger(__name__)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# =============================================================================
|
|
82
|
+
# Configuration
|
|
83
|
+
# =============================================================================
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class SparkEngineConfig(DistributedEngineConfig):
|
|
88
|
+
"""Configuration for Spark execution engine.
|
|
89
|
+
|
|
90
|
+
Attributes:
|
|
91
|
+
app_name: Spark application name.
|
|
92
|
+
master: Spark master URL.
|
|
93
|
+
executor_memory: Memory per executor.
|
|
94
|
+
driver_memory: Driver memory.
|
|
95
|
+
executor_cores: Cores per executor.
|
|
96
|
+
arrow_enabled: Enable Arrow optimization.
|
|
97
|
+
adaptive_enabled: Enable adaptive query execution.
|
|
98
|
+
broadcast_threshold: Broadcast join threshold in bytes.
|
|
99
|
+
shuffle_partitions: Number of shuffle partitions.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
app_name: str = "truthound-spark"
|
|
103
|
+
master: str = "" # Empty = use existing session
|
|
104
|
+
executor_memory: str = "4g"
|
|
105
|
+
driver_memory: str = "2g"
|
|
106
|
+
executor_cores: int = 2
|
|
107
|
+
arrow_enabled: bool = True
|
|
108
|
+
adaptive_enabled: bool = True
|
|
109
|
+
broadcast_threshold: int = 10 * 1024 * 1024 # 10MB
|
|
110
|
+
shuffle_partitions: int = 200
|
|
111
|
+
extra_spark_conf: dict[str, str] = field(default_factory=dict)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _check_pyspark_available() -> None:
|
|
115
|
+
"""Check if PySpark is available."""
|
|
116
|
+
try:
|
|
117
|
+
import pyspark # noqa: F401
|
|
118
|
+
except ImportError:
|
|
119
|
+
raise ImportError(
|
|
120
|
+
"pyspark is required for SparkExecutionEngine. "
|
|
121
|
+
"Install with: pip install pyspark"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# Spark Execution Engine
|
|
127
|
+
# =============================================================================
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class SparkExecutionEngine(BaseDistributedEngine[SparkEngineConfig]):
|
|
131
|
+
"""Spark-native execution engine for distributed validation.
|
|
132
|
+
|
|
133
|
+
This engine executes validation operations directly on Spark DataFrames,
|
|
134
|
+
avoiding the overhead of converting to Polars for operations that can
|
|
135
|
+
be performed natively in Spark.
|
|
136
|
+
|
|
137
|
+
Key Features:
|
|
138
|
+
- Native Spark aggregations (count, sum, avg, min, max, etc.)
|
|
139
|
+
- Distributed null/duplicate checking
|
|
140
|
+
- Arrow-based zero-copy conversion to Polars when needed
|
|
141
|
+
- Partition-aware operations
|
|
142
|
+
- Checkpoint support for fault tolerance
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> engine = SparkExecutionEngine.from_dataframe(spark_df)
|
|
146
|
+
>>> null_counts = engine.count_nulls_all() # Native Spark
|
|
147
|
+
>>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
engine_type = "spark"
|
|
151
|
+
|
|
152
|
+
def __init__(
|
|
153
|
+
self,
|
|
154
|
+
spark_df: "SparkDataFrame",
|
|
155
|
+
config: SparkEngineConfig | None = None,
|
|
156
|
+
spark_session: "SparkSession | None" = None,
|
|
157
|
+
) -> None:
|
|
158
|
+
"""Initialize Spark execution engine.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
spark_df: PySpark DataFrame.
|
|
162
|
+
config: Optional configuration.
|
|
163
|
+
spark_session: Optional SparkSession (defaults to df's session).
|
|
164
|
+
"""
|
|
165
|
+
_check_pyspark_available()
|
|
166
|
+
super().__init__(config)
|
|
167
|
+
|
|
168
|
+
self._df = spark_df
|
|
169
|
+
self._spark = spark_session or spark_df.sparkSession
|
|
170
|
+
self._schema = spark_df.schema
|
|
171
|
+
self._columns = spark_df.columns
|
|
172
|
+
self._cached_row_count: int | None = None
|
|
173
|
+
|
|
174
|
+
# Configure Spark for optimal performance
|
|
175
|
+
self._configure_spark()
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def _default_config(cls) -> SparkEngineConfig:
|
|
179
|
+
"""Create default configuration."""
|
|
180
|
+
return SparkEngineConfig()
|
|
181
|
+
|
|
182
|
+
def _configure_spark(self) -> None:
|
|
183
|
+
"""Configure Spark session for optimal performance."""
|
|
184
|
+
if self._config.arrow_enabled:
|
|
185
|
+
self._spark.conf.set(
|
|
186
|
+
"spark.sql.execution.arrow.pyspark.enabled",
|
|
187
|
+
"true",
|
|
188
|
+
)
|
|
189
|
+
self._spark.conf.set(
|
|
190
|
+
"spark.sql.execution.arrow.pyspark.fallback.enabled",
|
|
191
|
+
"true",
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if self._config.adaptive_enabled:
|
|
195
|
+
self._spark.conf.set(
|
|
196
|
+
"spark.sql.adaptive.enabled",
|
|
197
|
+
"true",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
self._spark.conf.set(
|
|
201
|
+
"spark.sql.autoBroadcastJoinThreshold",
|
|
202
|
+
str(self._config.broadcast_threshold),
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
self._spark.conf.set(
|
|
206
|
+
"spark.sql.shuffle.partitions",
|
|
207
|
+
str(self._config.shuffle_partitions),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Apply extra configurations
|
|
211
|
+
for key, value in self._config.extra_spark_conf.items():
|
|
212
|
+
self._spark.conf.set(key, value)
|
|
213
|
+
|
|
214
|
+
# -------------------------------------------------------------------------
|
|
215
|
+
# Factory Methods
|
|
216
|
+
# -------------------------------------------------------------------------
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def from_dataframe(
|
|
220
|
+
cls,
|
|
221
|
+
df: "SparkDataFrame",
|
|
222
|
+
config: SparkEngineConfig | None = None,
|
|
223
|
+
) -> "SparkExecutionEngine":
|
|
224
|
+
"""Create engine from existing Spark DataFrame.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
df: PySpark DataFrame.
|
|
228
|
+
config: Optional configuration.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
SparkExecutionEngine instance.
|
|
232
|
+
"""
|
|
233
|
+
return cls(df, config)
|
|
234
|
+
|
|
235
|
+
@classmethod
|
|
236
|
+
def from_table(
|
|
237
|
+
cls,
|
|
238
|
+
spark: "SparkSession",
|
|
239
|
+
table_name: str,
|
|
240
|
+
database: str | None = None,
|
|
241
|
+
config: SparkEngineConfig | None = None,
|
|
242
|
+
) -> "SparkExecutionEngine":
|
|
243
|
+
"""Create engine from Spark table.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
spark: SparkSession.
|
|
247
|
+
table_name: Table name.
|
|
248
|
+
database: Optional database name.
|
|
249
|
+
config: Optional configuration.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
SparkExecutionEngine instance.
|
|
253
|
+
"""
|
|
254
|
+
_check_pyspark_available()
|
|
255
|
+
|
|
256
|
+
full_name = f"{database}.{table_name}" if database else table_name
|
|
257
|
+
df = spark.table(full_name)
|
|
258
|
+
|
|
259
|
+
return cls(df, config, spark)
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def from_parquet(
|
|
263
|
+
cls,
|
|
264
|
+
spark: "SparkSession",
|
|
265
|
+
path: str,
|
|
266
|
+
config: SparkEngineConfig | None = None,
|
|
267
|
+
) -> "SparkExecutionEngine":
|
|
268
|
+
"""Create engine from Parquet files.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
spark: SparkSession.
|
|
272
|
+
path: Path to Parquet files.
|
|
273
|
+
config: Optional configuration.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
SparkExecutionEngine instance.
|
|
277
|
+
"""
|
|
278
|
+
_check_pyspark_available()
|
|
279
|
+
|
|
280
|
+
df = spark.read.parquet(path)
|
|
281
|
+
return cls(df, config, spark)
|
|
282
|
+
|
|
283
|
+
# -------------------------------------------------------------------------
|
|
284
|
+
# Properties
|
|
285
|
+
# -------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def backend_type(self) -> ComputeBackend:
|
|
289
|
+
"""Get the compute backend type."""
|
|
290
|
+
return ComputeBackend.SPARK
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def spark_dataframe(self) -> "SparkDataFrame":
|
|
294
|
+
"""Get the underlying Spark DataFrame."""
|
|
295
|
+
return self._df
|
|
296
|
+
|
|
297
|
+
@property
|
|
298
|
+
def spark_session(self) -> "SparkSession":
|
|
299
|
+
"""Get the Spark session."""
|
|
300
|
+
return self._spark
|
|
301
|
+
|
|
302
|
+
@property
|
|
303
|
+
def supports_sql_pushdown(self) -> bool:
|
|
304
|
+
"""Spark supports SQL pushdown."""
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
# -------------------------------------------------------------------------
|
|
308
|
+
# Abstract Method Implementations
|
|
309
|
+
# -------------------------------------------------------------------------
|
|
310
|
+
|
|
311
|
+
def _get_partition_count(self) -> int:
|
|
312
|
+
"""Get number of data partitions."""
|
|
313
|
+
return self._df.rdd.getNumPartitions()
|
|
314
|
+
|
|
315
|
+
def _get_partition_info(self) -> list[PartitionInfo]:
|
|
316
|
+
"""Get information about all partitions."""
|
|
317
|
+
num_partitions = self._get_partition_count()
|
|
318
|
+
columns = tuple(self._columns)
|
|
319
|
+
|
|
320
|
+
return [
|
|
321
|
+
PartitionInfo(
|
|
322
|
+
partition_id=i,
|
|
323
|
+
total_partitions=num_partitions,
|
|
324
|
+
columns=columns,
|
|
325
|
+
)
|
|
326
|
+
for i in range(num_partitions)
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
def _execute_on_partitions(
|
|
330
|
+
self,
|
|
331
|
+
operation: str,
|
|
332
|
+
func: Callable[[Iterator[Any]], Iterator[dict[str, Any]]],
|
|
333
|
+
columns: list[str] | None = None,
|
|
334
|
+
) -> list[DistributedResult]:
|
|
335
|
+
"""Execute function on all partitions using mapPartitions.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
operation: Operation name for metrics.
|
|
339
|
+
func: Function to apply to each partition.
|
|
340
|
+
columns: Columns to include (None = all).
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Results from all partitions.
|
|
344
|
+
"""
|
|
345
|
+
import time
|
|
346
|
+
|
|
347
|
+
metrics = self._start_metrics(operation)
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
df = self._df
|
|
351
|
+
if columns:
|
|
352
|
+
df = df.select(*columns)
|
|
353
|
+
|
|
354
|
+
# Execute on partitions
|
|
355
|
+
results_rdd = df.rdd.mapPartitions(func)
|
|
356
|
+
raw_results = results_rdd.collect()
|
|
357
|
+
|
|
358
|
+
results = []
|
|
359
|
+
total_rows = 0
|
|
360
|
+
for i, result_dict in enumerate(raw_results):
|
|
361
|
+
row_count = result_dict.get("row_count", 0)
|
|
362
|
+
total_rows += row_count
|
|
363
|
+
results.append(
|
|
364
|
+
DistributedResult(
|
|
365
|
+
partition_id=i,
|
|
366
|
+
operation=operation,
|
|
367
|
+
value=result_dict.get("value"),
|
|
368
|
+
row_count=row_count,
|
|
369
|
+
duration_ms=result_dict.get("duration_ms", 0),
|
|
370
|
+
errors=result_dict.get("errors", []),
|
|
371
|
+
metadata=result_dict.get("metadata", {}),
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
metrics.partitions_processed = len(results)
|
|
376
|
+
metrics.rows_processed = total_rows
|
|
377
|
+
|
|
378
|
+
return results
|
|
379
|
+
|
|
380
|
+
except Exception as e:
|
|
381
|
+
metrics.errors.append(str(e))
|
|
382
|
+
raise
|
|
383
|
+
finally:
|
|
384
|
+
self._end_metrics(metrics)
|
|
385
|
+
|
|
386
|
+
def _aggregate_distributed(
|
|
387
|
+
self,
|
|
388
|
+
spec: AggregationSpec,
|
|
389
|
+
) -> dict[str, Any]:
|
|
390
|
+
"""Perform distributed aggregation using native Spark operations.
|
|
391
|
+
|
|
392
|
+
This method uses Spark's built-in aggregation functions for
|
|
393
|
+
optimal performance, falling back to map-reduce style
|
|
394
|
+
aggregation for custom aggregators.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
spec: Aggregation specification.
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
Aggregated results.
|
|
401
|
+
"""
|
|
402
|
+
from pyspark.sql import functions as F
|
|
403
|
+
|
|
404
|
+
metrics = self._start_metrics("aggregate")
|
|
405
|
+
|
|
406
|
+
try:
|
|
407
|
+
results = {}
|
|
408
|
+
|
|
409
|
+
# Group aggregations by type for batching
|
|
410
|
+
spark_aggs = []
|
|
411
|
+
custom_aggs = []
|
|
412
|
+
|
|
413
|
+
spark_agg_funcs = {
|
|
414
|
+
"count": lambda c: F.count(F.lit(1)) if c == "*" else F.count(c),
|
|
415
|
+
"sum": F.sum,
|
|
416
|
+
"mean": F.avg,
|
|
417
|
+
"min": F.min,
|
|
418
|
+
"max": F.max,
|
|
419
|
+
"std": F.stddev,
|
|
420
|
+
"var": F.variance,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
for agg in spec.aggregations:
|
|
424
|
+
if agg.operation in spark_agg_funcs:
|
|
425
|
+
spark_aggs.append(agg)
|
|
426
|
+
else:
|
|
427
|
+
custom_aggs.append(agg)
|
|
428
|
+
|
|
429
|
+
# Execute native Spark aggregations in batch
|
|
430
|
+
if spark_aggs:
|
|
431
|
+
exprs = []
|
|
432
|
+
for agg in spark_aggs:
|
|
433
|
+
func = spark_agg_funcs[agg.operation]
|
|
434
|
+
expr = func(agg.column).alias(agg.alias)
|
|
435
|
+
exprs.append(expr)
|
|
436
|
+
|
|
437
|
+
if spec.group_by:
|
|
438
|
+
agg_df = self._df.groupBy(*spec.group_by).agg(*exprs)
|
|
439
|
+
else:
|
|
440
|
+
agg_df = self._df.agg(*exprs)
|
|
441
|
+
|
|
442
|
+
# Collect results
|
|
443
|
+
row = agg_df.collect()[0]
|
|
444
|
+
for agg in spark_aggs:
|
|
445
|
+
results[agg.alias] = row[agg.alias]
|
|
446
|
+
|
|
447
|
+
# Handle minmax specially (returns dict)
|
|
448
|
+
minmax_aggs = [a for a in spec.aggregations if a.operation == "minmax"]
|
|
449
|
+
for agg in minmax_aggs:
|
|
450
|
+
min_val = self._df.agg(F.min(agg.column)).collect()[0][0]
|
|
451
|
+
max_val = self._df.agg(F.max(agg.column)).collect()[0][0]
|
|
452
|
+
results[agg.alias] = {"min": min_val, "max": max_val}
|
|
453
|
+
|
|
454
|
+
# Execute custom aggregations using map-reduce
|
|
455
|
+
for agg in custom_aggs:
|
|
456
|
+
if agg.operation == "null_count":
|
|
457
|
+
# Native Spark null count
|
|
458
|
+
null_count = self._df.filter(F.col(agg.column).isNull()).count()
|
|
459
|
+
total_count = self._df.count()
|
|
460
|
+
results[agg.alias] = {
|
|
461
|
+
"null_count": null_count,
|
|
462
|
+
"total_count": total_count,
|
|
463
|
+
}
|
|
464
|
+
elif agg.operation == "distinct_count":
|
|
465
|
+
# Native Spark distinct count
|
|
466
|
+
distinct_count = self._df.select(agg.column).distinct().count()
|
|
467
|
+
results[agg.alias] = distinct_count
|
|
468
|
+
else:
|
|
469
|
+
# Use custom aggregator via map-reduce
|
|
470
|
+
result = self._aggregate_with_aggregator(agg)
|
|
471
|
+
results[agg.alias] = result
|
|
472
|
+
|
|
473
|
+
return results
|
|
474
|
+
|
|
475
|
+
except Exception as e:
|
|
476
|
+
metrics.errors.append(str(e))
|
|
477
|
+
raise
|
|
478
|
+
finally:
|
|
479
|
+
self._end_metrics(metrics)
|
|
480
|
+
|
|
481
|
+
def _aggregate_with_aggregator(
|
|
482
|
+
self,
|
|
483
|
+
agg: Any,
|
|
484
|
+
) -> Any:
|
|
485
|
+
"""Perform aggregation using custom aggregator via map-reduce.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
agg: Aggregation specification.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Aggregated result.
|
|
492
|
+
"""
|
|
493
|
+
aggregator = get_aggregator(agg.operation, **agg.params)
|
|
494
|
+
column = agg.column
|
|
495
|
+
|
|
496
|
+
# Map phase: compute partial aggregates per partition
|
|
497
|
+
def map_partition(iterator: Iterator) -> Iterator:
|
|
498
|
+
state = aggregator.initialize()
|
|
499
|
+
for row in iterator:
|
|
500
|
+
value = row[column] if column in row.asDict() else None
|
|
501
|
+
state = aggregator.accumulate(state, value)
|
|
502
|
+
yield state
|
|
503
|
+
|
|
504
|
+
partial_results = self._df.rdd.mapPartitions(map_partition).collect()
|
|
505
|
+
|
|
506
|
+
# Reduce phase: merge all partial results
|
|
507
|
+
if not partial_results:
|
|
508
|
+
return aggregator.finalize(aggregator.initialize())
|
|
509
|
+
|
|
510
|
+
final_state = partial_results[0]
|
|
511
|
+
for state in partial_results[1:]:
|
|
512
|
+
final_state = aggregator.merge(final_state, state)
|
|
513
|
+
|
|
514
|
+
return aggregator.finalize(final_state)
|
|
515
|
+
|
|
516
|
+
def _to_arrow_batches(
|
|
517
|
+
self,
|
|
518
|
+
batch_size: int | None = None,
|
|
519
|
+
) -> list["pa.RecordBatch"]:
|
|
520
|
+
"""Convert Spark DataFrame to Arrow batches.
|
|
521
|
+
|
|
522
|
+
Uses Spark's native Arrow support when available for
|
|
523
|
+
optimal performance and zero-copy conversion.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
batch_size: Batch size for conversion.
|
|
527
|
+
|
|
528
|
+
Returns:
|
|
529
|
+
List of Arrow record batches.
|
|
530
|
+
"""
|
|
531
|
+
import pyarrow as pa
|
|
532
|
+
|
|
533
|
+
batch_size = batch_size or self._config.arrow_batch_size
|
|
534
|
+
|
|
535
|
+
try:
|
|
536
|
+
# Try native Arrow conversion (Spark 3.0+)
|
|
537
|
+
# This is the most efficient path
|
|
538
|
+
arrow_batches = self._df._collect_as_arrow()
|
|
539
|
+
return arrow_batches
|
|
540
|
+
except AttributeError:
|
|
541
|
+
# Fallback: Convert via Pandas with Arrow
|
|
542
|
+
logger.debug("Falling back to Pandas-based Arrow conversion")
|
|
543
|
+
|
|
544
|
+
try:
|
|
545
|
+
# Use toPandas with Arrow enabled
|
|
546
|
+
pandas_df = self._df.toPandas()
|
|
547
|
+
table = pa.Table.from_pandas(pandas_df)
|
|
548
|
+
return table.to_batches(max_chunksize=batch_size)
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.warning(f"Arrow conversion failed: {e}")
|
|
551
|
+
# Last resort: manual conversion
|
|
552
|
+
return self._manual_arrow_conversion(batch_size)
|
|
553
|
+
|
|
554
|
+
def _manual_arrow_conversion(
|
|
555
|
+
self,
|
|
556
|
+
batch_size: int,
|
|
557
|
+
) -> list["pa.RecordBatch"]:
|
|
558
|
+
"""Manual Arrow conversion for older Spark versions.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
batch_size: Batch size.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
List of Arrow record batches.
|
|
565
|
+
"""
|
|
566
|
+
import pyarrow as pa
|
|
567
|
+
|
|
568
|
+
# Collect data in batches
|
|
569
|
+
batches = []
|
|
570
|
+
schema = self._infer_arrow_schema()
|
|
571
|
+
|
|
572
|
+
for partition in self._df.rdd.mapPartitions(
|
|
573
|
+
lambda it: [list(it)]
|
|
574
|
+
).collect():
|
|
575
|
+
if not partition:
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
# Convert partition to dict of arrays
|
|
579
|
+
data = {col: [] for col in self._columns}
|
|
580
|
+
for row in partition:
|
|
581
|
+
row_dict = row.asDict()
|
|
582
|
+
for col in self._columns:
|
|
583
|
+
data[col].append(row_dict.get(col))
|
|
584
|
+
|
|
585
|
+
# Create record batch
|
|
586
|
+
batch = pa.RecordBatch.from_pydict(data, schema=schema)
|
|
587
|
+
batches.append(batch)
|
|
588
|
+
|
|
589
|
+
return batches
|
|
590
|
+
|
|
591
|
+
def _infer_arrow_schema(self) -> "pa.Schema":
|
|
592
|
+
"""Infer Arrow schema from Spark schema."""
|
|
593
|
+
import pyarrow as pa
|
|
594
|
+
from pyspark.sql.types import (
|
|
595
|
+
BooleanType,
|
|
596
|
+
ByteType,
|
|
597
|
+
DateType,
|
|
598
|
+
DecimalType,
|
|
599
|
+
DoubleType,
|
|
600
|
+
FloatType,
|
|
601
|
+
IntegerType,
|
|
602
|
+
LongType,
|
|
603
|
+
ShortType,
|
|
604
|
+
StringType,
|
|
605
|
+
TimestampType,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
type_mapping = {
|
|
609
|
+
ByteType: pa.int8(),
|
|
610
|
+
ShortType: pa.int16(),
|
|
611
|
+
IntegerType: pa.int32(),
|
|
612
|
+
LongType: pa.int64(),
|
|
613
|
+
FloatType: pa.float32(),
|
|
614
|
+
DoubleType: pa.float64(),
|
|
615
|
+
StringType: pa.string(),
|
|
616
|
+
BooleanType: pa.bool_(),
|
|
617
|
+
DateType: pa.date32(),
|
|
618
|
+
TimestampType: pa.timestamp("us"),
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
fields = []
|
|
622
|
+
for field in self._schema.fields:
|
|
623
|
+
arrow_type = type_mapping.get(type(field.dataType), pa.string())
|
|
624
|
+
if isinstance(field.dataType, DecimalType):
|
|
625
|
+
arrow_type = pa.decimal128(
|
|
626
|
+
field.dataType.precision,
|
|
627
|
+
field.dataType.scale,
|
|
628
|
+
)
|
|
629
|
+
fields.append(pa.field(field.name, arrow_type, nullable=field.nullable))
|
|
630
|
+
|
|
631
|
+
return pa.schema(fields)
|
|
632
|
+
|
|
633
|
+
def _repartition(self, num_partitions: int) -> "SparkExecutionEngine":
|
|
634
|
+
"""Repartition the underlying DataFrame.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
num_partitions: New number of partitions.
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
New engine with repartitioned data.
|
|
641
|
+
"""
|
|
642
|
+
repartitioned = self._df.repartition(num_partitions)
|
|
643
|
+
return SparkExecutionEngine(repartitioned, self._config, self._spark)
|
|
644
|
+
|
|
645
|
+
def coalesce(self, num_partitions: int) -> "SparkExecutionEngine":
|
|
646
|
+
"""Coalesce partitions (no shuffle).
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
num_partitions: New number of partitions.
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
New engine with coalesced data.
|
|
653
|
+
"""
|
|
654
|
+
coalesced = self._df.coalesce(num_partitions)
|
|
655
|
+
return SparkExecutionEngine(coalesced, self._config, self._spark)
|
|
656
|
+
|
|
657
|
+
# -------------------------------------------------------------------------
|
|
658
|
+
# Core Operation Overrides (Native Spark)
|
|
659
|
+
# -------------------------------------------------------------------------
|
|
660
|
+
|
|
661
|
+
def count_rows(self) -> int:
|
|
662
|
+
"""Count rows using native Spark count."""
|
|
663
|
+
if self._cached_row_count is not None:
|
|
664
|
+
return self._cached_row_count
|
|
665
|
+
|
|
666
|
+
cache_key = self._cache_key("count_rows")
|
|
667
|
+
cached = self._get_cached(cache_key)
|
|
668
|
+
if cached is not None:
|
|
669
|
+
return cached
|
|
670
|
+
|
|
671
|
+
count = self._df.count()
|
|
672
|
+
self._cached_row_count = count
|
|
673
|
+
self._set_cached(cache_key, count)
|
|
674
|
+
return count
|
|
675
|
+
|
|
676
|
+
def get_columns(self) -> list[str]:
|
|
677
|
+
"""Get column names."""
|
|
678
|
+
return self._columns
|
|
679
|
+
|
|
680
|
+
def count_nulls(self, column: str) -> int:
|
|
681
|
+
"""Count nulls using native Spark filter."""
|
|
682
|
+
from pyspark.sql import functions as F
|
|
683
|
+
|
|
684
|
+
cache_key = self._cache_key("count_nulls", column)
|
|
685
|
+
cached = self._get_cached(cache_key)
|
|
686
|
+
if cached is not None:
|
|
687
|
+
return cached
|
|
688
|
+
|
|
689
|
+
count = self._df.filter(F.col(column).isNull()).count()
|
|
690
|
+
self._set_cached(cache_key, count)
|
|
691
|
+
return count
|
|
692
|
+
|
|
693
|
+
def count_nulls_all(self) -> dict[str, int]:
|
|
694
|
+
"""Count nulls in all columns using batch aggregation."""
|
|
695
|
+
from pyspark.sql import functions as F
|
|
696
|
+
|
|
697
|
+
cache_key = self._cache_key("count_nulls_all")
|
|
698
|
+
cached = self._get_cached(cache_key)
|
|
699
|
+
if cached is not None:
|
|
700
|
+
return cached
|
|
701
|
+
|
|
702
|
+
# Single pass aggregation for all columns
|
|
703
|
+
exprs = [
|
|
704
|
+
F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias(f"{col}_nulls")
|
|
705
|
+
for col in self._columns
|
|
706
|
+
]
|
|
707
|
+
|
|
708
|
+
row = self._df.agg(*exprs).collect()[0]
|
|
709
|
+
|
|
710
|
+
result = {
|
|
711
|
+
col: row[f"{col}_nulls"] or 0
|
|
712
|
+
for col in self._columns
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
self._set_cached(cache_key, result)
|
|
716
|
+
return result
|
|
717
|
+
|
|
718
|
+
def count_distinct(self, column: str) -> int:
|
|
719
|
+
"""Count distinct values using native Spark."""
|
|
720
|
+
from pyspark.sql import functions as F
|
|
721
|
+
|
|
722
|
+
cache_key = self._cache_key("count_distinct", column)
|
|
723
|
+
cached = self._get_cached(cache_key)
|
|
724
|
+
if cached is not None:
|
|
725
|
+
return cached
|
|
726
|
+
|
|
727
|
+
count = self._df.select(F.countDistinct(column)).collect()[0][0]
|
|
728
|
+
self._set_cached(cache_key, count)
|
|
729
|
+
return count
|
|
730
|
+
|
|
731
|
+
def get_stats(self, column: str) -> dict[str, Any]:
|
|
732
|
+
"""Get column statistics using native Spark aggregations."""
|
|
733
|
+
from pyspark.sql import functions as F
|
|
734
|
+
|
|
735
|
+
cache_key = self._cache_key("get_stats", column)
|
|
736
|
+
cached = self._get_cached(cache_key)
|
|
737
|
+
if cached is not None:
|
|
738
|
+
return cached
|
|
739
|
+
|
|
740
|
+
# Single-pass aggregation for all stats
|
|
741
|
+
row = self._df.agg(
|
|
742
|
+
F.count(column).alias("count"),
|
|
743
|
+
F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias("null_count"),
|
|
744
|
+
F.avg(column).alias("mean"),
|
|
745
|
+
F.stddev(column).alias("std"),
|
|
746
|
+
F.min(column).alias("min"),
|
|
747
|
+
F.max(column).alias("max"),
|
|
748
|
+
).collect()[0]
|
|
749
|
+
|
|
750
|
+
stats = {
|
|
751
|
+
"count": row["count"],
|
|
752
|
+
"null_count": row["null_count"] or 0,
|
|
753
|
+
"mean": row["mean"],
|
|
754
|
+
"std": row["std"],
|
|
755
|
+
"min": row["min"],
|
|
756
|
+
"max": row["max"],
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
self._set_cached(cache_key, stats)
|
|
760
|
+
return stats
|
|
761
|
+
|
|
762
|
+
def get_quantiles(
|
|
763
|
+
self,
|
|
764
|
+
column: str,
|
|
765
|
+
quantiles: list[float],
|
|
766
|
+
) -> list[float]:
|
|
767
|
+
"""Get quantiles using Spark's approxQuantile."""
|
|
768
|
+
cache_key = self._cache_key("get_quantiles", column, tuple(quantiles))
|
|
769
|
+
cached = self._get_cached(cache_key)
|
|
770
|
+
if cached is not None:
|
|
771
|
+
return cached
|
|
772
|
+
|
|
773
|
+
# approxQuantile with 0.01 relative error
|
|
774
|
+
result = self._df.approxQuantile(column, quantiles, 0.01)
|
|
775
|
+
self._set_cached(cache_key, result)
|
|
776
|
+
return result
|
|
777
|
+
|
|
778
|
+
def get_value_counts(
|
|
779
|
+
self,
|
|
780
|
+
column: str,
|
|
781
|
+
limit: int | None = None,
|
|
782
|
+
) -> dict[Any, int]:
|
|
783
|
+
"""Get value counts using native Spark groupBy."""
|
|
784
|
+
from pyspark.sql import functions as F
|
|
785
|
+
|
|
786
|
+
cache_key = self._cache_key("get_value_counts", column, limit)
|
|
787
|
+
cached = self._get_cached(cache_key)
|
|
788
|
+
if cached is not None:
|
|
789
|
+
return cached
|
|
790
|
+
|
|
791
|
+
counts = (
|
|
792
|
+
self._df.groupBy(column)
|
|
793
|
+
.agg(F.count("*").alias("count"))
|
|
794
|
+
.orderBy(F.desc("count"))
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
if limit:
|
|
798
|
+
counts = counts.limit(limit)
|
|
799
|
+
|
|
800
|
+
rows = counts.collect()
|
|
801
|
+
result = {row[column]: row["count"] for row in rows}
|
|
802
|
+
|
|
803
|
+
self._set_cached(cache_key, result)
|
|
804
|
+
return result
|
|
805
|
+
|
|
806
|
+
def count_duplicates(self, columns: list[str]) -> int:
|
|
807
|
+
"""Count duplicates using native Spark operations."""
|
|
808
|
+
from pyspark.sql import functions as F
|
|
809
|
+
|
|
810
|
+
cache_key = self._cache_key("count_duplicates", tuple(columns))
|
|
811
|
+
cached = self._get_cached(cache_key)
|
|
812
|
+
if cached is not None:
|
|
813
|
+
return cached
|
|
814
|
+
|
|
815
|
+
total = self.count_rows()
|
|
816
|
+
unique = self._df.select(columns).distinct().count()
|
|
817
|
+
duplicates = total - unique
|
|
818
|
+
|
|
819
|
+
self._set_cached(cache_key, duplicates)
|
|
820
|
+
return duplicates
|
|
821
|
+
|
|
822
|
+
def count_matching_regex(self, column: str, pattern: str) -> int:
|
|
823
|
+
"""Count values matching regex using Spark rlike."""
|
|
824
|
+
from pyspark.sql import functions as F
|
|
825
|
+
|
|
826
|
+
cache_key = self._cache_key("count_matching_regex", column, pattern)
|
|
827
|
+
cached = self._get_cached(cache_key)
|
|
828
|
+
if cached is not None:
|
|
829
|
+
return cached
|
|
830
|
+
|
|
831
|
+
count = self._df.filter(F.col(column).rlike(pattern)).count()
|
|
832
|
+
self._set_cached(cache_key, count)
|
|
833
|
+
return count
|
|
834
|
+
|
|
835
|
+
def count_in_range(
|
|
836
|
+
self,
|
|
837
|
+
column: str,
|
|
838
|
+
min_value: Any | None = None,
|
|
839
|
+
max_value: Any | None = None,
|
|
840
|
+
inclusive: bool = True,
|
|
841
|
+
) -> int:
|
|
842
|
+
"""Count values in range using native Spark filter."""
|
|
843
|
+
from pyspark.sql import functions as F
|
|
844
|
+
|
|
845
|
+
cache_key = self._cache_key(
|
|
846
|
+
"count_in_range", column, min_value, max_value, inclusive
|
|
847
|
+
)
|
|
848
|
+
cached = self._get_cached(cache_key)
|
|
849
|
+
if cached is not None:
|
|
850
|
+
return cached
|
|
851
|
+
|
|
852
|
+
condition = None
|
|
853
|
+
|
|
854
|
+
if min_value is not None:
|
|
855
|
+
if inclusive:
|
|
856
|
+
condition = F.col(column) >= min_value
|
|
857
|
+
else:
|
|
858
|
+
condition = F.col(column) > min_value
|
|
859
|
+
|
|
860
|
+
if max_value is not None:
|
|
861
|
+
max_cond = (
|
|
862
|
+
F.col(column) <= max_value
|
|
863
|
+
if inclusive
|
|
864
|
+
else F.col(column) < max_value
|
|
865
|
+
)
|
|
866
|
+
condition = condition & max_cond if condition is not None else max_cond
|
|
867
|
+
|
|
868
|
+
if condition is None:
|
|
869
|
+
count = self.count_rows()
|
|
870
|
+
else:
|
|
871
|
+
count = self._df.filter(condition).count()
|
|
872
|
+
|
|
873
|
+
self._set_cached(cache_key, count)
|
|
874
|
+
return count
|
|
875
|
+
|
|
876
|
+
def count_in_set(self, column: str, values: set[Any]) -> int:
|
|
877
|
+
"""Count values in set using Spark isin."""
|
|
878
|
+
from pyspark.sql import functions as F
|
|
879
|
+
|
|
880
|
+
cache_key = self._cache_key("count_in_set", column, frozenset(values))
|
|
881
|
+
cached = self._get_cached(cache_key)
|
|
882
|
+
if cached is not None:
|
|
883
|
+
return cached
|
|
884
|
+
|
|
885
|
+
count = self._df.filter(F.col(column).isin(list(values))).count()
|
|
886
|
+
self._set_cached(cache_key, count)
|
|
887
|
+
return count
|
|
888
|
+
|
|
889
|
+
# -------------------------------------------------------------------------
|
|
890
|
+
# Sampling
|
|
891
|
+
# -------------------------------------------------------------------------
|
|
892
|
+
|
|
893
|
+
def sample(
|
|
894
|
+
self,
|
|
895
|
+
n: int = 1000,
|
|
896
|
+
seed: int | None = None,
|
|
897
|
+
) -> "SparkExecutionEngine":
|
|
898
|
+
"""Create sampled engine using Spark's native sampling.
|
|
899
|
+
|
|
900
|
+
Args:
|
|
901
|
+
n: Target number of rows.
|
|
902
|
+
seed: Random seed.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
New engine with sampled data.
|
|
906
|
+
"""
|
|
907
|
+
row_count = self.count_rows()
|
|
908
|
+
|
|
909
|
+
if row_count <= n:
|
|
910
|
+
return self
|
|
911
|
+
|
|
912
|
+
fraction = min((n * 1.1) / row_count, 1.0)
|
|
913
|
+
|
|
914
|
+
if seed is not None:
|
|
915
|
+
sampled = self._df.sample(
|
|
916
|
+
withReplacement=False,
|
|
917
|
+
fraction=fraction,
|
|
918
|
+
seed=seed,
|
|
919
|
+
)
|
|
920
|
+
else:
|
|
921
|
+
sampled = self._df.sample(withReplacement=False, fraction=fraction)
|
|
922
|
+
|
|
923
|
+
sampled = sampled.limit(n)
|
|
924
|
+
|
|
925
|
+
return SparkExecutionEngine(sampled, self._config, self._spark)
|
|
926
|
+
|
|
927
|
+
# -------------------------------------------------------------------------
|
|
928
|
+
# Spark-Specific Methods
|
|
929
|
+
# -------------------------------------------------------------------------
|
|
930
|
+
|
|
931
|
+
def persist(self, storage_level: str = "MEMORY_AND_DISK") -> "SparkExecutionEngine":
|
|
932
|
+
"""Persist the DataFrame.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
storage_level: Spark storage level.
|
|
936
|
+
|
|
937
|
+
Returns:
|
|
938
|
+
Self after persisting.
|
|
939
|
+
"""
|
|
940
|
+
from pyspark import StorageLevel
|
|
941
|
+
|
|
942
|
+
levels = {
|
|
943
|
+
"MEMORY_ONLY": StorageLevel.MEMORY_ONLY,
|
|
944
|
+
"MEMORY_AND_DISK": StorageLevel.MEMORY_AND_DISK,
|
|
945
|
+
"DISK_ONLY": StorageLevel.DISK_ONLY,
|
|
946
|
+
"MEMORY_ONLY_SER": StorageLevel.MEMORY_ONLY_SER,
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
level = levels.get(storage_level, StorageLevel.MEMORY_AND_DISK)
|
|
950
|
+
self._df.persist(level)
|
|
951
|
+
return self
|
|
952
|
+
|
|
953
|
+
def unpersist(self) -> "SparkExecutionEngine":
|
|
954
|
+
"""Unpersist the DataFrame.
|
|
955
|
+
|
|
956
|
+
Returns:
|
|
957
|
+
Self after unpersisting.
|
|
958
|
+
"""
|
|
959
|
+
self._df.unpersist()
|
|
960
|
+
return self
|
|
961
|
+
|
|
962
|
+
def checkpoint(self) -> "SparkExecutionEngine":
|
|
963
|
+
"""Checkpoint the DataFrame for fault tolerance.
|
|
964
|
+
|
|
965
|
+
Returns:
|
|
966
|
+
New engine with checkpointed data.
|
|
967
|
+
"""
|
|
968
|
+
if self._config.checkpoint_dir:
|
|
969
|
+
self._spark.sparkContext.setCheckpointDir(self._config.checkpoint_dir)
|
|
970
|
+
|
|
971
|
+
checkpointed = self._df.checkpoint()
|
|
972
|
+
return SparkExecutionEngine(checkpointed, self._config, self._spark)
|
|
973
|
+
|
|
974
|
+
def explain(self, extended: bool = False) -> str:
|
|
975
|
+
"""Get the execution plan.
|
|
976
|
+
|
|
977
|
+
Args:
|
|
978
|
+
extended: Show extended plan.
|
|
979
|
+
|
|
980
|
+
Returns:
|
|
981
|
+
Execution plan as string.
|
|
982
|
+
"""
|
|
983
|
+
import io
|
|
984
|
+
import sys
|
|
985
|
+
|
|
986
|
+
old_stdout = sys.stdout
|
|
987
|
+
sys.stdout = buffer = io.StringIO()
|
|
988
|
+
try:
|
|
989
|
+
self._df.explain(extended=extended)
|
|
990
|
+
return buffer.getvalue()
|
|
991
|
+
finally:
|
|
992
|
+
sys.stdout = old_stdout
|
|
993
|
+
|
|
994
|
+
def sql(self, query: str) -> "SparkExecutionEngine":
|
|
995
|
+
"""Execute SQL query on this DataFrame.
|
|
996
|
+
|
|
997
|
+
Args:
|
|
998
|
+
query: SQL query with {table} placeholder.
|
|
999
|
+
|
|
1000
|
+
Returns:
|
|
1001
|
+
New engine with query results.
|
|
1002
|
+
"""
|
|
1003
|
+
# Register temp view
|
|
1004
|
+
view_name = f"truthound_temp_{id(self._df)}"
|
|
1005
|
+
self._df.createOrReplaceTempView(view_name)
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
result_df = self._spark.sql(query.format(table=view_name))
|
|
1009
|
+
return SparkExecutionEngine(result_df, self._config, self._spark)
|
|
1010
|
+
finally:
|
|
1011
|
+
self._spark.catalog.dropTempView(view_name)
|