truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
"""Protocol definitions for distributed execution engines.
|
|
2
|
+
|
|
3
|
+
This module defines the structural typing protocols that all distributed
|
|
4
|
+
execution engine implementations should follow. These protocols enable:
|
|
5
|
+
- Type-safe distributed operations
|
|
6
|
+
- Backend-agnostic interfaces
|
|
7
|
+
- Extensibility for custom backends
|
|
8
|
+
|
|
9
|
+
Design Principles:
|
|
10
|
+
1. Protocol-first: Define interfaces before implementations
|
|
11
|
+
2. Composable: Small, focused protocols that can be combined
|
|
12
|
+
3. Backend-agnostic: Same interface for Spark, Dask, Ray, etc.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from enum import Enum, auto
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Callable, Generic, Iterator, Protocol, TypeVar, runtime_checkable
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
import polars as pl
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# Enums
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ExecutionMode(str, Enum):
|
|
33
|
+
"""Execution modes for distributed operations."""
|
|
34
|
+
|
|
35
|
+
EAGER = "eager" # Execute immediately
|
|
36
|
+
LAZY = "lazy" # Build execution plan, execute on collect
|
|
37
|
+
STREAMING = "streaming" # Process data in streaming fashion
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PartitionStrategy(str, Enum):
|
|
41
|
+
"""Strategies for data partitioning."""
|
|
42
|
+
|
|
43
|
+
ROW_HASH = "row_hash" # Hash-based row partitioning
|
|
44
|
+
ROW_RANGE = "row_range" # Range-based row partitioning
|
|
45
|
+
COLUMN = "column" # Partition by columns
|
|
46
|
+
ROUND_ROBIN = "round_robin" # Round-robin distribution
|
|
47
|
+
CUSTOM = "custom" # Custom partitioning function
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AggregationScope(str, Enum):
|
|
51
|
+
"""Scope of aggregation operations."""
|
|
52
|
+
|
|
53
|
+
GLOBAL = "global" # Aggregate across all partitions
|
|
54
|
+
PARTITION = "partition" # Aggregate within partition
|
|
55
|
+
COLUMN = "column" # Aggregate per column
|
|
56
|
+
GROUPED = "grouped" # Aggregate by group key
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ComputeBackend(str, Enum):
|
|
60
|
+
"""Supported distributed compute backends."""
|
|
61
|
+
|
|
62
|
+
SPARK = "spark"
|
|
63
|
+
DASK = "dask"
|
|
64
|
+
RAY = "ray"
|
|
65
|
+
LOCAL = "local"
|
|
66
|
+
AUTO = "auto"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# =============================================================================
|
|
70
|
+
# Data Classes
|
|
71
|
+
# =============================================================================
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class PartitionInfo:
|
|
76
|
+
"""Information about a data partition.
|
|
77
|
+
|
|
78
|
+
Attributes:
|
|
79
|
+
partition_id: Unique identifier for this partition.
|
|
80
|
+
total_partitions: Total number of partitions.
|
|
81
|
+
row_start: Starting row index (inclusive).
|
|
82
|
+
row_end: Ending row index (exclusive).
|
|
83
|
+
columns: Columns in this partition.
|
|
84
|
+
size_bytes: Estimated size in bytes.
|
|
85
|
+
host: Host where this partition resides.
|
|
86
|
+
metadata: Additional partition metadata.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
partition_id: int
|
|
90
|
+
total_partitions: int
|
|
91
|
+
row_start: int = 0
|
|
92
|
+
row_end: int = 0
|
|
93
|
+
columns: tuple[str, ...] = field(default_factory=tuple)
|
|
94
|
+
size_bytes: int = 0
|
|
95
|
+
host: str = ""
|
|
96
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def row_count(self) -> int:
|
|
100
|
+
"""Get number of rows in this partition."""
|
|
101
|
+
return self.row_end - self.row_start
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class DistributedResult:
|
|
106
|
+
"""Result from a distributed operation.
|
|
107
|
+
|
|
108
|
+
Attributes:
|
|
109
|
+
partition_id: Source partition ID.
|
|
110
|
+
operation: Operation that produced this result.
|
|
111
|
+
value: The computed value.
|
|
112
|
+
row_count: Number of rows processed.
|
|
113
|
+
duration_ms: Processing duration in milliseconds.
|
|
114
|
+
errors: List of errors encountered.
|
|
115
|
+
warnings: List of warnings.
|
|
116
|
+
metadata: Additional result metadata.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
partition_id: int
|
|
120
|
+
operation: str
|
|
121
|
+
value: Any
|
|
122
|
+
row_count: int = 0
|
|
123
|
+
duration_ms: float = 0.0
|
|
124
|
+
errors: list[str] = field(default_factory=list)
|
|
125
|
+
warnings: list[str] = field(default_factory=list)
|
|
126
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def success(self) -> bool:
|
|
130
|
+
"""Check if operation succeeded without errors."""
|
|
131
|
+
return len(self.errors) == 0
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class DistributedAggregation:
|
|
136
|
+
"""Specification for a distributed aggregation.
|
|
137
|
+
|
|
138
|
+
Attributes:
|
|
139
|
+
column: Column to aggregate.
|
|
140
|
+
operation: Aggregation operation name.
|
|
141
|
+
params: Additional parameters for the operation.
|
|
142
|
+
alias: Result column alias.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
column: str
|
|
146
|
+
operation: str # "count", "sum", "mean", "min", "max", "std", "var", etc.
|
|
147
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
148
|
+
alias: str = ""
|
|
149
|
+
|
|
150
|
+
def __post_init__(self) -> None:
|
|
151
|
+
if not self.alias:
|
|
152
|
+
self.alias = f"{self.column}_{self.operation}"
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class AggregationSpec:
|
|
157
|
+
"""Specification for multiple aggregations."""
|
|
158
|
+
|
|
159
|
+
aggregations: list[DistributedAggregation] = field(default_factory=list)
|
|
160
|
+
group_by: list[str] = field(default_factory=list)
|
|
161
|
+
scope: AggregationScope = AggregationScope.GLOBAL
|
|
162
|
+
|
|
163
|
+
def add(
|
|
164
|
+
self,
|
|
165
|
+
column: str,
|
|
166
|
+
operation: str,
|
|
167
|
+
alias: str = "",
|
|
168
|
+
**params: Any,
|
|
169
|
+
) -> "AggregationSpec":
|
|
170
|
+
"""Add an aggregation to the spec."""
|
|
171
|
+
self.aggregations.append(
|
|
172
|
+
DistributedAggregation(
|
|
173
|
+
column=column,
|
|
174
|
+
operation=operation,
|
|
175
|
+
params=params,
|
|
176
|
+
alias=alias,
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
return self
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# =============================================================================
|
|
183
|
+
# Protocols
|
|
184
|
+
# =============================================================================
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
T = TypeVar("T")
|
|
188
|
+
ResultT = TypeVar("ResultT")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@runtime_checkable
|
|
192
|
+
class DistributedDataProtocol(Protocol):
|
|
193
|
+
"""Protocol for distributed data representations.
|
|
194
|
+
|
|
195
|
+
This protocol abstracts over Spark DataFrames, Dask DataFrames,
|
|
196
|
+
Ray Datasets, etc.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def columns(self) -> list[str]:
|
|
201
|
+
"""Get column names."""
|
|
202
|
+
...
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def num_partitions(self) -> int:
|
|
206
|
+
"""Get number of partitions."""
|
|
207
|
+
...
|
|
208
|
+
|
|
209
|
+
def repartition(self, num_partitions: int) -> "DistributedDataProtocol":
|
|
210
|
+
"""Repartition the data."""
|
|
211
|
+
...
|
|
212
|
+
|
|
213
|
+
def get_partition_info(self) -> list[PartitionInfo]:
|
|
214
|
+
"""Get information about all partitions."""
|
|
215
|
+
...
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@runtime_checkable
|
|
219
|
+
class MapReduceProtocol(Protocol[T, ResultT]):
|
|
220
|
+
"""Protocol for map-reduce style operations.
|
|
221
|
+
|
|
222
|
+
Type Parameters:
|
|
223
|
+
T: Input type for map function.
|
|
224
|
+
ResultT: Output type from reduce function.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
def map_partitions(
|
|
228
|
+
self,
|
|
229
|
+
func: Callable[[Iterator[T]], Iterator[ResultT]],
|
|
230
|
+
) -> "MapReduceProtocol[ResultT, ResultT]":
|
|
231
|
+
"""Apply function to each partition."""
|
|
232
|
+
...
|
|
233
|
+
|
|
234
|
+
def reduce(
|
|
235
|
+
self,
|
|
236
|
+
func: Callable[[ResultT, ResultT], ResultT],
|
|
237
|
+
) -> ResultT:
|
|
238
|
+
"""Reduce all partitions to a single value."""
|
|
239
|
+
...
|
|
240
|
+
|
|
241
|
+
def collect(self) -> list[ResultT]:
|
|
242
|
+
"""Collect all results to driver."""
|
|
243
|
+
...
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@runtime_checkable
|
|
247
|
+
class DistributedAggregatorProtocol(Protocol):
|
|
248
|
+
"""Protocol for distributed aggregation operations.
|
|
249
|
+
|
|
250
|
+
Aggregators must support:
|
|
251
|
+
- Partial aggregation (per-partition)
|
|
252
|
+
- Final aggregation (cross-partition merge)
|
|
253
|
+
- Incremental updates
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def initialize(self) -> Any:
|
|
257
|
+
"""Initialize accumulator state."""
|
|
258
|
+
...
|
|
259
|
+
|
|
260
|
+
def accumulate(self, state: Any, value: Any) -> Any:
|
|
261
|
+
"""Add a value to the accumulator."""
|
|
262
|
+
...
|
|
263
|
+
|
|
264
|
+
def merge(self, state1: Any, state2: Any) -> Any:
|
|
265
|
+
"""Merge two accumulator states."""
|
|
266
|
+
...
|
|
267
|
+
|
|
268
|
+
def finalize(self, state: Any) -> Any:
|
|
269
|
+
"""Finalize and return the result."""
|
|
270
|
+
...
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@runtime_checkable
|
|
274
|
+
class DistributedBackendProtocol(Protocol):
|
|
275
|
+
"""Protocol for distributed computing backends.
|
|
276
|
+
|
|
277
|
+
All distributed backends (Spark, Dask, Ray) must implement
|
|
278
|
+
this protocol to be usable with Truthound's distributed
|
|
279
|
+
execution framework.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def backend_type(self) -> ComputeBackend:
|
|
284
|
+
"""Get the backend type."""
|
|
285
|
+
...
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def is_available(self) -> bool:
|
|
289
|
+
"""Check if the backend is available."""
|
|
290
|
+
...
|
|
291
|
+
|
|
292
|
+
def initialize(self) -> None:
|
|
293
|
+
"""Initialize the backend (connect, start cluster, etc.)."""
|
|
294
|
+
...
|
|
295
|
+
|
|
296
|
+
def shutdown(self) -> None:
|
|
297
|
+
"""Shutdown the backend (disconnect, stop cluster, etc.)."""
|
|
298
|
+
...
|
|
299
|
+
|
|
300
|
+
def distribute_data(
|
|
301
|
+
self,
|
|
302
|
+
data: Any,
|
|
303
|
+
num_partitions: int | None = None,
|
|
304
|
+
strategy: PartitionStrategy = PartitionStrategy.ROW_HASH,
|
|
305
|
+
) -> DistributedDataProtocol:
|
|
306
|
+
"""Distribute data across the cluster."""
|
|
307
|
+
...
|
|
308
|
+
|
|
309
|
+
def map_partitions(
|
|
310
|
+
self,
|
|
311
|
+
data: DistributedDataProtocol,
|
|
312
|
+
func: Callable[[Any], DistributedResult],
|
|
313
|
+
) -> list[DistributedResult]:
|
|
314
|
+
"""Execute function on each partition."""
|
|
315
|
+
...
|
|
316
|
+
|
|
317
|
+
def aggregate(
|
|
318
|
+
self,
|
|
319
|
+
data: DistributedDataProtocol,
|
|
320
|
+
spec: AggregationSpec,
|
|
321
|
+
) -> dict[str, Any]:
|
|
322
|
+
"""Perform distributed aggregation."""
|
|
323
|
+
...
|
|
324
|
+
|
|
325
|
+
def collect(
|
|
326
|
+
self,
|
|
327
|
+
data: DistributedDataProtocol,
|
|
328
|
+
limit: int | None = None,
|
|
329
|
+
) -> Any:
|
|
330
|
+
"""Collect distributed data to local."""
|
|
331
|
+
...
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
@runtime_checkable
|
|
335
|
+
class ArrowConvertibleProtocol(Protocol):
|
|
336
|
+
"""Protocol for types that can convert to/from Arrow.
|
|
337
|
+
|
|
338
|
+
Arrow is used as the zero-copy interchange format between
|
|
339
|
+
different compute backends.
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
def to_arrow(self) -> "pa.Table":
|
|
343
|
+
"""Convert to PyArrow Table."""
|
|
344
|
+
...
|
|
345
|
+
|
|
346
|
+
@classmethod
|
|
347
|
+
def from_arrow(cls, table: "pa.Table") -> Any:
|
|
348
|
+
"""Create from PyArrow Table."""
|
|
349
|
+
...
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@runtime_checkable
|
|
353
|
+
class DistributedExecutionProtocol(Protocol):
|
|
354
|
+
"""Protocol for distributed execution engines.
|
|
355
|
+
|
|
356
|
+
Execution engines provide the high-level interface for
|
|
357
|
+
running validations in a distributed manner.
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def backend(self) -> DistributedBackendProtocol:
|
|
362
|
+
"""Get the underlying backend."""
|
|
363
|
+
...
|
|
364
|
+
|
|
365
|
+
def count_rows(self) -> int:
|
|
366
|
+
"""Count total rows (distributed)."""
|
|
367
|
+
...
|
|
368
|
+
|
|
369
|
+
def count_nulls(self, column: str) -> int:
|
|
370
|
+
"""Count nulls in a column (distributed)."""
|
|
371
|
+
...
|
|
372
|
+
|
|
373
|
+
def count_nulls_all(self) -> dict[str, int]:
|
|
374
|
+
"""Count nulls in all columns (distributed)."""
|
|
375
|
+
...
|
|
376
|
+
|
|
377
|
+
def count_distinct(self, column: str) -> int:
|
|
378
|
+
"""Count distinct values (distributed)."""
|
|
379
|
+
...
|
|
380
|
+
|
|
381
|
+
def get_stats(self, column: str) -> dict[str, Any]:
|
|
382
|
+
"""Get column statistics (distributed)."""
|
|
383
|
+
...
|
|
384
|
+
|
|
385
|
+
def aggregate(self, spec: AggregationSpec) -> dict[str, Any]:
|
|
386
|
+
"""Perform distributed aggregation."""
|
|
387
|
+
...
|
|
388
|
+
|
|
389
|
+
def to_polars_lazyframe(self) -> "pl.LazyFrame":
|
|
390
|
+
"""Convert to Polars LazyFrame (via Arrow)."""
|
|
391
|
+
...
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
# =============================================================================
|
|
395
|
+
# Abstract Base Classes
|
|
396
|
+
# =============================================================================
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class BaseAggregator(ABC, Generic[T]):
|
|
400
|
+
"""Abstract base class for distributed aggregators.
|
|
401
|
+
|
|
402
|
+
Aggregators implement the map-reduce pattern for computing
|
|
403
|
+
aggregate statistics across partitions.
|
|
404
|
+
|
|
405
|
+
Type Parameters:
|
|
406
|
+
T: Type of the accumulated state.
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
name: str = "base"
|
|
410
|
+
|
|
411
|
+
@abstractmethod
|
|
412
|
+
def initialize(self) -> T:
|
|
413
|
+
"""Create initial accumulator state."""
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
@abstractmethod
|
|
417
|
+
def accumulate(self, state: T, value: Any) -> T:
|
|
418
|
+
"""Add a value to the accumulator."""
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
@abstractmethod
|
|
422
|
+
def merge(self, state1: T, state2: T) -> T:
|
|
423
|
+
"""Merge two accumulator states."""
|
|
424
|
+
pass
|
|
425
|
+
|
|
426
|
+
@abstractmethod
|
|
427
|
+
def finalize(self, state: T) -> Any:
|
|
428
|
+
"""Convert accumulator state to final result."""
|
|
429
|
+
pass
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# =============================================================================
|
|
433
|
+
# Built-in Aggregators
|
|
434
|
+
# =============================================================================
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
@dataclass
|
|
438
|
+
class CountState:
|
|
439
|
+
"""State for count aggregator."""
|
|
440
|
+
|
|
441
|
+
count: int = 0
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
class CountAggregator(BaseAggregator[CountState]):
|
|
445
|
+
"""Distributed count aggregator."""
|
|
446
|
+
|
|
447
|
+
name = "count"
|
|
448
|
+
|
|
449
|
+
def initialize(self) -> CountState:
|
|
450
|
+
return CountState()
|
|
451
|
+
|
|
452
|
+
def accumulate(self, state: CountState, value: Any) -> CountState:
|
|
453
|
+
state.count += 1
|
|
454
|
+
return state
|
|
455
|
+
|
|
456
|
+
def merge(self, state1: CountState, state2: CountState) -> CountState:
|
|
457
|
+
return CountState(count=state1.count + state2.count)
|
|
458
|
+
|
|
459
|
+
def finalize(self, state: CountState) -> int:
|
|
460
|
+
return state.count
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
@dataclass
|
|
464
|
+
class SumState:
|
|
465
|
+
"""State for sum aggregator."""
|
|
466
|
+
|
|
467
|
+
total: float = 0.0
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class SumAggregator(BaseAggregator[SumState]):
|
|
471
|
+
"""Distributed sum aggregator."""
|
|
472
|
+
|
|
473
|
+
name = "sum"
|
|
474
|
+
|
|
475
|
+
def initialize(self) -> SumState:
|
|
476
|
+
return SumState()
|
|
477
|
+
|
|
478
|
+
def accumulate(self, state: SumState, value: Any) -> SumState:
|
|
479
|
+
if value is not None:
|
|
480
|
+
state.total += float(value)
|
|
481
|
+
return state
|
|
482
|
+
|
|
483
|
+
def merge(self, state1: SumState, state2: SumState) -> SumState:
|
|
484
|
+
return SumState(total=state1.total + state2.total)
|
|
485
|
+
|
|
486
|
+
def finalize(self, state: SumState) -> float:
|
|
487
|
+
return state.total
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
@dataclass
|
|
491
|
+
class MeanState:
|
|
492
|
+
"""State for mean aggregator (uses Welford's online algorithm)."""
|
|
493
|
+
|
|
494
|
+
count: int = 0
|
|
495
|
+
mean: float = 0.0
|
|
496
|
+
m2: float = 0.0 # Sum of squared differences from mean
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
class MeanAggregator(BaseAggregator[MeanState]):
|
|
500
|
+
"""Distributed mean aggregator using parallel Welford's algorithm."""
|
|
501
|
+
|
|
502
|
+
name = "mean"
|
|
503
|
+
|
|
504
|
+
def initialize(self) -> MeanState:
|
|
505
|
+
return MeanState()
|
|
506
|
+
|
|
507
|
+
def accumulate(self, state: MeanState, value: Any) -> MeanState:
|
|
508
|
+
if value is None:
|
|
509
|
+
return state
|
|
510
|
+
x = float(value)
|
|
511
|
+
state.count += 1
|
|
512
|
+
delta = x - state.mean
|
|
513
|
+
state.mean += delta / state.count
|
|
514
|
+
delta2 = x - state.mean
|
|
515
|
+
state.m2 += delta * delta2
|
|
516
|
+
return state
|
|
517
|
+
|
|
518
|
+
def merge(self, state1: MeanState, state2: MeanState) -> MeanState:
|
|
519
|
+
"""Merge using parallel Welford's algorithm."""
|
|
520
|
+
if state1.count == 0:
|
|
521
|
+
return state2
|
|
522
|
+
if state2.count == 0:
|
|
523
|
+
return state1
|
|
524
|
+
|
|
525
|
+
count = state1.count + state2.count
|
|
526
|
+
delta = state2.mean - state1.mean
|
|
527
|
+
mean = state1.mean + delta * state2.count / count
|
|
528
|
+
m2 = (
|
|
529
|
+
state1.m2
|
|
530
|
+
+ state2.m2
|
|
531
|
+
+ delta**2 * state1.count * state2.count / count
|
|
532
|
+
)
|
|
533
|
+
return MeanState(count=count, mean=mean, m2=m2)
|
|
534
|
+
|
|
535
|
+
def finalize(self, state: MeanState) -> float:
|
|
536
|
+
return state.mean if state.count > 0 else 0.0
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
@dataclass
|
|
540
|
+
class StdState(MeanState):
|
|
541
|
+
"""State for standard deviation aggregator (extends MeanState)."""
|
|
542
|
+
|
|
543
|
+
pass
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
class StdAggregator(BaseAggregator[StdState]):
|
|
547
|
+
"""Distributed standard deviation aggregator."""
|
|
548
|
+
|
|
549
|
+
name = "std"
|
|
550
|
+
|
|
551
|
+
def __init__(self, ddof: int = 1) -> None:
|
|
552
|
+
self.ddof = ddof # Delta degrees of freedom
|
|
553
|
+
|
|
554
|
+
def initialize(self) -> StdState:
|
|
555
|
+
return StdState()
|
|
556
|
+
|
|
557
|
+
def accumulate(self, state: StdState, value: Any) -> StdState:
|
|
558
|
+
if value is None:
|
|
559
|
+
return state
|
|
560
|
+
x = float(value)
|
|
561
|
+
state.count += 1
|
|
562
|
+
delta = x - state.mean
|
|
563
|
+
state.mean += delta / state.count
|
|
564
|
+
delta2 = x - state.mean
|
|
565
|
+
state.m2 += delta * delta2
|
|
566
|
+
return state
|
|
567
|
+
|
|
568
|
+
def merge(self, state1: StdState, state2: StdState) -> StdState:
|
|
569
|
+
if state1.count == 0:
|
|
570
|
+
return state2
|
|
571
|
+
if state2.count == 0:
|
|
572
|
+
return state1
|
|
573
|
+
|
|
574
|
+
count = state1.count + state2.count
|
|
575
|
+
delta = state2.mean - state1.mean
|
|
576
|
+
mean = state1.mean + delta * state2.count / count
|
|
577
|
+
m2 = (
|
|
578
|
+
state1.m2
|
|
579
|
+
+ state2.m2
|
|
580
|
+
+ delta**2 * state1.count * state2.count / count
|
|
581
|
+
)
|
|
582
|
+
return StdState(count=count, mean=mean, m2=m2)
|
|
583
|
+
|
|
584
|
+
def finalize(self, state: StdState) -> float:
|
|
585
|
+
if state.count <= self.ddof:
|
|
586
|
+
return 0.0
|
|
587
|
+
variance = state.m2 / (state.count - self.ddof)
|
|
588
|
+
return variance**0.5
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
@dataclass
|
|
592
|
+
class MinMaxState:
|
|
593
|
+
"""State for min/max aggregator."""
|
|
594
|
+
|
|
595
|
+
min_value: float | None = None
|
|
596
|
+
max_value: float | None = None
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
class MinMaxAggregator(BaseAggregator[MinMaxState]):
|
|
600
|
+
"""Distributed min/max aggregator."""
|
|
601
|
+
|
|
602
|
+
name = "minmax"
|
|
603
|
+
|
|
604
|
+
def initialize(self) -> MinMaxState:
|
|
605
|
+
return MinMaxState()
|
|
606
|
+
|
|
607
|
+
def accumulate(self, state: MinMaxState, value: Any) -> MinMaxState:
|
|
608
|
+
if value is None:
|
|
609
|
+
return state
|
|
610
|
+
x = float(value)
|
|
611
|
+
if state.min_value is None or x < state.min_value:
|
|
612
|
+
state.min_value = x
|
|
613
|
+
if state.max_value is None or x > state.max_value:
|
|
614
|
+
state.max_value = x
|
|
615
|
+
return state
|
|
616
|
+
|
|
617
|
+
def merge(self, state1: MinMaxState, state2: MinMaxState) -> MinMaxState:
|
|
618
|
+
min_val = None
|
|
619
|
+
max_val = None
|
|
620
|
+
|
|
621
|
+
if state1.min_value is not None and state2.min_value is not None:
|
|
622
|
+
min_val = min(state1.min_value, state2.min_value)
|
|
623
|
+
else:
|
|
624
|
+
min_val = state1.min_value or state2.min_value
|
|
625
|
+
|
|
626
|
+
if state1.max_value is not None and state2.max_value is not None:
|
|
627
|
+
max_val = max(state1.max_value, state2.max_value)
|
|
628
|
+
else:
|
|
629
|
+
max_val = state1.max_value or state2.max_value
|
|
630
|
+
|
|
631
|
+
return MinMaxState(min_value=min_val, max_value=max_val)
|
|
632
|
+
|
|
633
|
+
def finalize(self, state: MinMaxState) -> dict[str, float | None]:
|
|
634
|
+
return {"min": state.min_value, "max": state.max_value}
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
@dataclass
|
|
638
|
+
class NullCountState:
|
|
639
|
+
"""State for null count aggregator."""
|
|
640
|
+
|
|
641
|
+
null_count: int = 0
|
|
642
|
+
total_count: int = 0
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
class NullCountAggregator(BaseAggregator[NullCountState]):
|
|
646
|
+
"""Distributed null count aggregator."""
|
|
647
|
+
|
|
648
|
+
name = "null_count"
|
|
649
|
+
|
|
650
|
+
def initialize(self) -> NullCountState:
|
|
651
|
+
return NullCountState()
|
|
652
|
+
|
|
653
|
+
def accumulate(self, state: NullCountState, value: Any) -> NullCountState:
|
|
654
|
+
state.total_count += 1
|
|
655
|
+
if value is None:
|
|
656
|
+
state.null_count += 1
|
|
657
|
+
return state
|
|
658
|
+
|
|
659
|
+
def merge(self, state1: NullCountState, state2: NullCountState) -> NullCountState:
|
|
660
|
+
return NullCountState(
|
|
661
|
+
null_count=state1.null_count + state2.null_count,
|
|
662
|
+
total_count=state1.total_count + state2.total_count,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
def finalize(self, state: NullCountState) -> dict[str, int]:
|
|
666
|
+
return {"null_count": state.null_count, "total_count": state.total_count}
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
@dataclass
|
|
670
|
+
class DistinctState:
|
|
671
|
+
"""State for approximate distinct count (HyperLogLog)."""
|
|
672
|
+
|
|
673
|
+
# Simplified version - for production use HyperLogLog
|
|
674
|
+
seen: set = field(default_factory=set)
|
|
675
|
+
count: int = 0
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
class DistinctCountAggregator(BaseAggregator[DistinctState]):
|
|
679
|
+
"""Distributed distinct count aggregator.
|
|
680
|
+
|
|
681
|
+
Note: For very large cardinalities, consider using HyperLogLog.
|
|
682
|
+
"""
|
|
683
|
+
|
|
684
|
+
name = "distinct_count"
|
|
685
|
+
|
|
686
|
+
def __init__(self, max_sample: int = 100_000) -> None:
|
|
687
|
+
self.max_sample = max_sample
|
|
688
|
+
|
|
689
|
+
def initialize(self) -> DistinctState:
|
|
690
|
+
return DistinctState()
|
|
691
|
+
|
|
692
|
+
def accumulate(self, state: DistinctState, value: Any) -> DistinctState:
|
|
693
|
+
if value is not None and len(state.seen) < self.max_sample:
|
|
694
|
+
# Use hash for memory efficiency
|
|
695
|
+
try:
|
|
696
|
+
state.seen.add(hash(value))
|
|
697
|
+
except TypeError:
|
|
698
|
+
state.seen.add(hash(str(value)))
|
|
699
|
+
state.count += 1
|
|
700
|
+
return state
|
|
701
|
+
|
|
702
|
+
def merge(self, state1: DistinctState, state2: DistinctState) -> DistinctState:
|
|
703
|
+
merged = DistinctState()
|
|
704
|
+
merged.seen = state1.seen | state2.seen
|
|
705
|
+
merged.count = state1.count + state2.count
|
|
706
|
+
return merged
|
|
707
|
+
|
|
708
|
+
def finalize(self, state: DistinctState) -> int:
|
|
709
|
+
return len(state.seen)
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
# =============================================================================
|
|
713
|
+
# Aggregator Registry
|
|
714
|
+
# =============================================================================
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
AGGREGATOR_REGISTRY: dict[str, type[BaseAggregator]] = {
|
|
718
|
+
"count": CountAggregator,
|
|
719
|
+
"sum": SumAggregator,
|
|
720
|
+
"mean": MeanAggregator,
|
|
721
|
+
"std": StdAggregator,
|
|
722
|
+
"minmax": MinMaxAggregator,
|
|
723
|
+
"null_count": NullCountAggregator,
|
|
724
|
+
"distinct_count": DistinctCountAggregator,
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def get_aggregator(name: str, **kwargs: Any) -> BaseAggregator:
|
|
729
|
+
"""Get an aggregator by name.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
name: Aggregator name.
|
|
733
|
+
**kwargs: Additional arguments for the aggregator.
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
Aggregator instance.
|
|
737
|
+
|
|
738
|
+
Raises:
|
|
739
|
+
KeyError: If aggregator not found.
|
|
740
|
+
"""
|
|
741
|
+
if name not in AGGREGATOR_REGISTRY:
|
|
742
|
+
raise KeyError(
|
|
743
|
+
f"Unknown aggregator: {name}. "
|
|
744
|
+
f"Available: {list(AGGREGATOR_REGISTRY.keys())}"
|
|
745
|
+
)
|
|
746
|
+
return AGGREGATOR_REGISTRY[name](**kwargs)
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def register_aggregator(name: str, aggregator_class: type[BaseAggregator]) -> None:
|
|
750
|
+
"""Register a custom aggregator.
|
|
751
|
+
|
|
752
|
+
Args:
|
|
753
|
+
name: Name to register under.
|
|
754
|
+
aggregator_class: Aggregator class to register.
|
|
755
|
+
"""
|
|
756
|
+
AGGREGATOR_REGISTRY[name] = aggregator_class
|