truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1127 @@
|
|
|
1
|
+
"""Ray-native execution engine for distributed data validation.
|
|
2
|
+
|
|
3
|
+
This module provides a Ray-native execution engine that:
|
|
4
|
+
- Executes validation operations directly on Ray Datasets
|
|
5
|
+
- Avoids Polars conversion overhead for distributed operations
|
|
6
|
+
- Uses Arrow for efficient data transfer when conversion is needed
|
|
7
|
+
- Supports distributed aggregations with proper reduce semantics
|
|
8
|
+
|
|
9
|
+
Architecture:
|
|
10
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
11
|
+
│ RayExecutionEngine │
|
|
12
|
+
│ │
|
|
13
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
14
|
+
│ │ Native Ray Operations │ │
|
|
15
|
+
│ │ (count, aggregate, filter - no conversion overhead) │ │
|
|
16
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
17
|
+
│ │ │
|
|
18
|
+
│ ▼ │
|
|
19
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
20
|
+
│ │ Arrow Bridge (when needed) │ │
|
|
21
|
+
│ │ (zero-copy conversion to Polars for ML validators) │ │
|
|
22
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
23
|
+
│ │ │
|
|
24
|
+
│ ▼ │
|
|
25
|
+
│ ┌──────────────────────────────────────────────────────────┐ │
|
|
26
|
+
│ │ Polars LazyFrame (fallback) │ │
|
|
27
|
+
│ │ (only for validators that require Polars operations) │ │
|
|
28
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> import ray
|
|
34
|
+
>>> from truthound.execution.distributed import RayExecutionEngine
|
|
35
|
+
>>>
|
|
36
|
+
>>> ray.init()
|
|
37
|
+
>>> ds = ray.data.read_parquet("large_data.parquet")
|
|
38
|
+
>>>
|
|
39
|
+
>>> # Create native Ray engine
|
|
40
|
+
>>> engine = RayExecutionEngine.from_dataset(ds)
|
|
41
|
+
>>>
|
|
42
|
+
>>> # Native Ray operations (no conversion overhead)
|
|
43
|
+
>>> row_count = engine.count_rows()
|
|
44
|
+
>>> null_counts = engine.count_nulls_all()
|
|
45
|
+
>>> stats = engine.get_stats("price")
|
|
46
|
+
>>>
|
|
47
|
+
>>> # Convert to Polars only when needed (via Arrow)
|
|
48
|
+
>>> lf = engine.to_polars_lazyframe()
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
import logging
|
|
54
|
+
import time
|
|
55
|
+
from dataclasses import dataclass, field
|
|
56
|
+
from functools import reduce
|
|
57
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator
|
|
58
|
+
|
|
59
|
+
from truthound.execution.distributed.base import (
|
|
60
|
+
BaseDistributedEngine,
|
|
61
|
+
DistributedEngineConfig,
|
|
62
|
+
ExecutionMetrics,
|
|
63
|
+
)
|
|
64
|
+
from truthound.execution.distributed.protocols import (
|
|
65
|
+
AggregationScope,
|
|
66
|
+
AggregationSpec,
|
|
67
|
+
ComputeBackend,
|
|
68
|
+
DistributedResult,
|
|
69
|
+
PartitionInfo,
|
|
70
|
+
PartitionStrategy,
|
|
71
|
+
get_aggregator,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if TYPE_CHECKING:
|
|
75
|
+
import pyarrow as pa
|
|
76
|
+
import ray
|
|
77
|
+
from ray.data import Dataset
|
|
78
|
+
|
|
79
|
+
logger = logging.getLogger(__name__)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# =============================================================================
|
|
83
|
+
# Configuration
|
|
84
|
+
# =============================================================================
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class RayEngineConfig(DistributedEngineConfig):
|
|
89
|
+
"""Configuration for Ray execution engine.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
ray_address: Ray cluster address (None = local).
|
|
93
|
+
num_cpus: Number of CPUs to use.
|
|
94
|
+
num_gpus: Number of GPUs to use.
|
|
95
|
+
object_store_memory: Object store memory in bytes.
|
|
96
|
+
batch_size: Batch size for iterating over data.
|
|
97
|
+
prefetch_batches: Number of batches to prefetch.
|
|
98
|
+
concurrency: Number of concurrent tasks for map operations.
|
|
99
|
+
use_actors: Use actor pool for better resource utilization.
|
|
100
|
+
actor_pool_size: Size of actor pool.
|
|
101
|
+
target_max_block_size: Target max block size in bytes.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
ray_address: str | None = None
|
|
105
|
+
num_cpus: int | None = None
|
|
106
|
+
num_gpus: int | None = None
|
|
107
|
+
object_store_memory: int | None = None
|
|
108
|
+
batch_size: int = 4096
|
|
109
|
+
prefetch_batches: int = 2
|
|
110
|
+
concurrency: int | None = None
|
|
111
|
+
use_actors: bool = False
|
|
112
|
+
actor_pool_size: int = 4
|
|
113
|
+
target_max_block_size: int = 128 * 1024 * 1024 # 128MB
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _check_ray_available() -> None:
|
|
117
|
+
"""Check if Ray is available."""
|
|
118
|
+
try:
|
|
119
|
+
import ray # noqa: F401
|
|
120
|
+
import ray.data # noqa: F401
|
|
121
|
+
except ImportError:
|
|
122
|
+
raise ImportError(
|
|
123
|
+
"ray is required for RayExecutionEngine. "
|
|
124
|
+
"Install with: pip install 'ray[data]'"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _ensure_ray_initialized(config: RayEngineConfig) -> None:
|
|
129
|
+
"""Ensure Ray is initialized."""
|
|
130
|
+
import ray
|
|
131
|
+
|
|
132
|
+
if not ray.is_initialized():
|
|
133
|
+
init_kwargs = {}
|
|
134
|
+
if config.ray_address:
|
|
135
|
+
init_kwargs["address"] = config.ray_address
|
|
136
|
+
if config.num_cpus:
|
|
137
|
+
init_kwargs["num_cpus"] = config.num_cpus
|
|
138
|
+
if config.num_gpus:
|
|
139
|
+
init_kwargs["num_gpus"] = config.num_gpus
|
|
140
|
+
if config.object_store_memory:
|
|
141
|
+
init_kwargs["object_store_memory"] = config.object_store_memory
|
|
142
|
+
|
|
143
|
+
ray.init(**init_kwargs)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# =============================================================================
|
|
147
|
+
# Ray Execution Engine
|
|
148
|
+
# =============================================================================
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class RayExecutionEngine(BaseDistributedEngine[RayEngineConfig]):
|
|
152
|
+
"""Ray-native execution engine for distributed validation.
|
|
153
|
+
|
|
154
|
+
This engine executes validation operations directly on Ray Datasets,
|
|
155
|
+
avoiding the overhead of converting to Polars for operations that can
|
|
156
|
+
be performed natively in Ray.
|
|
157
|
+
|
|
158
|
+
Key Features:
|
|
159
|
+
- Native Ray aggregations (count, sum, mean, min, max, etc.)
|
|
160
|
+
- Distributed null/duplicate checking
|
|
161
|
+
- Arrow-based zero-copy conversion to Polars when needed
|
|
162
|
+
- Block-aware operations
|
|
163
|
+
- Automatic scaling and fault tolerance
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
>>> engine = RayExecutionEngine.from_dataset(ray_dataset)
|
|
167
|
+
>>> null_counts = engine.count_nulls_all() # Native Ray
|
|
168
|
+
>>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
engine_type = "ray"
|
|
172
|
+
|
|
173
|
+
def __init__(
|
|
174
|
+
self,
|
|
175
|
+
dataset: "Dataset",
|
|
176
|
+
config: RayEngineConfig | None = None,
|
|
177
|
+
) -> None:
|
|
178
|
+
"""Initialize Ray execution engine.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
dataset: Ray Dataset.
|
|
182
|
+
config: Optional configuration.
|
|
183
|
+
"""
|
|
184
|
+
_check_ray_available()
|
|
185
|
+
super().__init__(config)
|
|
186
|
+
|
|
187
|
+
_ensure_ray_initialized(self._config)
|
|
188
|
+
|
|
189
|
+
self._ds = dataset
|
|
190
|
+
self._schema = dataset.schema()
|
|
191
|
+
self._columns = list(self._schema.names) if self._schema else []
|
|
192
|
+
self._cached_row_count: int | None = None
|
|
193
|
+
|
|
194
|
+
@classmethod
|
|
195
|
+
def _default_config(cls) -> RayEngineConfig:
|
|
196
|
+
"""Create default configuration."""
|
|
197
|
+
return RayEngineConfig()
|
|
198
|
+
|
|
199
|
+
# -------------------------------------------------------------------------
|
|
200
|
+
# Factory Methods
|
|
201
|
+
# -------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def from_dataset(
|
|
205
|
+
cls,
|
|
206
|
+
dataset: "Dataset",
|
|
207
|
+
config: RayEngineConfig | None = None,
|
|
208
|
+
) -> "RayExecutionEngine":
|
|
209
|
+
"""Create engine from existing Ray Dataset.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
dataset: Ray Dataset.
|
|
213
|
+
config: Optional configuration.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
RayExecutionEngine instance.
|
|
217
|
+
"""
|
|
218
|
+
return cls(dataset, config)
|
|
219
|
+
|
|
220
|
+
@classmethod
|
|
221
|
+
def from_parquet(
|
|
222
|
+
cls,
|
|
223
|
+
path: str,
|
|
224
|
+
config: RayEngineConfig | None = None,
|
|
225
|
+
**read_kwargs: Any,
|
|
226
|
+
) -> "RayExecutionEngine":
|
|
227
|
+
"""Create engine from Parquet files.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
path: Path to Parquet files (can use glob patterns).
|
|
231
|
+
config: Optional configuration.
|
|
232
|
+
**read_kwargs: Additional arguments for read_parquet.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
RayExecutionEngine instance.
|
|
236
|
+
"""
|
|
237
|
+
_check_ray_available()
|
|
238
|
+
import ray.data
|
|
239
|
+
|
|
240
|
+
cfg = config or RayEngineConfig()
|
|
241
|
+
_ensure_ray_initialized(cfg)
|
|
242
|
+
|
|
243
|
+
ds = ray.data.read_parquet(path, **read_kwargs)
|
|
244
|
+
|
|
245
|
+
return cls(ds, config)
|
|
246
|
+
|
|
247
|
+
@classmethod
|
|
248
|
+
def from_csv(
|
|
249
|
+
cls,
|
|
250
|
+
path: str,
|
|
251
|
+
config: RayEngineConfig | None = None,
|
|
252
|
+
**read_kwargs: Any,
|
|
253
|
+
) -> "RayExecutionEngine":
|
|
254
|
+
"""Create engine from CSV files.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
path: Path to CSV files (can use glob patterns).
|
|
258
|
+
config: Optional configuration.
|
|
259
|
+
**read_kwargs: Additional arguments for read_csv.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
RayExecutionEngine instance.
|
|
263
|
+
"""
|
|
264
|
+
_check_ray_available()
|
|
265
|
+
import ray.data
|
|
266
|
+
|
|
267
|
+
cfg = config or RayEngineConfig()
|
|
268
|
+
_ensure_ray_initialized(cfg)
|
|
269
|
+
|
|
270
|
+
ds = ray.data.read_csv(path, **read_kwargs)
|
|
271
|
+
|
|
272
|
+
return cls(ds, config)
|
|
273
|
+
|
|
274
|
+
@classmethod
|
|
275
|
+
def from_pandas(
|
|
276
|
+
cls,
|
|
277
|
+
df: Any,
|
|
278
|
+
config: RayEngineConfig | None = None,
|
|
279
|
+
) -> "RayExecutionEngine":
|
|
280
|
+
"""Create engine from Pandas DataFrame.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
df: Pandas DataFrame.
|
|
284
|
+
config: Optional configuration.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
RayExecutionEngine instance.
|
|
288
|
+
"""
|
|
289
|
+
_check_ray_available()
|
|
290
|
+
import ray.data
|
|
291
|
+
|
|
292
|
+
cfg = config or RayEngineConfig()
|
|
293
|
+
_ensure_ray_initialized(cfg)
|
|
294
|
+
|
|
295
|
+
ds = ray.data.from_pandas(df)
|
|
296
|
+
|
|
297
|
+
return cls(ds, config)
|
|
298
|
+
|
|
299
|
+
@classmethod
|
|
300
|
+
def from_arrow(
|
|
301
|
+
cls,
|
|
302
|
+
table: "pa.Table",
|
|
303
|
+
config: RayEngineConfig | None = None,
|
|
304
|
+
) -> "RayExecutionEngine":
|
|
305
|
+
"""Create engine from Arrow Table.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
table: PyArrow Table.
|
|
309
|
+
config: Optional configuration.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
RayExecutionEngine instance.
|
|
313
|
+
"""
|
|
314
|
+
_check_ray_available()
|
|
315
|
+
import ray.data
|
|
316
|
+
|
|
317
|
+
cfg = config or RayEngineConfig()
|
|
318
|
+
_ensure_ray_initialized(cfg)
|
|
319
|
+
|
|
320
|
+
ds = ray.data.from_arrow(table)
|
|
321
|
+
|
|
322
|
+
return cls(ds, config)
|
|
323
|
+
|
|
324
|
+
@classmethod
|
|
325
|
+
def from_items(
|
|
326
|
+
cls,
|
|
327
|
+
items: list[dict[str, Any]],
|
|
328
|
+
config: RayEngineConfig | None = None,
|
|
329
|
+
) -> "RayExecutionEngine":
|
|
330
|
+
"""Create engine from list of dictionaries.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
items: List of row dictionaries.
|
|
334
|
+
config: Optional configuration.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
RayExecutionEngine instance.
|
|
338
|
+
"""
|
|
339
|
+
_check_ray_available()
|
|
340
|
+
import ray.data
|
|
341
|
+
|
|
342
|
+
cfg = config or RayEngineConfig()
|
|
343
|
+
_ensure_ray_initialized(cfg)
|
|
344
|
+
|
|
345
|
+
ds = ray.data.from_items(items)
|
|
346
|
+
|
|
347
|
+
return cls(ds, config)
|
|
348
|
+
|
|
349
|
+
# -------------------------------------------------------------------------
|
|
350
|
+
# Properties
|
|
351
|
+
# -------------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
@property
|
|
354
|
+
def backend_type(self) -> ComputeBackend:
|
|
355
|
+
"""Get the compute backend type."""
|
|
356
|
+
return ComputeBackend.RAY
|
|
357
|
+
|
|
358
|
+
@property
|
|
359
|
+
def dataset(self) -> "Dataset":
|
|
360
|
+
"""Get the underlying Ray Dataset."""
|
|
361
|
+
return self._ds
|
|
362
|
+
|
|
363
|
+
@property
|
|
364
|
+
def schema(self) -> Any:
|
|
365
|
+
"""Get the dataset schema."""
|
|
366
|
+
return self._schema
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def supports_sql_pushdown(self) -> bool:
|
|
370
|
+
"""Ray doesn't have native SQL pushdown."""
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
# -------------------------------------------------------------------------
|
|
374
|
+
# Abstract Method Implementations
|
|
375
|
+
# -------------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
def _get_partition_count(self) -> int:
|
|
378
|
+
"""Get number of data blocks (partitions)."""
|
|
379
|
+
return self._ds.num_blocks()
|
|
380
|
+
|
|
381
|
+
def _get_partition_info(self) -> list[PartitionInfo]:
|
|
382
|
+
"""Get information about all partitions (blocks)."""
|
|
383
|
+
num_blocks = self._get_partition_count()
|
|
384
|
+
columns = tuple(self._columns)
|
|
385
|
+
|
|
386
|
+
return [
|
|
387
|
+
PartitionInfo(
|
|
388
|
+
partition_id=i,
|
|
389
|
+
total_partitions=num_blocks,
|
|
390
|
+
columns=columns,
|
|
391
|
+
)
|
|
392
|
+
for i in range(num_blocks)
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
def _execute_on_partitions(
|
|
396
|
+
self,
|
|
397
|
+
operation: str,
|
|
398
|
+
func: Callable[[Any], dict[str, Any]],
|
|
399
|
+
columns: list[str] | None = None,
|
|
400
|
+
) -> list[DistributedResult]:
|
|
401
|
+
"""Execute function on all blocks using map_batches.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
operation: Operation name for metrics.
|
|
405
|
+
func: Function to apply to each batch.
|
|
406
|
+
columns: Columns to include (None = all).
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Results from all blocks.
|
|
410
|
+
"""
|
|
411
|
+
import ray
|
|
412
|
+
|
|
413
|
+
metrics = self._start_metrics(operation)
|
|
414
|
+
|
|
415
|
+
try:
|
|
416
|
+
ds = self._ds
|
|
417
|
+
if columns:
|
|
418
|
+
ds = ds.select_columns(columns)
|
|
419
|
+
|
|
420
|
+
# Map batches - func receives batch dict
|
|
421
|
+
def wrapped_func(batch: dict[str, Any]) -> dict[str, Any]:
|
|
422
|
+
start_time = time.time()
|
|
423
|
+
result = func(batch)
|
|
424
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
425
|
+
|
|
426
|
+
# Get row count from batch
|
|
427
|
+
row_count = len(next(iter(batch.values()))) if batch else 0
|
|
428
|
+
|
|
429
|
+
return {
|
|
430
|
+
"value": [result.get("value")],
|
|
431
|
+
"row_count": [row_count],
|
|
432
|
+
"duration_ms": [duration_ms],
|
|
433
|
+
"errors": [result.get("errors", [])],
|
|
434
|
+
"metadata": [result.get("metadata", {})],
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
results_ds = ds.map_batches(
|
|
438
|
+
wrapped_func,
|
|
439
|
+
batch_format="pydict",
|
|
440
|
+
batch_size=self._config.batch_size,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Collect results
|
|
444
|
+
collected = []
|
|
445
|
+
for i, batch in enumerate(results_ds.iter_batches(batch_format="pydict")):
|
|
446
|
+
for j in range(len(batch["value"])):
|
|
447
|
+
collected.append(
|
|
448
|
+
DistributedResult(
|
|
449
|
+
partition_id=i,
|
|
450
|
+
operation=operation,
|
|
451
|
+
value=batch["value"][j],
|
|
452
|
+
row_count=batch["row_count"][j],
|
|
453
|
+
duration_ms=batch["duration_ms"][j],
|
|
454
|
+
errors=batch["errors"][j] if batch["errors"][j] else [],
|
|
455
|
+
metadata=batch["metadata"][j] if batch["metadata"][j] else {},
|
|
456
|
+
)
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
total_rows = sum(r.row_count for r in collected)
|
|
460
|
+
metrics.partitions_processed = len(collected)
|
|
461
|
+
metrics.rows_processed = total_rows
|
|
462
|
+
|
|
463
|
+
return collected
|
|
464
|
+
|
|
465
|
+
except Exception as e:
|
|
466
|
+
metrics.errors.append(str(e))
|
|
467
|
+
raise
|
|
468
|
+
finally:
|
|
469
|
+
self._end_metrics(metrics)
|
|
470
|
+
|
|
471
|
+
def _aggregate_distributed(
|
|
472
|
+
self,
|
|
473
|
+
spec: AggregationSpec,
|
|
474
|
+
) -> dict[str, Any]:
|
|
475
|
+
"""Perform distributed aggregation using native Ray operations.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
spec: Aggregation specification.
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
Aggregated results.
|
|
482
|
+
"""
|
|
483
|
+
import ray
|
|
484
|
+
|
|
485
|
+
metrics = self._start_metrics("aggregate")
|
|
486
|
+
|
|
487
|
+
try:
|
|
488
|
+
results = {}
|
|
489
|
+
|
|
490
|
+
for agg in spec.aggregations:
|
|
491
|
+
column = agg.column
|
|
492
|
+
operation = agg.operation
|
|
493
|
+
alias = agg.alias
|
|
494
|
+
params = agg.params
|
|
495
|
+
|
|
496
|
+
if operation == "count":
|
|
497
|
+
if column == "*":
|
|
498
|
+
value = self._ds.count()
|
|
499
|
+
else:
|
|
500
|
+
# Count non-null values
|
|
501
|
+
value = self._count_non_null(column)
|
|
502
|
+
results[alias] = value
|
|
503
|
+
|
|
504
|
+
elif operation == "sum":
|
|
505
|
+
value = self._ds.sum(column)
|
|
506
|
+
results[alias] = value
|
|
507
|
+
|
|
508
|
+
elif operation == "mean":
|
|
509
|
+
value = self._ds.mean(column)
|
|
510
|
+
results[alias] = value
|
|
511
|
+
|
|
512
|
+
elif operation == "min":
|
|
513
|
+
value = self._ds.min(column)
|
|
514
|
+
results[alias] = value
|
|
515
|
+
|
|
516
|
+
elif operation == "max":
|
|
517
|
+
value = self._ds.max(column)
|
|
518
|
+
results[alias] = value
|
|
519
|
+
|
|
520
|
+
elif operation == "std":
|
|
521
|
+
value = self._ds.std(column)
|
|
522
|
+
results[alias] = value
|
|
523
|
+
|
|
524
|
+
elif operation == "var":
|
|
525
|
+
# Ray doesn't have built-in var, compute from std
|
|
526
|
+
std = self._ds.std(column)
|
|
527
|
+
value = std ** 2 if std is not None else None
|
|
528
|
+
results[alias] = value
|
|
529
|
+
|
|
530
|
+
elif operation == "minmax":
|
|
531
|
+
min_val = self._ds.min(column)
|
|
532
|
+
max_val = self._ds.max(column)
|
|
533
|
+
results[alias] = {"min": min_val, "max": max_val}
|
|
534
|
+
|
|
535
|
+
elif operation == "null_count":
|
|
536
|
+
null_count = self._count_nulls_column(column)
|
|
537
|
+
total_count = self._ds.count()
|
|
538
|
+
results[alias] = {
|
|
539
|
+
"null_count": null_count,
|
|
540
|
+
"total_count": total_count,
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
elif operation == "distinct_count":
|
|
544
|
+
value = self._count_distinct_column(column)
|
|
545
|
+
results[alias] = value
|
|
546
|
+
|
|
547
|
+
else:
|
|
548
|
+
# Use custom aggregator via map-reduce
|
|
549
|
+
result = self._aggregate_with_aggregator(agg)
|
|
550
|
+
results[alias] = result
|
|
551
|
+
|
|
552
|
+
return results
|
|
553
|
+
|
|
554
|
+
except Exception as e:
|
|
555
|
+
metrics.errors.append(str(e))
|
|
556
|
+
raise
|
|
557
|
+
finally:
|
|
558
|
+
self._end_metrics(metrics)
|
|
559
|
+
|
|
560
|
+
def _count_non_null(self, column: str) -> int:
|
|
561
|
+
"""Count non-null values in a column."""
|
|
562
|
+
total = self._ds.count()
|
|
563
|
+
null_count = self._count_nulls_column(column)
|
|
564
|
+
return total - null_count
|
|
565
|
+
|
|
566
|
+
def _count_nulls_column(self, column: str) -> int:
|
|
567
|
+
"""Count null values in a column."""
|
|
568
|
+
import ray
|
|
569
|
+
|
|
570
|
+
@ray.remote
|
|
571
|
+
def count_nulls_batch(batch: dict) -> int:
|
|
572
|
+
values = batch.get(column, [])
|
|
573
|
+
return sum(1 for v in values if v is None)
|
|
574
|
+
|
|
575
|
+
null_counts = []
|
|
576
|
+
for batch in self._ds.iter_batches(
|
|
577
|
+
batch_format="pydict",
|
|
578
|
+
batch_size=self._config.batch_size,
|
|
579
|
+
):
|
|
580
|
+
ref = count_nulls_batch.remote(batch)
|
|
581
|
+
null_counts.append(ref)
|
|
582
|
+
|
|
583
|
+
return sum(ray.get(null_counts))
|
|
584
|
+
|
|
585
|
+
def _count_distinct_column(self, column: str) -> int:
|
|
586
|
+
"""Count distinct values in a column."""
|
|
587
|
+
# Use unique() which returns a dataset with unique values
|
|
588
|
+
unique_ds = self._ds.unique(column)
|
|
589
|
+
return unique_ds.count()
|
|
590
|
+
|
|
591
|
+
def _aggregate_with_aggregator(
|
|
592
|
+
self,
|
|
593
|
+
agg: Any,
|
|
594
|
+
) -> Any:
|
|
595
|
+
"""Perform aggregation using custom aggregator via map-reduce.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
agg: Aggregation specification.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
Aggregated result.
|
|
602
|
+
"""
|
|
603
|
+
import ray
|
|
604
|
+
|
|
605
|
+
aggregator = get_aggregator(agg.operation, **agg.params)
|
|
606
|
+
column = agg.column
|
|
607
|
+
|
|
608
|
+
@ray.remote
|
|
609
|
+
def map_batch(batch: dict) -> Any:
|
|
610
|
+
state = aggregator.initialize()
|
|
611
|
+
values = batch.get(column, [])
|
|
612
|
+
for value in values:
|
|
613
|
+
state = aggregator.accumulate(state, value)
|
|
614
|
+
return state
|
|
615
|
+
|
|
616
|
+
# Map phase: compute partial aggregates per batch
|
|
617
|
+
batch_refs = []
|
|
618
|
+
for batch in self._ds.iter_batches(
|
|
619
|
+
batch_format="pydict",
|
|
620
|
+
batch_size=self._config.batch_size,
|
|
621
|
+
):
|
|
622
|
+
ref = map_batch.remote(batch)
|
|
623
|
+
batch_refs.append(ref)
|
|
624
|
+
|
|
625
|
+
partial_states = ray.get(batch_refs)
|
|
626
|
+
|
|
627
|
+
# Reduce phase: merge all partial states
|
|
628
|
+
if not partial_states:
|
|
629
|
+
return aggregator.finalize(aggregator.initialize())
|
|
630
|
+
|
|
631
|
+
final_state = reduce(aggregator.merge, partial_states)
|
|
632
|
+
return aggregator.finalize(final_state)
|
|
633
|
+
|
|
634
|
+
def _to_arrow_batches(
|
|
635
|
+
self,
|
|
636
|
+
batch_size: int | None = None,
|
|
637
|
+
) -> list["pa.RecordBatch"]:
|
|
638
|
+
"""Convert Ray Dataset to Arrow batches.
|
|
639
|
+
|
|
640
|
+
Ray has native Arrow support, making this efficient.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
batch_size: Batch size for conversion.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
List of Arrow record batches.
|
|
647
|
+
"""
|
|
648
|
+
import pyarrow as pa
|
|
649
|
+
|
|
650
|
+
batch_size = batch_size or self._config.arrow_batch_size
|
|
651
|
+
|
|
652
|
+
# Ray Dataset has native Arrow support
|
|
653
|
+
batches = []
|
|
654
|
+
for batch in self._ds.iter_batches(
|
|
655
|
+
batch_format="pyarrow",
|
|
656
|
+
batch_size=batch_size,
|
|
657
|
+
):
|
|
658
|
+
if isinstance(batch, pa.RecordBatch):
|
|
659
|
+
batches.append(batch)
|
|
660
|
+
elif isinstance(batch, pa.Table):
|
|
661
|
+
batches.extend(batch.to_batches(max_chunksize=batch_size))
|
|
662
|
+
|
|
663
|
+
return batches
|
|
664
|
+
|
|
665
|
+
def _repartition(self, num_partitions: int) -> "RayExecutionEngine":
|
|
666
|
+
"""Repartition the underlying Dataset.
|
|
667
|
+
|
|
668
|
+
Args:
|
|
669
|
+
num_partitions: New number of partitions (blocks).
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
New engine with repartitioned data.
|
|
673
|
+
"""
|
|
674
|
+
repartitioned = self._ds.repartition(num_partitions)
|
|
675
|
+
return RayExecutionEngine(repartitioned, self._config)
|
|
676
|
+
|
|
677
|
+
def coalesce(self, num_partitions: int) -> "RayExecutionEngine":
|
|
678
|
+
"""Coalesce partitions (blocks).
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
num_partitions: New number of partitions.
|
|
682
|
+
|
|
683
|
+
Returns:
|
|
684
|
+
New engine with coalesced data.
|
|
685
|
+
"""
|
|
686
|
+
# Ray's repartition can reduce partitions without full shuffle
|
|
687
|
+
coalesced = self._ds.repartition(num_partitions)
|
|
688
|
+
return RayExecutionEngine(coalesced, self._config)
|
|
689
|
+
|
|
690
|
+
# -------------------------------------------------------------------------
|
|
691
|
+
# Core Operation Overrides (Native Ray)
|
|
692
|
+
# -------------------------------------------------------------------------
|
|
693
|
+
|
|
694
|
+
def count_rows(self) -> int:
|
|
695
|
+
"""Count rows using native Ray count."""
|
|
696
|
+
if self._cached_row_count is not None:
|
|
697
|
+
return self._cached_row_count
|
|
698
|
+
|
|
699
|
+
cache_key = self._cache_key("count_rows")
|
|
700
|
+
cached = self._get_cached(cache_key)
|
|
701
|
+
if cached is not None:
|
|
702
|
+
return cached
|
|
703
|
+
|
|
704
|
+
count = self._ds.count()
|
|
705
|
+
self._cached_row_count = count
|
|
706
|
+
self._set_cached(cache_key, count)
|
|
707
|
+
return count
|
|
708
|
+
|
|
709
|
+
def get_columns(self) -> list[str]:
|
|
710
|
+
"""Get column names."""
|
|
711
|
+
return self._columns
|
|
712
|
+
|
|
713
|
+
def count_nulls(self, column: str) -> int:
|
|
714
|
+
"""Count nulls using distributed computation."""
|
|
715
|
+
cache_key = self._cache_key("count_nulls", column)
|
|
716
|
+
cached = self._get_cached(cache_key)
|
|
717
|
+
if cached is not None:
|
|
718
|
+
return cached
|
|
719
|
+
|
|
720
|
+
count = self._count_nulls_column(column)
|
|
721
|
+
self._set_cached(cache_key, count)
|
|
722
|
+
return count
|
|
723
|
+
|
|
724
|
+
def count_nulls_all(self) -> dict[str, int]:
|
|
725
|
+
"""Count nulls in all columns."""
|
|
726
|
+
import ray
|
|
727
|
+
|
|
728
|
+
cache_key = self._cache_key("count_nulls_all")
|
|
729
|
+
cached = self._get_cached(cache_key)
|
|
730
|
+
if cached is not None:
|
|
731
|
+
return cached
|
|
732
|
+
|
|
733
|
+
# Compute null counts for all columns in parallel
|
|
734
|
+
@ray.remote
|
|
735
|
+
def count_batch_nulls(batch: dict, columns: list) -> dict[str, int]:
|
|
736
|
+
result = {}
|
|
737
|
+
for col in columns:
|
|
738
|
+
values = batch.get(col, [])
|
|
739
|
+
result[col] = sum(1 for v in values if v is None)
|
|
740
|
+
return result
|
|
741
|
+
|
|
742
|
+
batch_results = []
|
|
743
|
+
for batch in self._ds.iter_batches(
|
|
744
|
+
batch_format="pydict",
|
|
745
|
+
batch_size=self._config.batch_size,
|
|
746
|
+
):
|
|
747
|
+
ref = count_batch_nulls.remote(batch, self._columns)
|
|
748
|
+
batch_results.append(ref)
|
|
749
|
+
|
|
750
|
+
all_counts = ray.get(batch_results)
|
|
751
|
+
|
|
752
|
+
# Merge results
|
|
753
|
+
result = {col: 0 for col in self._columns}
|
|
754
|
+
for counts in all_counts:
|
|
755
|
+
for col, count in counts.items():
|
|
756
|
+
result[col] += count
|
|
757
|
+
|
|
758
|
+
self._set_cached(cache_key, result)
|
|
759
|
+
return result
|
|
760
|
+
|
|
761
|
+
def count_distinct(self, column: str) -> int:
|
|
762
|
+
"""Count distinct values using native Ray."""
|
|
763
|
+
cache_key = self._cache_key("count_distinct", column)
|
|
764
|
+
cached = self._get_cached(cache_key)
|
|
765
|
+
if cached is not None:
|
|
766
|
+
return cached
|
|
767
|
+
|
|
768
|
+
count = self._count_distinct_column(column)
|
|
769
|
+
self._set_cached(cache_key, count)
|
|
770
|
+
return count
|
|
771
|
+
|
|
772
|
+
def get_stats(self, column: str) -> dict[str, Any]:
|
|
773
|
+
"""Get column statistics using native Ray aggregations."""
|
|
774
|
+
cache_key = self._cache_key("get_stats", column)
|
|
775
|
+
cached = self._get_cached(cache_key)
|
|
776
|
+
if cached is not None:
|
|
777
|
+
return cached
|
|
778
|
+
|
|
779
|
+
# Compute stats
|
|
780
|
+
stats = {
|
|
781
|
+
"count": self._ds.count(),
|
|
782
|
+
"null_count": self._count_nulls_column(column),
|
|
783
|
+
"mean": self._ds.mean(column),
|
|
784
|
+
"std": self._ds.std(column),
|
|
785
|
+
"min": self._ds.min(column),
|
|
786
|
+
"max": self._ds.max(column),
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
self._set_cached(cache_key, stats)
|
|
790
|
+
return stats
|
|
791
|
+
|
|
792
|
+
def get_value_counts(
|
|
793
|
+
self,
|
|
794
|
+
column: str,
|
|
795
|
+
limit: int | None = None,
|
|
796
|
+
) -> dict[Any, int]:
|
|
797
|
+
"""Get value counts."""
|
|
798
|
+
import ray
|
|
799
|
+
|
|
800
|
+
cache_key = self._cache_key("get_value_counts", column, limit)
|
|
801
|
+
cached = self._get_cached(cache_key)
|
|
802
|
+
if cached is not None:
|
|
803
|
+
return cached
|
|
804
|
+
|
|
805
|
+
# Use groupby with count
|
|
806
|
+
grouped = self._ds.groupby(column).count()
|
|
807
|
+
|
|
808
|
+
# Collect and sort
|
|
809
|
+
counts = {}
|
|
810
|
+
for batch in grouped.iter_batches(batch_format="pydict"):
|
|
811
|
+
for i in range(len(batch[column])):
|
|
812
|
+
value = batch[column][i]
|
|
813
|
+
count = batch["count()"][i]
|
|
814
|
+
counts[value] = count
|
|
815
|
+
|
|
816
|
+
# Sort by count descending
|
|
817
|
+
sorted_counts = dict(
|
|
818
|
+
sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
if limit:
|
|
822
|
+
sorted_counts = dict(list(sorted_counts.items())[:limit])
|
|
823
|
+
|
|
824
|
+
self._set_cached(cache_key, sorted_counts)
|
|
825
|
+
return sorted_counts
|
|
826
|
+
|
|
827
|
+
def count_duplicates(self, columns: list[str]) -> int:
|
|
828
|
+
"""Count duplicates."""
|
|
829
|
+
cache_key = self._cache_key("count_duplicates", tuple(columns))
|
|
830
|
+
cached = self._get_cached(cache_key)
|
|
831
|
+
if cached is not None:
|
|
832
|
+
return cached
|
|
833
|
+
|
|
834
|
+
total = self.count_rows()
|
|
835
|
+
|
|
836
|
+
# Get unique count
|
|
837
|
+
if len(columns) == 1:
|
|
838
|
+
unique_ds = self._ds.unique(columns[0])
|
|
839
|
+
else:
|
|
840
|
+
# For multiple columns, use groupby
|
|
841
|
+
grouped = self._ds.groupby(columns).count()
|
|
842
|
+
unique_count = grouped.count()
|
|
843
|
+
duplicates = total - unique_count
|
|
844
|
+
self._set_cached(cache_key, duplicates)
|
|
845
|
+
return duplicates
|
|
846
|
+
|
|
847
|
+
unique_count = unique_ds.count()
|
|
848
|
+
duplicates = total - unique_count
|
|
849
|
+
|
|
850
|
+
self._set_cached(cache_key, duplicates)
|
|
851
|
+
return duplicates
|
|
852
|
+
|
|
853
|
+
def count_matching_regex(self, column: str, pattern: str) -> int:
|
|
854
|
+
"""Count values matching regex."""
|
|
855
|
+
import ray
|
|
856
|
+
import re
|
|
857
|
+
|
|
858
|
+
cache_key = self._cache_key("count_matching_regex", column, pattern)
|
|
859
|
+
cached = self._get_cached(cache_key)
|
|
860
|
+
if cached is not None:
|
|
861
|
+
return cached
|
|
862
|
+
|
|
863
|
+
compiled = re.compile(pattern)
|
|
864
|
+
|
|
865
|
+
@ray.remote
|
|
866
|
+
def count_matches_batch(batch: dict) -> int:
|
|
867
|
+
values = batch.get(column, [])
|
|
868
|
+
return sum(
|
|
869
|
+
1 for v in values
|
|
870
|
+
if v is not None and compiled.match(str(v))
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
batch_refs = []
|
|
874
|
+
for batch in self._ds.iter_batches(
|
|
875
|
+
batch_format="pydict",
|
|
876
|
+
batch_size=self._config.batch_size,
|
|
877
|
+
):
|
|
878
|
+
ref = count_matches_batch.remote(batch)
|
|
879
|
+
batch_refs.append(ref)
|
|
880
|
+
|
|
881
|
+
count = sum(ray.get(batch_refs))
|
|
882
|
+
|
|
883
|
+
self._set_cached(cache_key, count)
|
|
884
|
+
return count
|
|
885
|
+
|
|
886
|
+
def count_in_range(
|
|
887
|
+
self,
|
|
888
|
+
column: str,
|
|
889
|
+
min_value: Any | None = None,
|
|
890
|
+
max_value: Any | None = None,
|
|
891
|
+
inclusive: bool = True,
|
|
892
|
+
) -> int:
|
|
893
|
+
"""Count values in range."""
|
|
894
|
+
import ray
|
|
895
|
+
|
|
896
|
+
cache_key = self._cache_key(
|
|
897
|
+
"count_in_range", column, min_value, max_value, inclusive
|
|
898
|
+
)
|
|
899
|
+
cached = self._get_cached(cache_key)
|
|
900
|
+
if cached is not None:
|
|
901
|
+
return cached
|
|
902
|
+
|
|
903
|
+
@ray.remote
|
|
904
|
+
def count_range_batch(batch: dict) -> int:
|
|
905
|
+
values = batch.get(column, [])
|
|
906
|
+
count = 0
|
|
907
|
+
for v in values:
|
|
908
|
+
if v is None:
|
|
909
|
+
continue
|
|
910
|
+
in_range = True
|
|
911
|
+
if min_value is not None:
|
|
912
|
+
in_range = v >= min_value if inclusive else v > min_value
|
|
913
|
+
if in_range and max_value is not None:
|
|
914
|
+
in_range = v <= max_value if inclusive else v < max_value
|
|
915
|
+
if in_range:
|
|
916
|
+
count += 1
|
|
917
|
+
return count
|
|
918
|
+
|
|
919
|
+
batch_refs = []
|
|
920
|
+
for batch in self._ds.iter_batches(
|
|
921
|
+
batch_format="pydict",
|
|
922
|
+
batch_size=self._config.batch_size,
|
|
923
|
+
):
|
|
924
|
+
ref = count_range_batch.remote(batch)
|
|
925
|
+
batch_refs.append(ref)
|
|
926
|
+
|
|
927
|
+
count = sum(ray.get(batch_refs))
|
|
928
|
+
|
|
929
|
+
self._set_cached(cache_key, count)
|
|
930
|
+
return count
|
|
931
|
+
|
|
932
|
+
def count_in_set(self, column: str, values: set[Any]) -> int:
|
|
933
|
+
"""Count values in set."""
|
|
934
|
+
import ray
|
|
935
|
+
|
|
936
|
+
cache_key = self._cache_key("count_in_set", column, frozenset(values))
|
|
937
|
+
cached = self._get_cached(cache_key)
|
|
938
|
+
if cached is not None:
|
|
939
|
+
return cached
|
|
940
|
+
|
|
941
|
+
values_set = set(values)
|
|
942
|
+
|
|
943
|
+
@ray.remote
|
|
944
|
+
def count_in_set_batch(batch: dict) -> int:
|
|
945
|
+
col_values = batch.get(column, [])
|
|
946
|
+
return sum(1 for v in col_values if v in values_set)
|
|
947
|
+
|
|
948
|
+
batch_refs = []
|
|
949
|
+
for batch in self._ds.iter_batches(
|
|
950
|
+
batch_format="pydict",
|
|
951
|
+
batch_size=self._config.batch_size,
|
|
952
|
+
):
|
|
953
|
+
ref = count_in_set_batch.remote(batch)
|
|
954
|
+
batch_refs.append(ref)
|
|
955
|
+
|
|
956
|
+
count = sum(ray.get(batch_refs))
|
|
957
|
+
|
|
958
|
+
self._set_cached(cache_key, count)
|
|
959
|
+
return count
|
|
960
|
+
|
|
961
|
+
# -------------------------------------------------------------------------
|
|
962
|
+
# Sampling
|
|
963
|
+
# -------------------------------------------------------------------------
|
|
964
|
+
|
|
965
|
+
def sample(
|
|
966
|
+
self,
|
|
967
|
+
n: int = 1000,
|
|
968
|
+
seed: int | None = None,
|
|
969
|
+
) -> "RayExecutionEngine":
|
|
970
|
+
"""Create sampled engine using Ray's native sampling.
|
|
971
|
+
|
|
972
|
+
Args:
|
|
973
|
+
n: Target number of rows.
|
|
974
|
+
seed: Random seed.
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
New engine with sampled data.
|
|
978
|
+
"""
|
|
979
|
+
row_count = self.count_rows()
|
|
980
|
+
|
|
981
|
+
if row_count <= n:
|
|
982
|
+
return self
|
|
983
|
+
|
|
984
|
+
fraction = min((n * 1.1) / row_count, 1.0)
|
|
985
|
+
|
|
986
|
+
# Ray's random_sample method
|
|
987
|
+
sampled = self._ds.random_sample(fraction, seed=seed)
|
|
988
|
+
|
|
989
|
+
# Limit to exact n rows
|
|
990
|
+
sampled = sampled.limit(n)
|
|
991
|
+
|
|
992
|
+
return RayExecutionEngine(sampled, self._config)
|
|
993
|
+
|
|
994
|
+
# -------------------------------------------------------------------------
|
|
995
|
+
# Ray-Specific Methods
|
|
996
|
+
# -------------------------------------------------------------------------
|
|
997
|
+
|
|
998
|
+
def materialize(self) -> "RayExecutionEngine":
|
|
999
|
+
"""Materialize the dataset (trigger execution and cache).
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
Self after materializing.
|
|
1003
|
+
"""
|
|
1004
|
+
self._ds = self._ds.materialize()
|
|
1005
|
+
return self
|
|
1006
|
+
|
|
1007
|
+
def filter(
|
|
1008
|
+
self,
|
|
1009
|
+
fn: Callable[[dict[str, Any]], bool],
|
|
1010
|
+
) -> "RayExecutionEngine":
|
|
1011
|
+
"""Filter the dataset using a function.
|
|
1012
|
+
|
|
1013
|
+
Args:
|
|
1014
|
+
fn: Filter function that takes a row dict and returns bool.
|
|
1015
|
+
|
|
1016
|
+
Returns:
|
|
1017
|
+
New engine with filtered data.
|
|
1018
|
+
"""
|
|
1019
|
+
filtered = self._ds.filter(fn)
|
|
1020
|
+
return RayExecutionEngine(filtered, self._config)
|
|
1021
|
+
|
|
1022
|
+
def select_columns(self, columns: list[str]) -> "RayExecutionEngine":
|
|
1023
|
+
"""Select specific columns.
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
columns: Columns to select.
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
New engine with selected columns.
|
|
1030
|
+
"""
|
|
1031
|
+
selected = self._ds.select_columns(columns)
|
|
1032
|
+
return RayExecutionEngine(selected, self._config)
|
|
1033
|
+
|
|
1034
|
+
def take(self, n: int = 5) -> list[dict[str, Any]]:
|
|
1035
|
+
"""Get first n rows as list of dicts.
|
|
1036
|
+
|
|
1037
|
+
Args:
|
|
1038
|
+
n: Number of rows.
|
|
1039
|
+
|
|
1040
|
+
Returns:
|
|
1041
|
+
List of row dictionaries.
|
|
1042
|
+
"""
|
|
1043
|
+
return self._ds.take(n)
|
|
1044
|
+
|
|
1045
|
+
def take_all(self) -> list[dict[str, Any]]:
|
|
1046
|
+
"""Get all rows as list of dicts.
|
|
1047
|
+
|
|
1048
|
+
Returns:
|
|
1049
|
+
List of row dictionaries.
|
|
1050
|
+
"""
|
|
1051
|
+
return self._ds.take_all()
|
|
1052
|
+
|
|
1053
|
+
def show(self, n: int = 20) -> None:
|
|
1054
|
+
"""Print the first n rows.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
n: Number of rows to show.
|
|
1058
|
+
"""
|
|
1059
|
+
self._ds.show(n)
|
|
1060
|
+
|
|
1061
|
+
def to_pandas(self) -> Any:
|
|
1062
|
+
"""Convert to Pandas DataFrame.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
Pandas DataFrame.
|
|
1066
|
+
"""
|
|
1067
|
+
return self._ds.to_pandas()
|
|
1068
|
+
|
|
1069
|
+
def to_arrow(self) -> "pa.Table":
|
|
1070
|
+
"""Convert to Arrow Table.
|
|
1071
|
+
|
|
1072
|
+
Returns:
|
|
1073
|
+
PyArrow Table.
|
|
1074
|
+
"""
|
|
1075
|
+
return self._ds.to_arrow()
|
|
1076
|
+
|
|
1077
|
+
def write_parquet(self, path: str, **kwargs: Any) -> None:
|
|
1078
|
+
"""Write to Parquet files.
|
|
1079
|
+
|
|
1080
|
+
Args:
|
|
1081
|
+
path: Output path.
|
|
1082
|
+
**kwargs: Additional arguments for write_parquet.
|
|
1083
|
+
"""
|
|
1084
|
+
self._ds.write_parquet(path, **kwargs)
|
|
1085
|
+
|
|
1086
|
+
def write_csv(self, path: str, **kwargs: Any) -> None:
|
|
1087
|
+
"""Write to CSV files.
|
|
1088
|
+
|
|
1089
|
+
Args:
|
|
1090
|
+
path: Output path.
|
|
1091
|
+
**kwargs: Additional arguments for write_csv.
|
|
1092
|
+
"""
|
|
1093
|
+
self._ds.write_csv(path, **kwargs)
|
|
1094
|
+
|
|
1095
|
+
def stats(self) -> str:
|
|
1096
|
+
"""Get dataset statistics.
|
|
1097
|
+
|
|
1098
|
+
Returns:
|
|
1099
|
+
Statistics string.
|
|
1100
|
+
"""
|
|
1101
|
+
return self._ds.stats()
|
|
1102
|
+
|
|
1103
|
+
def schema_str(self) -> str:
|
|
1104
|
+
"""Get schema as string.
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
Schema string.
|
|
1108
|
+
"""
|
|
1109
|
+
return str(self._schema)
|
|
1110
|
+
|
|
1111
|
+
# -------------------------------------------------------------------------
|
|
1112
|
+
# Context Manager
|
|
1113
|
+
# -------------------------------------------------------------------------
|
|
1114
|
+
|
|
1115
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
1116
|
+
"""Context manager exit."""
|
|
1117
|
+
super().__exit__(exc_type, exc_val, exc_tb)
|
|
1118
|
+
# Note: We don't shutdown Ray here as it might be shared
|
|
1119
|
+
# Users should manage Ray lifecycle separately
|
|
1120
|
+
|
|
1121
|
+
@staticmethod
|
|
1122
|
+
def shutdown() -> None:
|
|
1123
|
+
"""Shutdown Ray."""
|
|
1124
|
+
import ray
|
|
1125
|
+
|
|
1126
|
+
if ray.is_initialized():
|
|
1127
|
+
ray.shutdown()
|