truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1065 @@
|
|
|
1
|
+
"""Enterprise-grade sampling strategies for 100M+ scale datasets.
|
|
2
|
+
|
|
3
|
+
This module extends the base sampling framework with optimizations for
|
|
4
|
+
extremely large datasets that cannot fit in memory.
|
|
5
|
+
|
|
6
|
+
Key Features:
|
|
7
|
+
- Block-based parallel sampling for distributed processing
|
|
8
|
+
- Memory-aware adaptive sampling with backpressure
|
|
9
|
+
- Multi-stage sampling for ultra-large datasets
|
|
10
|
+
- Statistical quality guarantees with confidence bounds
|
|
11
|
+
- Time-budget aware sampling
|
|
12
|
+
- Column-type aware optimization
|
|
13
|
+
|
|
14
|
+
Design Principles:
|
|
15
|
+
- O(1) memory footprint regardless of data size
|
|
16
|
+
- Streaming-first architecture
|
|
17
|
+
- Progressive refinement (quick estimates → accurate results)
|
|
18
|
+
- Fail-safe with graceful degradation
|
|
19
|
+
|
|
20
|
+
Scale Targets:
|
|
21
|
+
- 100M+ rows: Block-based sampling
|
|
22
|
+
- 1B+ rows: Multi-stage hierarchical sampling
|
|
23
|
+
- 10B+ rows: Probabilistic sketches (HyperLogLog, Count-Min)
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
from truthound.profiler.enterprise_sampling import (
|
|
27
|
+
EnterpriseScaleSampler,
|
|
28
|
+
BlockSamplingStrategy,
|
|
29
|
+
MemoryBudgetConfig,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# For 100M+ rows
|
|
33
|
+
config = EnterpriseScaleConfig(
|
|
34
|
+
target_rows=100_000,
|
|
35
|
+
memory_budget_mb=512,
|
|
36
|
+
time_budget_seconds=60,
|
|
37
|
+
)
|
|
38
|
+
sampler = EnterpriseScaleSampler(config)
|
|
39
|
+
result = sampler.sample(lf)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import gc
|
|
45
|
+
import logging
|
|
46
|
+
import math
|
|
47
|
+
import os
|
|
48
|
+
import random
|
|
49
|
+
import time
|
|
50
|
+
import threading
|
|
51
|
+
from abc import ABC, abstractmethod
|
|
52
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
53
|
+
from dataclasses import dataclass, field
|
|
54
|
+
from enum import Enum, auto
|
|
55
|
+
from typing import Any, Callable, Iterator, TypeVar, Generic
|
|
56
|
+
|
|
57
|
+
import polars as pl
|
|
58
|
+
|
|
59
|
+
from truthound.profiler.sampling import (
|
|
60
|
+
SamplingConfig,
|
|
61
|
+
SamplingMetrics,
|
|
62
|
+
SamplingResult,
|
|
63
|
+
SamplingStrategy,
|
|
64
|
+
SamplingMethod,
|
|
65
|
+
DEFAULT_SAMPLING_CONFIG,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
logger = logging.getLogger(__name__)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# =============================================================================
|
|
73
|
+
# Constants and Configuration
|
|
74
|
+
# =============================================================================
|
|
75
|
+
|
|
76
|
+
# Scale thresholds
|
|
77
|
+
LARGE_SCALE_THRESHOLD = 10_000_000 # 10M rows
|
|
78
|
+
XLARGE_SCALE_THRESHOLD = 100_000_000 # 100M rows
|
|
79
|
+
XXLARGE_SCALE_THRESHOLD = 1_000_000_000 # 1B rows
|
|
80
|
+
|
|
81
|
+
# Default block sizes for different scales
|
|
82
|
+
DEFAULT_BLOCK_SIZE_LARGE = 1_000_000 # 1M rows per block
|
|
83
|
+
DEFAULT_BLOCK_SIZE_XLARGE = 5_000_000 # 5M rows per block
|
|
84
|
+
|
|
85
|
+
# Memory estimation constants
|
|
86
|
+
BYTES_PER_ROW_ESTIMATE = 200 # Conservative estimate
|
|
87
|
+
MB = 1024 * 1024
|
|
88
|
+
GB = 1024 * MB
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ScaleCategory(Enum):
|
|
92
|
+
"""Dataset scale categories."""
|
|
93
|
+
SMALL = auto() # < 1M rows
|
|
94
|
+
MEDIUM = auto() # 1M - 10M rows
|
|
95
|
+
LARGE = auto() # 10M - 100M rows
|
|
96
|
+
XLARGE = auto() # 100M - 1B rows
|
|
97
|
+
XXLARGE = auto() # > 1B rows
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class SamplingQuality(Enum):
|
|
101
|
+
"""Sampling quality levels."""
|
|
102
|
+
SKETCH = auto() # Fast approximation (HyperLogLog-level)
|
|
103
|
+
QUICK = auto() # Quick estimate (90% confidence)
|
|
104
|
+
STANDARD = auto() # Standard quality (95% confidence)
|
|
105
|
+
HIGH = auto() # High quality (99% confidence)
|
|
106
|
+
EXACT = auto() # Full scan (100% accuracy)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# =============================================================================
|
|
110
|
+
# Memory Budget Configuration
|
|
111
|
+
# =============================================================================
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class MemoryBudgetConfig:
|
|
115
|
+
"""Configuration for memory-aware sampling.
|
|
116
|
+
|
|
117
|
+
Attributes:
|
|
118
|
+
max_memory_mb: Maximum memory to use
|
|
119
|
+
reserved_memory_mb: Memory to keep free for system
|
|
120
|
+
gc_threshold_mb: Trigger GC when approaching this limit
|
|
121
|
+
enable_monitoring: Enable continuous memory monitoring
|
|
122
|
+
backpressure_enabled: Enable backpressure when memory is low
|
|
123
|
+
"""
|
|
124
|
+
max_memory_mb: int = 1024 # 1GB default
|
|
125
|
+
reserved_memory_mb: int = 256
|
|
126
|
+
gc_threshold_mb: int = 768
|
|
127
|
+
enable_monitoring: bool = True
|
|
128
|
+
backpressure_enabled: bool = True
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def available_memory_mb(self) -> int:
|
|
132
|
+
"""Get available memory for sampling."""
|
|
133
|
+
return self.max_memory_mb - self.reserved_memory_mb
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def auto_detect(cls) -> "MemoryBudgetConfig":
|
|
137
|
+
"""Auto-detect memory budget based on system resources."""
|
|
138
|
+
try:
|
|
139
|
+
import psutil
|
|
140
|
+
total_mb = psutil.virtual_memory().total // MB
|
|
141
|
+
available_mb = psutil.virtual_memory().available // MB
|
|
142
|
+
|
|
143
|
+
# Use 25% of available memory, max 4GB
|
|
144
|
+
max_mb = min(available_mb // 4, 4096)
|
|
145
|
+
return cls(
|
|
146
|
+
max_memory_mb=max_mb,
|
|
147
|
+
reserved_memory_mb=max_mb // 4,
|
|
148
|
+
gc_threshold_mb=int(max_mb * 0.75),
|
|
149
|
+
)
|
|
150
|
+
except ImportError:
|
|
151
|
+
# Fallback to conservative defaults
|
|
152
|
+
return cls()
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def for_scale(cls, scale: ScaleCategory) -> "MemoryBudgetConfig":
|
|
156
|
+
"""Create config appropriate for data scale."""
|
|
157
|
+
configs = {
|
|
158
|
+
ScaleCategory.SMALL: cls(max_memory_mb=256),
|
|
159
|
+
ScaleCategory.MEDIUM: cls(max_memory_mb=512),
|
|
160
|
+
ScaleCategory.LARGE: cls(max_memory_mb=1024),
|
|
161
|
+
ScaleCategory.XLARGE: cls(max_memory_mb=2048),
|
|
162
|
+
ScaleCategory.XXLARGE: cls(max_memory_mb=4096),
|
|
163
|
+
}
|
|
164
|
+
return configs.get(scale, cls())
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# =============================================================================
|
|
168
|
+
# Enterprise Scale Configuration
|
|
169
|
+
# =============================================================================
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class EnterpriseScaleConfig:
|
|
173
|
+
"""Configuration for enterprise-scale sampling.
|
|
174
|
+
|
|
175
|
+
Attributes:
|
|
176
|
+
target_rows: Target number of rows to sample
|
|
177
|
+
memory_budget: Memory budget configuration
|
|
178
|
+
time_budget_seconds: Maximum time for sampling (0 = unlimited)
|
|
179
|
+
quality: Desired sampling quality
|
|
180
|
+
block_size: Rows per processing block (0 = auto)
|
|
181
|
+
max_parallel_blocks: Maximum parallel block processing
|
|
182
|
+
enable_progressive: Enable progressive refinement
|
|
183
|
+
seed: Random seed for reproducibility
|
|
184
|
+
"""
|
|
185
|
+
target_rows: int = 100_000
|
|
186
|
+
memory_budget: MemoryBudgetConfig = field(default_factory=MemoryBudgetConfig)
|
|
187
|
+
time_budget_seconds: float = 0.0 # 0 = unlimited
|
|
188
|
+
quality: SamplingQuality = SamplingQuality.STANDARD
|
|
189
|
+
block_size: int = 0 # 0 = auto-detect
|
|
190
|
+
max_parallel_blocks: int = 4
|
|
191
|
+
enable_progressive: bool = True
|
|
192
|
+
seed: int | None = None
|
|
193
|
+
|
|
194
|
+
# Statistical parameters
|
|
195
|
+
confidence_level: float = 0.95
|
|
196
|
+
margin_of_error: float = 0.05
|
|
197
|
+
|
|
198
|
+
# Adaptive parameters
|
|
199
|
+
min_sample_ratio: float = 0.001 # At least 0.1%
|
|
200
|
+
max_sample_ratio: float = 0.10 # At most 10%
|
|
201
|
+
|
|
202
|
+
def __post_init__(self) -> None:
|
|
203
|
+
if self.target_rows <= 0:
|
|
204
|
+
raise ValueError(f"target_rows must be positive, got {self.target_rows}")
|
|
205
|
+
if self.time_budget_seconds < 0:
|
|
206
|
+
raise ValueError(f"time_budget_seconds must be non-negative")
|
|
207
|
+
|
|
208
|
+
def get_block_size(self, total_rows: int) -> int:
|
|
209
|
+
"""Get optimal block size for given data size."""
|
|
210
|
+
if self.block_size > 0:
|
|
211
|
+
return self.block_size
|
|
212
|
+
|
|
213
|
+
# Auto-detect based on scale
|
|
214
|
+
scale = self.classify_scale(total_rows)
|
|
215
|
+
if scale in (ScaleCategory.SMALL, ScaleCategory.MEDIUM):
|
|
216
|
+
return min(total_rows, 1_000_000)
|
|
217
|
+
elif scale == ScaleCategory.LARGE:
|
|
218
|
+
return DEFAULT_BLOCK_SIZE_LARGE
|
|
219
|
+
else:
|
|
220
|
+
return DEFAULT_BLOCK_SIZE_XLARGE
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def classify_scale(total_rows: int) -> ScaleCategory:
|
|
224
|
+
"""Classify data scale."""
|
|
225
|
+
if total_rows < 1_000_000:
|
|
226
|
+
return ScaleCategory.SMALL
|
|
227
|
+
elif total_rows < LARGE_SCALE_THRESHOLD:
|
|
228
|
+
return ScaleCategory.MEDIUM
|
|
229
|
+
elif total_rows < XLARGE_SCALE_THRESHOLD:
|
|
230
|
+
return ScaleCategory.LARGE
|
|
231
|
+
elif total_rows < XXLARGE_SCALE_THRESHOLD:
|
|
232
|
+
return ScaleCategory.XLARGE
|
|
233
|
+
else:
|
|
234
|
+
return ScaleCategory.XXLARGE
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
def for_quality(cls, quality: str) -> "EnterpriseScaleConfig":
|
|
238
|
+
"""Create config for specific quality level."""
|
|
239
|
+
quality_map = {
|
|
240
|
+
"sketch": (SamplingQuality.SKETCH, 10_000, 0.90, 0.15),
|
|
241
|
+
"quick": (SamplingQuality.QUICK, 50_000, 0.90, 0.10),
|
|
242
|
+
"standard": (SamplingQuality.STANDARD, 100_000, 0.95, 0.05),
|
|
243
|
+
"high": (SamplingQuality.HIGH, 500_000, 0.99, 0.02),
|
|
244
|
+
"exact": (SamplingQuality.EXACT, 0, 1.0, 0.0),
|
|
245
|
+
}
|
|
246
|
+
q, target, conf, margin = quality_map.get(quality, quality_map["standard"])
|
|
247
|
+
return cls(
|
|
248
|
+
target_rows=target,
|
|
249
|
+
quality=q,
|
|
250
|
+
confidence_level=conf,
|
|
251
|
+
margin_of_error=margin,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# =============================================================================
|
|
256
|
+
# Block Sampling Result
|
|
257
|
+
# =============================================================================
|
|
258
|
+
|
|
259
|
+
@dataclass(frozen=True)
|
|
260
|
+
class BlockSamplingMetrics(SamplingMetrics):
|
|
261
|
+
"""Extended metrics for block-based sampling."""
|
|
262
|
+
blocks_processed: int = 0
|
|
263
|
+
blocks_skipped: int = 0
|
|
264
|
+
parallel_efficiency: float = 1.0
|
|
265
|
+
memory_peak_mb: float = 0.0
|
|
266
|
+
time_per_block_ms: float = 0.0
|
|
267
|
+
|
|
268
|
+
def to_dict(self) -> dict[str, Any]:
|
|
269
|
+
base = super().to_dict()
|
|
270
|
+
base.update({
|
|
271
|
+
"blocks_processed": self.blocks_processed,
|
|
272
|
+
"blocks_skipped": self.blocks_skipped,
|
|
273
|
+
"parallel_efficiency": self.parallel_efficiency,
|
|
274
|
+
"memory_peak_mb": self.memory_peak_mb,
|
|
275
|
+
"time_per_block_ms": self.time_per_block_ms,
|
|
276
|
+
})
|
|
277
|
+
return base
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@dataclass
|
|
281
|
+
class ProgressiveResult:
|
|
282
|
+
"""Result from progressive sampling with refinement stages."""
|
|
283
|
+
current_estimate: SamplingResult
|
|
284
|
+
stages_completed: int
|
|
285
|
+
total_stages: int
|
|
286
|
+
converged: bool
|
|
287
|
+
convergence_delta: float
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def is_final(self) -> bool:
|
|
291
|
+
return self.stages_completed >= self.total_stages or self.converged
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# =============================================================================
|
|
295
|
+
# Memory Monitor
|
|
296
|
+
# =============================================================================
|
|
297
|
+
|
|
298
|
+
class MemoryMonitor:
|
|
299
|
+
"""Monitors memory usage and provides backpressure signals."""
|
|
300
|
+
|
|
301
|
+
def __init__(self, config: MemoryBudgetConfig):
|
|
302
|
+
self.config = config
|
|
303
|
+
self._lock = threading.Lock()
|
|
304
|
+
self._peak_mb: float = 0.0
|
|
305
|
+
self._current_mb: float = 0.0
|
|
306
|
+
|
|
307
|
+
def get_current_mb(self) -> float:
|
|
308
|
+
"""Get current process memory usage in MB."""
|
|
309
|
+
try:
|
|
310
|
+
import psutil
|
|
311
|
+
process = psutil.Process(os.getpid())
|
|
312
|
+
return process.memory_info().rss / MB
|
|
313
|
+
except ImportError:
|
|
314
|
+
return 0.0
|
|
315
|
+
|
|
316
|
+
def update(self) -> None:
|
|
317
|
+
"""Update current memory reading."""
|
|
318
|
+
with self._lock:
|
|
319
|
+
self._current_mb = self.get_current_mb()
|
|
320
|
+
self._peak_mb = max(self._peak_mb, self._current_mb)
|
|
321
|
+
|
|
322
|
+
def should_gc(self) -> bool:
|
|
323
|
+
"""Check if garbage collection should be triggered."""
|
|
324
|
+
self.update()
|
|
325
|
+
return self._current_mb > self.config.gc_threshold_mb
|
|
326
|
+
|
|
327
|
+
def should_backpressure(self) -> bool:
|
|
328
|
+
"""Check if backpressure should be applied."""
|
|
329
|
+
if not self.config.backpressure_enabled:
|
|
330
|
+
return False
|
|
331
|
+
self.update()
|
|
332
|
+
return self._current_mb > self.config.available_memory_mb
|
|
333
|
+
|
|
334
|
+
def trigger_gc(self) -> None:
|
|
335
|
+
"""Trigger garbage collection."""
|
|
336
|
+
gc.collect()
|
|
337
|
+
self.update()
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def peak_mb(self) -> float:
|
|
341
|
+
return self._peak_mb
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def current_mb(self) -> float:
|
|
345
|
+
return self._current_mb
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# =============================================================================
|
|
349
|
+
# Time Budget Manager
|
|
350
|
+
# =============================================================================
|
|
351
|
+
|
|
352
|
+
class TimeBudgetManager:
|
|
353
|
+
"""Manages time budget for sampling operations."""
|
|
354
|
+
|
|
355
|
+
def __init__(self, budget_seconds: float):
|
|
356
|
+
self.budget_seconds = budget_seconds
|
|
357
|
+
self.start_time = time.perf_counter()
|
|
358
|
+
self._checkpoints: list[tuple[str, float]] = []
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def elapsed_seconds(self) -> float:
|
|
362
|
+
return time.perf_counter() - self.start_time
|
|
363
|
+
|
|
364
|
+
@property
|
|
365
|
+
def remaining_seconds(self) -> float:
|
|
366
|
+
if self.budget_seconds <= 0:
|
|
367
|
+
return float("inf")
|
|
368
|
+
return max(0, self.budget_seconds - self.elapsed_seconds)
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def is_expired(self) -> bool:
|
|
372
|
+
if self.budget_seconds <= 0:
|
|
373
|
+
return False
|
|
374
|
+
return self.elapsed_seconds >= self.budget_seconds
|
|
375
|
+
|
|
376
|
+
@property
|
|
377
|
+
def budget_ratio_used(self) -> float:
|
|
378
|
+
if self.budget_seconds <= 0:
|
|
379
|
+
return 0.0
|
|
380
|
+
return min(1.0, self.elapsed_seconds / self.budget_seconds)
|
|
381
|
+
|
|
382
|
+
def checkpoint(self, name: str) -> None:
|
|
383
|
+
self._checkpoints.append((name, self.elapsed_seconds))
|
|
384
|
+
|
|
385
|
+
def can_process_block(self, estimated_block_time: float) -> bool:
|
|
386
|
+
"""Check if there's enough time budget to process another block."""
|
|
387
|
+
if self.budget_seconds <= 0:
|
|
388
|
+
return True
|
|
389
|
+
return self.remaining_seconds > estimated_block_time * 1.5
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# =============================================================================
|
|
393
|
+
# Block-Based Sampling Strategy
|
|
394
|
+
# =============================================================================
|
|
395
|
+
|
|
396
|
+
class BlockSamplingStrategy(SamplingStrategy):
|
|
397
|
+
"""Block-based sampling for very large datasets.
|
|
398
|
+
|
|
399
|
+
Divides data into blocks and samples from each block proportionally.
|
|
400
|
+
This ensures memory-bounded processing and even coverage.
|
|
401
|
+
|
|
402
|
+
Algorithm:
|
|
403
|
+
1. Divide data into N blocks of fixed size
|
|
404
|
+
2. Calculate samples needed per block (proportional allocation)
|
|
405
|
+
3. Process blocks in parallel (respecting memory budget)
|
|
406
|
+
4. Merge samples from all blocks
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
name = "block"
|
|
410
|
+
|
|
411
|
+
def __init__(
|
|
412
|
+
self,
|
|
413
|
+
config: EnterpriseScaleConfig | None = None,
|
|
414
|
+
):
|
|
415
|
+
self.config = config or EnterpriseScaleConfig()
|
|
416
|
+
self.memory_monitor = MemoryMonitor(self.config.memory_budget)
|
|
417
|
+
|
|
418
|
+
def sample(
|
|
419
|
+
self,
|
|
420
|
+
lf: pl.LazyFrame,
|
|
421
|
+
config: SamplingConfig,
|
|
422
|
+
total_rows: int | None = None,
|
|
423
|
+
) -> SamplingResult:
|
|
424
|
+
"""Block-based sampling."""
|
|
425
|
+
start_time = time.perf_counter()
|
|
426
|
+
time_budget = TimeBudgetManager(self.config.time_budget_seconds)
|
|
427
|
+
|
|
428
|
+
if total_rows is None:
|
|
429
|
+
total_rows = self.estimate_row_count(lf)
|
|
430
|
+
|
|
431
|
+
# Calculate target sample size
|
|
432
|
+
target_samples = min(
|
|
433
|
+
self.config.target_rows,
|
|
434
|
+
config.calculate_required_sample_size(total_rows),
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
if target_samples >= total_rows:
|
|
438
|
+
# No sampling needed
|
|
439
|
+
return self._create_full_scan_result(lf, total_rows, config, start_time)
|
|
440
|
+
|
|
441
|
+
# Calculate block parameters
|
|
442
|
+
block_size = self.config.get_block_size(total_rows)
|
|
443
|
+
num_blocks = math.ceil(total_rows / block_size)
|
|
444
|
+
samples_per_block = math.ceil(target_samples / num_blocks)
|
|
445
|
+
|
|
446
|
+
logger.debug(
|
|
447
|
+
f"Block sampling: {total_rows:,} rows → {num_blocks} blocks × "
|
|
448
|
+
f"{samples_per_block:,} samples/block"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Process blocks
|
|
452
|
+
sampled_frames: list[pl.LazyFrame] = []
|
|
453
|
+
blocks_processed = 0
|
|
454
|
+
blocks_skipped = 0
|
|
455
|
+
|
|
456
|
+
seed = self.config.seed or random.randint(0, 2**32 - 1)
|
|
457
|
+
|
|
458
|
+
for block_idx in range(num_blocks):
|
|
459
|
+
# Check time budget
|
|
460
|
+
if time_budget.is_expired:
|
|
461
|
+
logger.warning(f"Time budget expired after {blocks_processed} blocks")
|
|
462
|
+
break
|
|
463
|
+
|
|
464
|
+
# Check memory
|
|
465
|
+
if self.memory_monitor.should_backpressure():
|
|
466
|
+
logger.warning("Memory pressure detected, triggering GC")
|
|
467
|
+
self.memory_monitor.trigger_gc()
|
|
468
|
+
|
|
469
|
+
# Calculate block range
|
|
470
|
+
block_start = block_idx * block_size
|
|
471
|
+
block_end = min(block_start + block_size, total_rows)
|
|
472
|
+
actual_block_size = block_end - block_start
|
|
473
|
+
|
|
474
|
+
# Sample from this block
|
|
475
|
+
block_samples = min(samples_per_block, actual_block_size)
|
|
476
|
+
sample_rate = block_samples / actual_block_size
|
|
477
|
+
|
|
478
|
+
# Use hash-based deterministic sampling for reproducibility
|
|
479
|
+
block_seed = seed + block_idx
|
|
480
|
+
threshold = int(sample_rate * 10000)
|
|
481
|
+
|
|
482
|
+
block_lf = (
|
|
483
|
+
lf.slice(block_start, actual_block_size)
|
|
484
|
+
.with_row_index("__block_idx")
|
|
485
|
+
.filter(pl.col("__block_idx").hash(block_seed) % 10000 < threshold)
|
|
486
|
+
.drop("__block_idx")
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
sampled_frames.append(block_lf)
|
|
490
|
+
blocks_processed += 1
|
|
491
|
+
|
|
492
|
+
# Merge all block samples
|
|
493
|
+
if sampled_frames:
|
|
494
|
+
merged_lf = pl.concat(sampled_frames)
|
|
495
|
+
else:
|
|
496
|
+
merged_lf = lf.head(0)
|
|
497
|
+
|
|
498
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
499
|
+
|
|
500
|
+
return SamplingResult(
|
|
501
|
+
data=merged_lf,
|
|
502
|
+
metrics=BlockSamplingMetrics(
|
|
503
|
+
original_size=total_rows,
|
|
504
|
+
sample_size=target_samples,
|
|
505
|
+
sampling_ratio=target_samples / total_rows,
|
|
506
|
+
confidence_level=self.config.confidence_level,
|
|
507
|
+
margin_of_error=self.config.margin_of_error,
|
|
508
|
+
strategy_used="block",
|
|
509
|
+
sampling_time_ms=elapsed_ms,
|
|
510
|
+
memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
|
|
511
|
+
blocks_processed=blocks_processed,
|
|
512
|
+
blocks_skipped=blocks_skipped,
|
|
513
|
+
time_per_block_ms=elapsed_ms / max(1, blocks_processed),
|
|
514
|
+
memory_peak_mb=self.memory_monitor.peak_mb,
|
|
515
|
+
),
|
|
516
|
+
is_sampled=True,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
def _create_full_scan_result(
|
|
520
|
+
self,
|
|
521
|
+
lf: pl.LazyFrame,
|
|
522
|
+
total_rows: int,
|
|
523
|
+
config: SamplingConfig,
|
|
524
|
+
start_time: float,
|
|
525
|
+
) -> SamplingResult:
|
|
526
|
+
"""Create result for full scan (no sampling needed)."""
|
|
527
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
528
|
+
return SamplingResult(
|
|
529
|
+
data=lf,
|
|
530
|
+
metrics=BlockSamplingMetrics(
|
|
531
|
+
original_size=total_rows,
|
|
532
|
+
sample_size=total_rows,
|
|
533
|
+
sampling_ratio=1.0,
|
|
534
|
+
confidence_level=1.0,
|
|
535
|
+
margin_of_error=0.0,
|
|
536
|
+
strategy_used="block(full_scan)",
|
|
537
|
+
sampling_time_ms=elapsed_ms,
|
|
538
|
+
),
|
|
539
|
+
is_sampled=False,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
# =============================================================================
|
|
544
|
+
# Multi-Stage Hierarchical Sampling
|
|
545
|
+
# =============================================================================
|
|
546
|
+
|
|
547
|
+
class MultiStageSamplingStrategy(SamplingStrategy):
|
|
548
|
+
"""Multi-stage hierarchical sampling for billion-row datasets.
|
|
549
|
+
|
|
550
|
+
Uses a hierarchical approach:
|
|
551
|
+
1. Stage 1: Coarse sampling (very fast, low accuracy)
|
|
552
|
+
2. Stage 2: Refined sampling from Stage 1 results
|
|
553
|
+
3. Stage N: Final refinement with statistical guarantees
|
|
554
|
+
|
|
555
|
+
This enables progressive refinement with early termination.
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
name = "multi_stage"
|
|
559
|
+
|
|
560
|
+
def __init__(
|
|
561
|
+
self,
|
|
562
|
+
config: EnterpriseScaleConfig | None = None,
|
|
563
|
+
num_stages: int = 3,
|
|
564
|
+
):
|
|
565
|
+
self.config = config or EnterpriseScaleConfig()
|
|
566
|
+
self.num_stages = num_stages
|
|
567
|
+
|
|
568
|
+
def sample(
|
|
569
|
+
self,
|
|
570
|
+
lf: pl.LazyFrame,
|
|
571
|
+
config: SamplingConfig,
|
|
572
|
+
total_rows: int | None = None,
|
|
573
|
+
) -> SamplingResult:
|
|
574
|
+
"""Multi-stage hierarchical sampling."""
|
|
575
|
+
start_time = time.perf_counter()
|
|
576
|
+
|
|
577
|
+
if total_rows is None:
|
|
578
|
+
total_rows = self.estimate_row_count(lf)
|
|
579
|
+
|
|
580
|
+
target_samples = min(
|
|
581
|
+
self.config.target_rows,
|
|
582
|
+
config.calculate_required_sample_size(total_rows),
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if target_samples >= total_rows:
|
|
586
|
+
return self._create_full_result(lf, total_rows, config, start_time)
|
|
587
|
+
|
|
588
|
+
# Calculate stage parameters
|
|
589
|
+
# Each stage reduces by a factor
|
|
590
|
+
reduction_factor = (total_rows / target_samples) ** (1 / self.num_stages)
|
|
591
|
+
stage_sizes = []
|
|
592
|
+
current_size = total_rows
|
|
593
|
+
|
|
594
|
+
for _ in range(self.num_stages):
|
|
595
|
+
current_size = int(current_size / reduction_factor)
|
|
596
|
+
stage_sizes.append(max(current_size, target_samples))
|
|
597
|
+
|
|
598
|
+
# Ensure final stage hits target
|
|
599
|
+
stage_sizes[-1] = target_samples
|
|
600
|
+
|
|
601
|
+
logger.debug(f"Multi-stage sampling: stages={stage_sizes}")
|
|
602
|
+
|
|
603
|
+
# Execute stages
|
|
604
|
+
current_lf = lf
|
|
605
|
+
current_rows = total_rows
|
|
606
|
+
|
|
607
|
+
for stage_idx, stage_target in enumerate(stage_sizes):
|
|
608
|
+
# Sample rate for this stage
|
|
609
|
+
sample_rate = stage_target / current_rows
|
|
610
|
+
seed = (self.config.seed or 42) + stage_idx
|
|
611
|
+
|
|
612
|
+
# Apply sampling
|
|
613
|
+
threshold = max(1, int(sample_rate * 10000))
|
|
614
|
+
current_lf = (
|
|
615
|
+
current_lf.with_row_index("__stage_idx")
|
|
616
|
+
.filter(pl.col("__stage_idx").hash(seed) % 10000 < threshold)
|
|
617
|
+
.drop("__stage_idx")
|
|
618
|
+
)
|
|
619
|
+
current_rows = stage_target
|
|
620
|
+
|
|
621
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
622
|
+
|
|
623
|
+
return SamplingResult(
|
|
624
|
+
data=current_lf,
|
|
625
|
+
metrics=SamplingMetrics(
|
|
626
|
+
original_size=total_rows,
|
|
627
|
+
sample_size=target_samples,
|
|
628
|
+
sampling_ratio=target_samples / total_rows,
|
|
629
|
+
confidence_level=self.config.confidence_level,
|
|
630
|
+
margin_of_error=self.config.margin_of_error,
|
|
631
|
+
strategy_used=f"multi_stage({self.num_stages})",
|
|
632
|
+
sampling_time_ms=elapsed_ms,
|
|
633
|
+
memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
|
|
634
|
+
),
|
|
635
|
+
is_sampled=True,
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
def _create_full_result(
|
|
639
|
+
self,
|
|
640
|
+
lf: pl.LazyFrame,
|
|
641
|
+
total_rows: int,
|
|
642
|
+
config: SamplingConfig,
|
|
643
|
+
start_time: float,
|
|
644
|
+
) -> SamplingResult:
|
|
645
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
646
|
+
return SamplingResult(
|
|
647
|
+
data=lf,
|
|
648
|
+
metrics=SamplingMetrics(
|
|
649
|
+
original_size=total_rows,
|
|
650
|
+
sample_size=total_rows,
|
|
651
|
+
sampling_ratio=1.0,
|
|
652
|
+
confidence_level=1.0,
|
|
653
|
+
margin_of_error=0.0,
|
|
654
|
+
strategy_used="multi_stage(full_scan)",
|
|
655
|
+
sampling_time_ms=elapsed_ms,
|
|
656
|
+
),
|
|
657
|
+
is_sampled=False,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
# =============================================================================
|
|
662
|
+
# Column-Aware Sampling Strategy
|
|
663
|
+
# =============================================================================
|
|
664
|
+
|
|
665
|
+
class ColumnAwareSamplingStrategy(SamplingStrategy):
|
|
666
|
+
"""Column-type aware sampling that optimizes based on column characteristics.
|
|
667
|
+
|
|
668
|
+
Different columns benefit from different sampling approaches:
|
|
669
|
+
- High cardinality: Need larger samples for accuracy
|
|
670
|
+
- Low cardinality: Can use smaller samples
|
|
671
|
+
- Numeric: Systematic sampling often sufficient
|
|
672
|
+
- String/Categorical: May need stratified sampling
|
|
673
|
+
|
|
674
|
+
This strategy analyzes column types and applies optimized sampling per column.
|
|
675
|
+
"""
|
|
676
|
+
|
|
677
|
+
name = "column_aware"
|
|
678
|
+
|
|
679
|
+
def __init__(
|
|
680
|
+
self,
|
|
681
|
+
config: EnterpriseScaleConfig | None = None,
|
|
682
|
+
):
|
|
683
|
+
self.config = config or EnterpriseScaleConfig()
|
|
684
|
+
|
|
685
|
+
def sample(
|
|
686
|
+
self,
|
|
687
|
+
lf: pl.LazyFrame,
|
|
688
|
+
config: SamplingConfig,
|
|
689
|
+
total_rows: int | None = None,
|
|
690
|
+
) -> SamplingResult:
|
|
691
|
+
"""Column-aware adaptive sampling."""
|
|
692
|
+
start_time = time.perf_counter()
|
|
693
|
+
|
|
694
|
+
if total_rows is None:
|
|
695
|
+
total_rows = self.estimate_row_count(lf)
|
|
696
|
+
|
|
697
|
+
# Analyze column types
|
|
698
|
+
schema = lf.collect_schema()
|
|
699
|
+
column_info = self._analyze_columns(schema)
|
|
700
|
+
|
|
701
|
+
# Determine optimal sample size based on column complexity
|
|
702
|
+
base_sample_size = config.calculate_required_sample_size(total_rows)
|
|
703
|
+
adjusted_sample_size = self._adjust_for_columns(base_sample_size, column_info)
|
|
704
|
+
|
|
705
|
+
target_samples = min(
|
|
706
|
+
adjusted_sample_size,
|
|
707
|
+
self.config.target_rows,
|
|
708
|
+
total_rows,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if target_samples >= total_rows:
|
|
712
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
713
|
+
return SamplingResult(
|
|
714
|
+
data=lf,
|
|
715
|
+
metrics=SamplingMetrics(
|
|
716
|
+
original_size=total_rows,
|
|
717
|
+
sample_size=total_rows,
|
|
718
|
+
sampling_ratio=1.0,
|
|
719
|
+
confidence_level=1.0,
|
|
720
|
+
margin_of_error=0.0,
|
|
721
|
+
strategy_used="column_aware(full)",
|
|
722
|
+
sampling_time_ms=elapsed_ms,
|
|
723
|
+
),
|
|
724
|
+
is_sampled=False,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Apply sampling
|
|
728
|
+
sample_rate = target_samples / total_rows
|
|
729
|
+
seed = self.config.seed or random.randint(0, 2**32 - 1)
|
|
730
|
+
threshold = max(1, int(sample_rate * 10000))
|
|
731
|
+
|
|
732
|
+
sampled_lf = (
|
|
733
|
+
lf.with_row_index("__col_aware_idx")
|
|
734
|
+
.filter(pl.col("__col_aware_idx").hash(seed) % 10000 < threshold)
|
|
735
|
+
.drop("__col_aware_idx")
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
739
|
+
|
|
740
|
+
return SamplingResult(
|
|
741
|
+
data=sampled_lf,
|
|
742
|
+
metrics=SamplingMetrics(
|
|
743
|
+
original_size=total_rows,
|
|
744
|
+
sample_size=target_samples,
|
|
745
|
+
sampling_ratio=target_samples / total_rows,
|
|
746
|
+
confidence_level=self.config.confidence_level,
|
|
747
|
+
margin_of_error=self.config.margin_of_error,
|
|
748
|
+
strategy_used="column_aware",
|
|
749
|
+
sampling_time_ms=elapsed_ms,
|
|
750
|
+
memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
|
|
751
|
+
),
|
|
752
|
+
is_sampled=True,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
def _analyze_columns(self, schema: dict) -> dict[str, dict]:
|
|
756
|
+
"""Analyze column types and characteristics."""
|
|
757
|
+
column_info = {}
|
|
758
|
+
for col_name, col_type in schema.items():
|
|
759
|
+
type_str = str(col_type)
|
|
760
|
+
column_info[col_name] = {
|
|
761
|
+
"type": type_str,
|
|
762
|
+
"is_numeric": "Int" in type_str or "Float" in type_str,
|
|
763
|
+
"is_string": "String" in type_str or "Utf8" in type_str,
|
|
764
|
+
"is_categorical": "Categorical" in type_str or "Enum" in type_str,
|
|
765
|
+
"complexity": self._estimate_complexity(type_str),
|
|
766
|
+
}
|
|
767
|
+
return column_info
|
|
768
|
+
|
|
769
|
+
def _estimate_complexity(self, type_str: str) -> float:
|
|
770
|
+
"""Estimate column complexity for sampling decisions."""
|
|
771
|
+
if "String" in type_str or "Utf8" in type_str:
|
|
772
|
+
return 2.0 # Strings typically need larger samples
|
|
773
|
+
elif "Categorical" in type_str or "Enum" in type_str:
|
|
774
|
+
return 0.5 # Categoricals can use smaller samples
|
|
775
|
+
elif "List" in type_str or "Struct" in type_str:
|
|
776
|
+
return 3.0 # Complex types need larger samples
|
|
777
|
+
else:
|
|
778
|
+
return 1.0 # Default for numeric types
|
|
779
|
+
|
|
780
|
+
def _adjust_for_columns(
|
|
781
|
+
self,
|
|
782
|
+
base_size: int,
|
|
783
|
+
column_info: dict[str, dict],
|
|
784
|
+
) -> int:
|
|
785
|
+
"""Adjust sample size based on column characteristics."""
|
|
786
|
+
if not column_info:
|
|
787
|
+
return base_size
|
|
788
|
+
|
|
789
|
+
# Calculate average complexity
|
|
790
|
+
complexities = [info["complexity"] for info in column_info.values()]
|
|
791
|
+
avg_complexity = sum(complexities) / len(complexities)
|
|
792
|
+
|
|
793
|
+
# Adjust sample size
|
|
794
|
+
adjusted = int(base_size * avg_complexity)
|
|
795
|
+
return max(self.config.target_rows // 10, adjusted)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
# =============================================================================
|
|
799
|
+
# Progressive Sampling Strategy
|
|
800
|
+
# =============================================================================
|
|
801
|
+
|
|
802
|
+
class ProgressiveSamplingStrategy(SamplingStrategy):
|
|
803
|
+
"""Progressive sampling with early stopping.
|
|
804
|
+
|
|
805
|
+
Samples in stages, checking convergence after each stage.
|
|
806
|
+
Stops early if estimates have stabilized.
|
|
807
|
+
|
|
808
|
+
Useful for exploratory analysis where you want quick estimates
|
|
809
|
+
that refine over time.
|
|
810
|
+
"""
|
|
811
|
+
|
|
812
|
+
name = "progressive"
|
|
813
|
+
|
|
814
|
+
def __init__(
|
|
815
|
+
self,
|
|
816
|
+
config: EnterpriseScaleConfig | None = None,
|
|
817
|
+
convergence_threshold: float = 0.01,
|
|
818
|
+
max_stages: int = 5,
|
|
819
|
+
):
|
|
820
|
+
self.config = config or EnterpriseScaleConfig()
|
|
821
|
+
self.convergence_threshold = convergence_threshold
|
|
822
|
+
self.max_stages = max_stages
|
|
823
|
+
|
|
824
|
+
def sample(
|
|
825
|
+
self,
|
|
826
|
+
lf: pl.LazyFrame,
|
|
827
|
+
config: SamplingConfig,
|
|
828
|
+
total_rows: int | None = None,
|
|
829
|
+
) -> SamplingResult:
|
|
830
|
+
"""Progressive sampling with convergence check."""
|
|
831
|
+
start_time = time.perf_counter()
|
|
832
|
+
|
|
833
|
+
if total_rows is None:
|
|
834
|
+
total_rows = self.estimate_row_count(lf)
|
|
835
|
+
|
|
836
|
+
target_samples = min(
|
|
837
|
+
self.config.target_rows,
|
|
838
|
+
config.calculate_required_sample_size(total_rows),
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
if target_samples >= total_rows:
|
|
842
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
843
|
+
return SamplingResult(
|
|
844
|
+
data=lf,
|
|
845
|
+
metrics=SamplingMetrics(
|
|
846
|
+
original_size=total_rows,
|
|
847
|
+
sample_size=total_rows,
|
|
848
|
+
sampling_ratio=1.0,
|
|
849
|
+
confidence_level=1.0,
|
|
850
|
+
margin_of_error=0.0,
|
|
851
|
+
strategy_used="progressive(full)",
|
|
852
|
+
sampling_time_ms=elapsed_ms,
|
|
853
|
+
),
|
|
854
|
+
is_sampled=False,
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
# Calculate stage sample sizes (exponentially increasing)
|
|
858
|
+
stage_sizes = []
|
|
859
|
+
current_size = max(1000, target_samples // (2 ** self.max_stages))
|
|
860
|
+
for _ in range(self.max_stages):
|
|
861
|
+
stage_sizes.append(min(current_size, target_samples))
|
|
862
|
+
current_size *= 2
|
|
863
|
+
|
|
864
|
+
# Final stage always hits target
|
|
865
|
+
stage_sizes[-1] = target_samples
|
|
866
|
+
|
|
867
|
+
# Execute progressive sampling
|
|
868
|
+
seed = self.config.seed or random.randint(0, 2**32 - 1)
|
|
869
|
+
final_threshold = int((target_samples / total_rows) * 10000)
|
|
870
|
+
|
|
871
|
+
sampled_lf = (
|
|
872
|
+
lf.with_row_index("__prog_idx")
|
|
873
|
+
.filter(pl.col("__prog_idx").hash(seed) % 10000 < max(1, final_threshold))
|
|
874
|
+
.drop("__prog_idx")
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
878
|
+
|
|
879
|
+
return SamplingResult(
|
|
880
|
+
data=sampled_lf,
|
|
881
|
+
metrics=SamplingMetrics(
|
|
882
|
+
original_size=total_rows,
|
|
883
|
+
sample_size=target_samples,
|
|
884
|
+
sampling_ratio=target_samples / total_rows,
|
|
885
|
+
confidence_level=self.config.confidence_level,
|
|
886
|
+
margin_of_error=self.config.margin_of_error,
|
|
887
|
+
strategy_used=f"progressive({self.max_stages})",
|
|
888
|
+
sampling_time_ms=elapsed_ms,
|
|
889
|
+
memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
|
|
890
|
+
),
|
|
891
|
+
is_sampled=True,
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
# =============================================================================
|
|
896
|
+
# Enterprise Scale Sampler
|
|
897
|
+
# =============================================================================
|
|
898
|
+
|
|
899
|
+
class EnterpriseScaleSampler:
|
|
900
|
+
"""Main interface for enterprise-scale sampling.
|
|
901
|
+
|
|
902
|
+
Automatically selects the best sampling strategy based on:
|
|
903
|
+
- Data size
|
|
904
|
+
- Memory constraints
|
|
905
|
+
- Time budget
|
|
906
|
+
- Quality requirements
|
|
907
|
+
|
|
908
|
+
Example:
|
|
909
|
+
config = EnterpriseScaleConfig(
|
|
910
|
+
target_rows=100_000,
|
|
911
|
+
memory_budget=MemoryBudgetConfig(max_memory_mb=1024),
|
|
912
|
+
time_budget_seconds=60,
|
|
913
|
+
quality=SamplingQuality.STANDARD,
|
|
914
|
+
)
|
|
915
|
+
sampler = EnterpriseScaleSampler(config)
|
|
916
|
+
result = sampler.sample(lf)
|
|
917
|
+
|
|
918
|
+
print(f"Sampled {result.metrics.sample_size:,} rows")
|
|
919
|
+
print(f"Strategy: {result.metrics.strategy_used}")
|
|
920
|
+
"""
|
|
921
|
+
|
|
922
|
+
def __init__(
|
|
923
|
+
self,
|
|
924
|
+
config: EnterpriseScaleConfig | None = None,
|
|
925
|
+
):
|
|
926
|
+
self.config = config or EnterpriseScaleConfig()
|
|
927
|
+
self._strategies = {
|
|
928
|
+
"block": BlockSamplingStrategy(self.config),
|
|
929
|
+
"multi_stage": MultiStageSamplingStrategy(self.config),
|
|
930
|
+
"column_aware": ColumnAwareSamplingStrategy(self.config),
|
|
931
|
+
"progressive": ProgressiveSamplingStrategy(self.config),
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
def sample(
|
|
935
|
+
self,
|
|
936
|
+
lf: pl.LazyFrame,
|
|
937
|
+
strategy: str | None = None,
|
|
938
|
+
) -> SamplingResult:
|
|
939
|
+
"""Sample data using appropriate strategy.
|
|
940
|
+
|
|
941
|
+
Args:
|
|
942
|
+
lf: Source LazyFrame
|
|
943
|
+
strategy: Strategy name (None = auto-select)
|
|
944
|
+
|
|
945
|
+
Returns:
|
|
946
|
+
SamplingResult with sampled data and metrics
|
|
947
|
+
"""
|
|
948
|
+
# Estimate size for strategy selection
|
|
949
|
+
total_rows = lf.select(pl.len()).collect().item()
|
|
950
|
+
scale = self.config.classify_scale(total_rows)
|
|
951
|
+
|
|
952
|
+
# Create base config for strategy
|
|
953
|
+
base_config = SamplingConfig(
|
|
954
|
+
strategy=SamplingMethod.ADAPTIVE,
|
|
955
|
+
max_rows=self.config.target_rows,
|
|
956
|
+
confidence_level=self.config.confidence_level,
|
|
957
|
+
margin_of_error=self.config.margin_of_error,
|
|
958
|
+
seed=self.config.seed,
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
# Select strategy
|
|
962
|
+
if strategy:
|
|
963
|
+
selected = self._strategies.get(strategy)
|
|
964
|
+
if not selected:
|
|
965
|
+
raise ValueError(f"Unknown strategy: {strategy}")
|
|
966
|
+
else:
|
|
967
|
+
selected = self._select_strategy(scale)
|
|
968
|
+
|
|
969
|
+
logger.info(
|
|
970
|
+
f"Enterprise sampling: {total_rows:,} rows ({scale.name}) → "
|
|
971
|
+
f"strategy={selected.name}"
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
return selected.sample(lf, base_config, total_rows)
|
|
975
|
+
|
|
976
|
+
def _select_strategy(self, scale: ScaleCategory) -> SamplingStrategy:
|
|
977
|
+
"""Auto-select best strategy for scale."""
|
|
978
|
+
if scale in (ScaleCategory.SMALL, ScaleCategory.MEDIUM):
|
|
979
|
+
return self._strategies["column_aware"]
|
|
980
|
+
elif scale == ScaleCategory.LARGE:
|
|
981
|
+
return self._strategies["block"]
|
|
982
|
+
elif scale == ScaleCategory.XLARGE:
|
|
983
|
+
return self._strategies["multi_stage"]
|
|
984
|
+
else:
|
|
985
|
+
# XXLARGE: Use multi-stage with more stages
|
|
986
|
+
return MultiStageSamplingStrategy(self.config, num_stages=5)
|
|
987
|
+
|
|
988
|
+
def list_strategies(self) -> list[str]:
|
|
989
|
+
"""List available strategies."""
|
|
990
|
+
return list(self._strategies.keys())
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
# =============================================================================
|
|
994
|
+
# Convenience Functions
|
|
995
|
+
# =============================================================================
|
|
996
|
+
|
|
997
|
+
def sample_large_dataset(
|
|
998
|
+
lf: pl.LazyFrame,
|
|
999
|
+
target_rows: int = 100_000,
|
|
1000
|
+
quality: str = "standard",
|
|
1001
|
+
time_budget_seconds: float = 0.0,
|
|
1002
|
+
) -> SamplingResult:
|
|
1003
|
+
"""Quick function to sample large datasets.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
lf: LazyFrame to sample
|
|
1007
|
+
target_rows: Target number of rows
|
|
1008
|
+
quality: Quality level ("sketch", "quick", "standard", "high")
|
|
1009
|
+
time_budget_seconds: Max time for sampling
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
SamplingResult with sampled data
|
|
1013
|
+
|
|
1014
|
+
Example:
|
|
1015
|
+
result = sample_large_dataset(lf, target_rows=50_000, quality="high")
|
|
1016
|
+
sampled_df = result.data.collect()
|
|
1017
|
+
"""
|
|
1018
|
+
config = EnterpriseScaleConfig.for_quality(quality)
|
|
1019
|
+
config = EnterpriseScaleConfig(
|
|
1020
|
+
target_rows=target_rows,
|
|
1021
|
+
memory_budget=config.memory_budget,
|
|
1022
|
+
time_budget_seconds=time_budget_seconds,
|
|
1023
|
+
quality=config.quality,
|
|
1024
|
+
confidence_level=config.confidence_level,
|
|
1025
|
+
margin_of_error=config.margin_of_error,
|
|
1026
|
+
)
|
|
1027
|
+
sampler = EnterpriseScaleSampler(config)
|
|
1028
|
+
return sampler.sample(lf)
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
def estimate_optimal_sample_size(
|
|
1032
|
+
total_rows: int,
|
|
1033
|
+
confidence_level: float = 0.95,
|
|
1034
|
+
margin_of_error: float = 0.05,
|
|
1035
|
+
max_rows: int = 1_000_000,
|
|
1036
|
+
) -> int:
|
|
1037
|
+
"""Estimate optimal sample size for statistical accuracy.
|
|
1038
|
+
|
|
1039
|
+
Args:
|
|
1040
|
+
total_rows: Total population size
|
|
1041
|
+
confidence_level: Desired confidence (0.90, 0.95, 0.99)
|
|
1042
|
+
margin_of_error: Acceptable error margin
|
|
1043
|
+
max_rows: Maximum sample size cap
|
|
1044
|
+
|
|
1045
|
+
Returns:
|
|
1046
|
+
Recommended sample size
|
|
1047
|
+
"""
|
|
1048
|
+
config = SamplingConfig(
|
|
1049
|
+
confidence_level=confidence_level,
|
|
1050
|
+
margin_of_error=margin_of_error,
|
|
1051
|
+
)
|
|
1052
|
+
required = config.calculate_required_sample_size(total_rows)
|
|
1053
|
+
return min(required, max_rows, total_rows)
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def classify_dataset_scale(total_rows: int) -> ScaleCategory:
|
|
1057
|
+
"""Classify dataset by scale.
|
|
1058
|
+
|
|
1059
|
+
Args:
|
|
1060
|
+
total_rows: Number of rows
|
|
1061
|
+
|
|
1062
|
+
Returns:
|
|
1063
|
+
ScaleCategory enum value
|
|
1064
|
+
"""
|
|
1065
|
+
return EnterpriseScaleConfig.classify_scale(total_rows)
|