truthound 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound/__init__.py +162 -0
- truthound/adapters.py +100 -0
- truthound/api.py +365 -0
- truthound/audit/__init__.py +248 -0
- truthound/audit/core.py +967 -0
- truthound/audit/filters.py +620 -0
- truthound/audit/formatters.py +707 -0
- truthound/audit/logger.py +902 -0
- truthound/audit/middleware.py +571 -0
- truthound/audit/storage.py +1083 -0
- truthound/benchmark/__init__.py +123 -0
- truthound/benchmark/base.py +757 -0
- truthound/benchmark/comparison.py +635 -0
- truthound/benchmark/generators.py +706 -0
- truthound/benchmark/reporters.py +718 -0
- truthound/benchmark/runner.py +635 -0
- truthound/benchmark/scenarios.py +712 -0
- truthound/cache.py +252 -0
- truthound/checkpoint/__init__.py +136 -0
- truthound/checkpoint/actions/__init__.py +164 -0
- truthound/checkpoint/actions/base.py +324 -0
- truthound/checkpoint/actions/custom.py +234 -0
- truthound/checkpoint/actions/discord_notify.py +290 -0
- truthound/checkpoint/actions/email_notify.py +405 -0
- truthound/checkpoint/actions/github_action.py +406 -0
- truthound/checkpoint/actions/opsgenie.py +1499 -0
- truthound/checkpoint/actions/pagerduty.py +226 -0
- truthound/checkpoint/actions/slack_notify.py +233 -0
- truthound/checkpoint/actions/store_result.py +249 -0
- truthound/checkpoint/actions/teams_notify.py +1570 -0
- truthound/checkpoint/actions/telegram_notify.py +419 -0
- truthound/checkpoint/actions/update_docs.py +552 -0
- truthound/checkpoint/actions/webhook.py +293 -0
- truthound/checkpoint/analytics/__init__.py +147 -0
- truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
- truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
- truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
- truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
- truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
- truthound/checkpoint/analytics/analyzers/base.py +270 -0
- truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
- truthound/checkpoint/analytics/analyzers/trend.py +314 -0
- truthound/checkpoint/analytics/models.py +292 -0
- truthound/checkpoint/analytics/protocols.py +549 -0
- truthound/checkpoint/analytics/service.py +718 -0
- truthound/checkpoint/analytics/stores/__init__.py +16 -0
- truthound/checkpoint/analytics/stores/base.py +306 -0
- truthound/checkpoint/analytics/stores/memory_store.py +353 -0
- truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
- truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
- truthound/checkpoint/async_actions.py +794 -0
- truthound/checkpoint/async_base.py +708 -0
- truthound/checkpoint/async_checkpoint.py +617 -0
- truthound/checkpoint/async_runner.py +639 -0
- truthound/checkpoint/checkpoint.py +527 -0
- truthound/checkpoint/ci/__init__.py +61 -0
- truthound/checkpoint/ci/detector.py +355 -0
- truthound/checkpoint/ci/reporter.py +436 -0
- truthound/checkpoint/ci/templates.py +454 -0
- truthound/checkpoint/circuitbreaker/__init__.py +133 -0
- truthound/checkpoint/circuitbreaker/breaker.py +542 -0
- truthound/checkpoint/circuitbreaker/core.py +252 -0
- truthound/checkpoint/circuitbreaker/detection.py +459 -0
- truthound/checkpoint/circuitbreaker/middleware.py +389 -0
- truthound/checkpoint/circuitbreaker/registry.py +357 -0
- truthound/checkpoint/distributed/__init__.py +139 -0
- truthound/checkpoint/distributed/backends/__init__.py +35 -0
- truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
- truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
- truthound/checkpoint/distributed/backends/local_backend.py +397 -0
- truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
- truthound/checkpoint/distributed/base.py +774 -0
- truthound/checkpoint/distributed/orchestrator.py +765 -0
- truthound/checkpoint/distributed/protocols.py +842 -0
- truthound/checkpoint/distributed/registry.py +449 -0
- truthound/checkpoint/idempotency/__init__.py +120 -0
- truthound/checkpoint/idempotency/core.py +295 -0
- truthound/checkpoint/idempotency/fingerprint.py +454 -0
- truthound/checkpoint/idempotency/locking.py +604 -0
- truthound/checkpoint/idempotency/service.py +592 -0
- truthound/checkpoint/idempotency/stores.py +653 -0
- truthound/checkpoint/monitoring/__init__.py +134 -0
- truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
- truthound/checkpoint/monitoring/aggregators/base.py +372 -0
- truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
- truthound/checkpoint/monitoring/aggregators/window.py +493 -0
- truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
- truthound/checkpoint/monitoring/collectors/base.py +257 -0
- truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
- truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
- truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
- truthound/checkpoint/monitoring/events.py +410 -0
- truthound/checkpoint/monitoring/protocols.py +636 -0
- truthound/checkpoint/monitoring/service.py +578 -0
- truthound/checkpoint/monitoring/views/__init__.py +17 -0
- truthound/checkpoint/monitoring/views/base.py +172 -0
- truthound/checkpoint/monitoring/views/queue_view.py +220 -0
- truthound/checkpoint/monitoring/views/task_view.py +240 -0
- truthound/checkpoint/monitoring/views/worker_view.py +263 -0
- truthound/checkpoint/registry.py +337 -0
- truthound/checkpoint/runner.py +356 -0
- truthound/checkpoint/transaction/__init__.py +133 -0
- truthound/checkpoint/transaction/base.py +389 -0
- truthound/checkpoint/transaction/compensatable.py +537 -0
- truthound/checkpoint/transaction/coordinator.py +576 -0
- truthound/checkpoint/transaction/executor.py +622 -0
- truthound/checkpoint/transaction/idempotency.py +534 -0
- truthound/checkpoint/transaction/saga/__init__.py +143 -0
- truthound/checkpoint/transaction/saga/builder.py +584 -0
- truthound/checkpoint/transaction/saga/definition.py +515 -0
- truthound/checkpoint/transaction/saga/event_store.py +542 -0
- truthound/checkpoint/transaction/saga/patterns.py +833 -0
- truthound/checkpoint/transaction/saga/runner.py +718 -0
- truthound/checkpoint/transaction/saga/state_machine.py +793 -0
- truthound/checkpoint/transaction/saga/strategies.py +780 -0
- truthound/checkpoint/transaction/saga/testing.py +886 -0
- truthound/checkpoint/triggers/__init__.py +58 -0
- truthound/checkpoint/triggers/base.py +237 -0
- truthound/checkpoint/triggers/event.py +385 -0
- truthound/checkpoint/triggers/schedule.py +355 -0
- truthound/cli.py +2358 -0
- truthound/cli_modules/__init__.py +124 -0
- truthound/cli_modules/advanced/__init__.py +45 -0
- truthound/cli_modules/advanced/benchmark.py +343 -0
- truthound/cli_modules/advanced/docs.py +225 -0
- truthound/cli_modules/advanced/lineage.py +209 -0
- truthound/cli_modules/advanced/ml.py +320 -0
- truthound/cli_modules/advanced/realtime.py +196 -0
- truthound/cli_modules/checkpoint/__init__.py +46 -0
- truthound/cli_modules/checkpoint/init.py +114 -0
- truthound/cli_modules/checkpoint/list.py +71 -0
- truthound/cli_modules/checkpoint/run.py +159 -0
- truthound/cli_modules/checkpoint/validate.py +67 -0
- truthound/cli_modules/common/__init__.py +71 -0
- truthound/cli_modules/common/errors.py +414 -0
- truthound/cli_modules/common/options.py +419 -0
- truthound/cli_modules/common/output.py +507 -0
- truthound/cli_modules/common/protocol.py +552 -0
- truthound/cli_modules/core/__init__.py +48 -0
- truthound/cli_modules/core/check.py +123 -0
- truthound/cli_modules/core/compare.py +104 -0
- truthound/cli_modules/core/learn.py +57 -0
- truthound/cli_modules/core/mask.py +77 -0
- truthound/cli_modules/core/profile.py +65 -0
- truthound/cli_modules/core/scan.py +61 -0
- truthound/cli_modules/profiler/__init__.py +51 -0
- truthound/cli_modules/profiler/auto_profile.py +175 -0
- truthound/cli_modules/profiler/metadata.py +107 -0
- truthound/cli_modules/profiler/suite.py +283 -0
- truthound/cli_modules/registry.py +431 -0
- truthound/cli_modules/scaffolding/__init__.py +89 -0
- truthound/cli_modules/scaffolding/base.py +631 -0
- truthound/cli_modules/scaffolding/commands.py +545 -0
- truthound/cli_modules/scaffolding/plugins.py +1072 -0
- truthound/cli_modules/scaffolding/reporters.py +594 -0
- truthound/cli_modules/scaffolding/validators.py +1127 -0
- truthound/common/__init__.py +18 -0
- truthound/common/resilience/__init__.py +130 -0
- truthound/common/resilience/bulkhead.py +266 -0
- truthound/common/resilience/circuit_breaker.py +516 -0
- truthound/common/resilience/composite.py +332 -0
- truthound/common/resilience/config.py +292 -0
- truthound/common/resilience/protocols.py +217 -0
- truthound/common/resilience/rate_limiter.py +404 -0
- truthound/common/resilience/retry.py +341 -0
- truthound/datadocs/__init__.py +260 -0
- truthound/datadocs/base.py +571 -0
- truthound/datadocs/builder.py +761 -0
- truthound/datadocs/charts.py +764 -0
- truthound/datadocs/dashboard/__init__.py +63 -0
- truthound/datadocs/dashboard/app.py +576 -0
- truthound/datadocs/dashboard/components.py +584 -0
- truthound/datadocs/dashboard/state.py +240 -0
- truthound/datadocs/engine/__init__.py +46 -0
- truthound/datadocs/engine/context.py +376 -0
- truthound/datadocs/engine/pipeline.py +618 -0
- truthound/datadocs/engine/registry.py +469 -0
- truthound/datadocs/exporters/__init__.py +49 -0
- truthound/datadocs/exporters/base.py +198 -0
- truthound/datadocs/exporters/html.py +178 -0
- truthound/datadocs/exporters/json_exporter.py +253 -0
- truthound/datadocs/exporters/markdown.py +284 -0
- truthound/datadocs/exporters/pdf.py +392 -0
- truthound/datadocs/i18n/__init__.py +86 -0
- truthound/datadocs/i18n/catalog.py +960 -0
- truthound/datadocs/i18n/formatting.py +505 -0
- truthound/datadocs/i18n/loader.py +256 -0
- truthound/datadocs/i18n/plurals.py +378 -0
- truthound/datadocs/renderers/__init__.py +42 -0
- truthound/datadocs/renderers/base.py +401 -0
- truthound/datadocs/renderers/custom.py +342 -0
- truthound/datadocs/renderers/jinja.py +697 -0
- truthound/datadocs/sections.py +736 -0
- truthound/datadocs/styles.py +931 -0
- truthound/datadocs/themes/__init__.py +101 -0
- truthound/datadocs/themes/base.py +336 -0
- truthound/datadocs/themes/default.py +417 -0
- truthound/datadocs/themes/enterprise.py +419 -0
- truthound/datadocs/themes/loader.py +336 -0
- truthound/datadocs/themes.py +301 -0
- truthound/datadocs/transformers/__init__.py +57 -0
- truthound/datadocs/transformers/base.py +268 -0
- truthound/datadocs/transformers/enrichers.py +544 -0
- truthound/datadocs/transformers/filters.py +447 -0
- truthound/datadocs/transformers/i18n.py +468 -0
- truthound/datadocs/versioning/__init__.py +62 -0
- truthound/datadocs/versioning/diff.py +639 -0
- truthound/datadocs/versioning/storage.py +497 -0
- truthound/datadocs/versioning/version.py +358 -0
- truthound/datasources/__init__.py +223 -0
- truthound/datasources/_async_protocols.py +222 -0
- truthound/datasources/_protocols.py +159 -0
- truthound/datasources/adapters.py +428 -0
- truthound/datasources/async_base.py +599 -0
- truthound/datasources/async_factory.py +511 -0
- truthound/datasources/base.py +516 -0
- truthound/datasources/factory.py +433 -0
- truthound/datasources/nosql/__init__.py +47 -0
- truthound/datasources/nosql/base.py +487 -0
- truthound/datasources/nosql/elasticsearch.py +801 -0
- truthound/datasources/nosql/mongodb.py +636 -0
- truthound/datasources/pandas_optimized.py +582 -0
- truthound/datasources/pandas_source.py +216 -0
- truthound/datasources/polars_source.py +395 -0
- truthound/datasources/spark_source.py +479 -0
- truthound/datasources/sql/__init__.py +154 -0
- truthound/datasources/sql/base.py +710 -0
- truthound/datasources/sql/bigquery.py +410 -0
- truthound/datasources/sql/cloud_base.py +199 -0
- truthound/datasources/sql/databricks.py +471 -0
- truthound/datasources/sql/mysql.py +316 -0
- truthound/datasources/sql/oracle.py +427 -0
- truthound/datasources/sql/postgresql.py +321 -0
- truthound/datasources/sql/redshift.py +479 -0
- truthound/datasources/sql/snowflake.py +439 -0
- truthound/datasources/sql/sqlite.py +286 -0
- truthound/datasources/sql/sqlserver.py +437 -0
- truthound/datasources/streaming/__init__.py +47 -0
- truthound/datasources/streaming/base.py +350 -0
- truthound/datasources/streaming/kafka.py +670 -0
- truthound/decorators.py +98 -0
- truthound/docs/__init__.py +69 -0
- truthound/docs/extractor.py +971 -0
- truthound/docs/generator.py +601 -0
- truthound/docs/parser.py +1037 -0
- truthound/docs/renderer.py +999 -0
- truthound/drift/__init__.py +22 -0
- truthound/drift/compare.py +189 -0
- truthound/drift/detectors.py +464 -0
- truthound/drift/report.py +160 -0
- truthound/execution/__init__.py +65 -0
- truthound/execution/_protocols.py +324 -0
- truthound/execution/base.py +576 -0
- truthound/execution/distributed/__init__.py +179 -0
- truthound/execution/distributed/aggregations.py +731 -0
- truthound/execution/distributed/arrow_bridge.py +817 -0
- truthound/execution/distributed/base.py +550 -0
- truthound/execution/distributed/dask_engine.py +976 -0
- truthound/execution/distributed/mixins.py +766 -0
- truthound/execution/distributed/protocols.py +756 -0
- truthound/execution/distributed/ray_engine.py +1127 -0
- truthound/execution/distributed/registry.py +446 -0
- truthound/execution/distributed/spark_engine.py +1011 -0
- truthound/execution/distributed/validator_adapter.py +682 -0
- truthound/execution/pandas_engine.py +401 -0
- truthound/execution/polars_engine.py +497 -0
- truthound/execution/pushdown/__init__.py +230 -0
- truthound/execution/pushdown/ast.py +1550 -0
- truthound/execution/pushdown/builder.py +1550 -0
- truthound/execution/pushdown/dialects.py +1072 -0
- truthound/execution/pushdown/executor.py +829 -0
- truthound/execution/pushdown/optimizer.py +1041 -0
- truthound/execution/sql_engine.py +518 -0
- truthound/infrastructure/__init__.py +189 -0
- truthound/infrastructure/audit.py +1515 -0
- truthound/infrastructure/config.py +1133 -0
- truthound/infrastructure/encryption.py +1132 -0
- truthound/infrastructure/logging.py +1503 -0
- truthound/infrastructure/metrics.py +1220 -0
- truthound/lineage/__init__.py +89 -0
- truthound/lineage/base.py +746 -0
- truthound/lineage/impact_analysis.py +474 -0
- truthound/lineage/integrations/__init__.py +22 -0
- truthound/lineage/integrations/openlineage.py +548 -0
- truthound/lineage/tracker.py +512 -0
- truthound/lineage/visualization/__init__.py +33 -0
- truthound/lineage/visualization/protocols.py +145 -0
- truthound/lineage/visualization/renderers/__init__.py +20 -0
- truthound/lineage/visualization/renderers/cytoscape.py +329 -0
- truthound/lineage/visualization/renderers/d3.py +331 -0
- truthound/lineage/visualization/renderers/graphviz.py +276 -0
- truthound/lineage/visualization/renderers/mermaid.py +308 -0
- truthound/maskers.py +113 -0
- truthound/ml/__init__.py +124 -0
- truthound/ml/anomaly_models/__init__.py +31 -0
- truthound/ml/anomaly_models/ensemble.py +362 -0
- truthound/ml/anomaly_models/isolation_forest.py +444 -0
- truthound/ml/anomaly_models/statistical.py +392 -0
- truthound/ml/base.py +1178 -0
- truthound/ml/drift_detection/__init__.py +26 -0
- truthound/ml/drift_detection/concept.py +381 -0
- truthound/ml/drift_detection/distribution.py +361 -0
- truthound/ml/drift_detection/feature.py +442 -0
- truthound/ml/drift_detection/multivariate.py +495 -0
- truthound/ml/monitoring/__init__.py +88 -0
- truthound/ml/monitoring/alerting/__init__.py +33 -0
- truthound/ml/monitoring/alerting/handlers.py +427 -0
- truthound/ml/monitoring/alerting/rules.py +508 -0
- truthound/ml/monitoring/collectors/__init__.py +19 -0
- truthound/ml/monitoring/collectors/composite.py +105 -0
- truthound/ml/monitoring/collectors/drift.py +324 -0
- truthound/ml/monitoring/collectors/performance.py +179 -0
- truthound/ml/monitoring/collectors/quality.py +369 -0
- truthound/ml/monitoring/monitor.py +536 -0
- truthound/ml/monitoring/protocols.py +451 -0
- truthound/ml/monitoring/stores/__init__.py +15 -0
- truthound/ml/monitoring/stores/memory.py +201 -0
- truthound/ml/monitoring/stores/prometheus.py +296 -0
- truthound/ml/rule_learning/__init__.py +25 -0
- truthound/ml/rule_learning/constraint_miner.py +443 -0
- truthound/ml/rule_learning/pattern_learner.py +499 -0
- truthound/ml/rule_learning/profile_learner.py +462 -0
- truthound/multitenancy/__init__.py +326 -0
- truthound/multitenancy/core.py +852 -0
- truthound/multitenancy/integration.py +597 -0
- truthound/multitenancy/isolation.py +630 -0
- truthound/multitenancy/manager.py +770 -0
- truthound/multitenancy/middleware.py +765 -0
- truthound/multitenancy/quota.py +537 -0
- truthound/multitenancy/resolvers.py +603 -0
- truthound/multitenancy/storage.py +703 -0
- truthound/observability/__init__.py +307 -0
- truthound/observability/context.py +531 -0
- truthound/observability/instrumentation.py +611 -0
- truthound/observability/logging.py +887 -0
- truthound/observability/metrics.py +1157 -0
- truthound/observability/tracing/__init__.py +178 -0
- truthound/observability/tracing/baggage.py +310 -0
- truthound/observability/tracing/config.py +426 -0
- truthound/observability/tracing/exporter.py +787 -0
- truthound/observability/tracing/integration.py +1018 -0
- truthound/observability/tracing/otel/__init__.py +146 -0
- truthound/observability/tracing/otel/adapter.py +982 -0
- truthound/observability/tracing/otel/bridge.py +1177 -0
- truthound/observability/tracing/otel/compat.py +681 -0
- truthound/observability/tracing/otel/config.py +691 -0
- truthound/observability/tracing/otel/detection.py +327 -0
- truthound/observability/tracing/otel/protocols.py +426 -0
- truthound/observability/tracing/processor.py +561 -0
- truthound/observability/tracing/propagator.py +757 -0
- truthound/observability/tracing/provider.py +569 -0
- truthound/observability/tracing/resource.py +515 -0
- truthound/observability/tracing/sampler.py +487 -0
- truthound/observability/tracing/span.py +676 -0
- truthound/plugins/__init__.py +198 -0
- truthound/plugins/base.py +599 -0
- truthound/plugins/cli.py +680 -0
- truthound/plugins/dependencies/__init__.py +42 -0
- truthound/plugins/dependencies/graph.py +422 -0
- truthound/plugins/dependencies/resolver.py +417 -0
- truthound/plugins/discovery.py +379 -0
- truthound/plugins/docs/__init__.py +46 -0
- truthound/plugins/docs/extractor.py +444 -0
- truthound/plugins/docs/renderer.py +499 -0
- truthound/plugins/enterprise_manager.py +877 -0
- truthound/plugins/examples/__init__.py +19 -0
- truthound/plugins/examples/custom_validators.py +317 -0
- truthound/plugins/examples/slack_notifier.py +312 -0
- truthound/plugins/examples/xml_reporter.py +254 -0
- truthound/plugins/hooks.py +558 -0
- truthound/plugins/lifecycle/__init__.py +43 -0
- truthound/plugins/lifecycle/hot_reload.py +402 -0
- truthound/plugins/lifecycle/manager.py +371 -0
- truthound/plugins/manager.py +736 -0
- truthound/plugins/registry.py +338 -0
- truthound/plugins/security/__init__.py +93 -0
- truthound/plugins/security/exceptions.py +332 -0
- truthound/plugins/security/policies.py +348 -0
- truthound/plugins/security/protocols.py +643 -0
- truthound/plugins/security/sandbox/__init__.py +45 -0
- truthound/plugins/security/sandbox/context.py +158 -0
- truthound/plugins/security/sandbox/engines/__init__.py +19 -0
- truthound/plugins/security/sandbox/engines/container.py +379 -0
- truthound/plugins/security/sandbox/engines/noop.py +144 -0
- truthound/plugins/security/sandbox/engines/process.py +336 -0
- truthound/plugins/security/sandbox/factory.py +211 -0
- truthound/plugins/security/signing/__init__.py +57 -0
- truthound/plugins/security/signing/service.py +330 -0
- truthound/plugins/security/signing/trust_store.py +368 -0
- truthound/plugins/security/signing/verifier.py +459 -0
- truthound/plugins/versioning/__init__.py +41 -0
- truthound/plugins/versioning/constraints.py +297 -0
- truthound/plugins/versioning/resolver.py +329 -0
- truthound/profiler/__init__.py +1729 -0
- truthound/profiler/_lazy.py +452 -0
- truthound/profiler/ab_testing/__init__.py +80 -0
- truthound/profiler/ab_testing/analysis.py +449 -0
- truthound/profiler/ab_testing/base.py +257 -0
- truthound/profiler/ab_testing/experiment.py +395 -0
- truthound/profiler/ab_testing/tracking.py +368 -0
- truthound/profiler/auto_threshold.py +1170 -0
- truthound/profiler/base.py +579 -0
- truthound/profiler/cache_patterns.py +911 -0
- truthound/profiler/caching.py +1303 -0
- truthound/profiler/column_profiler.py +712 -0
- truthound/profiler/comparison.py +1007 -0
- truthound/profiler/custom_patterns.py +1170 -0
- truthound/profiler/dashboard/__init__.py +50 -0
- truthound/profiler/dashboard/app.py +476 -0
- truthound/profiler/dashboard/components.py +457 -0
- truthound/profiler/dashboard/config.py +72 -0
- truthound/profiler/distributed/__init__.py +83 -0
- truthound/profiler/distributed/base.py +281 -0
- truthound/profiler/distributed/dask_backend.py +498 -0
- truthound/profiler/distributed/local_backend.py +293 -0
- truthound/profiler/distributed/profiler.py +304 -0
- truthound/profiler/distributed/ray_backend.py +374 -0
- truthound/profiler/distributed/spark_backend.py +375 -0
- truthound/profiler/distributed.py +1366 -0
- truthound/profiler/enterprise_sampling.py +1065 -0
- truthound/profiler/errors.py +488 -0
- truthound/profiler/evolution/__init__.py +91 -0
- truthound/profiler/evolution/alerts.py +426 -0
- truthound/profiler/evolution/changes.py +206 -0
- truthound/profiler/evolution/compatibility.py +365 -0
- truthound/profiler/evolution/detector.py +372 -0
- truthound/profiler/evolution/protocols.py +121 -0
- truthound/profiler/generators/__init__.py +48 -0
- truthound/profiler/generators/base.py +384 -0
- truthound/profiler/generators/ml_rules.py +375 -0
- truthound/profiler/generators/pattern_rules.py +384 -0
- truthound/profiler/generators/schema_rules.py +267 -0
- truthound/profiler/generators/stats_rules.py +324 -0
- truthound/profiler/generators/suite_generator.py +857 -0
- truthound/profiler/i18n.py +1542 -0
- truthound/profiler/incremental.py +554 -0
- truthound/profiler/incremental_validation.py +1710 -0
- truthound/profiler/integration/__init__.py +73 -0
- truthound/profiler/integration/adapters.py +345 -0
- truthound/profiler/integration/context.py +371 -0
- truthound/profiler/integration/executor.py +527 -0
- truthound/profiler/integration/naming.py +75 -0
- truthound/profiler/integration/protocols.py +243 -0
- truthound/profiler/memory.py +1185 -0
- truthound/profiler/migration/__init__.py +60 -0
- truthound/profiler/migration/base.py +345 -0
- truthound/profiler/migration/manager.py +444 -0
- truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
- truthound/profiler/ml/__init__.py +73 -0
- truthound/profiler/ml/base.py +244 -0
- truthound/profiler/ml/classifier.py +507 -0
- truthound/profiler/ml/feature_extraction.py +604 -0
- truthound/profiler/ml/pretrained.py +448 -0
- truthound/profiler/ml_inference.py +1276 -0
- truthound/profiler/native_patterns.py +815 -0
- truthound/profiler/observability.py +1184 -0
- truthound/profiler/process_timeout.py +1566 -0
- truthound/profiler/progress.py +568 -0
- truthound/profiler/progress_callbacks.py +1734 -0
- truthound/profiler/quality.py +1345 -0
- truthound/profiler/resilience.py +1180 -0
- truthound/profiler/sampled_matcher.py +794 -0
- truthound/profiler/sampling.py +1288 -0
- truthound/profiler/scheduling/__init__.py +82 -0
- truthound/profiler/scheduling/protocols.py +214 -0
- truthound/profiler/scheduling/scheduler.py +474 -0
- truthound/profiler/scheduling/storage.py +457 -0
- truthound/profiler/scheduling/triggers.py +449 -0
- truthound/profiler/schema.py +603 -0
- truthound/profiler/streaming.py +685 -0
- truthound/profiler/streaming_patterns.py +1354 -0
- truthound/profiler/suite_cli.py +625 -0
- truthound/profiler/suite_config.py +789 -0
- truthound/profiler/suite_export.py +1268 -0
- truthound/profiler/table_profiler.py +547 -0
- truthound/profiler/timeout.py +565 -0
- truthound/profiler/validation.py +1532 -0
- truthound/profiler/visualization/__init__.py +118 -0
- truthound/profiler/visualization/base.py +346 -0
- truthound/profiler/visualization/generator.py +1259 -0
- truthound/profiler/visualization/plotly_renderer.py +811 -0
- truthound/profiler/visualization/renderers.py +669 -0
- truthound/profiler/visualization/sections.py +540 -0
- truthound/profiler/visualization.py +2122 -0
- truthound/profiler/yaml_validation.py +1151 -0
- truthound/py.typed +0 -0
- truthound/ratelimit/__init__.py +248 -0
- truthound/ratelimit/algorithms.py +1108 -0
- truthound/ratelimit/core.py +573 -0
- truthound/ratelimit/integration.py +532 -0
- truthound/ratelimit/limiter.py +663 -0
- truthound/ratelimit/middleware.py +700 -0
- truthound/ratelimit/policy.py +792 -0
- truthound/ratelimit/storage.py +763 -0
- truthound/rbac/__init__.py +340 -0
- truthound/rbac/core.py +976 -0
- truthound/rbac/integration.py +760 -0
- truthound/rbac/manager.py +1052 -0
- truthound/rbac/middleware.py +842 -0
- truthound/rbac/policy.py +954 -0
- truthound/rbac/storage.py +878 -0
- truthound/realtime/__init__.py +141 -0
- truthound/realtime/adapters/__init__.py +43 -0
- truthound/realtime/adapters/base.py +533 -0
- truthound/realtime/adapters/kafka.py +487 -0
- truthound/realtime/adapters/kinesis.py +479 -0
- truthound/realtime/adapters/mock.py +243 -0
- truthound/realtime/base.py +553 -0
- truthound/realtime/factory.py +382 -0
- truthound/realtime/incremental.py +660 -0
- truthound/realtime/processing/__init__.py +67 -0
- truthound/realtime/processing/exactly_once.py +575 -0
- truthound/realtime/processing/state.py +547 -0
- truthound/realtime/processing/windows.py +647 -0
- truthound/realtime/protocols.py +569 -0
- truthound/realtime/streaming.py +605 -0
- truthound/realtime/testing/__init__.py +32 -0
- truthound/realtime/testing/containers.py +615 -0
- truthound/realtime/testing/fixtures.py +484 -0
- truthound/report.py +280 -0
- truthound/reporters/__init__.py +46 -0
- truthound/reporters/_protocols.py +30 -0
- truthound/reporters/base.py +324 -0
- truthound/reporters/ci/__init__.py +66 -0
- truthound/reporters/ci/azure.py +436 -0
- truthound/reporters/ci/base.py +509 -0
- truthound/reporters/ci/bitbucket.py +567 -0
- truthound/reporters/ci/circleci.py +547 -0
- truthound/reporters/ci/detection.py +364 -0
- truthound/reporters/ci/factory.py +182 -0
- truthound/reporters/ci/github.py +388 -0
- truthound/reporters/ci/gitlab.py +471 -0
- truthound/reporters/ci/jenkins.py +525 -0
- truthound/reporters/console_reporter.py +299 -0
- truthound/reporters/factory.py +211 -0
- truthound/reporters/html_reporter.py +524 -0
- truthound/reporters/json_reporter.py +256 -0
- truthound/reporters/markdown_reporter.py +280 -0
- truthound/reporters/sdk/__init__.py +174 -0
- truthound/reporters/sdk/builder.py +558 -0
- truthound/reporters/sdk/mixins.py +1150 -0
- truthound/reporters/sdk/schema.py +1493 -0
- truthound/reporters/sdk/templates.py +666 -0
- truthound/reporters/sdk/testing.py +968 -0
- truthound/scanners.py +170 -0
- truthound/scheduling/__init__.py +122 -0
- truthound/scheduling/cron.py +1136 -0
- truthound/scheduling/presets.py +212 -0
- truthound/schema.py +275 -0
- truthound/secrets/__init__.py +173 -0
- truthound/secrets/base.py +618 -0
- truthound/secrets/cloud.py +682 -0
- truthound/secrets/integration.py +507 -0
- truthound/secrets/manager.py +633 -0
- truthound/secrets/oidc/__init__.py +172 -0
- truthound/secrets/oidc/base.py +902 -0
- truthound/secrets/oidc/credential_provider.py +623 -0
- truthound/secrets/oidc/exchangers.py +1001 -0
- truthound/secrets/oidc/github/__init__.py +110 -0
- truthound/secrets/oidc/github/claims.py +718 -0
- truthound/secrets/oidc/github/enhanced_provider.py +693 -0
- truthound/secrets/oidc/github/trust_policy.py +742 -0
- truthound/secrets/oidc/github/verification.py +723 -0
- truthound/secrets/oidc/github/workflow.py +691 -0
- truthound/secrets/oidc/providers.py +825 -0
- truthound/secrets/providers.py +506 -0
- truthound/secrets/resolver.py +495 -0
- truthound/stores/__init__.py +177 -0
- truthound/stores/backends/__init__.py +18 -0
- truthound/stores/backends/_protocols.py +340 -0
- truthound/stores/backends/azure_blob.py +530 -0
- truthound/stores/backends/concurrent_filesystem.py +915 -0
- truthound/stores/backends/connection_pool.py +1365 -0
- truthound/stores/backends/database.py +743 -0
- truthound/stores/backends/filesystem.py +538 -0
- truthound/stores/backends/gcs.py +399 -0
- truthound/stores/backends/memory.py +354 -0
- truthound/stores/backends/s3.py +434 -0
- truthound/stores/backpressure/__init__.py +84 -0
- truthound/stores/backpressure/base.py +375 -0
- truthound/stores/backpressure/circuit_breaker.py +434 -0
- truthound/stores/backpressure/monitor.py +376 -0
- truthound/stores/backpressure/strategies.py +677 -0
- truthound/stores/base.py +551 -0
- truthound/stores/batching/__init__.py +65 -0
- truthound/stores/batching/base.py +305 -0
- truthound/stores/batching/buffer.py +370 -0
- truthound/stores/batching/store.py +248 -0
- truthound/stores/batching/writer.py +521 -0
- truthound/stores/caching/__init__.py +60 -0
- truthound/stores/caching/backends.py +684 -0
- truthound/stores/caching/base.py +356 -0
- truthound/stores/caching/store.py +305 -0
- truthound/stores/compression/__init__.py +193 -0
- truthound/stores/compression/adaptive.py +694 -0
- truthound/stores/compression/base.py +514 -0
- truthound/stores/compression/pipeline.py +868 -0
- truthound/stores/compression/providers.py +672 -0
- truthound/stores/compression/streaming.py +832 -0
- truthound/stores/concurrency/__init__.py +81 -0
- truthound/stores/concurrency/atomic.py +556 -0
- truthound/stores/concurrency/index.py +775 -0
- truthound/stores/concurrency/locks.py +576 -0
- truthound/stores/concurrency/manager.py +482 -0
- truthound/stores/encryption/__init__.py +297 -0
- truthound/stores/encryption/base.py +952 -0
- truthound/stores/encryption/keys.py +1191 -0
- truthound/stores/encryption/pipeline.py +903 -0
- truthound/stores/encryption/providers.py +953 -0
- truthound/stores/encryption/streaming.py +950 -0
- truthound/stores/expectations.py +227 -0
- truthound/stores/factory.py +246 -0
- truthound/stores/migration/__init__.py +75 -0
- truthound/stores/migration/base.py +480 -0
- truthound/stores/migration/manager.py +347 -0
- truthound/stores/migration/registry.py +382 -0
- truthound/stores/migration/store.py +559 -0
- truthound/stores/observability/__init__.py +106 -0
- truthound/stores/observability/audit.py +718 -0
- truthound/stores/observability/config.py +270 -0
- truthound/stores/observability/factory.py +208 -0
- truthound/stores/observability/metrics.py +636 -0
- truthound/stores/observability/protocols.py +410 -0
- truthound/stores/observability/store.py +570 -0
- truthound/stores/observability/tracing.py +784 -0
- truthound/stores/replication/__init__.py +76 -0
- truthound/stores/replication/base.py +260 -0
- truthound/stores/replication/monitor.py +269 -0
- truthound/stores/replication/store.py +439 -0
- truthound/stores/replication/syncer.py +391 -0
- truthound/stores/results.py +359 -0
- truthound/stores/retention/__init__.py +77 -0
- truthound/stores/retention/base.py +378 -0
- truthound/stores/retention/policies.py +621 -0
- truthound/stores/retention/scheduler.py +279 -0
- truthound/stores/retention/store.py +526 -0
- truthound/stores/streaming/__init__.py +138 -0
- truthound/stores/streaming/base.py +801 -0
- truthound/stores/streaming/database.py +984 -0
- truthound/stores/streaming/filesystem.py +719 -0
- truthound/stores/streaming/reader.py +629 -0
- truthound/stores/streaming/s3.py +843 -0
- truthound/stores/streaming/writer.py +790 -0
- truthound/stores/tiering/__init__.py +108 -0
- truthound/stores/tiering/base.py +462 -0
- truthound/stores/tiering/manager.py +249 -0
- truthound/stores/tiering/policies.py +692 -0
- truthound/stores/tiering/store.py +526 -0
- truthound/stores/versioning/__init__.py +56 -0
- truthound/stores/versioning/base.py +376 -0
- truthound/stores/versioning/store.py +660 -0
- truthound/stores/versioning/strategies.py +353 -0
- truthound/types.py +56 -0
- truthound/validators/__init__.py +774 -0
- truthound/validators/aggregate/__init__.py +27 -0
- truthound/validators/aggregate/central.py +116 -0
- truthound/validators/aggregate/extremes.py +116 -0
- truthound/validators/aggregate/spread.py +118 -0
- truthound/validators/aggregate/sum.py +64 -0
- truthound/validators/aggregate/type.py +78 -0
- truthound/validators/anomaly/__init__.py +93 -0
- truthound/validators/anomaly/base.py +431 -0
- truthound/validators/anomaly/ml_based.py +1190 -0
- truthound/validators/anomaly/multivariate.py +647 -0
- truthound/validators/anomaly/statistical.py +599 -0
- truthound/validators/base.py +1089 -0
- truthound/validators/business_rule/__init__.py +46 -0
- truthound/validators/business_rule/base.py +147 -0
- truthound/validators/business_rule/checksum.py +509 -0
- truthound/validators/business_rule/financial.py +526 -0
- truthound/validators/cache.py +733 -0
- truthound/validators/completeness/__init__.py +39 -0
- truthound/validators/completeness/conditional.py +73 -0
- truthound/validators/completeness/default.py +98 -0
- truthound/validators/completeness/empty.py +103 -0
- truthound/validators/completeness/nan.py +337 -0
- truthound/validators/completeness/null.py +152 -0
- truthound/validators/cross_table/__init__.py +17 -0
- truthound/validators/cross_table/aggregate.py +333 -0
- truthound/validators/cross_table/row_count.py +122 -0
- truthound/validators/datetime/__init__.py +29 -0
- truthound/validators/datetime/format.py +78 -0
- truthound/validators/datetime/freshness.py +269 -0
- truthound/validators/datetime/order.py +73 -0
- truthound/validators/datetime/parseable.py +185 -0
- truthound/validators/datetime/range.py +202 -0
- truthound/validators/datetime/timezone.py +69 -0
- truthound/validators/distribution/__init__.py +49 -0
- truthound/validators/distribution/distribution.py +128 -0
- truthound/validators/distribution/monotonic.py +119 -0
- truthound/validators/distribution/outlier.py +178 -0
- truthound/validators/distribution/quantile.py +80 -0
- truthound/validators/distribution/range.py +254 -0
- truthound/validators/distribution/set.py +125 -0
- truthound/validators/distribution/statistical.py +459 -0
- truthound/validators/drift/__init__.py +79 -0
- truthound/validators/drift/base.py +427 -0
- truthound/validators/drift/multi_feature.py +401 -0
- truthound/validators/drift/numeric.py +395 -0
- truthound/validators/drift/psi.py +446 -0
- truthound/validators/drift/statistical.py +510 -0
- truthound/validators/enterprise.py +1658 -0
- truthound/validators/geospatial/__init__.py +80 -0
- truthound/validators/geospatial/base.py +97 -0
- truthound/validators/geospatial/boundary.py +238 -0
- truthound/validators/geospatial/coordinate.py +351 -0
- truthound/validators/geospatial/distance.py +399 -0
- truthound/validators/geospatial/polygon.py +665 -0
- truthound/validators/i18n/__init__.py +308 -0
- truthound/validators/i18n/bidi.py +571 -0
- truthound/validators/i18n/catalogs.py +570 -0
- truthound/validators/i18n/dialects.py +763 -0
- truthound/validators/i18n/extended_catalogs.py +549 -0
- truthound/validators/i18n/formatting.py +1434 -0
- truthound/validators/i18n/loader.py +1020 -0
- truthound/validators/i18n/messages.py +521 -0
- truthound/validators/i18n/plural.py +683 -0
- truthound/validators/i18n/protocols.py +855 -0
- truthound/validators/i18n/tms.py +1162 -0
- truthound/validators/localization/__init__.py +53 -0
- truthound/validators/localization/base.py +122 -0
- truthound/validators/localization/chinese.py +362 -0
- truthound/validators/localization/japanese.py +275 -0
- truthound/validators/localization/korean.py +524 -0
- truthound/validators/memory/__init__.py +94 -0
- truthound/validators/memory/approximate_knn.py +506 -0
- truthound/validators/memory/base.py +547 -0
- truthound/validators/memory/sgd_online.py +719 -0
- truthound/validators/memory/streaming_ecdf.py +753 -0
- truthound/validators/ml_feature/__init__.py +54 -0
- truthound/validators/ml_feature/base.py +249 -0
- truthound/validators/ml_feature/correlation.py +299 -0
- truthound/validators/ml_feature/leakage.py +344 -0
- truthound/validators/ml_feature/null_impact.py +270 -0
- truthound/validators/ml_feature/scale.py +264 -0
- truthound/validators/multi_column/__init__.py +89 -0
- truthound/validators/multi_column/arithmetic.py +284 -0
- truthound/validators/multi_column/base.py +231 -0
- truthound/validators/multi_column/comparison.py +273 -0
- truthound/validators/multi_column/consistency.py +312 -0
- truthound/validators/multi_column/statistical.py +299 -0
- truthound/validators/optimization/__init__.py +164 -0
- truthound/validators/optimization/aggregation.py +563 -0
- truthound/validators/optimization/covariance.py +556 -0
- truthound/validators/optimization/geo.py +626 -0
- truthound/validators/optimization/graph.py +587 -0
- truthound/validators/optimization/orchestrator.py +970 -0
- truthound/validators/optimization/profiling.py +1312 -0
- truthound/validators/privacy/__init__.py +223 -0
- truthound/validators/privacy/base.py +635 -0
- truthound/validators/privacy/ccpa.py +670 -0
- truthound/validators/privacy/gdpr.py +728 -0
- truthound/validators/privacy/global_patterns.py +604 -0
- truthound/validators/privacy/plugins.py +867 -0
- truthound/validators/profiling/__init__.py +52 -0
- truthound/validators/profiling/base.py +175 -0
- truthound/validators/profiling/cardinality.py +312 -0
- truthound/validators/profiling/entropy.py +391 -0
- truthound/validators/profiling/frequency.py +455 -0
- truthound/validators/pushdown_support.py +660 -0
- truthound/validators/query/__init__.py +91 -0
- truthound/validators/query/aggregate.py +346 -0
- truthound/validators/query/base.py +246 -0
- truthound/validators/query/column.py +249 -0
- truthound/validators/query/expression.py +274 -0
- truthound/validators/query/result.py +323 -0
- truthound/validators/query/row_count.py +264 -0
- truthound/validators/referential/__init__.py +80 -0
- truthound/validators/referential/base.py +395 -0
- truthound/validators/referential/cascade.py +391 -0
- truthound/validators/referential/circular.py +563 -0
- truthound/validators/referential/foreign_key.py +624 -0
- truthound/validators/referential/orphan.py +485 -0
- truthound/validators/registry.py +112 -0
- truthound/validators/schema/__init__.py +41 -0
- truthound/validators/schema/column_count.py +142 -0
- truthound/validators/schema/column_exists.py +80 -0
- truthound/validators/schema/column_order.py +82 -0
- truthound/validators/schema/column_pair.py +85 -0
- truthound/validators/schema/column_pair_set.py +195 -0
- truthound/validators/schema/column_type.py +94 -0
- truthound/validators/schema/multi_column.py +53 -0
- truthound/validators/schema/multi_column_aggregate.py +175 -0
- truthound/validators/schema/referential.py +274 -0
- truthound/validators/schema/table_schema.py +91 -0
- truthound/validators/schema_validator.py +219 -0
- truthound/validators/sdk/__init__.py +250 -0
- truthound/validators/sdk/builder.py +680 -0
- truthound/validators/sdk/decorators.py +474 -0
- truthound/validators/sdk/enterprise/__init__.py +211 -0
- truthound/validators/sdk/enterprise/docs.py +725 -0
- truthound/validators/sdk/enterprise/fuzzing.py +659 -0
- truthound/validators/sdk/enterprise/licensing.py +709 -0
- truthound/validators/sdk/enterprise/manager.py +543 -0
- truthound/validators/sdk/enterprise/resources.py +628 -0
- truthound/validators/sdk/enterprise/sandbox.py +766 -0
- truthound/validators/sdk/enterprise/signing.py +603 -0
- truthound/validators/sdk/enterprise/templates.py +865 -0
- truthound/validators/sdk/enterprise/versioning.py +659 -0
- truthound/validators/sdk/templates.py +757 -0
- truthound/validators/sdk/testing.py +807 -0
- truthound/validators/security/__init__.py +181 -0
- truthound/validators/security/redos/__init__.py +182 -0
- truthound/validators/security/redos/core.py +861 -0
- truthound/validators/security/redos/cpu_monitor.py +593 -0
- truthound/validators/security/redos/cve_database.py +791 -0
- truthound/validators/security/redos/ml/__init__.py +155 -0
- truthound/validators/security/redos/ml/base.py +785 -0
- truthound/validators/security/redos/ml/datasets.py +618 -0
- truthound/validators/security/redos/ml/features.py +359 -0
- truthound/validators/security/redos/ml/models.py +1000 -0
- truthound/validators/security/redos/ml/predictor.py +507 -0
- truthound/validators/security/redos/ml/storage.py +632 -0
- truthound/validators/security/redos/ml/training.py +571 -0
- truthound/validators/security/redos/ml_analyzer.py +937 -0
- truthound/validators/security/redos/optimizer.py +674 -0
- truthound/validators/security/redos/profiler.py +682 -0
- truthound/validators/security/redos/re2_engine.py +709 -0
- truthound/validators/security/redos.py +886 -0
- truthound/validators/security/sql_security.py +1247 -0
- truthound/validators/streaming/__init__.py +126 -0
- truthound/validators/streaming/base.py +292 -0
- truthound/validators/streaming/completeness.py +210 -0
- truthound/validators/streaming/mixin.py +575 -0
- truthound/validators/streaming/range.py +308 -0
- truthound/validators/streaming/sources.py +846 -0
- truthound/validators/string/__init__.py +57 -0
- truthound/validators/string/casing.py +158 -0
- truthound/validators/string/charset.py +96 -0
- truthound/validators/string/format.py +501 -0
- truthound/validators/string/json.py +77 -0
- truthound/validators/string/json_schema.py +184 -0
- truthound/validators/string/length.py +104 -0
- truthound/validators/string/like_pattern.py +237 -0
- truthound/validators/string/regex.py +202 -0
- truthound/validators/string/regex_extended.py +435 -0
- truthound/validators/table/__init__.py +88 -0
- truthound/validators/table/base.py +78 -0
- truthound/validators/table/column_count.py +198 -0
- truthound/validators/table/freshness.py +362 -0
- truthound/validators/table/row_count.py +251 -0
- truthound/validators/table/schema.py +333 -0
- truthound/validators/table/size.py +285 -0
- truthound/validators/timeout/__init__.py +102 -0
- truthound/validators/timeout/advanced/__init__.py +247 -0
- truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
- truthound/validators/timeout/advanced/prediction.py +773 -0
- truthound/validators/timeout/advanced/priority.py +618 -0
- truthound/validators/timeout/advanced/redis_backend.py +770 -0
- truthound/validators/timeout/advanced/retry.py +721 -0
- truthound/validators/timeout/advanced/sampling.py +788 -0
- truthound/validators/timeout/advanced/sla.py +661 -0
- truthound/validators/timeout/advanced/telemetry.py +804 -0
- truthound/validators/timeout/cascade.py +477 -0
- truthound/validators/timeout/deadline.py +657 -0
- truthound/validators/timeout/degradation.py +525 -0
- truthound/validators/timeout/distributed.py +597 -0
- truthound/validators/timeseries/__init__.py +89 -0
- truthound/validators/timeseries/base.py +326 -0
- truthound/validators/timeseries/completeness.py +617 -0
- truthound/validators/timeseries/gap.py +485 -0
- truthound/validators/timeseries/monotonic.py +310 -0
- truthound/validators/timeseries/seasonality.py +422 -0
- truthound/validators/timeseries/trend.py +510 -0
- truthound/validators/uniqueness/__init__.py +59 -0
- truthound/validators/uniqueness/approximate.py +475 -0
- truthound/validators/uniqueness/distinct_values.py +253 -0
- truthound/validators/uniqueness/duplicate.py +118 -0
- truthound/validators/uniqueness/primary_key.py +140 -0
- truthound/validators/uniqueness/unique.py +191 -0
- truthound/validators/uniqueness/within_record.py +599 -0
- truthound/validators/utils.py +756 -0
- truthound-1.0.8.dist-info/METADATA +474 -0
- truthound-1.0.8.dist-info/RECORD +877 -0
- truthound-1.0.8.dist-info/WHEEL +4 -0
- truthound-1.0.8.dist-info/entry_points.txt +2 -0
- truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1288 @@
|
|
|
1
|
+
"""Enterprise-grade sampling strategies for memory-efficient pattern matching.
|
|
2
|
+
|
|
3
|
+
This module provides a comprehensive sampling framework that prevents OOM errors
|
|
4
|
+
when processing large datasets while maintaining statistical accuracy.
|
|
5
|
+
|
|
6
|
+
Key features:
|
|
7
|
+
- Pluggable sampling strategy architecture
|
|
8
|
+
- Memory-aware adaptive sampling
|
|
9
|
+
- Statistical confidence estimation
|
|
10
|
+
- Stratified sampling for skewed distributions
|
|
11
|
+
- Reservoir sampling for streaming data
|
|
12
|
+
|
|
13
|
+
Design Principles:
|
|
14
|
+
- Open/Closed: New strategies can be added without modifying existing code
|
|
15
|
+
- Single Responsibility: Each strategy handles one sampling approach
|
|
16
|
+
- Dependency Inversion: High-level modules depend on abstractions
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
from truthound.profiler.sampling import (
|
|
20
|
+
SampledPatternMatcher,
|
|
21
|
+
SamplingConfig,
|
|
22
|
+
AdaptiveSamplingStrategy,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Use adaptive sampling based on data size
|
|
26
|
+
config = SamplingConfig(
|
|
27
|
+
strategy="adaptive",
|
|
28
|
+
max_rows=100_000,
|
|
29
|
+
confidence_level=0.95,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
matcher = SampledPatternMatcher(sampling_config=config)
|
|
33
|
+
results = matcher.match_column(lf, "email")
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import hashlib
|
|
39
|
+
import logging
|
|
40
|
+
import math
|
|
41
|
+
import random
|
|
42
|
+
import sys
|
|
43
|
+
import threading
|
|
44
|
+
import time
|
|
45
|
+
from abc import ABC, abstractmethod
|
|
46
|
+
from dataclasses import dataclass, field
|
|
47
|
+
from datetime import datetime
|
|
48
|
+
from enum import Enum
|
|
49
|
+
from typing import (
|
|
50
|
+
TYPE_CHECKING,
|
|
51
|
+
Any,
|
|
52
|
+
Callable,
|
|
53
|
+
Generic,
|
|
54
|
+
Iterator,
|
|
55
|
+
Protocol,
|
|
56
|
+
Sequence,
|
|
57
|
+
TypeVar,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
import polars as pl
|
|
61
|
+
|
|
62
|
+
logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# =============================================================================
|
|
66
|
+
# Types and Enums
|
|
67
|
+
# =============================================================================
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SamplingMethod(str, Enum):
|
|
71
|
+
"""Available sampling methods."""
|
|
72
|
+
|
|
73
|
+
NONE = "none" # No sampling (use all data)
|
|
74
|
+
RANDOM = "random" # Simple random sampling
|
|
75
|
+
SYSTEMATIC = "systematic" # Every nth row
|
|
76
|
+
STRATIFIED = "stratified" # Preserve distribution
|
|
77
|
+
RESERVOIR = "reservoir" # Streaming reservoir sampling
|
|
78
|
+
ADAPTIVE = "adaptive" # Auto-select based on data size
|
|
79
|
+
HEAD = "head" # First n rows (fastest, least accurate)
|
|
80
|
+
HASH = "hash" # Deterministic hash-based sampling
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ConfidenceLevel(float, Enum):
|
|
84
|
+
"""Common confidence levels for statistical sampling."""
|
|
85
|
+
|
|
86
|
+
LOW = 0.90 # 90% confidence
|
|
87
|
+
MEDIUM = 0.95 # 95% confidence (default)
|
|
88
|
+
HIGH = 0.99 # 99% confidence
|
|
89
|
+
VERY_HIGH = 0.999 # 99.9% confidence
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# =============================================================================
|
|
93
|
+
# Sampling Result
|
|
94
|
+
# =============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class SamplingMetrics:
|
|
99
|
+
"""Metrics about the sampling operation.
|
|
100
|
+
|
|
101
|
+
Attributes:
|
|
102
|
+
original_size: Original dataset size
|
|
103
|
+
sample_size: Actual sample size used
|
|
104
|
+
sampling_ratio: Fraction of data sampled
|
|
105
|
+
confidence_level: Statistical confidence level
|
|
106
|
+
margin_of_error: Estimated margin of error
|
|
107
|
+
strategy_used: Name of the sampling strategy
|
|
108
|
+
sampling_time_ms: Time taken to sample
|
|
109
|
+
memory_saved_estimate_mb: Estimated memory saved
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
original_size: int
|
|
113
|
+
sample_size: int
|
|
114
|
+
sampling_ratio: float
|
|
115
|
+
confidence_level: float
|
|
116
|
+
margin_of_error: float
|
|
117
|
+
strategy_used: str
|
|
118
|
+
sampling_time_ms: float = 0.0
|
|
119
|
+
memory_saved_estimate_mb: float = 0.0
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def is_full_scan(self) -> bool:
|
|
123
|
+
"""Check if full data was used (no sampling)."""
|
|
124
|
+
return self.sampling_ratio >= 1.0
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def reduction_factor(self) -> float:
|
|
128
|
+
"""Get data reduction factor (1.0 = no reduction)."""
|
|
129
|
+
if self.sample_size == 0:
|
|
130
|
+
return 0.0
|
|
131
|
+
return self.original_size / self.sample_size
|
|
132
|
+
|
|
133
|
+
def to_dict(self) -> dict[str, Any]:
|
|
134
|
+
"""Convert to dictionary for serialization."""
|
|
135
|
+
return {
|
|
136
|
+
"original_size": self.original_size,
|
|
137
|
+
"sample_size": self.sample_size,
|
|
138
|
+
"sampling_ratio": self.sampling_ratio,
|
|
139
|
+
"confidence_level": self.confidence_level,
|
|
140
|
+
"margin_of_error": self.margin_of_error,
|
|
141
|
+
"strategy_used": self.strategy_used,
|
|
142
|
+
"sampling_time_ms": self.sampling_time_ms,
|
|
143
|
+
"memory_saved_estimate_mb": self.memory_saved_estimate_mb,
|
|
144
|
+
"is_full_scan": self.is_full_scan,
|
|
145
|
+
"reduction_factor": self.reduction_factor,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class SamplingResult(Generic[TypeVar("T")]):
|
|
151
|
+
"""Result of a sampling operation.
|
|
152
|
+
|
|
153
|
+
Attributes:
|
|
154
|
+
data: The sampled LazyFrame
|
|
155
|
+
metrics: Sampling metrics
|
|
156
|
+
is_sampled: Whether sampling was applied
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
data: pl.LazyFrame
|
|
160
|
+
metrics: SamplingMetrics
|
|
161
|
+
is_sampled: bool = True
|
|
162
|
+
|
|
163
|
+
def __post_init__(self) -> None:
|
|
164
|
+
"""Validate result."""
|
|
165
|
+
if self.metrics.sample_size == 0 and self.is_sampled:
|
|
166
|
+
logger.warning("Sampling resulted in zero rows")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# =============================================================================
|
|
170
|
+
# Sampling Configuration
|
|
171
|
+
# =============================================================================
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class SamplingConfig:
|
|
176
|
+
"""Configuration for sampling behavior.
|
|
177
|
+
|
|
178
|
+
This configuration controls how data is sampled for pattern matching.
|
|
179
|
+
It supports both explicit size limits and statistical parameters.
|
|
180
|
+
|
|
181
|
+
Attributes:
|
|
182
|
+
strategy: Sampling strategy to use
|
|
183
|
+
max_rows: Maximum rows to sample (0 = auto-calculate)
|
|
184
|
+
max_memory_mb: Maximum memory to use for sampling (0 = unlimited)
|
|
185
|
+
confidence_level: Statistical confidence level (0.0 to 1.0)
|
|
186
|
+
margin_of_error: Acceptable margin of error (0.0 to 1.0)
|
|
187
|
+
seed: Random seed for reproducibility (None = random)
|
|
188
|
+
min_sample_size: Minimum sample size regardless of calculations
|
|
189
|
+
enable_caching: Cache sampling decisions for same data
|
|
190
|
+
fallback_strategy: Strategy to use if primary fails
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
strategy: SamplingMethod = SamplingMethod.ADAPTIVE
|
|
194
|
+
max_rows: int = 100_000
|
|
195
|
+
max_memory_mb: int = 0 # 0 = auto (use 10% of available)
|
|
196
|
+
confidence_level: float = 0.95
|
|
197
|
+
margin_of_error: float = 0.05
|
|
198
|
+
seed: int | None = None
|
|
199
|
+
min_sample_size: int = 1000
|
|
200
|
+
enable_caching: bool = True
|
|
201
|
+
fallback_strategy: SamplingMethod = SamplingMethod.HEAD
|
|
202
|
+
|
|
203
|
+
# Size thresholds for adaptive strategy
|
|
204
|
+
small_dataset_threshold: int = 10_000
|
|
205
|
+
medium_dataset_threshold: int = 100_000
|
|
206
|
+
large_dataset_threshold: int = 1_000_000
|
|
207
|
+
|
|
208
|
+
def __post_init__(self) -> None:
|
|
209
|
+
"""Validate configuration."""
|
|
210
|
+
if not 0.0 < self.confidence_level < 1.0:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"confidence_level must be between 0 and 1, got {self.confidence_level}"
|
|
213
|
+
)
|
|
214
|
+
if not 0.0 < self.margin_of_error < 1.0:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"margin_of_error must be between 0 and 1, got {self.margin_of_error}"
|
|
217
|
+
)
|
|
218
|
+
if self.max_rows < 0:
|
|
219
|
+
raise ValueError(f"max_rows must be non-negative, got {self.max_rows}")
|
|
220
|
+
if self.min_sample_size < 1:
|
|
221
|
+
raise ValueError(
|
|
222
|
+
f"min_sample_size must be positive, got {self.min_sample_size}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def calculate_required_sample_size(
|
|
226
|
+
self,
|
|
227
|
+
population_size: int,
|
|
228
|
+
expected_proportion: float = 0.5,
|
|
229
|
+
) -> int:
|
|
230
|
+
"""Calculate statistically required sample size.
|
|
231
|
+
|
|
232
|
+
Uses Cochran's formula with finite population correction.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
population_size: Total population size
|
|
236
|
+
expected_proportion: Expected proportion (0.5 = maximum variance)
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Required sample size for desired confidence/margin
|
|
240
|
+
"""
|
|
241
|
+
if population_size <= 0:
|
|
242
|
+
return 0
|
|
243
|
+
|
|
244
|
+
# Z-score for confidence level
|
|
245
|
+
z_scores = {
|
|
246
|
+
0.90: 1.645,
|
|
247
|
+
0.95: 1.96,
|
|
248
|
+
0.99: 2.576,
|
|
249
|
+
0.999: 3.291,
|
|
250
|
+
}
|
|
251
|
+
z = z_scores.get(
|
|
252
|
+
round(self.confidence_level, 3),
|
|
253
|
+
self._z_score_from_confidence(self.confidence_level),
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
p = expected_proportion
|
|
257
|
+
e = self.margin_of_error
|
|
258
|
+
|
|
259
|
+
# Cochran's formula for infinite population
|
|
260
|
+
n0 = (z ** 2 * p * (1 - p)) / (e ** 2)
|
|
261
|
+
|
|
262
|
+
# Finite population correction
|
|
263
|
+
n = n0 / (1 + (n0 - 1) / population_size)
|
|
264
|
+
|
|
265
|
+
# Apply bounds
|
|
266
|
+
sample_size = int(math.ceil(n))
|
|
267
|
+
sample_size = max(sample_size, self.min_sample_size)
|
|
268
|
+
sample_size = min(sample_size, population_size)
|
|
269
|
+
|
|
270
|
+
if self.max_rows > 0:
|
|
271
|
+
sample_size = min(sample_size, self.max_rows)
|
|
272
|
+
|
|
273
|
+
return sample_size
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def _z_score_from_confidence(confidence: float) -> float:
|
|
277
|
+
"""Approximate Z-score from confidence level."""
|
|
278
|
+
# Using inverse normal approximation
|
|
279
|
+
# For more accuracy, use scipy.stats.norm.ppf
|
|
280
|
+
alpha = 1 - confidence
|
|
281
|
+
# Rough approximation for common values
|
|
282
|
+
if alpha <= 0.001:
|
|
283
|
+
return 3.3
|
|
284
|
+
elif alpha <= 0.01:
|
|
285
|
+
return 2.6
|
|
286
|
+
elif alpha <= 0.05:
|
|
287
|
+
return 2.0
|
|
288
|
+
elif alpha <= 0.10:
|
|
289
|
+
return 1.6
|
|
290
|
+
else:
|
|
291
|
+
return 1.0
|
|
292
|
+
|
|
293
|
+
@classmethod
|
|
294
|
+
def for_accuracy(cls, accuracy: str = "medium") -> "SamplingConfig":
|
|
295
|
+
"""Create config optimized for accuracy level.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
accuracy: "low", "medium", "high", or "maximum"
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Configured SamplingConfig
|
|
302
|
+
"""
|
|
303
|
+
configs = {
|
|
304
|
+
"low": cls(
|
|
305
|
+
strategy=SamplingMethod.HEAD,
|
|
306
|
+
max_rows=10_000,
|
|
307
|
+
confidence_level=0.90,
|
|
308
|
+
margin_of_error=0.10,
|
|
309
|
+
),
|
|
310
|
+
"medium": cls(
|
|
311
|
+
strategy=SamplingMethod.ADAPTIVE,
|
|
312
|
+
max_rows=100_000,
|
|
313
|
+
confidence_level=0.95,
|
|
314
|
+
margin_of_error=0.05,
|
|
315
|
+
),
|
|
316
|
+
"high": cls(
|
|
317
|
+
strategy=SamplingMethod.RANDOM,
|
|
318
|
+
max_rows=500_000,
|
|
319
|
+
confidence_level=0.99,
|
|
320
|
+
margin_of_error=0.02,
|
|
321
|
+
),
|
|
322
|
+
"maximum": cls(
|
|
323
|
+
strategy=SamplingMethod.NONE,
|
|
324
|
+
max_rows=0,
|
|
325
|
+
confidence_level=0.999,
|
|
326
|
+
margin_of_error=0.01,
|
|
327
|
+
),
|
|
328
|
+
}
|
|
329
|
+
return configs.get(accuracy, configs["medium"])
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def for_speed(cls) -> "SamplingConfig":
|
|
333
|
+
"""Create config optimized for speed."""
|
|
334
|
+
return cls(
|
|
335
|
+
strategy=SamplingMethod.HEAD,
|
|
336
|
+
max_rows=10_000,
|
|
337
|
+
confidence_level=0.90,
|
|
338
|
+
margin_of_error=0.10,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
@classmethod
|
|
342
|
+
def for_memory(cls, max_memory_mb: int = 100) -> "SamplingConfig":
|
|
343
|
+
"""Create config optimized for memory efficiency."""
|
|
344
|
+
return cls(
|
|
345
|
+
strategy=SamplingMethod.RESERVOIR,
|
|
346
|
+
max_rows=50_000,
|
|
347
|
+
max_memory_mb=max_memory_mb,
|
|
348
|
+
confidence_level=0.95,
|
|
349
|
+
margin_of_error=0.05,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def to_dict(self) -> dict[str, Any]:
|
|
353
|
+
"""Convert to dictionary."""
|
|
354
|
+
return {
|
|
355
|
+
"strategy": self.strategy.value,
|
|
356
|
+
"max_rows": self.max_rows,
|
|
357
|
+
"max_memory_mb": self.max_memory_mb,
|
|
358
|
+
"confidence_level": self.confidence_level,
|
|
359
|
+
"margin_of_error": self.margin_of_error,
|
|
360
|
+
"seed": self.seed,
|
|
361
|
+
"min_sample_size": self.min_sample_size,
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# Default configuration
|
|
366
|
+
DEFAULT_SAMPLING_CONFIG = SamplingConfig()
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# =============================================================================
|
|
370
|
+
# Sampling Strategy Protocol
|
|
371
|
+
# =============================================================================
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class SamplingStrategy(ABC):
|
|
375
|
+
"""Abstract base class for sampling strategies.
|
|
376
|
+
|
|
377
|
+
All sampling strategies must implement this interface.
|
|
378
|
+
This enables the Strategy pattern for flexible sampling behavior.
|
|
379
|
+
|
|
380
|
+
Example:
|
|
381
|
+
class MyCustomStrategy(SamplingStrategy):
|
|
382
|
+
name = "custom"
|
|
383
|
+
|
|
384
|
+
def sample(self, lf, config):
|
|
385
|
+
# Custom sampling logic
|
|
386
|
+
...
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
name: str = "base"
|
|
390
|
+
|
|
391
|
+
@abstractmethod
|
|
392
|
+
def sample(
|
|
393
|
+
self,
|
|
394
|
+
lf: pl.LazyFrame,
|
|
395
|
+
config: SamplingConfig,
|
|
396
|
+
total_rows: int | None = None,
|
|
397
|
+
) -> SamplingResult:
|
|
398
|
+
"""Sample data from the LazyFrame.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
lf: Source LazyFrame
|
|
402
|
+
config: Sampling configuration
|
|
403
|
+
total_rows: Pre-computed total rows (optional, for efficiency)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
SamplingResult with sampled data and metrics
|
|
407
|
+
"""
|
|
408
|
+
pass
|
|
409
|
+
|
|
410
|
+
def estimate_row_count(self, lf: pl.LazyFrame) -> int:
|
|
411
|
+
"""Estimate row count without full scan.
|
|
412
|
+
|
|
413
|
+
Override for more efficient implementations.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
lf: LazyFrame to estimate
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Estimated row count
|
|
420
|
+
"""
|
|
421
|
+
# Default: exact count (can be expensive)
|
|
422
|
+
return lf.select(pl.len()).collect().item()
|
|
423
|
+
|
|
424
|
+
def _create_metrics(
|
|
425
|
+
self,
|
|
426
|
+
original_size: int,
|
|
427
|
+
sample_size: int,
|
|
428
|
+
config: SamplingConfig,
|
|
429
|
+
sampling_time_ms: float = 0.0,
|
|
430
|
+
) -> SamplingMetrics:
|
|
431
|
+
"""Create sampling metrics."""
|
|
432
|
+
sampling_ratio = sample_size / original_size if original_size > 0 else 0.0
|
|
433
|
+
|
|
434
|
+
# Estimate margin of error for actual sample
|
|
435
|
+
if sample_size > 0 and original_size > 0:
|
|
436
|
+
# Simplified margin of error calculation
|
|
437
|
+
z = 1.96 # 95% confidence
|
|
438
|
+
p = 0.5 # Maximum variance
|
|
439
|
+
margin = z * math.sqrt(p * (1 - p) / sample_size)
|
|
440
|
+
# Finite population correction
|
|
441
|
+
if sample_size < original_size:
|
|
442
|
+
fpc = math.sqrt((original_size - sample_size) / (original_size - 1))
|
|
443
|
+
margin *= fpc
|
|
444
|
+
else:
|
|
445
|
+
margin = 1.0
|
|
446
|
+
|
|
447
|
+
# Estimate memory saved (rough: 100 bytes per row average)
|
|
448
|
+
rows_saved = original_size - sample_size
|
|
449
|
+
memory_saved_mb = (rows_saved * 100) / (1024 * 1024)
|
|
450
|
+
|
|
451
|
+
return SamplingMetrics(
|
|
452
|
+
original_size=original_size,
|
|
453
|
+
sample_size=sample_size,
|
|
454
|
+
sampling_ratio=sampling_ratio,
|
|
455
|
+
confidence_level=config.confidence_level,
|
|
456
|
+
margin_of_error=min(margin, 1.0),
|
|
457
|
+
strategy_used=self.name,
|
|
458
|
+
sampling_time_ms=sampling_time_ms,
|
|
459
|
+
memory_saved_estimate_mb=max(0, memory_saved_mb),
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# =============================================================================
|
|
464
|
+
# Concrete Sampling Strategies
|
|
465
|
+
# =============================================================================
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class NoSamplingStrategy(SamplingStrategy):
|
|
469
|
+
"""Strategy that uses all data without sampling.
|
|
470
|
+
|
|
471
|
+
Use when accuracy is paramount and memory is not a concern.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
name = "none"
|
|
475
|
+
|
|
476
|
+
def sample(
|
|
477
|
+
self,
|
|
478
|
+
lf: pl.LazyFrame,
|
|
479
|
+
config: SamplingConfig,
|
|
480
|
+
total_rows: int | None = None,
|
|
481
|
+
) -> SamplingResult:
|
|
482
|
+
"""Return all data without sampling."""
|
|
483
|
+
start_time = time.perf_counter()
|
|
484
|
+
|
|
485
|
+
if total_rows is None:
|
|
486
|
+
total_rows = self.estimate_row_count(lf)
|
|
487
|
+
|
|
488
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
489
|
+
|
|
490
|
+
return SamplingResult(
|
|
491
|
+
data=lf,
|
|
492
|
+
metrics=self._create_metrics(
|
|
493
|
+
original_size=total_rows,
|
|
494
|
+
sample_size=total_rows,
|
|
495
|
+
config=config,
|
|
496
|
+
sampling_time_ms=elapsed_ms,
|
|
497
|
+
),
|
|
498
|
+
is_sampled=False,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class HeadSamplingStrategy(SamplingStrategy):
|
|
503
|
+
"""Strategy that takes the first N rows.
|
|
504
|
+
|
|
505
|
+
Fastest sampling method but may not be representative
|
|
506
|
+
if data has ordering bias.
|
|
507
|
+
"""
|
|
508
|
+
|
|
509
|
+
name = "head"
|
|
510
|
+
|
|
511
|
+
def sample(
|
|
512
|
+
self,
|
|
513
|
+
lf: pl.LazyFrame,
|
|
514
|
+
config: SamplingConfig,
|
|
515
|
+
total_rows: int | None = None,
|
|
516
|
+
) -> SamplingResult:
|
|
517
|
+
"""Take first N rows."""
|
|
518
|
+
start_time = time.perf_counter()
|
|
519
|
+
|
|
520
|
+
if total_rows is None:
|
|
521
|
+
total_rows = self.estimate_row_count(lf)
|
|
522
|
+
|
|
523
|
+
# Calculate sample size
|
|
524
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
525
|
+
if config.max_rows > 0:
|
|
526
|
+
sample_size = min(sample_size, config.max_rows)
|
|
527
|
+
|
|
528
|
+
# No sampling needed if sample >= total
|
|
529
|
+
if sample_size >= total_rows:
|
|
530
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
531
|
+
|
|
532
|
+
# Apply head sampling
|
|
533
|
+
sampled_lf = lf.head(sample_size)
|
|
534
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
535
|
+
|
|
536
|
+
return SamplingResult(
|
|
537
|
+
data=sampled_lf,
|
|
538
|
+
metrics=self._create_metrics(
|
|
539
|
+
original_size=total_rows,
|
|
540
|
+
sample_size=sample_size,
|
|
541
|
+
config=config,
|
|
542
|
+
sampling_time_ms=elapsed_ms,
|
|
543
|
+
),
|
|
544
|
+
is_sampled=True,
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class RandomSamplingStrategy(SamplingStrategy):
|
|
549
|
+
"""Strategy for simple random sampling.
|
|
550
|
+
|
|
551
|
+
Uses Polars native random sampling for efficiency.
|
|
552
|
+
Provides unbiased samples but may not preserve rare patterns.
|
|
553
|
+
"""
|
|
554
|
+
|
|
555
|
+
name = "random"
|
|
556
|
+
|
|
557
|
+
def sample(
|
|
558
|
+
self,
|
|
559
|
+
lf: pl.LazyFrame,
|
|
560
|
+
config: SamplingConfig,
|
|
561
|
+
total_rows: int | None = None,
|
|
562
|
+
) -> SamplingResult:
|
|
563
|
+
"""Random sample of N rows."""
|
|
564
|
+
start_time = time.perf_counter()
|
|
565
|
+
|
|
566
|
+
if total_rows is None:
|
|
567
|
+
total_rows = self.estimate_row_count(lf)
|
|
568
|
+
|
|
569
|
+
# Calculate sample size
|
|
570
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
571
|
+
if config.max_rows > 0:
|
|
572
|
+
sample_size = min(sample_size, config.max_rows)
|
|
573
|
+
|
|
574
|
+
# No sampling needed if sample >= total
|
|
575
|
+
if sample_size >= total_rows:
|
|
576
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
577
|
+
|
|
578
|
+
# Calculate fraction for sampling
|
|
579
|
+
fraction = sample_size / total_rows
|
|
580
|
+
|
|
581
|
+
# Apply random sampling with seed for reproducibility
|
|
582
|
+
seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
|
|
583
|
+
|
|
584
|
+
# Polars sample is on DataFrame, need to collect first for true random
|
|
585
|
+
# For LazyFrame, we use a workaround with row index
|
|
586
|
+
# Use higher precision (10000) to avoid fraction becoming 0 for small ratios
|
|
587
|
+
threshold = max(1, int(fraction * 10000))
|
|
588
|
+
sampled_lf = (
|
|
589
|
+
lf.with_row_index("__sample_idx")
|
|
590
|
+
.filter(pl.col("__sample_idx").hash(seed) % 10000 < threshold)
|
|
591
|
+
.drop("__sample_idx")
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
595
|
+
|
|
596
|
+
# Actual sample size may vary due to hash-based sampling
|
|
597
|
+
actual_sample_size = min(sample_size, total_rows)
|
|
598
|
+
|
|
599
|
+
return SamplingResult(
|
|
600
|
+
data=sampled_lf,
|
|
601
|
+
metrics=self._create_metrics(
|
|
602
|
+
original_size=total_rows,
|
|
603
|
+
sample_size=actual_sample_size,
|
|
604
|
+
config=config,
|
|
605
|
+
sampling_time_ms=elapsed_ms,
|
|
606
|
+
),
|
|
607
|
+
is_sampled=True,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
class SystematicSamplingStrategy(SamplingStrategy):
|
|
612
|
+
"""Strategy for systematic sampling (every Nth row).
|
|
613
|
+
|
|
614
|
+
Efficient and ensures even coverage across data.
|
|
615
|
+
May miss periodic patterns if data has periodicity.
|
|
616
|
+
"""
|
|
617
|
+
|
|
618
|
+
name = "systematic"
|
|
619
|
+
|
|
620
|
+
def sample(
|
|
621
|
+
self,
|
|
622
|
+
lf: pl.LazyFrame,
|
|
623
|
+
config: SamplingConfig,
|
|
624
|
+
total_rows: int | None = None,
|
|
625
|
+
) -> SamplingResult:
|
|
626
|
+
"""Take every Nth row."""
|
|
627
|
+
start_time = time.perf_counter()
|
|
628
|
+
|
|
629
|
+
if total_rows is None:
|
|
630
|
+
total_rows = self.estimate_row_count(lf)
|
|
631
|
+
|
|
632
|
+
# Calculate sample size and interval
|
|
633
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
634
|
+
if config.max_rows > 0:
|
|
635
|
+
sample_size = min(sample_size, config.max_rows)
|
|
636
|
+
|
|
637
|
+
if sample_size >= total_rows:
|
|
638
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
639
|
+
|
|
640
|
+
# Calculate sampling interval
|
|
641
|
+
interval = max(1, total_rows // sample_size)
|
|
642
|
+
|
|
643
|
+
# Random start offset for unbiased sampling
|
|
644
|
+
seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
|
|
645
|
+
random.seed(seed)
|
|
646
|
+
offset = random.randint(0, interval - 1)
|
|
647
|
+
|
|
648
|
+
# Apply systematic sampling
|
|
649
|
+
sampled_lf = (
|
|
650
|
+
lf.with_row_index("__sample_idx")
|
|
651
|
+
.filter((pl.col("__sample_idx") - offset) % interval == 0)
|
|
652
|
+
.drop("__sample_idx")
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
656
|
+
actual_sample_size = (total_rows - offset + interval - 1) // interval
|
|
657
|
+
|
|
658
|
+
return SamplingResult(
|
|
659
|
+
data=sampled_lf,
|
|
660
|
+
metrics=self._create_metrics(
|
|
661
|
+
original_size=total_rows,
|
|
662
|
+
sample_size=min(actual_sample_size, sample_size),
|
|
663
|
+
config=config,
|
|
664
|
+
sampling_time_ms=elapsed_ms,
|
|
665
|
+
),
|
|
666
|
+
is_sampled=True,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
class HashSamplingStrategy(SamplingStrategy):
|
|
671
|
+
"""Strategy for deterministic hash-based sampling.
|
|
672
|
+
|
|
673
|
+
Produces reproducible samples based on row content.
|
|
674
|
+
Useful for consistent sampling across runs.
|
|
675
|
+
"""
|
|
676
|
+
|
|
677
|
+
name = "hash"
|
|
678
|
+
|
|
679
|
+
def __init__(self, hash_column: str | None = None):
|
|
680
|
+
"""Initialize hash sampling strategy.
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
hash_column: Column to use for hashing (None = use row index)
|
|
684
|
+
"""
|
|
685
|
+
self.hash_column = hash_column
|
|
686
|
+
|
|
687
|
+
def sample(
|
|
688
|
+
self,
|
|
689
|
+
lf: pl.LazyFrame,
|
|
690
|
+
config: SamplingConfig,
|
|
691
|
+
total_rows: int | None = None,
|
|
692
|
+
) -> SamplingResult:
|
|
693
|
+
"""Hash-based deterministic sampling."""
|
|
694
|
+
start_time = time.perf_counter()
|
|
695
|
+
|
|
696
|
+
if total_rows is None:
|
|
697
|
+
total_rows = self.estimate_row_count(lf)
|
|
698
|
+
|
|
699
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
700
|
+
if config.max_rows > 0:
|
|
701
|
+
sample_size = min(sample_size, config.max_rows)
|
|
702
|
+
|
|
703
|
+
if sample_size >= total_rows:
|
|
704
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
705
|
+
|
|
706
|
+
# Calculate threshold for hash-based filtering
|
|
707
|
+
# Use higher precision (10000) to avoid threshold becoming 0 for small ratios
|
|
708
|
+
threshold = max(1, int((sample_size / total_rows) * 10000))
|
|
709
|
+
seed = config.seed if config.seed is not None else 42
|
|
710
|
+
|
|
711
|
+
if self.hash_column:
|
|
712
|
+
# Hash specific column
|
|
713
|
+
sampled_lf = lf.filter(
|
|
714
|
+
pl.col(self.hash_column).hash(seed) % 10000 < threshold
|
|
715
|
+
)
|
|
716
|
+
else:
|
|
717
|
+
# Hash row index
|
|
718
|
+
sampled_lf = (
|
|
719
|
+
lf.with_row_index("__hash_idx")
|
|
720
|
+
.filter(pl.col("__hash_idx").hash(seed) % 10000 < threshold)
|
|
721
|
+
.drop("__hash_idx")
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
725
|
+
|
|
726
|
+
return SamplingResult(
|
|
727
|
+
data=sampled_lf,
|
|
728
|
+
metrics=self._create_metrics(
|
|
729
|
+
original_size=total_rows,
|
|
730
|
+
sample_size=sample_size,
|
|
731
|
+
config=config,
|
|
732
|
+
sampling_time_ms=elapsed_ms,
|
|
733
|
+
),
|
|
734
|
+
is_sampled=True,
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class StratifiedSamplingStrategy(SamplingStrategy):
|
|
739
|
+
"""Strategy for stratified sampling.
|
|
740
|
+
|
|
741
|
+
Preserves distribution of a stratification column.
|
|
742
|
+
Useful when data has important categorical groupings.
|
|
743
|
+
"""
|
|
744
|
+
|
|
745
|
+
name = "stratified"
|
|
746
|
+
|
|
747
|
+
def __init__(self, stratify_column: str | None = None):
|
|
748
|
+
"""Initialize stratified sampling.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
stratify_column: Column to stratify by
|
|
752
|
+
"""
|
|
753
|
+
self.stratify_column = stratify_column
|
|
754
|
+
|
|
755
|
+
def sample(
|
|
756
|
+
self,
|
|
757
|
+
lf: pl.LazyFrame,
|
|
758
|
+
config: SamplingConfig,
|
|
759
|
+
total_rows: int | None = None,
|
|
760
|
+
) -> SamplingResult:
|
|
761
|
+
"""Stratified sampling preserving group proportions."""
|
|
762
|
+
start_time = time.perf_counter()
|
|
763
|
+
|
|
764
|
+
if total_rows is None:
|
|
765
|
+
total_rows = self.estimate_row_count(lf)
|
|
766
|
+
|
|
767
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
768
|
+
if config.max_rows > 0:
|
|
769
|
+
sample_size = min(sample_size, config.max_rows)
|
|
770
|
+
|
|
771
|
+
if sample_size >= total_rows:
|
|
772
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
773
|
+
|
|
774
|
+
fraction = sample_size / total_rows
|
|
775
|
+
|
|
776
|
+
if self.stratify_column:
|
|
777
|
+
# Sample within each stratum
|
|
778
|
+
seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
|
|
779
|
+
|
|
780
|
+
# Get strata and sample proportionally
|
|
781
|
+
sampled_lf = (
|
|
782
|
+
lf.with_row_index("__strat_idx")
|
|
783
|
+
.with_columns(
|
|
784
|
+
(pl.col("__strat_idx").hash(seed) % 1000 / 1000).alias("__rand")
|
|
785
|
+
)
|
|
786
|
+
.filter(pl.col("__rand") < fraction)
|
|
787
|
+
.drop(["__strat_idx", "__rand"])
|
|
788
|
+
)
|
|
789
|
+
else:
|
|
790
|
+
# Fallback to random sampling if no stratify column
|
|
791
|
+
return RandomSamplingStrategy().sample(lf, config, total_rows)
|
|
792
|
+
|
|
793
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
794
|
+
|
|
795
|
+
return SamplingResult(
|
|
796
|
+
data=sampled_lf,
|
|
797
|
+
metrics=self._create_metrics(
|
|
798
|
+
original_size=total_rows,
|
|
799
|
+
sample_size=sample_size,
|
|
800
|
+
config=config,
|
|
801
|
+
sampling_time_ms=elapsed_ms,
|
|
802
|
+
),
|
|
803
|
+
is_sampled=True,
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
class ReservoirSamplingStrategy(SamplingStrategy):
|
|
808
|
+
"""Strategy for reservoir sampling.
|
|
809
|
+
|
|
810
|
+
Optimal for streaming data where total size is unknown.
|
|
811
|
+
Provides uniform random sample with single pass.
|
|
812
|
+
"""
|
|
813
|
+
|
|
814
|
+
name = "reservoir"
|
|
815
|
+
|
|
816
|
+
def sample(
|
|
817
|
+
self,
|
|
818
|
+
lf: pl.LazyFrame,
|
|
819
|
+
config: SamplingConfig,
|
|
820
|
+
total_rows: int | None = None,
|
|
821
|
+
) -> SamplingResult:
|
|
822
|
+
"""Reservoir sampling for streaming-friendly sampling."""
|
|
823
|
+
start_time = time.perf_counter()
|
|
824
|
+
|
|
825
|
+
# For reservoir sampling, we need to process in a streaming fashion
|
|
826
|
+
# Polars doesn't have native reservoir sampling, so we approximate
|
|
827
|
+
|
|
828
|
+
if total_rows is None:
|
|
829
|
+
total_rows = self.estimate_row_count(lf)
|
|
830
|
+
|
|
831
|
+
sample_size = config.calculate_required_sample_size(total_rows)
|
|
832
|
+
if config.max_rows > 0:
|
|
833
|
+
sample_size = min(sample_size, config.max_rows)
|
|
834
|
+
|
|
835
|
+
if sample_size >= total_rows:
|
|
836
|
+
return NoSamplingStrategy().sample(lf, config, total_rows)
|
|
837
|
+
|
|
838
|
+
# Approximate reservoir sampling using weighted random selection
|
|
839
|
+
seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
|
|
840
|
+
|
|
841
|
+
# Use logarithmic random for reservoir-like behavior
|
|
842
|
+
sampled_lf = (
|
|
843
|
+
lf.with_row_index("__res_idx")
|
|
844
|
+
.with_columns(
|
|
845
|
+
(-pl.col("__res_idx").hash(seed).log()).alias("__priority")
|
|
846
|
+
)
|
|
847
|
+
.sort("__priority")
|
|
848
|
+
.head(sample_size)
|
|
849
|
+
.drop(["__res_idx", "__priority"])
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
853
|
+
|
|
854
|
+
return SamplingResult(
|
|
855
|
+
data=sampled_lf,
|
|
856
|
+
metrics=self._create_metrics(
|
|
857
|
+
original_size=total_rows,
|
|
858
|
+
sample_size=sample_size,
|
|
859
|
+
config=config,
|
|
860
|
+
sampling_time_ms=elapsed_ms,
|
|
861
|
+
),
|
|
862
|
+
is_sampled=True,
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
class AdaptiveSamplingStrategy(SamplingStrategy):
|
|
867
|
+
"""Strategy that adapts based on data characteristics.
|
|
868
|
+
|
|
869
|
+
Automatically selects the best sampling method based on:
|
|
870
|
+
- Dataset size
|
|
871
|
+
- Available memory
|
|
872
|
+
- Accuracy requirements
|
|
873
|
+
|
|
874
|
+
This is the recommended default strategy.
|
|
875
|
+
"""
|
|
876
|
+
|
|
877
|
+
name = "adaptive"
|
|
878
|
+
|
|
879
|
+
def __init__(self) -> None:
|
|
880
|
+
"""Initialize with sub-strategies."""
|
|
881
|
+
self._strategies: dict[str, SamplingStrategy] = {
|
|
882
|
+
"none": NoSamplingStrategy(),
|
|
883
|
+
"head": HeadSamplingStrategy(),
|
|
884
|
+
"random": RandomSamplingStrategy(),
|
|
885
|
+
"systematic": SystematicSamplingStrategy(),
|
|
886
|
+
"reservoir": ReservoirSamplingStrategy(),
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
def sample(
|
|
890
|
+
self,
|
|
891
|
+
lf: pl.LazyFrame,
|
|
892
|
+
config: SamplingConfig,
|
|
893
|
+
total_rows: int | None = None,
|
|
894
|
+
) -> SamplingResult:
|
|
895
|
+
"""Adaptively sample based on data size and config."""
|
|
896
|
+
start_time = time.perf_counter()
|
|
897
|
+
|
|
898
|
+
if total_rows is None:
|
|
899
|
+
total_rows = self.estimate_row_count(lf)
|
|
900
|
+
|
|
901
|
+
# Select strategy based on data size
|
|
902
|
+
selected_strategy = self._select_strategy(total_rows, config)
|
|
903
|
+
|
|
904
|
+
logger.debug(
|
|
905
|
+
f"Adaptive sampling selected '{selected_strategy.name}' "
|
|
906
|
+
f"for {total_rows:,} rows"
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
# Delegate to selected strategy
|
|
910
|
+
result = selected_strategy.sample(lf, config, total_rows)
|
|
911
|
+
|
|
912
|
+
# Update metrics to reflect adaptive selection
|
|
913
|
+
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
|
914
|
+
|
|
915
|
+
return SamplingResult(
|
|
916
|
+
data=result.data,
|
|
917
|
+
metrics=SamplingMetrics(
|
|
918
|
+
original_size=result.metrics.original_size,
|
|
919
|
+
sample_size=result.metrics.sample_size,
|
|
920
|
+
sampling_ratio=result.metrics.sampling_ratio,
|
|
921
|
+
confidence_level=result.metrics.confidence_level,
|
|
922
|
+
margin_of_error=result.metrics.margin_of_error,
|
|
923
|
+
strategy_used=f"adaptive({selected_strategy.name})",
|
|
924
|
+
sampling_time_ms=elapsed_ms,
|
|
925
|
+
memory_saved_estimate_mb=result.metrics.memory_saved_estimate_mb,
|
|
926
|
+
),
|
|
927
|
+
is_sampled=result.is_sampled,
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
def _select_strategy(
|
|
931
|
+
self,
|
|
932
|
+
total_rows: int,
|
|
933
|
+
config: SamplingConfig,
|
|
934
|
+
) -> SamplingStrategy:
|
|
935
|
+
"""Select the best strategy for given parameters."""
|
|
936
|
+
# Small datasets: no sampling needed
|
|
937
|
+
if total_rows <= config.small_dataset_threshold:
|
|
938
|
+
return self._strategies["none"]
|
|
939
|
+
|
|
940
|
+
# Medium datasets: systematic for balance of speed/quality
|
|
941
|
+
if total_rows <= config.medium_dataset_threshold:
|
|
942
|
+
return self._strategies["systematic"]
|
|
943
|
+
|
|
944
|
+
# Large datasets: random for better representation
|
|
945
|
+
if total_rows <= config.large_dataset_threshold:
|
|
946
|
+
return self._strategies["random"]
|
|
947
|
+
|
|
948
|
+
# Very large datasets: reservoir for memory efficiency
|
|
949
|
+
return self._strategies["reservoir"]
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
# =============================================================================
|
|
953
|
+
# Sampling Strategy Registry
|
|
954
|
+
# =============================================================================
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
class SamplingStrategyRegistry:
|
|
958
|
+
"""Registry for sampling strategies.
|
|
959
|
+
|
|
960
|
+
Allows registration of custom strategies and creation by name.
|
|
961
|
+
|
|
962
|
+
Example:
|
|
963
|
+
registry = SamplingStrategyRegistry()
|
|
964
|
+
registry.register(MyCustomStrategy())
|
|
965
|
+
strategy = registry.get("custom")
|
|
966
|
+
"""
|
|
967
|
+
|
|
968
|
+
def __init__(self) -> None:
|
|
969
|
+
self._strategies: dict[str, SamplingStrategy] = {}
|
|
970
|
+
self._lock = threading.RLock()
|
|
971
|
+
self._register_defaults()
|
|
972
|
+
|
|
973
|
+
def _register_defaults(self) -> None:
|
|
974
|
+
"""Register built-in strategies."""
|
|
975
|
+
self.register(NoSamplingStrategy())
|
|
976
|
+
self.register(HeadSamplingStrategy())
|
|
977
|
+
self.register(RandomSamplingStrategy())
|
|
978
|
+
self.register(SystematicSamplingStrategy())
|
|
979
|
+
self.register(HashSamplingStrategy())
|
|
980
|
+
self.register(StratifiedSamplingStrategy())
|
|
981
|
+
self.register(ReservoirSamplingStrategy())
|
|
982
|
+
self.register(AdaptiveSamplingStrategy())
|
|
983
|
+
|
|
984
|
+
def register(self, strategy: SamplingStrategy) -> None:
|
|
985
|
+
"""Register a sampling strategy."""
|
|
986
|
+
with self._lock:
|
|
987
|
+
self._strategies[strategy.name] = strategy
|
|
988
|
+
logger.debug(f"Registered sampling strategy: {strategy.name}")
|
|
989
|
+
|
|
990
|
+
def get(self, name: str) -> SamplingStrategy:
|
|
991
|
+
"""Get a strategy by name.
|
|
992
|
+
|
|
993
|
+
Args:
|
|
994
|
+
name: Strategy name
|
|
995
|
+
|
|
996
|
+
Returns:
|
|
997
|
+
The requested strategy
|
|
998
|
+
|
|
999
|
+
Raises:
|
|
1000
|
+
KeyError: If strategy not found
|
|
1001
|
+
"""
|
|
1002
|
+
with self._lock:
|
|
1003
|
+
if name not in self._strategies:
|
|
1004
|
+
available = list(self._strategies.keys())
|
|
1005
|
+
raise KeyError(
|
|
1006
|
+
f"Unknown sampling strategy: '{name}'. "
|
|
1007
|
+
f"Available: {available}"
|
|
1008
|
+
)
|
|
1009
|
+
return self._strategies[name]
|
|
1010
|
+
|
|
1011
|
+
def get_or_default(
|
|
1012
|
+
self,
|
|
1013
|
+
name: str,
|
|
1014
|
+
default: SamplingStrategy | None = None,
|
|
1015
|
+
) -> SamplingStrategy:
|
|
1016
|
+
"""Get strategy by name with fallback."""
|
|
1017
|
+
try:
|
|
1018
|
+
return self.get(name)
|
|
1019
|
+
except KeyError:
|
|
1020
|
+
return default or AdaptiveSamplingStrategy()
|
|
1021
|
+
|
|
1022
|
+
def list_strategies(self) -> list[str]:
|
|
1023
|
+
"""List all registered strategy names."""
|
|
1024
|
+
with self._lock:
|
|
1025
|
+
return list(self._strategies.keys())
|
|
1026
|
+
|
|
1027
|
+
def create_from_method(self, method: SamplingMethod) -> SamplingStrategy:
|
|
1028
|
+
"""Create strategy from SamplingMethod enum."""
|
|
1029
|
+
return self.get(method.value)
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
# Global registry instance
|
|
1033
|
+
sampling_strategy_registry = SamplingStrategyRegistry()
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
# =============================================================================
|
|
1037
|
+
# Data Size Estimator
|
|
1038
|
+
# =============================================================================
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
class DataSizeEstimator:
|
|
1042
|
+
"""Estimates data size for sampling decisions.
|
|
1043
|
+
|
|
1044
|
+
Provides fast, approximate size estimates without full scans.
|
|
1045
|
+
"""
|
|
1046
|
+
|
|
1047
|
+
@staticmethod
|
|
1048
|
+
def estimate_row_count(lf: pl.LazyFrame) -> int:
|
|
1049
|
+
"""Estimate row count.
|
|
1050
|
+
|
|
1051
|
+
Args:
|
|
1052
|
+
lf: LazyFrame to estimate
|
|
1053
|
+
|
|
1054
|
+
Returns:
|
|
1055
|
+
Estimated row count
|
|
1056
|
+
"""
|
|
1057
|
+
# For now, use exact count
|
|
1058
|
+
# Future: Use file metadata for parquet, etc.
|
|
1059
|
+
return lf.select(pl.len()).collect().item()
|
|
1060
|
+
|
|
1061
|
+
@staticmethod
|
|
1062
|
+
def estimate_memory_bytes(lf: pl.LazyFrame, sample_rows: int = 1000) -> int:
|
|
1063
|
+
"""Estimate memory usage per row.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
lf: LazyFrame to estimate
|
|
1067
|
+
sample_rows: Number of rows to sample for estimation
|
|
1068
|
+
|
|
1069
|
+
Returns:
|
|
1070
|
+
Estimated bytes per row
|
|
1071
|
+
"""
|
|
1072
|
+
try:
|
|
1073
|
+
sample = lf.head(sample_rows).collect()
|
|
1074
|
+
if len(sample) == 0:
|
|
1075
|
+
return 0
|
|
1076
|
+
|
|
1077
|
+
total_bytes = sample.estimated_size()
|
|
1078
|
+
bytes_per_row = total_bytes // len(sample)
|
|
1079
|
+
return bytes_per_row
|
|
1080
|
+
except Exception:
|
|
1081
|
+
# Default estimate: 100 bytes per row
|
|
1082
|
+
return 100
|
|
1083
|
+
|
|
1084
|
+
@staticmethod
|
|
1085
|
+
def estimate_total_memory_mb(
|
|
1086
|
+
lf: pl.LazyFrame,
|
|
1087
|
+
row_count: int | None = None,
|
|
1088
|
+
) -> float:
|
|
1089
|
+
"""Estimate total memory for full data.
|
|
1090
|
+
|
|
1091
|
+
Args:
|
|
1092
|
+
lf: LazyFrame to estimate
|
|
1093
|
+
row_count: Pre-computed row count
|
|
1094
|
+
|
|
1095
|
+
Returns:
|
|
1096
|
+
Estimated total memory in MB
|
|
1097
|
+
"""
|
|
1098
|
+
if row_count is None:
|
|
1099
|
+
row_count = DataSizeEstimator.estimate_row_count(lf)
|
|
1100
|
+
|
|
1101
|
+
bytes_per_row = DataSizeEstimator.estimate_memory_bytes(lf)
|
|
1102
|
+
total_bytes = row_count * bytes_per_row
|
|
1103
|
+
return total_bytes / (1024 * 1024)
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
# =============================================================================
|
|
1107
|
+
# Sampler (Main Interface)
|
|
1108
|
+
# =============================================================================
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
class Sampler:
|
|
1112
|
+
"""Main interface for data sampling.
|
|
1113
|
+
|
|
1114
|
+
Coordinates sampling strategies and provides a simple API
|
|
1115
|
+
for sampling data with configurable behavior.
|
|
1116
|
+
|
|
1117
|
+
Example:
|
|
1118
|
+
sampler = Sampler(SamplingConfig.for_accuracy("high"))
|
|
1119
|
+
result = sampler.sample(lf)
|
|
1120
|
+
|
|
1121
|
+
print(f"Sampled {result.metrics.sample_size:,} of "
|
|
1122
|
+
f"{result.metrics.original_size:,} rows")
|
|
1123
|
+
print(f"Strategy: {result.metrics.strategy_used}")
|
|
1124
|
+
"""
|
|
1125
|
+
|
|
1126
|
+
def __init__(
|
|
1127
|
+
self,
|
|
1128
|
+
config: SamplingConfig | None = None,
|
|
1129
|
+
registry: SamplingStrategyRegistry | None = None,
|
|
1130
|
+
):
|
|
1131
|
+
"""Initialize sampler.
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
config: Sampling configuration
|
|
1135
|
+
registry: Strategy registry (uses global if not provided)
|
|
1136
|
+
"""
|
|
1137
|
+
self.config = config or DEFAULT_SAMPLING_CONFIG
|
|
1138
|
+
self.registry = registry or sampling_strategy_registry
|
|
1139
|
+
self._size_estimator = DataSizeEstimator()
|
|
1140
|
+
|
|
1141
|
+
def sample(
|
|
1142
|
+
self,
|
|
1143
|
+
lf: pl.LazyFrame,
|
|
1144
|
+
config: SamplingConfig | None = None,
|
|
1145
|
+
) -> SamplingResult:
|
|
1146
|
+
"""Sample data from LazyFrame.
|
|
1147
|
+
|
|
1148
|
+
Args:
|
|
1149
|
+
lf: Source LazyFrame
|
|
1150
|
+
config: Override configuration for this call
|
|
1151
|
+
|
|
1152
|
+
Returns:
|
|
1153
|
+
SamplingResult with sampled data and metrics
|
|
1154
|
+
"""
|
|
1155
|
+
config = config or self.config
|
|
1156
|
+
|
|
1157
|
+
# Get the appropriate strategy
|
|
1158
|
+
strategy = self.registry.create_from_method(config.strategy)
|
|
1159
|
+
|
|
1160
|
+
# Estimate row count
|
|
1161
|
+
total_rows = self._size_estimator.estimate_row_count(lf)
|
|
1162
|
+
|
|
1163
|
+
# Execute sampling
|
|
1164
|
+
try:
|
|
1165
|
+
return strategy.sample(lf, config, total_rows)
|
|
1166
|
+
except Exception as e:
|
|
1167
|
+
logger.warning(
|
|
1168
|
+
f"Sampling strategy '{strategy.name}' failed: {e}. "
|
|
1169
|
+
f"Falling back to '{config.fallback_strategy.value}'"
|
|
1170
|
+
)
|
|
1171
|
+
# Fallback
|
|
1172
|
+
fallback = self.registry.create_from_method(config.fallback_strategy)
|
|
1173
|
+
return fallback.sample(lf, config, total_rows)
|
|
1174
|
+
|
|
1175
|
+
def sample_column(
|
|
1176
|
+
self,
|
|
1177
|
+
lf: pl.LazyFrame,
|
|
1178
|
+
column: str,
|
|
1179
|
+
config: SamplingConfig | None = None,
|
|
1180
|
+
) -> SamplingResult:
|
|
1181
|
+
"""Sample specific column from LazyFrame.
|
|
1182
|
+
|
|
1183
|
+
Args:
|
|
1184
|
+
lf: Source LazyFrame
|
|
1185
|
+
column: Column to sample
|
|
1186
|
+
config: Override configuration
|
|
1187
|
+
|
|
1188
|
+
Returns:
|
|
1189
|
+
SamplingResult with sampled column data
|
|
1190
|
+
"""
|
|
1191
|
+
# Select only the needed column for efficiency
|
|
1192
|
+
column_lf = lf.select(pl.col(column))
|
|
1193
|
+
return self.sample(column_lf, config)
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
# =============================================================================
|
|
1197
|
+
# Convenience Functions
|
|
1198
|
+
# =============================================================================
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
def create_sampler(
|
|
1202
|
+
strategy: str | SamplingMethod = "adaptive",
|
|
1203
|
+
max_rows: int = 100_000,
|
|
1204
|
+
confidence_level: float = 0.95,
|
|
1205
|
+
**kwargs: Any,
|
|
1206
|
+
) -> Sampler:
|
|
1207
|
+
"""Create a sampler with specified parameters.
|
|
1208
|
+
|
|
1209
|
+
Args:
|
|
1210
|
+
strategy: Sampling strategy name or enum
|
|
1211
|
+
max_rows: Maximum rows to sample
|
|
1212
|
+
confidence_level: Statistical confidence level
|
|
1213
|
+
**kwargs: Additional config options
|
|
1214
|
+
|
|
1215
|
+
Returns:
|
|
1216
|
+
Configured Sampler instance
|
|
1217
|
+
|
|
1218
|
+
Example:
|
|
1219
|
+
sampler = create_sampler(strategy="random", max_rows=50_000)
|
|
1220
|
+
result = sampler.sample(lf)
|
|
1221
|
+
"""
|
|
1222
|
+
if isinstance(strategy, str):
|
|
1223
|
+
strategy = SamplingMethod(strategy)
|
|
1224
|
+
|
|
1225
|
+
config = SamplingConfig(
|
|
1226
|
+
strategy=strategy,
|
|
1227
|
+
max_rows=max_rows,
|
|
1228
|
+
confidence_level=confidence_level,
|
|
1229
|
+
**kwargs,
|
|
1230
|
+
)
|
|
1231
|
+
|
|
1232
|
+
return Sampler(config)
|
|
1233
|
+
|
|
1234
|
+
|
|
1235
|
+
def sample_data(
|
|
1236
|
+
lf: pl.LazyFrame,
|
|
1237
|
+
max_rows: int = 100_000,
|
|
1238
|
+
strategy: str = "adaptive",
|
|
1239
|
+
) -> SamplingResult:
|
|
1240
|
+
"""Quick function to sample data.
|
|
1241
|
+
|
|
1242
|
+
Args:
|
|
1243
|
+
lf: LazyFrame to sample
|
|
1244
|
+
max_rows: Maximum rows
|
|
1245
|
+
strategy: Strategy name
|
|
1246
|
+
|
|
1247
|
+
Returns:
|
|
1248
|
+
SamplingResult
|
|
1249
|
+
|
|
1250
|
+
Example:
|
|
1251
|
+
result = sample_data(lf, max_rows=50_000)
|
|
1252
|
+
sampled_lf = result.data
|
|
1253
|
+
"""
|
|
1254
|
+
sampler = create_sampler(strategy=strategy, max_rows=max_rows)
|
|
1255
|
+
return sampler.sample(lf)
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def calculate_sample_size(
|
|
1259
|
+
population_size: int,
|
|
1260
|
+
confidence_level: float = 0.95,
|
|
1261
|
+
margin_of_error: float = 0.05,
|
|
1262
|
+
min_sample_size: int = 1,
|
|
1263
|
+
) -> int:
|
|
1264
|
+
"""Calculate required sample size for given parameters.
|
|
1265
|
+
|
|
1266
|
+
Uses Cochran's formula with finite population correction.
|
|
1267
|
+
By default, returns the pure statistical calculation without
|
|
1268
|
+
minimum size constraints.
|
|
1269
|
+
|
|
1270
|
+
Args:
|
|
1271
|
+
population_size: Total population
|
|
1272
|
+
confidence_level: Confidence level (0-1)
|
|
1273
|
+
margin_of_error: Margin of error (0-1)
|
|
1274
|
+
min_sample_size: Minimum sample size (default 1 for pure statistical result)
|
|
1275
|
+
|
|
1276
|
+
Returns:
|
|
1277
|
+
Required sample size
|
|
1278
|
+
|
|
1279
|
+
Example:
|
|
1280
|
+
n = calculate_sample_size(1_000_000, confidence_level=0.99)
|
|
1281
|
+
print(f"Need {n:,} samples for 99% confidence")
|
|
1282
|
+
"""
|
|
1283
|
+
config = SamplingConfig(
|
|
1284
|
+
confidence_level=confidence_level,
|
|
1285
|
+
margin_of_error=margin_of_error,
|
|
1286
|
+
min_sample_size=min_sample_size,
|
|
1287
|
+
)
|
|
1288
|
+
return config.calculate_required_sample_size(population_size)
|