truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1288 @@
1
+ """Enterprise-grade sampling strategies for memory-efficient pattern matching.
2
+
3
+ This module provides a comprehensive sampling framework that prevents OOM errors
4
+ when processing large datasets while maintaining statistical accuracy.
5
+
6
+ Key features:
7
+ - Pluggable sampling strategy architecture
8
+ - Memory-aware adaptive sampling
9
+ - Statistical confidence estimation
10
+ - Stratified sampling for skewed distributions
11
+ - Reservoir sampling for streaming data
12
+
13
+ Design Principles:
14
+ - Open/Closed: New strategies can be added without modifying existing code
15
+ - Single Responsibility: Each strategy handles one sampling approach
16
+ - Dependency Inversion: High-level modules depend on abstractions
17
+
18
+ Example:
19
+ from truthound.profiler.sampling import (
20
+ SampledPatternMatcher,
21
+ SamplingConfig,
22
+ AdaptiveSamplingStrategy,
23
+ )
24
+
25
+ # Use adaptive sampling based on data size
26
+ config = SamplingConfig(
27
+ strategy="adaptive",
28
+ max_rows=100_000,
29
+ confidence_level=0.95,
30
+ )
31
+
32
+ matcher = SampledPatternMatcher(sampling_config=config)
33
+ results = matcher.match_column(lf, "email")
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import hashlib
39
+ import logging
40
+ import math
41
+ import random
42
+ import sys
43
+ import threading
44
+ import time
45
+ from abc import ABC, abstractmethod
46
+ from dataclasses import dataclass, field
47
+ from datetime import datetime
48
+ from enum import Enum
49
+ from typing import (
50
+ TYPE_CHECKING,
51
+ Any,
52
+ Callable,
53
+ Generic,
54
+ Iterator,
55
+ Protocol,
56
+ Sequence,
57
+ TypeVar,
58
+ )
59
+
60
+ import polars as pl
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ # =============================================================================
66
+ # Types and Enums
67
+ # =============================================================================
68
+
69
+
70
+ class SamplingMethod(str, Enum):
71
+ """Available sampling methods."""
72
+
73
+ NONE = "none" # No sampling (use all data)
74
+ RANDOM = "random" # Simple random sampling
75
+ SYSTEMATIC = "systematic" # Every nth row
76
+ STRATIFIED = "stratified" # Preserve distribution
77
+ RESERVOIR = "reservoir" # Streaming reservoir sampling
78
+ ADAPTIVE = "adaptive" # Auto-select based on data size
79
+ HEAD = "head" # First n rows (fastest, least accurate)
80
+ HASH = "hash" # Deterministic hash-based sampling
81
+
82
+
83
+ class ConfidenceLevel(float, Enum):
84
+ """Common confidence levels for statistical sampling."""
85
+
86
+ LOW = 0.90 # 90% confidence
87
+ MEDIUM = 0.95 # 95% confidence (default)
88
+ HIGH = 0.99 # 99% confidence
89
+ VERY_HIGH = 0.999 # 99.9% confidence
90
+
91
+
92
+ # =============================================================================
93
+ # Sampling Result
94
+ # =============================================================================
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class SamplingMetrics:
99
+ """Metrics about the sampling operation.
100
+
101
+ Attributes:
102
+ original_size: Original dataset size
103
+ sample_size: Actual sample size used
104
+ sampling_ratio: Fraction of data sampled
105
+ confidence_level: Statistical confidence level
106
+ margin_of_error: Estimated margin of error
107
+ strategy_used: Name of the sampling strategy
108
+ sampling_time_ms: Time taken to sample
109
+ memory_saved_estimate_mb: Estimated memory saved
110
+ """
111
+
112
+ original_size: int
113
+ sample_size: int
114
+ sampling_ratio: float
115
+ confidence_level: float
116
+ margin_of_error: float
117
+ strategy_used: str
118
+ sampling_time_ms: float = 0.0
119
+ memory_saved_estimate_mb: float = 0.0
120
+
121
+ @property
122
+ def is_full_scan(self) -> bool:
123
+ """Check if full data was used (no sampling)."""
124
+ return self.sampling_ratio >= 1.0
125
+
126
+ @property
127
+ def reduction_factor(self) -> float:
128
+ """Get data reduction factor (1.0 = no reduction)."""
129
+ if self.sample_size == 0:
130
+ return 0.0
131
+ return self.original_size / self.sample_size
132
+
133
+ def to_dict(self) -> dict[str, Any]:
134
+ """Convert to dictionary for serialization."""
135
+ return {
136
+ "original_size": self.original_size,
137
+ "sample_size": self.sample_size,
138
+ "sampling_ratio": self.sampling_ratio,
139
+ "confidence_level": self.confidence_level,
140
+ "margin_of_error": self.margin_of_error,
141
+ "strategy_used": self.strategy_used,
142
+ "sampling_time_ms": self.sampling_time_ms,
143
+ "memory_saved_estimate_mb": self.memory_saved_estimate_mb,
144
+ "is_full_scan": self.is_full_scan,
145
+ "reduction_factor": self.reduction_factor,
146
+ }
147
+
148
+
149
+ @dataclass
150
+ class SamplingResult(Generic[TypeVar("T")]):
151
+ """Result of a sampling operation.
152
+
153
+ Attributes:
154
+ data: The sampled LazyFrame
155
+ metrics: Sampling metrics
156
+ is_sampled: Whether sampling was applied
157
+ """
158
+
159
+ data: pl.LazyFrame
160
+ metrics: SamplingMetrics
161
+ is_sampled: bool = True
162
+
163
+ def __post_init__(self) -> None:
164
+ """Validate result."""
165
+ if self.metrics.sample_size == 0 and self.is_sampled:
166
+ logger.warning("Sampling resulted in zero rows")
167
+
168
+
169
+ # =============================================================================
170
+ # Sampling Configuration
171
+ # =============================================================================
172
+
173
+
174
+ @dataclass
175
+ class SamplingConfig:
176
+ """Configuration for sampling behavior.
177
+
178
+ This configuration controls how data is sampled for pattern matching.
179
+ It supports both explicit size limits and statistical parameters.
180
+
181
+ Attributes:
182
+ strategy: Sampling strategy to use
183
+ max_rows: Maximum rows to sample (0 = auto-calculate)
184
+ max_memory_mb: Maximum memory to use for sampling (0 = unlimited)
185
+ confidence_level: Statistical confidence level (0.0 to 1.0)
186
+ margin_of_error: Acceptable margin of error (0.0 to 1.0)
187
+ seed: Random seed for reproducibility (None = random)
188
+ min_sample_size: Minimum sample size regardless of calculations
189
+ enable_caching: Cache sampling decisions for same data
190
+ fallback_strategy: Strategy to use if primary fails
191
+ """
192
+
193
+ strategy: SamplingMethod = SamplingMethod.ADAPTIVE
194
+ max_rows: int = 100_000
195
+ max_memory_mb: int = 0 # 0 = auto (use 10% of available)
196
+ confidence_level: float = 0.95
197
+ margin_of_error: float = 0.05
198
+ seed: int | None = None
199
+ min_sample_size: int = 1000
200
+ enable_caching: bool = True
201
+ fallback_strategy: SamplingMethod = SamplingMethod.HEAD
202
+
203
+ # Size thresholds for adaptive strategy
204
+ small_dataset_threshold: int = 10_000
205
+ medium_dataset_threshold: int = 100_000
206
+ large_dataset_threshold: int = 1_000_000
207
+
208
+ def __post_init__(self) -> None:
209
+ """Validate configuration."""
210
+ if not 0.0 < self.confidence_level < 1.0:
211
+ raise ValueError(
212
+ f"confidence_level must be between 0 and 1, got {self.confidence_level}"
213
+ )
214
+ if not 0.0 < self.margin_of_error < 1.0:
215
+ raise ValueError(
216
+ f"margin_of_error must be between 0 and 1, got {self.margin_of_error}"
217
+ )
218
+ if self.max_rows < 0:
219
+ raise ValueError(f"max_rows must be non-negative, got {self.max_rows}")
220
+ if self.min_sample_size < 1:
221
+ raise ValueError(
222
+ f"min_sample_size must be positive, got {self.min_sample_size}"
223
+ )
224
+
225
+ def calculate_required_sample_size(
226
+ self,
227
+ population_size: int,
228
+ expected_proportion: float = 0.5,
229
+ ) -> int:
230
+ """Calculate statistically required sample size.
231
+
232
+ Uses Cochran's formula with finite population correction.
233
+
234
+ Args:
235
+ population_size: Total population size
236
+ expected_proportion: Expected proportion (0.5 = maximum variance)
237
+
238
+ Returns:
239
+ Required sample size for desired confidence/margin
240
+ """
241
+ if population_size <= 0:
242
+ return 0
243
+
244
+ # Z-score for confidence level
245
+ z_scores = {
246
+ 0.90: 1.645,
247
+ 0.95: 1.96,
248
+ 0.99: 2.576,
249
+ 0.999: 3.291,
250
+ }
251
+ z = z_scores.get(
252
+ round(self.confidence_level, 3),
253
+ self._z_score_from_confidence(self.confidence_level),
254
+ )
255
+
256
+ p = expected_proportion
257
+ e = self.margin_of_error
258
+
259
+ # Cochran's formula for infinite population
260
+ n0 = (z ** 2 * p * (1 - p)) / (e ** 2)
261
+
262
+ # Finite population correction
263
+ n = n0 / (1 + (n0 - 1) / population_size)
264
+
265
+ # Apply bounds
266
+ sample_size = int(math.ceil(n))
267
+ sample_size = max(sample_size, self.min_sample_size)
268
+ sample_size = min(sample_size, population_size)
269
+
270
+ if self.max_rows > 0:
271
+ sample_size = min(sample_size, self.max_rows)
272
+
273
+ return sample_size
274
+
275
+ @staticmethod
276
+ def _z_score_from_confidence(confidence: float) -> float:
277
+ """Approximate Z-score from confidence level."""
278
+ # Using inverse normal approximation
279
+ # For more accuracy, use scipy.stats.norm.ppf
280
+ alpha = 1 - confidence
281
+ # Rough approximation for common values
282
+ if alpha <= 0.001:
283
+ return 3.3
284
+ elif alpha <= 0.01:
285
+ return 2.6
286
+ elif alpha <= 0.05:
287
+ return 2.0
288
+ elif alpha <= 0.10:
289
+ return 1.6
290
+ else:
291
+ return 1.0
292
+
293
+ @classmethod
294
+ def for_accuracy(cls, accuracy: str = "medium") -> "SamplingConfig":
295
+ """Create config optimized for accuracy level.
296
+
297
+ Args:
298
+ accuracy: "low", "medium", "high", or "maximum"
299
+
300
+ Returns:
301
+ Configured SamplingConfig
302
+ """
303
+ configs = {
304
+ "low": cls(
305
+ strategy=SamplingMethod.HEAD,
306
+ max_rows=10_000,
307
+ confidence_level=0.90,
308
+ margin_of_error=0.10,
309
+ ),
310
+ "medium": cls(
311
+ strategy=SamplingMethod.ADAPTIVE,
312
+ max_rows=100_000,
313
+ confidence_level=0.95,
314
+ margin_of_error=0.05,
315
+ ),
316
+ "high": cls(
317
+ strategy=SamplingMethod.RANDOM,
318
+ max_rows=500_000,
319
+ confidence_level=0.99,
320
+ margin_of_error=0.02,
321
+ ),
322
+ "maximum": cls(
323
+ strategy=SamplingMethod.NONE,
324
+ max_rows=0,
325
+ confidence_level=0.999,
326
+ margin_of_error=0.01,
327
+ ),
328
+ }
329
+ return configs.get(accuracy, configs["medium"])
330
+
331
+ @classmethod
332
+ def for_speed(cls) -> "SamplingConfig":
333
+ """Create config optimized for speed."""
334
+ return cls(
335
+ strategy=SamplingMethod.HEAD,
336
+ max_rows=10_000,
337
+ confidence_level=0.90,
338
+ margin_of_error=0.10,
339
+ )
340
+
341
+ @classmethod
342
+ def for_memory(cls, max_memory_mb: int = 100) -> "SamplingConfig":
343
+ """Create config optimized for memory efficiency."""
344
+ return cls(
345
+ strategy=SamplingMethod.RESERVOIR,
346
+ max_rows=50_000,
347
+ max_memory_mb=max_memory_mb,
348
+ confidence_level=0.95,
349
+ margin_of_error=0.05,
350
+ )
351
+
352
+ def to_dict(self) -> dict[str, Any]:
353
+ """Convert to dictionary."""
354
+ return {
355
+ "strategy": self.strategy.value,
356
+ "max_rows": self.max_rows,
357
+ "max_memory_mb": self.max_memory_mb,
358
+ "confidence_level": self.confidence_level,
359
+ "margin_of_error": self.margin_of_error,
360
+ "seed": self.seed,
361
+ "min_sample_size": self.min_sample_size,
362
+ }
363
+
364
+
365
+ # Default configuration
366
+ DEFAULT_SAMPLING_CONFIG = SamplingConfig()
367
+
368
+
369
+ # =============================================================================
370
+ # Sampling Strategy Protocol
371
+ # =============================================================================
372
+
373
+
374
+ class SamplingStrategy(ABC):
375
+ """Abstract base class for sampling strategies.
376
+
377
+ All sampling strategies must implement this interface.
378
+ This enables the Strategy pattern for flexible sampling behavior.
379
+
380
+ Example:
381
+ class MyCustomStrategy(SamplingStrategy):
382
+ name = "custom"
383
+
384
+ def sample(self, lf, config):
385
+ # Custom sampling logic
386
+ ...
387
+ """
388
+
389
+ name: str = "base"
390
+
391
+ @abstractmethod
392
+ def sample(
393
+ self,
394
+ lf: pl.LazyFrame,
395
+ config: SamplingConfig,
396
+ total_rows: int | None = None,
397
+ ) -> SamplingResult:
398
+ """Sample data from the LazyFrame.
399
+
400
+ Args:
401
+ lf: Source LazyFrame
402
+ config: Sampling configuration
403
+ total_rows: Pre-computed total rows (optional, for efficiency)
404
+
405
+ Returns:
406
+ SamplingResult with sampled data and metrics
407
+ """
408
+ pass
409
+
410
+ def estimate_row_count(self, lf: pl.LazyFrame) -> int:
411
+ """Estimate row count without full scan.
412
+
413
+ Override for more efficient implementations.
414
+
415
+ Args:
416
+ lf: LazyFrame to estimate
417
+
418
+ Returns:
419
+ Estimated row count
420
+ """
421
+ # Default: exact count (can be expensive)
422
+ return lf.select(pl.len()).collect().item()
423
+
424
+ def _create_metrics(
425
+ self,
426
+ original_size: int,
427
+ sample_size: int,
428
+ config: SamplingConfig,
429
+ sampling_time_ms: float = 0.0,
430
+ ) -> SamplingMetrics:
431
+ """Create sampling metrics."""
432
+ sampling_ratio = sample_size / original_size if original_size > 0 else 0.0
433
+
434
+ # Estimate margin of error for actual sample
435
+ if sample_size > 0 and original_size > 0:
436
+ # Simplified margin of error calculation
437
+ z = 1.96 # 95% confidence
438
+ p = 0.5 # Maximum variance
439
+ margin = z * math.sqrt(p * (1 - p) / sample_size)
440
+ # Finite population correction
441
+ if sample_size < original_size:
442
+ fpc = math.sqrt((original_size - sample_size) / (original_size - 1))
443
+ margin *= fpc
444
+ else:
445
+ margin = 1.0
446
+
447
+ # Estimate memory saved (rough: 100 bytes per row average)
448
+ rows_saved = original_size - sample_size
449
+ memory_saved_mb = (rows_saved * 100) / (1024 * 1024)
450
+
451
+ return SamplingMetrics(
452
+ original_size=original_size,
453
+ sample_size=sample_size,
454
+ sampling_ratio=sampling_ratio,
455
+ confidence_level=config.confidence_level,
456
+ margin_of_error=min(margin, 1.0),
457
+ strategy_used=self.name,
458
+ sampling_time_ms=sampling_time_ms,
459
+ memory_saved_estimate_mb=max(0, memory_saved_mb),
460
+ )
461
+
462
+
463
+ # =============================================================================
464
+ # Concrete Sampling Strategies
465
+ # =============================================================================
466
+
467
+
468
+ class NoSamplingStrategy(SamplingStrategy):
469
+ """Strategy that uses all data without sampling.
470
+
471
+ Use when accuracy is paramount and memory is not a concern.
472
+ """
473
+
474
+ name = "none"
475
+
476
+ def sample(
477
+ self,
478
+ lf: pl.LazyFrame,
479
+ config: SamplingConfig,
480
+ total_rows: int | None = None,
481
+ ) -> SamplingResult:
482
+ """Return all data without sampling."""
483
+ start_time = time.perf_counter()
484
+
485
+ if total_rows is None:
486
+ total_rows = self.estimate_row_count(lf)
487
+
488
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
489
+
490
+ return SamplingResult(
491
+ data=lf,
492
+ metrics=self._create_metrics(
493
+ original_size=total_rows,
494
+ sample_size=total_rows,
495
+ config=config,
496
+ sampling_time_ms=elapsed_ms,
497
+ ),
498
+ is_sampled=False,
499
+ )
500
+
501
+
502
+ class HeadSamplingStrategy(SamplingStrategy):
503
+ """Strategy that takes the first N rows.
504
+
505
+ Fastest sampling method but may not be representative
506
+ if data has ordering bias.
507
+ """
508
+
509
+ name = "head"
510
+
511
+ def sample(
512
+ self,
513
+ lf: pl.LazyFrame,
514
+ config: SamplingConfig,
515
+ total_rows: int | None = None,
516
+ ) -> SamplingResult:
517
+ """Take first N rows."""
518
+ start_time = time.perf_counter()
519
+
520
+ if total_rows is None:
521
+ total_rows = self.estimate_row_count(lf)
522
+
523
+ # Calculate sample size
524
+ sample_size = config.calculate_required_sample_size(total_rows)
525
+ if config.max_rows > 0:
526
+ sample_size = min(sample_size, config.max_rows)
527
+
528
+ # No sampling needed if sample >= total
529
+ if sample_size >= total_rows:
530
+ return NoSamplingStrategy().sample(lf, config, total_rows)
531
+
532
+ # Apply head sampling
533
+ sampled_lf = lf.head(sample_size)
534
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
535
+
536
+ return SamplingResult(
537
+ data=sampled_lf,
538
+ metrics=self._create_metrics(
539
+ original_size=total_rows,
540
+ sample_size=sample_size,
541
+ config=config,
542
+ sampling_time_ms=elapsed_ms,
543
+ ),
544
+ is_sampled=True,
545
+ )
546
+
547
+
548
+ class RandomSamplingStrategy(SamplingStrategy):
549
+ """Strategy for simple random sampling.
550
+
551
+ Uses Polars native random sampling for efficiency.
552
+ Provides unbiased samples but may not preserve rare patterns.
553
+ """
554
+
555
+ name = "random"
556
+
557
+ def sample(
558
+ self,
559
+ lf: pl.LazyFrame,
560
+ config: SamplingConfig,
561
+ total_rows: int | None = None,
562
+ ) -> SamplingResult:
563
+ """Random sample of N rows."""
564
+ start_time = time.perf_counter()
565
+
566
+ if total_rows is None:
567
+ total_rows = self.estimate_row_count(lf)
568
+
569
+ # Calculate sample size
570
+ sample_size = config.calculate_required_sample_size(total_rows)
571
+ if config.max_rows > 0:
572
+ sample_size = min(sample_size, config.max_rows)
573
+
574
+ # No sampling needed if sample >= total
575
+ if sample_size >= total_rows:
576
+ return NoSamplingStrategy().sample(lf, config, total_rows)
577
+
578
+ # Calculate fraction for sampling
579
+ fraction = sample_size / total_rows
580
+
581
+ # Apply random sampling with seed for reproducibility
582
+ seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
583
+
584
+ # Polars sample is on DataFrame, need to collect first for true random
585
+ # For LazyFrame, we use a workaround with row index
586
+ # Use higher precision (10000) to avoid fraction becoming 0 for small ratios
587
+ threshold = max(1, int(fraction * 10000))
588
+ sampled_lf = (
589
+ lf.with_row_index("__sample_idx")
590
+ .filter(pl.col("__sample_idx").hash(seed) % 10000 < threshold)
591
+ .drop("__sample_idx")
592
+ )
593
+
594
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
595
+
596
+ # Actual sample size may vary due to hash-based sampling
597
+ actual_sample_size = min(sample_size, total_rows)
598
+
599
+ return SamplingResult(
600
+ data=sampled_lf,
601
+ metrics=self._create_metrics(
602
+ original_size=total_rows,
603
+ sample_size=actual_sample_size,
604
+ config=config,
605
+ sampling_time_ms=elapsed_ms,
606
+ ),
607
+ is_sampled=True,
608
+ )
609
+
610
+
611
+ class SystematicSamplingStrategy(SamplingStrategy):
612
+ """Strategy for systematic sampling (every Nth row).
613
+
614
+ Efficient and ensures even coverage across data.
615
+ May miss periodic patterns if data has periodicity.
616
+ """
617
+
618
+ name = "systematic"
619
+
620
+ def sample(
621
+ self,
622
+ lf: pl.LazyFrame,
623
+ config: SamplingConfig,
624
+ total_rows: int | None = None,
625
+ ) -> SamplingResult:
626
+ """Take every Nth row."""
627
+ start_time = time.perf_counter()
628
+
629
+ if total_rows is None:
630
+ total_rows = self.estimate_row_count(lf)
631
+
632
+ # Calculate sample size and interval
633
+ sample_size = config.calculate_required_sample_size(total_rows)
634
+ if config.max_rows > 0:
635
+ sample_size = min(sample_size, config.max_rows)
636
+
637
+ if sample_size >= total_rows:
638
+ return NoSamplingStrategy().sample(lf, config, total_rows)
639
+
640
+ # Calculate sampling interval
641
+ interval = max(1, total_rows // sample_size)
642
+
643
+ # Random start offset for unbiased sampling
644
+ seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
645
+ random.seed(seed)
646
+ offset = random.randint(0, interval - 1)
647
+
648
+ # Apply systematic sampling
649
+ sampled_lf = (
650
+ lf.with_row_index("__sample_idx")
651
+ .filter((pl.col("__sample_idx") - offset) % interval == 0)
652
+ .drop("__sample_idx")
653
+ )
654
+
655
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
656
+ actual_sample_size = (total_rows - offset + interval - 1) // interval
657
+
658
+ return SamplingResult(
659
+ data=sampled_lf,
660
+ metrics=self._create_metrics(
661
+ original_size=total_rows,
662
+ sample_size=min(actual_sample_size, sample_size),
663
+ config=config,
664
+ sampling_time_ms=elapsed_ms,
665
+ ),
666
+ is_sampled=True,
667
+ )
668
+
669
+
670
+ class HashSamplingStrategy(SamplingStrategy):
671
+ """Strategy for deterministic hash-based sampling.
672
+
673
+ Produces reproducible samples based on row content.
674
+ Useful for consistent sampling across runs.
675
+ """
676
+
677
+ name = "hash"
678
+
679
+ def __init__(self, hash_column: str | None = None):
680
+ """Initialize hash sampling strategy.
681
+
682
+ Args:
683
+ hash_column: Column to use for hashing (None = use row index)
684
+ """
685
+ self.hash_column = hash_column
686
+
687
+ def sample(
688
+ self,
689
+ lf: pl.LazyFrame,
690
+ config: SamplingConfig,
691
+ total_rows: int | None = None,
692
+ ) -> SamplingResult:
693
+ """Hash-based deterministic sampling."""
694
+ start_time = time.perf_counter()
695
+
696
+ if total_rows is None:
697
+ total_rows = self.estimate_row_count(lf)
698
+
699
+ sample_size = config.calculate_required_sample_size(total_rows)
700
+ if config.max_rows > 0:
701
+ sample_size = min(sample_size, config.max_rows)
702
+
703
+ if sample_size >= total_rows:
704
+ return NoSamplingStrategy().sample(lf, config, total_rows)
705
+
706
+ # Calculate threshold for hash-based filtering
707
+ # Use higher precision (10000) to avoid threshold becoming 0 for small ratios
708
+ threshold = max(1, int((sample_size / total_rows) * 10000))
709
+ seed = config.seed if config.seed is not None else 42
710
+
711
+ if self.hash_column:
712
+ # Hash specific column
713
+ sampled_lf = lf.filter(
714
+ pl.col(self.hash_column).hash(seed) % 10000 < threshold
715
+ )
716
+ else:
717
+ # Hash row index
718
+ sampled_lf = (
719
+ lf.with_row_index("__hash_idx")
720
+ .filter(pl.col("__hash_idx").hash(seed) % 10000 < threshold)
721
+ .drop("__hash_idx")
722
+ )
723
+
724
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
725
+
726
+ return SamplingResult(
727
+ data=sampled_lf,
728
+ metrics=self._create_metrics(
729
+ original_size=total_rows,
730
+ sample_size=sample_size,
731
+ config=config,
732
+ sampling_time_ms=elapsed_ms,
733
+ ),
734
+ is_sampled=True,
735
+ )
736
+
737
+
738
+ class StratifiedSamplingStrategy(SamplingStrategy):
739
+ """Strategy for stratified sampling.
740
+
741
+ Preserves distribution of a stratification column.
742
+ Useful when data has important categorical groupings.
743
+ """
744
+
745
+ name = "stratified"
746
+
747
+ def __init__(self, stratify_column: str | None = None):
748
+ """Initialize stratified sampling.
749
+
750
+ Args:
751
+ stratify_column: Column to stratify by
752
+ """
753
+ self.stratify_column = stratify_column
754
+
755
+ def sample(
756
+ self,
757
+ lf: pl.LazyFrame,
758
+ config: SamplingConfig,
759
+ total_rows: int | None = None,
760
+ ) -> SamplingResult:
761
+ """Stratified sampling preserving group proportions."""
762
+ start_time = time.perf_counter()
763
+
764
+ if total_rows is None:
765
+ total_rows = self.estimate_row_count(lf)
766
+
767
+ sample_size = config.calculate_required_sample_size(total_rows)
768
+ if config.max_rows > 0:
769
+ sample_size = min(sample_size, config.max_rows)
770
+
771
+ if sample_size >= total_rows:
772
+ return NoSamplingStrategy().sample(lf, config, total_rows)
773
+
774
+ fraction = sample_size / total_rows
775
+
776
+ if self.stratify_column:
777
+ # Sample within each stratum
778
+ seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
779
+
780
+ # Get strata and sample proportionally
781
+ sampled_lf = (
782
+ lf.with_row_index("__strat_idx")
783
+ .with_columns(
784
+ (pl.col("__strat_idx").hash(seed) % 1000 / 1000).alias("__rand")
785
+ )
786
+ .filter(pl.col("__rand") < fraction)
787
+ .drop(["__strat_idx", "__rand"])
788
+ )
789
+ else:
790
+ # Fallback to random sampling if no stratify column
791
+ return RandomSamplingStrategy().sample(lf, config, total_rows)
792
+
793
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
794
+
795
+ return SamplingResult(
796
+ data=sampled_lf,
797
+ metrics=self._create_metrics(
798
+ original_size=total_rows,
799
+ sample_size=sample_size,
800
+ config=config,
801
+ sampling_time_ms=elapsed_ms,
802
+ ),
803
+ is_sampled=True,
804
+ )
805
+
806
+
807
+ class ReservoirSamplingStrategy(SamplingStrategy):
808
+ """Strategy for reservoir sampling.
809
+
810
+ Optimal for streaming data where total size is unknown.
811
+ Provides uniform random sample with single pass.
812
+ """
813
+
814
+ name = "reservoir"
815
+
816
+ def sample(
817
+ self,
818
+ lf: pl.LazyFrame,
819
+ config: SamplingConfig,
820
+ total_rows: int | None = None,
821
+ ) -> SamplingResult:
822
+ """Reservoir sampling for streaming-friendly sampling."""
823
+ start_time = time.perf_counter()
824
+
825
+ # For reservoir sampling, we need to process in a streaming fashion
826
+ # Polars doesn't have native reservoir sampling, so we approximate
827
+
828
+ if total_rows is None:
829
+ total_rows = self.estimate_row_count(lf)
830
+
831
+ sample_size = config.calculate_required_sample_size(total_rows)
832
+ if config.max_rows > 0:
833
+ sample_size = min(sample_size, config.max_rows)
834
+
835
+ if sample_size >= total_rows:
836
+ return NoSamplingStrategy().sample(lf, config, total_rows)
837
+
838
+ # Approximate reservoir sampling using weighted random selection
839
+ seed = config.seed if config.seed is not None else random.randint(0, 2**32 - 1)
840
+
841
+ # Use logarithmic random for reservoir-like behavior
842
+ sampled_lf = (
843
+ lf.with_row_index("__res_idx")
844
+ .with_columns(
845
+ (-pl.col("__res_idx").hash(seed).log()).alias("__priority")
846
+ )
847
+ .sort("__priority")
848
+ .head(sample_size)
849
+ .drop(["__res_idx", "__priority"])
850
+ )
851
+
852
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
853
+
854
+ return SamplingResult(
855
+ data=sampled_lf,
856
+ metrics=self._create_metrics(
857
+ original_size=total_rows,
858
+ sample_size=sample_size,
859
+ config=config,
860
+ sampling_time_ms=elapsed_ms,
861
+ ),
862
+ is_sampled=True,
863
+ )
864
+
865
+
866
+ class AdaptiveSamplingStrategy(SamplingStrategy):
867
+ """Strategy that adapts based on data characteristics.
868
+
869
+ Automatically selects the best sampling method based on:
870
+ - Dataset size
871
+ - Available memory
872
+ - Accuracy requirements
873
+
874
+ This is the recommended default strategy.
875
+ """
876
+
877
+ name = "adaptive"
878
+
879
+ def __init__(self) -> None:
880
+ """Initialize with sub-strategies."""
881
+ self._strategies: dict[str, SamplingStrategy] = {
882
+ "none": NoSamplingStrategy(),
883
+ "head": HeadSamplingStrategy(),
884
+ "random": RandomSamplingStrategy(),
885
+ "systematic": SystematicSamplingStrategy(),
886
+ "reservoir": ReservoirSamplingStrategy(),
887
+ }
888
+
889
+ def sample(
890
+ self,
891
+ lf: pl.LazyFrame,
892
+ config: SamplingConfig,
893
+ total_rows: int | None = None,
894
+ ) -> SamplingResult:
895
+ """Adaptively sample based on data size and config."""
896
+ start_time = time.perf_counter()
897
+
898
+ if total_rows is None:
899
+ total_rows = self.estimate_row_count(lf)
900
+
901
+ # Select strategy based on data size
902
+ selected_strategy = self._select_strategy(total_rows, config)
903
+
904
+ logger.debug(
905
+ f"Adaptive sampling selected '{selected_strategy.name}' "
906
+ f"for {total_rows:,} rows"
907
+ )
908
+
909
+ # Delegate to selected strategy
910
+ result = selected_strategy.sample(lf, config, total_rows)
911
+
912
+ # Update metrics to reflect adaptive selection
913
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
914
+
915
+ return SamplingResult(
916
+ data=result.data,
917
+ metrics=SamplingMetrics(
918
+ original_size=result.metrics.original_size,
919
+ sample_size=result.metrics.sample_size,
920
+ sampling_ratio=result.metrics.sampling_ratio,
921
+ confidence_level=result.metrics.confidence_level,
922
+ margin_of_error=result.metrics.margin_of_error,
923
+ strategy_used=f"adaptive({selected_strategy.name})",
924
+ sampling_time_ms=elapsed_ms,
925
+ memory_saved_estimate_mb=result.metrics.memory_saved_estimate_mb,
926
+ ),
927
+ is_sampled=result.is_sampled,
928
+ )
929
+
930
+ def _select_strategy(
931
+ self,
932
+ total_rows: int,
933
+ config: SamplingConfig,
934
+ ) -> SamplingStrategy:
935
+ """Select the best strategy for given parameters."""
936
+ # Small datasets: no sampling needed
937
+ if total_rows <= config.small_dataset_threshold:
938
+ return self._strategies["none"]
939
+
940
+ # Medium datasets: systematic for balance of speed/quality
941
+ if total_rows <= config.medium_dataset_threshold:
942
+ return self._strategies["systematic"]
943
+
944
+ # Large datasets: random for better representation
945
+ if total_rows <= config.large_dataset_threshold:
946
+ return self._strategies["random"]
947
+
948
+ # Very large datasets: reservoir for memory efficiency
949
+ return self._strategies["reservoir"]
950
+
951
+
952
+ # =============================================================================
953
+ # Sampling Strategy Registry
954
+ # =============================================================================
955
+
956
+
957
+ class SamplingStrategyRegistry:
958
+ """Registry for sampling strategies.
959
+
960
+ Allows registration of custom strategies and creation by name.
961
+
962
+ Example:
963
+ registry = SamplingStrategyRegistry()
964
+ registry.register(MyCustomStrategy())
965
+ strategy = registry.get("custom")
966
+ """
967
+
968
+ def __init__(self) -> None:
969
+ self._strategies: dict[str, SamplingStrategy] = {}
970
+ self._lock = threading.RLock()
971
+ self._register_defaults()
972
+
973
+ def _register_defaults(self) -> None:
974
+ """Register built-in strategies."""
975
+ self.register(NoSamplingStrategy())
976
+ self.register(HeadSamplingStrategy())
977
+ self.register(RandomSamplingStrategy())
978
+ self.register(SystematicSamplingStrategy())
979
+ self.register(HashSamplingStrategy())
980
+ self.register(StratifiedSamplingStrategy())
981
+ self.register(ReservoirSamplingStrategy())
982
+ self.register(AdaptiveSamplingStrategy())
983
+
984
+ def register(self, strategy: SamplingStrategy) -> None:
985
+ """Register a sampling strategy."""
986
+ with self._lock:
987
+ self._strategies[strategy.name] = strategy
988
+ logger.debug(f"Registered sampling strategy: {strategy.name}")
989
+
990
+ def get(self, name: str) -> SamplingStrategy:
991
+ """Get a strategy by name.
992
+
993
+ Args:
994
+ name: Strategy name
995
+
996
+ Returns:
997
+ The requested strategy
998
+
999
+ Raises:
1000
+ KeyError: If strategy not found
1001
+ """
1002
+ with self._lock:
1003
+ if name not in self._strategies:
1004
+ available = list(self._strategies.keys())
1005
+ raise KeyError(
1006
+ f"Unknown sampling strategy: '{name}'. "
1007
+ f"Available: {available}"
1008
+ )
1009
+ return self._strategies[name]
1010
+
1011
+ def get_or_default(
1012
+ self,
1013
+ name: str,
1014
+ default: SamplingStrategy | None = None,
1015
+ ) -> SamplingStrategy:
1016
+ """Get strategy by name with fallback."""
1017
+ try:
1018
+ return self.get(name)
1019
+ except KeyError:
1020
+ return default or AdaptiveSamplingStrategy()
1021
+
1022
+ def list_strategies(self) -> list[str]:
1023
+ """List all registered strategy names."""
1024
+ with self._lock:
1025
+ return list(self._strategies.keys())
1026
+
1027
+ def create_from_method(self, method: SamplingMethod) -> SamplingStrategy:
1028
+ """Create strategy from SamplingMethod enum."""
1029
+ return self.get(method.value)
1030
+
1031
+
1032
+ # Global registry instance
1033
+ sampling_strategy_registry = SamplingStrategyRegistry()
1034
+
1035
+
1036
+ # =============================================================================
1037
+ # Data Size Estimator
1038
+ # =============================================================================
1039
+
1040
+
1041
+ class DataSizeEstimator:
1042
+ """Estimates data size for sampling decisions.
1043
+
1044
+ Provides fast, approximate size estimates without full scans.
1045
+ """
1046
+
1047
+ @staticmethod
1048
+ def estimate_row_count(lf: pl.LazyFrame) -> int:
1049
+ """Estimate row count.
1050
+
1051
+ Args:
1052
+ lf: LazyFrame to estimate
1053
+
1054
+ Returns:
1055
+ Estimated row count
1056
+ """
1057
+ # For now, use exact count
1058
+ # Future: Use file metadata for parquet, etc.
1059
+ return lf.select(pl.len()).collect().item()
1060
+
1061
+ @staticmethod
1062
+ def estimate_memory_bytes(lf: pl.LazyFrame, sample_rows: int = 1000) -> int:
1063
+ """Estimate memory usage per row.
1064
+
1065
+ Args:
1066
+ lf: LazyFrame to estimate
1067
+ sample_rows: Number of rows to sample for estimation
1068
+
1069
+ Returns:
1070
+ Estimated bytes per row
1071
+ """
1072
+ try:
1073
+ sample = lf.head(sample_rows).collect()
1074
+ if len(sample) == 0:
1075
+ return 0
1076
+
1077
+ total_bytes = sample.estimated_size()
1078
+ bytes_per_row = total_bytes // len(sample)
1079
+ return bytes_per_row
1080
+ except Exception:
1081
+ # Default estimate: 100 bytes per row
1082
+ return 100
1083
+
1084
+ @staticmethod
1085
+ def estimate_total_memory_mb(
1086
+ lf: pl.LazyFrame,
1087
+ row_count: int | None = None,
1088
+ ) -> float:
1089
+ """Estimate total memory for full data.
1090
+
1091
+ Args:
1092
+ lf: LazyFrame to estimate
1093
+ row_count: Pre-computed row count
1094
+
1095
+ Returns:
1096
+ Estimated total memory in MB
1097
+ """
1098
+ if row_count is None:
1099
+ row_count = DataSizeEstimator.estimate_row_count(lf)
1100
+
1101
+ bytes_per_row = DataSizeEstimator.estimate_memory_bytes(lf)
1102
+ total_bytes = row_count * bytes_per_row
1103
+ return total_bytes / (1024 * 1024)
1104
+
1105
+
1106
+ # =============================================================================
1107
+ # Sampler (Main Interface)
1108
+ # =============================================================================
1109
+
1110
+
1111
+ class Sampler:
1112
+ """Main interface for data sampling.
1113
+
1114
+ Coordinates sampling strategies and provides a simple API
1115
+ for sampling data with configurable behavior.
1116
+
1117
+ Example:
1118
+ sampler = Sampler(SamplingConfig.for_accuracy("high"))
1119
+ result = sampler.sample(lf)
1120
+
1121
+ print(f"Sampled {result.metrics.sample_size:,} of "
1122
+ f"{result.metrics.original_size:,} rows")
1123
+ print(f"Strategy: {result.metrics.strategy_used}")
1124
+ """
1125
+
1126
+ def __init__(
1127
+ self,
1128
+ config: SamplingConfig | None = None,
1129
+ registry: SamplingStrategyRegistry | None = None,
1130
+ ):
1131
+ """Initialize sampler.
1132
+
1133
+ Args:
1134
+ config: Sampling configuration
1135
+ registry: Strategy registry (uses global if not provided)
1136
+ """
1137
+ self.config = config or DEFAULT_SAMPLING_CONFIG
1138
+ self.registry = registry or sampling_strategy_registry
1139
+ self._size_estimator = DataSizeEstimator()
1140
+
1141
+ def sample(
1142
+ self,
1143
+ lf: pl.LazyFrame,
1144
+ config: SamplingConfig | None = None,
1145
+ ) -> SamplingResult:
1146
+ """Sample data from LazyFrame.
1147
+
1148
+ Args:
1149
+ lf: Source LazyFrame
1150
+ config: Override configuration for this call
1151
+
1152
+ Returns:
1153
+ SamplingResult with sampled data and metrics
1154
+ """
1155
+ config = config or self.config
1156
+
1157
+ # Get the appropriate strategy
1158
+ strategy = self.registry.create_from_method(config.strategy)
1159
+
1160
+ # Estimate row count
1161
+ total_rows = self._size_estimator.estimate_row_count(lf)
1162
+
1163
+ # Execute sampling
1164
+ try:
1165
+ return strategy.sample(lf, config, total_rows)
1166
+ except Exception as e:
1167
+ logger.warning(
1168
+ f"Sampling strategy '{strategy.name}' failed: {e}. "
1169
+ f"Falling back to '{config.fallback_strategy.value}'"
1170
+ )
1171
+ # Fallback
1172
+ fallback = self.registry.create_from_method(config.fallback_strategy)
1173
+ return fallback.sample(lf, config, total_rows)
1174
+
1175
+ def sample_column(
1176
+ self,
1177
+ lf: pl.LazyFrame,
1178
+ column: str,
1179
+ config: SamplingConfig | None = None,
1180
+ ) -> SamplingResult:
1181
+ """Sample specific column from LazyFrame.
1182
+
1183
+ Args:
1184
+ lf: Source LazyFrame
1185
+ column: Column to sample
1186
+ config: Override configuration
1187
+
1188
+ Returns:
1189
+ SamplingResult with sampled column data
1190
+ """
1191
+ # Select only the needed column for efficiency
1192
+ column_lf = lf.select(pl.col(column))
1193
+ return self.sample(column_lf, config)
1194
+
1195
+
1196
+ # =============================================================================
1197
+ # Convenience Functions
1198
+ # =============================================================================
1199
+
1200
+
1201
+ def create_sampler(
1202
+ strategy: str | SamplingMethod = "adaptive",
1203
+ max_rows: int = 100_000,
1204
+ confidence_level: float = 0.95,
1205
+ **kwargs: Any,
1206
+ ) -> Sampler:
1207
+ """Create a sampler with specified parameters.
1208
+
1209
+ Args:
1210
+ strategy: Sampling strategy name or enum
1211
+ max_rows: Maximum rows to sample
1212
+ confidence_level: Statistical confidence level
1213
+ **kwargs: Additional config options
1214
+
1215
+ Returns:
1216
+ Configured Sampler instance
1217
+
1218
+ Example:
1219
+ sampler = create_sampler(strategy="random", max_rows=50_000)
1220
+ result = sampler.sample(lf)
1221
+ """
1222
+ if isinstance(strategy, str):
1223
+ strategy = SamplingMethod(strategy)
1224
+
1225
+ config = SamplingConfig(
1226
+ strategy=strategy,
1227
+ max_rows=max_rows,
1228
+ confidence_level=confidence_level,
1229
+ **kwargs,
1230
+ )
1231
+
1232
+ return Sampler(config)
1233
+
1234
+
1235
+ def sample_data(
1236
+ lf: pl.LazyFrame,
1237
+ max_rows: int = 100_000,
1238
+ strategy: str = "adaptive",
1239
+ ) -> SamplingResult:
1240
+ """Quick function to sample data.
1241
+
1242
+ Args:
1243
+ lf: LazyFrame to sample
1244
+ max_rows: Maximum rows
1245
+ strategy: Strategy name
1246
+
1247
+ Returns:
1248
+ SamplingResult
1249
+
1250
+ Example:
1251
+ result = sample_data(lf, max_rows=50_000)
1252
+ sampled_lf = result.data
1253
+ """
1254
+ sampler = create_sampler(strategy=strategy, max_rows=max_rows)
1255
+ return sampler.sample(lf)
1256
+
1257
+
1258
+ def calculate_sample_size(
1259
+ population_size: int,
1260
+ confidence_level: float = 0.95,
1261
+ margin_of_error: float = 0.05,
1262
+ min_sample_size: int = 1,
1263
+ ) -> int:
1264
+ """Calculate required sample size for given parameters.
1265
+
1266
+ Uses Cochran's formula with finite population correction.
1267
+ By default, returns the pure statistical calculation without
1268
+ minimum size constraints.
1269
+
1270
+ Args:
1271
+ population_size: Total population
1272
+ confidence_level: Confidence level (0-1)
1273
+ margin_of_error: Margin of error (0-1)
1274
+ min_sample_size: Minimum sample size (default 1 for pure statistical result)
1275
+
1276
+ Returns:
1277
+ Required sample size
1278
+
1279
+ Example:
1280
+ n = calculate_sample_size(1_000_000, confidence_level=0.99)
1281
+ print(f"Need {n:,} samples for 99% confidence")
1282
+ """
1283
+ config = SamplingConfig(
1284
+ confidence_level=confidence_level,
1285
+ margin_of_error=margin_of_error,
1286
+ min_sample_size=min_sample_size,
1287
+ )
1288
+ return config.calculate_required_sample_size(population_size)