truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1065 @@
1
+ """Enterprise-grade sampling strategies for 100M+ scale datasets.
2
+
3
+ This module extends the base sampling framework with optimizations for
4
+ extremely large datasets that cannot fit in memory.
5
+
6
+ Key Features:
7
+ - Block-based parallel sampling for distributed processing
8
+ - Memory-aware adaptive sampling with backpressure
9
+ - Multi-stage sampling for ultra-large datasets
10
+ - Statistical quality guarantees with confidence bounds
11
+ - Time-budget aware sampling
12
+ - Column-type aware optimization
13
+
14
+ Design Principles:
15
+ - O(1) memory footprint regardless of data size
16
+ - Streaming-first architecture
17
+ - Progressive refinement (quick estimates → accurate results)
18
+ - Fail-safe with graceful degradation
19
+
20
+ Scale Targets:
21
+ - 100M+ rows: Block-based sampling
22
+ - 1B+ rows: Multi-stage hierarchical sampling
23
+ - 10B+ rows: Probabilistic sketches (HyperLogLog, Count-Min)
24
+
25
+ Usage:
26
+ from truthound.profiler.enterprise_sampling import (
27
+ EnterpriseScaleSampler,
28
+ BlockSamplingStrategy,
29
+ MemoryBudgetConfig,
30
+ )
31
+
32
+ # For 100M+ rows
33
+ config = EnterpriseScaleConfig(
34
+ target_rows=100_000,
35
+ memory_budget_mb=512,
36
+ time_budget_seconds=60,
37
+ )
38
+ sampler = EnterpriseScaleSampler(config)
39
+ result = sampler.sample(lf)
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import gc
45
+ import logging
46
+ import math
47
+ import os
48
+ import random
49
+ import time
50
+ import threading
51
+ from abc import ABC, abstractmethod
52
+ from concurrent.futures import ThreadPoolExecutor, as_completed
53
+ from dataclasses import dataclass, field
54
+ from enum import Enum, auto
55
+ from typing import Any, Callable, Iterator, TypeVar, Generic
56
+
57
+ import polars as pl
58
+
59
+ from truthound.profiler.sampling import (
60
+ SamplingConfig,
61
+ SamplingMetrics,
62
+ SamplingResult,
63
+ SamplingStrategy,
64
+ SamplingMethod,
65
+ DEFAULT_SAMPLING_CONFIG,
66
+ )
67
+
68
+
69
+ logger = logging.getLogger(__name__)
70
+
71
+
72
+ # =============================================================================
73
+ # Constants and Configuration
74
+ # =============================================================================
75
+
76
+ # Scale thresholds
77
+ LARGE_SCALE_THRESHOLD = 10_000_000 # 10M rows
78
+ XLARGE_SCALE_THRESHOLD = 100_000_000 # 100M rows
79
+ XXLARGE_SCALE_THRESHOLD = 1_000_000_000 # 1B rows
80
+
81
+ # Default block sizes for different scales
82
+ DEFAULT_BLOCK_SIZE_LARGE = 1_000_000 # 1M rows per block
83
+ DEFAULT_BLOCK_SIZE_XLARGE = 5_000_000 # 5M rows per block
84
+
85
+ # Memory estimation constants
86
+ BYTES_PER_ROW_ESTIMATE = 200 # Conservative estimate
87
+ MB = 1024 * 1024
88
+ GB = 1024 * MB
89
+
90
+
91
+ class ScaleCategory(Enum):
92
+ """Dataset scale categories."""
93
+ SMALL = auto() # < 1M rows
94
+ MEDIUM = auto() # 1M - 10M rows
95
+ LARGE = auto() # 10M - 100M rows
96
+ XLARGE = auto() # 100M - 1B rows
97
+ XXLARGE = auto() # > 1B rows
98
+
99
+
100
+ class SamplingQuality(Enum):
101
+ """Sampling quality levels."""
102
+ SKETCH = auto() # Fast approximation (HyperLogLog-level)
103
+ QUICK = auto() # Quick estimate (90% confidence)
104
+ STANDARD = auto() # Standard quality (95% confidence)
105
+ HIGH = auto() # High quality (99% confidence)
106
+ EXACT = auto() # Full scan (100% accuracy)
107
+
108
+
109
+ # =============================================================================
110
+ # Memory Budget Configuration
111
+ # =============================================================================
112
+
113
+ @dataclass
114
+ class MemoryBudgetConfig:
115
+ """Configuration for memory-aware sampling.
116
+
117
+ Attributes:
118
+ max_memory_mb: Maximum memory to use
119
+ reserved_memory_mb: Memory to keep free for system
120
+ gc_threshold_mb: Trigger GC when approaching this limit
121
+ enable_monitoring: Enable continuous memory monitoring
122
+ backpressure_enabled: Enable backpressure when memory is low
123
+ """
124
+ max_memory_mb: int = 1024 # 1GB default
125
+ reserved_memory_mb: int = 256
126
+ gc_threshold_mb: int = 768
127
+ enable_monitoring: bool = True
128
+ backpressure_enabled: bool = True
129
+
130
+ @property
131
+ def available_memory_mb(self) -> int:
132
+ """Get available memory for sampling."""
133
+ return self.max_memory_mb - self.reserved_memory_mb
134
+
135
+ @classmethod
136
+ def auto_detect(cls) -> "MemoryBudgetConfig":
137
+ """Auto-detect memory budget based on system resources."""
138
+ try:
139
+ import psutil
140
+ total_mb = psutil.virtual_memory().total // MB
141
+ available_mb = psutil.virtual_memory().available // MB
142
+
143
+ # Use 25% of available memory, max 4GB
144
+ max_mb = min(available_mb // 4, 4096)
145
+ return cls(
146
+ max_memory_mb=max_mb,
147
+ reserved_memory_mb=max_mb // 4,
148
+ gc_threshold_mb=int(max_mb * 0.75),
149
+ )
150
+ except ImportError:
151
+ # Fallback to conservative defaults
152
+ return cls()
153
+
154
+ @classmethod
155
+ def for_scale(cls, scale: ScaleCategory) -> "MemoryBudgetConfig":
156
+ """Create config appropriate for data scale."""
157
+ configs = {
158
+ ScaleCategory.SMALL: cls(max_memory_mb=256),
159
+ ScaleCategory.MEDIUM: cls(max_memory_mb=512),
160
+ ScaleCategory.LARGE: cls(max_memory_mb=1024),
161
+ ScaleCategory.XLARGE: cls(max_memory_mb=2048),
162
+ ScaleCategory.XXLARGE: cls(max_memory_mb=4096),
163
+ }
164
+ return configs.get(scale, cls())
165
+
166
+
167
+ # =============================================================================
168
+ # Enterprise Scale Configuration
169
+ # =============================================================================
170
+
171
+ @dataclass
172
+ class EnterpriseScaleConfig:
173
+ """Configuration for enterprise-scale sampling.
174
+
175
+ Attributes:
176
+ target_rows: Target number of rows to sample
177
+ memory_budget: Memory budget configuration
178
+ time_budget_seconds: Maximum time for sampling (0 = unlimited)
179
+ quality: Desired sampling quality
180
+ block_size: Rows per processing block (0 = auto)
181
+ max_parallel_blocks: Maximum parallel block processing
182
+ enable_progressive: Enable progressive refinement
183
+ seed: Random seed for reproducibility
184
+ """
185
+ target_rows: int = 100_000
186
+ memory_budget: MemoryBudgetConfig = field(default_factory=MemoryBudgetConfig)
187
+ time_budget_seconds: float = 0.0 # 0 = unlimited
188
+ quality: SamplingQuality = SamplingQuality.STANDARD
189
+ block_size: int = 0 # 0 = auto-detect
190
+ max_parallel_blocks: int = 4
191
+ enable_progressive: bool = True
192
+ seed: int | None = None
193
+
194
+ # Statistical parameters
195
+ confidence_level: float = 0.95
196
+ margin_of_error: float = 0.05
197
+
198
+ # Adaptive parameters
199
+ min_sample_ratio: float = 0.001 # At least 0.1%
200
+ max_sample_ratio: float = 0.10 # At most 10%
201
+
202
+ def __post_init__(self) -> None:
203
+ if self.target_rows <= 0:
204
+ raise ValueError(f"target_rows must be positive, got {self.target_rows}")
205
+ if self.time_budget_seconds < 0:
206
+ raise ValueError(f"time_budget_seconds must be non-negative")
207
+
208
+ def get_block_size(self, total_rows: int) -> int:
209
+ """Get optimal block size for given data size."""
210
+ if self.block_size > 0:
211
+ return self.block_size
212
+
213
+ # Auto-detect based on scale
214
+ scale = self.classify_scale(total_rows)
215
+ if scale in (ScaleCategory.SMALL, ScaleCategory.MEDIUM):
216
+ return min(total_rows, 1_000_000)
217
+ elif scale == ScaleCategory.LARGE:
218
+ return DEFAULT_BLOCK_SIZE_LARGE
219
+ else:
220
+ return DEFAULT_BLOCK_SIZE_XLARGE
221
+
222
+ @staticmethod
223
+ def classify_scale(total_rows: int) -> ScaleCategory:
224
+ """Classify data scale."""
225
+ if total_rows < 1_000_000:
226
+ return ScaleCategory.SMALL
227
+ elif total_rows < LARGE_SCALE_THRESHOLD:
228
+ return ScaleCategory.MEDIUM
229
+ elif total_rows < XLARGE_SCALE_THRESHOLD:
230
+ return ScaleCategory.LARGE
231
+ elif total_rows < XXLARGE_SCALE_THRESHOLD:
232
+ return ScaleCategory.XLARGE
233
+ else:
234
+ return ScaleCategory.XXLARGE
235
+
236
+ @classmethod
237
+ def for_quality(cls, quality: str) -> "EnterpriseScaleConfig":
238
+ """Create config for specific quality level."""
239
+ quality_map = {
240
+ "sketch": (SamplingQuality.SKETCH, 10_000, 0.90, 0.15),
241
+ "quick": (SamplingQuality.QUICK, 50_000, 0.90, 0.10),
242
+ "standard": (SamplingQuality.STANDARD, 100_000, 0.95, 0.05),
243
+ "high": (SamplingQuality.HIGH, 500_000, 0.99, 0.02),
244
+ "exact": (SamplingQuality.EXACT, 0, 1.0, 0.0),
245
+ }
246
+ q, target, conf, margin = quality_map.get(quality, quality_map["standard"])
247
+ return cls(
248
+ target_rows=target,
249
+ quality=q,
250
+ confidence_level=conf,
251
+ margin_of_error=margin,
252
+ )
253
+
254
+
255
+ # =============================================================================
256
+ # Block Sampling Result
257
+ # =============================================================================
258
+
259
+ @dataclass(frozen=True)
260
+ class BlockSamplingMetrics(SamplingMetrics):
261
+ """Extended metrics for block-based sampling."""
262
+ blocks_processed: int = 0
263
+ blocks_skipped: int = 0
264
+ parallel_efficiency: float = 1.0
265
+ memory_peak_mb: float = 0.0
266
+ time_per_block_ms: float = 0.0
267
+
268
+ def to_dict(self) -> dict[str, Any]:
269
+ base = super().to_dict()
270
+ base.update({
271
+ "blocks_processed": self.blocks_processed,
272
+ "blocks_skipped": self.blocks_skipped,
273
+ "parallel_efficiency": self.parallel_efficiency,
274
+ "memory_peak_mb": self.memory_peak_mb,
275
+ "time_per_block_ms": self.time_per_block_ms,
276
+ })
277
+ return base
278
+
279
+
280
+ @dataclass
281
+ class ProgressiveResult:
282
+ """Result from progressive sampling with refinement stages."""
283
+ current_estimate: SamplingResult
284
+ stages_completed: int
285
+ total_stages: int
286
+ converged: bool
287
+ convergence_delta: float
288
+
289
+ @property
290
+ def is_final(self) -> bool:
291
+ return self.stages_completed >= self.total_stages or self.converged
292
+
293
+
294
+ # =============================================================================
295
+ # Memory Monitor
296
+ # =============================================================================
297
+
298
+ class MemoryMonitor:
299
+ """Monitors memory usage and provides backpressure signals."""
300
+
301
+ def __init__(self, config: MemoryBudgetConfig):
302
+ self.config = config
303
+ self._lock = threading.Lock()
304
+ self._peak_mb: float = 0.0
305
+ self._current_mb: float = 0.0
306
+
307
+ def get_current_mb(self) -> float:
308
+ """Get current process memory usage in MB."""
309
+ try:
310
+ import psutil
311
+ process = psutil.Process(os.getpid())
312
+ return process.memory_info().rss / MB
313
+ except ImportError:
314
+ return 0.0
315
+
316
+ def update(self) -> None:
317
+ """Update current memory reading."""
318
+ with self._lock:
319
+ self._current_mb = self.get_current_mb()
320
+ self._peak_mb = max(self._peak_mb, self._current_mb)
321
+
322
+ def should_gc(self) -> bool:
323
+ """Check if garbage collection should be triggered."""
324
+ self.update()
325
+ return self._current_mb > self.config.gc_threshold_mb
326
+
327
+ def should_backpressure(self) -> bool:
328
+ """Check if backpressure should be applied."""
329
+ if not self.config.backpressure_enabled:
330
+ return False
331
+ self.update()
332
+ return self._current_mb > self.config.available_memory_mb
333
+
334
+ def trigger_gc(self) -> None:
335
+ """Trigger garbage collection."""
336
+ gc.collect()
337
+ self.update()
338
+
339
+ @property
340
+ def peak_mb(self) -> float:
341
+ return self._peak_mb
342
+
343
+ @property
344
+ def current_mb(self) -> float:
345
+ return self._current_mb
346
+
347
+
348
+ # =============================================================================
349
+ # Time Budget Manager
350
+ # =============================================================================
351
+
352
+ class TimeBudgetManager:
353
+ """Manages time budget for sampling operations."""
354
+
355
+ def __init__(self, budget_seconds: float):
356
+ self.budget_seconds = budget_seconds
357
+ self.start_time = time.perf_counter()
358
+ self._checkpoints: list[tuple[str, float]] = []
359
+
360
+ @property
361
+ def elapsed_seconds(self) -> float:
362
+ return time.perf_counter() - self.start_time
363
+
364
+ @property
365
+ def remaining_seconds(self) -> float:
366
+ if self.budget_seconds <= 0:
367
+ return float("inf")
368
+ return max(0, self.budget_seconds - self.elapsed_seconds)
369
+
370
+ @property
371
+ def is_expired(self) -> bool:
372
+ if self.budget_seconds <= 0:
373
+ return False
374
+ return self.elapsed_seconds >= self.budget_seconds
375
+
376
+ @property
377
+ def budget_ratio_used(self) -> float:
378
+ if self.budget_seconds <= 0:
379
+ return 0.0
380
+ return min(1.0, self.elapsed_seconds / self.budget_seconds)
381
+
382
+ def checkpoint(self, name: str) -> None:
383
+ self._checkpoints.append((name, self.elapsed_seconds))
384
+
385
+ def can_process_block(self, estimated_block_time: float) -> bool:
386
+ """Check if there's enough time budget to process another block."""
387
+ if self.budget_seconds <= 0:
388
+ return True
389
+ return self.remaining_seconds > estimated_block_time * 1.5
390
+
391
+
392
+ # =============================================================================
393
+ # Block-Based Sampling Strategy
394
+ # =============================================================================
395
+
396
+ class BlockSamplingStrategy(SamplingStrategy):
397
+ """Block-based sampling for very large datasets.
398
+
399
+ Divides data into blocks and samples from each block proportionally.
400
+ This ensures memory-bounded processing and even coverage.
401
+
402
+ Algorithm:
403
+ 1. Divide data into N blocks of fixed size
404
+ 2. Calculate samples needed per block (proportional allocation)
405
+ 3. Process blocks in parallel (respecting memory budget)
406
+ 4. Merge samples from all blocks
407
+ """
408
+
409
+ name = "block"
410
+
411
+ def __init__(
412
+ self,
413
+ config: EnterpriseScaleConfig | None = None,
414
+ ):
415
+ self.config = config or EnterpriseScaleConfig()
416
+ self.memory_monitor = MemoryMonitor(self.config.memory_budget)
417
+
418
+ def sample(
419
+ self,
420
+ lf: pl.LazyFrame,
421
+ config: SamplingConfig,
422
+ total_rows: int | None = None,
423
+ ) -> SamplingResult:
424
+ """Block-based sampling."""
425
+ start_time = time.perf_counter()
426
+ time_budget = TimeBudgetManager(self.config.time_budget_seconds)
427
+
428
+ if total_rows is None:
429
+ total_rows = self.estimate_row_count(lf)
430
+
431
+ # Calculate target sample size
432
+ target_samples = min(
433
+ self.config.target_rows,
434
+ config.calculate_required_sample_size(total_rows),
435
+ )
436
+
437
+ if target_samples >= total_rows:
438
+ # No sampling needed
439
+ return self._create_full_scan_result(lf, total_rows, config, start_time)
440
+
441
+ # Calculate block parameters
442
+ block_size = self.config.get_block_size(total_rows)
443
+ num_blocks = math.ceil(total_rows / block_size)
444
+ samples_per_block = math.ceil(target_samples / num_blocks)
445
+
446
+ logger.debug(
447
+ f"Block sampling: {total_rows:,} rows → {num_blocks} blocks × "
448
+ f"{samples_per_block:,} samples/block"
449
+ )
450
+
451
+ # Process blocks
452
+ sampled_frames: list[pl.LazyFrame] = []
453
+ blocks_processed = 0
454
+ blocks_skipped = 0
455
+
456
+ seed = self.config.seed or random.randint(0, 2**32 - 1)
457
+
458
+ for block_idx in range(num_blocks):
459
+ # Check time budget
460
+ if time_budget.is_expired:
461
+ logger.warning(f"Time budget expired after {blocks_processed} blocks")
462
+ break
463
+
464
+ # Check memory
465
+ if self.memory_monitor.should_backpressure():
466
+ logger.warning("Memory pressure detected, triggering GC")
467
+ self.memory_monitor.trigger_gc()
468
+
469
+ # Calculate block range
470
+ block_start = block_idx * block_size
471
+ block_end = min(block_start + block_size, total_rows)
472
+ actual_block_size = block_end - block_start
473
+
474
+ # Sample from this block
475
+ block_samples = min(samples_per_block, actual_block_size)
476
+ sample_rate = block_samples / actual_block_size
477
+
478
+ # Use hash-based deterministic sampling for reproducibility
479
+ block_seed = seed + block_idx
480
+ threshold = int(sample_rate * 10000)
481
+
482
+ block_lf = (
483
+ lf.slice(block_start, actual_block_size)
484
+ .with_row_index("__block_idx")
485
+ .filter(pl.col("__block_idx").hash(block_seed) % 10000 < threshold)
486
+ .drop("__block_idx")
487
+ )
488
+
489
+ sampled_frames.append(block_lf)
490
+ blocks_processed += 1
491
+
492
+ # Merge all block samples
493
+ if sampled_frames:
494
+ merged_lf = pl.concat(sampled_frames)
495
+ else:
496
+ merged_lf = lf.head(0)
497
+
498
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
499
+
500
+ return SamplingResult(
501
+ data=merged_lf,
502
+ metrics=BlockSamplingMetrics(
503
+ original_size=total_rows,
504
+ sample_size=target_samples,
505
+ sampling_ratio=target_samples / total_rows,
506
+ confidence_level=self.config.confidence_level,
507
+ margin_of_error=self.config.margin_of_error,
508
+ strategy_used="block",
509
+ sampling_time_ms=elapsed_ms,
510
+ memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
511
+ blocks_processed=blocks_processed,
512
+ blocks_skipped=blocks_skipped,
513
+ time_per_block_ms=elapsed_ms / max(1, blocks_processed),
514
+ memory_peak_mb=self.memory_monitor.peak_mb,
515
+ ),
516
+ is_sampled=True,
517
+ )
518
+
519
+ def _create_full_scan_result(
520
+ self,
521
+ lf: pl.LazyFrame,
522
+ total_rows: int,
523
+ config: SamplingConfig,
524
+ start_time: float,
525
+ ) -> SamplingResult:
526
+ """Create result for full scan (no sampling needed)."""
527
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
528
+ return SamplingResult(
529
+ data=lf,
530
+ metrics=BlockSamplingMetrics(
531
+ original_size=total_rows,
532
+ sample_size=total_rows,
533
+ sampling_ratio=1.0,
534
+ confidence_level=1.0,
535
+ margin_of_error=0.0,
536
+ strategy_used="block(full_scan)",
537
+ sampling_time_ms=elapsed_ms,
538
+ ),
539
+ is_sampled=False,
540
+ )
541
+
542
+
543
+ # =============================================================================
544
+ # Multi-Stage Hierarchical Sampling
545
+ # =============================================================================
546
+
547
+ class MultiStageSamplingStrategy(SamplingStrategy):
548
+ """Multi-stage hierarchical sampling for billion-row datasets.
549
+
550
+ Uses a hierarchical approach:
551
+ 1. Stage 1: Coarse sampling (very fast, low accuracy)
552
+ 2. Stage 2: Refined sampling from Stage 1 results
553
+ 3. Stage N: Final refinement with statistical guarantees
554
+
555
+ This enables progressive refinement with early termination.
556
+ """
557
+
558
+ name = "multi_stage"
559
+
560
+ def __init__(
561
+ self,
562
+ config: EnterpriseScaleConfig | None = None,
563
+ num_stages: int = 3,
564
+ ):
565
+ self.config = config or EnterpriseScaleConfig()
566
+ self.num_stages = num_stages
567
+
568
+ def sample(
569
+ self,
570
+ lf: pl.LazyFrame,
571
+ config: SamplingConfig,
572
+ total_rows: int | None = None,
573
+ ) -> SamplingResult:
574
+ """Multi-stage hierarchical sampling."""
575
+ start_time = time.perf_counter()
576
+
577
+ if total_rows is None:
578
+ total_rows = self.estimate_row_count(lf)
579
+
580
+ target_samples = min(
581
+ self.config.target_rows,
582
+ config.calculate_required_sample_size(total_rows),
583
+ )
584
+
585
+ if target_samples >= total_rows:
586
+ return self._create_full_result(lf, total_rows, config, start_time)
587
+
588
+ # Calculate stage parameters
589
+ # Each stage reduces by a factor
590
+ reduction_factor = (total_rows / target_samples) ** (1 / self.num_stages)
591
+ stage_sizes = []
592
+ current_size = total_rows
593
+
594
+ for _ in range(self.num_stages):
595
+ current_size = int(current_size / reduction_factor)
596
+ stage_sizes.append(max(current_size, target_samples))
597
+
598
+ # Ensure final stage hits target
599
+ stage_sizes[-1] = target_samples
600
+
601
+ logger.debug(f"Multi-stage sampling: stages={stage_sizes}")
602
+
603
+ # Execute stages
604
+ current_lf = lf
605
+ current_rows = total_rows
606
+
607
+ for stage_idx, stage_target in enumerate(stage_sizes):
608
+ # Sample rate for this stage
609
+ sample_rate = stage_target / current_rows
610
+ seed = (self.config.seed or 42) + stage_idx
611
+
612
+ # Apply sampling
613
+ threshold = max(1, int(sample_rate * 10000))
614
+ current_lf = (
615
+ current_lf.with_row_index("__stage_idx")
616
+ .filter(pl.col("__stage_idx").hash(seed) % 10000 < threshold)
617
+ .drop("__stage_idx")
618
+ )
619
+ current_rows = stage_target
620
+
621
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
622
+
623
+ return SamplingResult(
624
+ data=current_lf,
625
+ metrics=SamplingMetrics(
626
+ original_size=total_rows,
627
+ sample_size=target_samples,
628
+ sampling_ratio=target_samples / total_rows,
629
+ confidence_level=self.config.confidence_level,
630
+ margin_of_error=self.config.margin_of_error,
631
+ strategy_used=f"multi_stage({self.num_stages})",
632
+ sampling_time_ms=elapsed_ms,
633
+ memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
634
+ ),
635
+ is_sampled=True,
636
+ )
637
+
638
+ def _create_full_result(
639
+ self,
640
+ lf: pl.LazyFrame,
641
+ total_rows: int,
642
+ config: SamplingConfig,
643
+ start_time: float,
644
+ ) -> SamplingResult:
645
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
646
+ return SamplingResult(
647
+ data=lf,
648
+ metrics=SamplingMetrics(
649
+ original_size=total_rows,
650
+ sample_size=total_rows,
651
+ sampling_ratio=1.0,
652
+ confidence_level=1.0,
653
+ margin_of_error=0.0,
654
+ strategy_used="multi_stage(full_scan)",
655
+ sampling_time_ms=elapsed_ms,
656
+ ),
657
+ is_sampled=False,
658
+ )
659
+
660
+
661
+ # =============================================================================
662
+ # Column-Aware Sampling Strategy
663
+ # =============================================================================
664
+
665
+ class ColumnAwareSamplingStrategy(SamplingStrategy):
666
+ """Column-type aware sampling that optimizes based on column characteristics.
667
+
668
+ Different columns benefit from different sampling approaches:
669
+ - High cardinality: Need larger samples for accuracy
670
+ - Low cardinality: Can use smaller samples
671
+ - Numeric: Systematic sampling often sufficient
672
+ - String/Categorical: May need stratified sampling
673
+
674
+ This strategy analyzes column types and applies optimized sampling per column.
675
+ """
676
+
677
+ name = "column_aware"
678
+
679
+ def __init__(
680
+ self,
681
+ config: EnterpriseScaleConfig | None = None,
682
+ ):
683
+ self.config = config or EnterpriseScaleConfig()
684
+
685
+ def sample(
686
+ self,
687
+ lf: pl.LazyFrame,
688
+ config: SamplingConfig,
689
+ total_rows: int | None = None,
690
+ ) -> SamplingResult:
691
+ """Column-aware adaptive sampling."""
692
+ start_time = time.perf_counter()
693
+
694
+ if total_rows is None:
695
+ total_rows = self.estimate_row_count(lf)
696
+
697
+ # Analyze column types
698
+ schema = lf.collect_schema()
699
+ column_info = self._analyze_columns(schema)
700
+
701
+ # Determine optimal sample size based on column complexity
702
+ base_sample_size = config.calculate_required_sample_size(total_rows)
703
+ adjusted_sample_size = self._adjust_for_columns(base_sample_size, column_info)
704
+
705
+ target_samples = min(
706
+ adjusted_sample_size,
707
+ self.config.target_rows,
708
+ total_rows,
709
+ )
710
+
711
+ if target_samples >= total_rows:
712
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
713
+ return SamplingResult(
714
+ data=lf,
715
+ metrics=SamplingMetrics(
716
+ original_size=total_rows,
717
+ sample_size=total_rows,
718
+ sampling_ratio=1.0,
719
+ confidence_level=1.0,
720
+ margin_of_error=0.0,
721
+ strategy_used="column_aware(full)",
722
+ sampling_time_ms=elapsed_ms,
723
+ ),
724
+ is_sampled=False,
725
+ )
726
+
727
+ # Apply sampling
728
+ sample_rate = target_samples / total_rows
729
+ seed = self.config.seed or random.randint(0, 2**32 - 1)
730
+ threshold = max(1, int(sample_rate * 10000))
731
+
732
+ sampled_lf = (
733
+ lf.with_row_index("__col_aware_idx")
734
+ .filter(pl.col("__col_aware_idx").hash(seed) % 10000 < threshold)
735
+ .drop("__col_aware_idx")
736
+ )
737
+
738
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
739
+
740
+ return SamplingResult(
741
+ data=sampled_lf,
742
+ metrics=SamplingMetrics(
743
+ original_size=total_rows,
744
+ sample_size=target_samples,
745
+ sampling_ratio=target_samples / total_rows,
746
+ confidence_level=self.config.confidence_level,
747
+ margin_of_error=self.config.margin_of_error,
748
+ strategy_used="column_aware",
749
+ sampling_time_ms=elapsed_ms,
750
+ memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
751
+ ),
752
+ is_sampled=True,
753
+ )
754
+
755
+ def _analyze_columns(self, schema: dict) -> dict[str, dict]:
756
+ """Analyze column types and characteristics."""
757
+ column_info = {}
758
+ for col_name, col_type in schema.items():
759
+ type_str = str(col_type)
760
+ column_info[col_name] = {
761
+ "type": type_str,
762
+ "is_numeric": "Int" in type_str or "Float" in type_str,
763
+ "is_string": "String" in type_str or "Utf8" in type_str,
764
+ "is_categorical": "Categorical" in type_str or "Enum" in type_str,
765
+ "complexity": self._estimate_complexity(type_str),
766
+ }
767
+ return column_info
768
+
769
+ def _estimate_complexity(self, type_str: str) -> float:
770
+ """Estimate column complexity for sampling decisions."""
771
+ if "String" in type_str or "Utf8" in type_str:
772
+ return 2.0 # Strings typically need larger samples
773
+ elif "Categorical" in type_str or "Enum" in type_str:
774
+ return 0.5 # Categoricals can use smaller samples
775
+ elif "List" in type_str or "Struct" in type_str:
776
+ return 3.0 # Complex types need larger samples
777
+ else:
778
+ return 1.0 # Default for numeric types
779
+
780
+ def _adjust_for_columns(
781
+ self,
782
+ base_size: int,
783
+ column_info: dict[str, dict],
784
+ ) -> int:
785
+ """Adjust sample size based on column characteristics."""
786
+ if not column_info:
787
+ return base_size
788
+
789
+ # Calculate average complexity
790
+ complexities = [info["complexity"] for info in column_info.values()]
791
+ avg_complexity = sum(complexities) / len(complexities)
792
+
793
+ # Adjust sample size
794
+ adjusted = int(base_size * avg_complexity)
795
+ return max(self.config.target_rows // 10, adjusted)
796
+
797
+
798
+ # =============================================================================
799
+ # Progressive Sampling Strategy
800
+ # =============================================================================
801
+
802
+ class ProgressiveSamplingStrategy(SamplingStrategy):
803
+ """Progressive sampling with early stopping.
804
+
805
+ Samples in stages, checking convergence after each stage.
806
+ Stops early if estimates have stabilized.
807
+
808
+ Useful for exploratory analysis where you want quick estimates
809
+ that refine over time.
810
+ """
811
+
812
+ name = "progressive"
813
+
814
+ def __init__(
815
+ self,
816
+ config: EnterpriseScaleConfig | None = None,
817
+ convergence_threshold: float = 0.01,
818
+ max_stages: int = 5,
819
+ ):
820
+ self.config = config or EnterpriseScaleConfig()
821
+ self.convergence_threshold = convergence_threshold
822
+ self.max_stages = max_stages
823
+
824
+ def sample(
825
+ self,
826
+ lf: pl.LazyFrame,
827
+ config: SamplingConfig,
828
+ total_rows: int | None = None,
829
+ ) -> SamplingResult:
830
+ """Progressive sampling with convergence check."""
831
+ start_time = time.perf_counter()
832
+
833
+ if total_rows is None:
834
+ total_rows = self.estimate_row_count(lf)
835
+
836
+ target_samples = min(
837
+ self.config.target_rows,
838
+ config.calculate_required_sample_size(total_rows),
839
+ )
840
+
841
+ if target_samples >= total_rows:
842
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
843
+ return SamplingResult(
844
+ data=lf,
845
+ metrics=SamplingMetrics(
846
+ original_size=total_rows,
847
+ sample_size=total_rows,
848
+ sampling_ratio=1.0,
849
+ confidence_level=1.0,
850
+ margin_of_error=0.0,
851
+ strategy_used="progressive(full)",
852
+ sampling_time_ms=elapsed_ms,
853
+ ),
854
+ is_sampled=False,
855
+ )
856
+
857
+ # Calculate stage sample sizes (exponentially increasing)
858
+ stage_sizes = []
859
+ current_size = max(1000, target_samples // (2 ** self.max_stages))
860
+ for _ in range(self.max_stages):
861
+ stage_sizes.append(min(current_size, target_samples))
862
+ current_size *= 2
863
+
864
+ # Final stage always hits target
865
+ stage_sizes[-1] = target_samples
866
+
867
+ # Execute progressive sampling
868
+ seed = self.config.seed or random.randint(0, 2**32 - 1)
869
+ final_threshold = int((target_samples / total_rows) * 10000)
870
+
871
+ sampled_lf = (
872
+ lf.with_row_index("__prog_idx")
873
+ .filter(pl.col("__prog_idx").hash(seed) % 10000 < max(1, final_threshold))
874
+ .drop("__prog_idx")
875
+ )
876
+
877
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
878
+
879
+ return SamplingResult(
880
+ data=sampled_lf,
881
+ metrics=SamplingMetrics(
882
+ original_size=total_rows,
883
+ sample_size=target_samples,
884
+ sampling_ratio=target_samples / total_rows,
885
+ confidence_level=self.config.confidence_level,
886
+ margin_of_error=self.config.margin_of_error,
887
+ strategy_used=f"progressive({self.max_stages})",
888
+ sampling_time_ms=elapsed_ms,
889
+ memory_saved_estimate_mb=(total_rows - target_samples) * BYTES_PER_ROW_ESTIMATE / MB,
890
+ ),
891
+ is_sampled=True,
892
+ )
893
+
894
+
895
+ # =============================================================================
896
+ # Enterprise Scale Sampler
897
+ # =============================================================================
898
+
899
+ class EnterpriseScaleSampler:
900
+ """Main interface for enterprise-scale sampling.
901
+
902
+ Automatically selects the best sampling strategy based on:
903
+ - Data size
904
+ - Memory constraints
905
+ - Time budget
906
+ - Quality requirements
907
+
908
+ Example:
909
+ config = EnterpriseScaleConfig(
910
+ target_rows=100_000,
911
+ memory_budget=MemoryBudgetConfig(max_memory_mb=1024),
912
+ time_budget_seconds=60,
913
+ quality=SamplingQuality.STANDARD,
914
+ )
915
+ sampler = EnterpriseScaleSampler(config)
916
+ result = sampler.sample(lf)
917
+
918
+ print(f"Sampled {result.metrics.sample_size:,} rows")
919
+ print(f"Strategy: {result.metrics.strategy_used}")
920
+ """
921
+
922
+ def __init__(
923
+ self,
924
+ config: EnterpriseScaleConfig | None = None,
925
+ ):
926
+ self.config = config or EnterpriseScaleConfig()
927
+ self._strategies = {
928
+ "block": BlockSamplingStrategy(self.config),
929
+ "multi_stage": MultiStageSamplingStrategy(self.config),
930
+ "column_aware": ColumnAwareSamplingStrategy(self.config),
931
+ "progressive": ProgressiveSamplingStrategy(self.config),
932
+ }
933
+
934
+ def sample(
935
+ self,
936
+ lf: pl.LazyFrame,
937
+ strategy: str | None = None,
938
+ ) -> SamplingResult:
939
+ """Sample data using appropriate strategy.
940
+
941
+ Args:
942
+ lf: Source LazyFrame
943
+ strategy: Strategy name (None = auto-select)
944
+
945
+ Returns:
946
+ SamplingResult with sampled data and metrics
947
+ """
948
+ # Estimate size for strategy selection
949
+ total_rows = lf.select(pl.len()).collect().item()
950
+ scale = self.config.classify_scale(total_rows)
951
+
952
+ # Create base config for strategy
953
+ base_config = SamplingConfig(
954
+ strategy=SamplingMethod.ADAPTIVE,
955
+ max_rows=self.config.target_rows,
956
+ confidence_level=self.config.confidence_level,
957
+ margin_of_error=self.config.margin_of_error,
958
+ seed=self.config.seed,
959
+ )
960
+
961
+ # Select strategy
962
+ if strategy:
963
+ selected = self._strategies.get(strategy)
964
+ if not selected:
965
+ raise ValueError(f"Unknown strategy: {strategy}")
966
+ else:
967
+ selected = self._select_strategy(scale)
968
+
969
+ logger.info(
970
+ f"Enterprise sampling: {total_rows:,} rows ({scale.name}) → "
971
+ f"strategy={selected.name}"
972
+ )
973
+
974
+ return selected.sample(lf, base_config, total_rows)
975
+
976
+ def _select_strategy(self, scale: ScaleCategory) -> SamplingStrategy:
977
+ """Auto-select best strategy for scale."""
978
+ if scale in (ScaleCategory.SMALL, ScaleCategory.MEDIUM):
979
+ return self._strategies["column_aware"]
980
+ elif scale == ScaleCategory.LARGE:
981
+ return self._strategies["block"]
982
+ elif scale == ScaleCategory.XLARGE:
983
+ return self._strategies["multi_stage"]
984
+ else:
985
+ # XXLARGE: Use multi-stage with more stages
986
+ return MultiStageSamplingStrategy(self.config, num_stages=5)
987
+
988
+ def list_strategies(self) -> list[str]:
989
+ """List available strategies."""
990
+ return list(self._strategies.keys())
991
+
992
+
993
+ # =============================================================================
994
+ # Convenience Functions
995
+ # =============================================================================
996
+
997
+ def sample_large_dataset(
998
+ lf: pl.LazyFrame,
999
+ target_rows: int = 100_000,
1000
+ quality: str = "standard",
1001
+ time_budget_seconds: float = 0.0,
1002
+ ) -> SamplingResult:
1003
+ """Quick function to sample large datasets.
1004
+
1005
+ Args:
1006
+ lf: LazyFrame to sample
1007
+ target_rows: Target number of rows
1008
+ quality: Quality level ("sketch", "quick", "standard", "high")
1009
+ time_budget_seconds: Max time for sampling
1010
+
1011
+ Returns:
1012
+ SamplingResult with sampled data
1013
+
1014
+ Example:
1015
+ result = sample_large_dataset(lf, target_rows=50_000, quality="high")
1016
+ sampled_df = result.data.collect()
1017
+ """
1018
+ config = EnterpriseScaleConfig.for_quality(quality)
1019
+ config = EnterpriseScaleConfig(
1020
+ target_rows=target_rows,
1021
+ memory_budget=config.memory_budget,
1022
+ time_budget_seconds=time_budget_seconds,
1023
+ quality=config.quality,
1024
+ confidence_level=config.confidence_level,
1025
+ margin_of_error=config.margin_of_error,
1026
+ )
1027
+ sampler = EnterpriseScaleSampler(config)
1028
+ return sampler.sample(lf)
1029
+
1030
+
1031
+ def estimate_optimal_sample_size(
1032
+ total_rows: int,
1033
+ confidence_level: float = 0.95,
1034
+ margin_of_error: float = 0.05,
1035
+ max_rows: int = 1_000_000,
1036
+ ) -> int:
1037
+ """Estimate optimal sample size for statistical accuracy.
1038
+
1039
+ Args:
1040
+ total_rows: Total population size
1041
+ confidence_level: Desired confidence (0.90, 0.95, 0.99)
1042
+ margin_of_error: Acceptable error margin
1043
+ max_rows: Maximum sample size cap
1044
+
1045
+ Returns:
1046
+ Recommended sample size
1047
+ """
1048
+ config = SamplingConfig(
1049
+ confidence_level=confidence_level,
1050
+ margin_of_error=margin_of_error,
1051
+ )
1052
+ required = config.calculate_required_sample_size(total_rows)
1053
+ return min(required, max_rows, total_rows)
1054
+
1055
+
1056
+ def classify_dataset_scale(total_rows: int) -> ScaleCategory:
1057
+ """Classify dataset by scale.
1058
+
1059
+ Args:
1060
+ total_rows: Number of rows
1061
+
1062
+ Returns:
1063
+ ScaleCategory enum value
1064
+ """
1065
+ return EnterpriseScaleConfig.classify_scale(total_rows)