truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,976 @@
1
+ """Dask-native execution engine for distributed data validation.
2
+
3
+ This module provides a Dask-native execution engine that:
4
+ - Executes validation operations directly on Dask DataFrames
5
+ - Avoids Polars conversion overhead for distributed operations
6
+ - Uses Arrow for efficient data transfer when conversion is needed
7
+ - Supports distributed aggregations with proper reduce semantics
8
+
9
+ Architecture:
10
+ ┌─────────────────────────────────────────────────────────────────┐
11
+ │ DaskExecutionEngine │
12
+ │ │
13
+ │ ┌──────────────────────────────────────────────────────────┐ │
14
+ │ │ Native Dask Operations │ │
15
+ │ │ (count, aggregate, filter - no conversion overhead) │ │
16
+ │ └──────────────────────────────────────────────────────────┘ │
17
+ │ │ │
18
+ │ ▼ │
19
+ │ ┌──────────────────────────────────────────────────────────┐ │
20
+ │ │ Arrow Bridge (when needed) │ │
21
+ │ │ (zero-copy conversion to Polars for ML validators) │ │
22
+ │ └──────────────────────────────────────────────────────────┘ │
23
+ │ │ │
24
+ │ ▼ │
25
+ │ ┌──────────────────────────────────────────────────────────┐ │
26
+ │ │ Polars LazyFrame (fallback) │ │
27
+ │ │ (only for validators that require Polars operations) │ │
28
+ │ └──────────────────────────────────────────────────────────┘ │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────────┘
31
+
32
+ Example:
33
+ >>> import dask.dataframe as dd
34
+ >>> from truthound.execution.distributed import DaskExecutionEngine
35
+ >>>
36
+ >>> ddf = dd.read_parquet("large_data.parquet")
37
+ >>>
38
+ >>> # Create native Dask engine
39
+ >>> engine = DaskExecutionEngine.from_dataframe(ddf)
40
+ >>>
41
+ >>> # Native Dask operations (no conversion overhead)
42
+ >>> row_count = engine.count_rows()
43
+ >>> null_counts = engine.count_nulls_all()
44
+ >>> stats = engine.get_stats("price")
45
+ >>>
46
+ >>> # Convert to Polars only when needed (via Arrow)
47
+ >>> lf = engine.to_polars_lazyframe()
48
+ """
49
+
50
+ from __future__ import annotations
51
+
52
+ import logging
53
+ import time
54
+ from dataclasses import dataclass, field
55
+ from functools import reduce
56
+ from typing import TYPE_CHECKING, Any, Callable, Iterator
57
+
58
+ from truthound.execution.distributed.base import (
59
+ BaseDistributedEngine,
60
+ DistributedEngineConfig,
61
+ ExecutionMetrics,
62
+ )
63
+ from truthound.execution.distributed.protocols import (
64
+ AggregationScope,
65
+ AggregationSpec,
66
+ ComputeBackend,
67
+ DistributedResult,
68
+ PartitionInfo,
69
+ PartitionStrategy,
70
+ get_aggregator,
71
+ )
72
+
73
+ if TYPE_CHECKING:
74
+ import dask.dataframe as dd
75
+ import pandas as pd
76
+ import pyarrow as pa
77
+ from distributed import Client
78
+
79
+ logger = logging.getLogger(__name__)
80
+
81
+
82
+ # =============================================================================
83
+ # Configuration
84
+ # =============================================================================
85
+
86
+
87
+ @dataclass
88
+ class DaskEngineConfig(DistributedEngineConfig):
89
+ """Configuration for Dask execution engine.
90
+
91
+ Attributes:
92
+ scheduler: Dask scheduler to use ('distributed', 'threads', 'synchronous').
93
+ client_address: Address of distributed scheduler (for distributed mode).
94
+ n_workers: Number of workers (for local cluster).
95
+ threads_per_worker: Threads per worker.
96
+ memory_per_worker: Memory limit per worker.
97
+ processes: Use processes instead of threads.
98
+ dashboard_address: Dashboard address (for distributed mode).
99
+ blocksize: Block size for reading files.
100
+ persist_intermediate: Persist intermediate results.
101
+ """
102
+
103
+ scheduler: str = "threads" # 'distributed', 'threads', 'synchronous'
104
+ client_address: str | None = None
105
+ n_workers: int | None = None
106
+ threads_per_worker: int = 2
107
+ memory_per_worker: str = "2GB"
108
+ processes: bool = False
109
+ dashboard_address: str = ":8787"
110
+ blocksize: str = "128MB"
111
+ persist_intermediate: bool = False
112
+
113
+
114
+ def _check_dask_available() -> None:
115
+ """Check if Dask is available."""
116
+ try:
117
+ import dask.dataframe # noqa: F401
118
+ except ImportError:
119
+ raise ImportError(
120
+ "dask is required for DaskExecutionEngine. "
121
+ "Install with: pip install dask[dataframe] distributed"
122
+ )
123
+
124
+
125
+ # =============================================================================
126
+ # Dask Execution Engine
127
+ # =============================================================================
128
+
129
+
130
+ class DaskExecutionEngine(BaseDistributedEngine[DaskEngineConfig]):
131
+ """Dask-native execution engine for distributed validation.
132
+
133
+ This engine executes validation operations directly on Dask DataFrames,
134
+ avoiding the overhead of converting to Polars for operations that can
135
+ be performed natively in Dask.
136
+
137
+ Key Features:
138
+ - Native Dask aggregations (count, sum, mean, min, max, etc.)
139
+ - Distributed null/duplicate checking
140
+ - Arrow-based zero-copy conversion to Polars when needed
141
+ - Partition-aware operations
142
+ - Lazy evaluation with optimized task graphs
143
+
144
+ Example:
145
+ >>> engine = DaskExecutionEngine.from_dataframe(dask_df)
146
+ >>> null_counts = engine.count_nulls_all() # Native Dask
147
+ >>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
148
+ """
149
+
150
+ engine_type = "dask"
151
+
152
+ def __init__(
153
+ self,
154
+ dask_df: "dd.DataFrame",
155
+ config: DaskEngineConfig | None = None,
156
+ client: "Client | None" = None,
157
+ ) -> None:
158
+ """Initialize Dask execution engine.
159
+
160
+ Args:
161
+ dask_df: Dask DataFrame.
162
+ config: Optional configuration.
163
+ client: Optional Dask distributed client.
164
+ """
165
+ _check_dask_available()
166
+ super().__init__(config)
167
+
168
+ self._ddf = dask_df
169
+ self._client = client
170
+ self._columns = list(dask_df.columns)
171
+ self._cached_row_count: int | None = None
172
+ self._dtypes = dict(dask_df.dtypes)
173
+
174
+ # Initialize distributed client if configured
175
+ self._setup_client()
176
+
177
+ @classmethod
178
+ def _default_config(cls) -> DaskEngineConfig:
179
+ """Create default configuration."""
180
+ return DaskEngineConfig()
181
+
182
+ def _setup_client(self) -> None:
183
+ """Set up Dask distributed client if needed."""
184
+ if self._config.scheduler == "distributed" and self._client is None:
185
+ try:
186
+ from distributed import Client
187
+
188
+ if self._config.client_address:
189
+ self._client = Client(self._config.client_address)
190
+ else:
191
+ self._client = Client(
192
+ n_workers=self._config.n_workers,
193
+ threads_per_worker=self._config.threads_per_worker,
194
+ memory_limit=self._config.memory_per_worker,
195
+ processes=self._config.processes,
196
+ dashboard_address=self._config.dashboard_address,
197
+ )
198
+ except ImportError:
199
+ logger.warning(
200
+ "distributed not installed. Using default scheduler."
201
+ )
202
+
203
+ # -------------------------------------------------------------------------
204
+ # Factory Methods
205
+ # -------------------------------------------------------------------------
206
+
207
+ @classmethod
208
+ def from_dataframe(
209
+ cls,
210
+ ddf: "dd.DataFrame",
211
+ config: DaskEngineConfig | None = None,
212
+ client: "Client | None" = None,
213
+ ) -> "DaskExecutionEngine":
214
+ """Create engine from existing Dask DataFrame.
215
+
216
+ Args:
217
+ ddf: Dask DataFrame.
218
+ config: Optional configuration.
219
+ client: Optional distributed client.
220
+
221
+ Returns:
222
+ DaskExecutionEngine instance.
223
+ """
224
+ return cls(ddf, config, client)
225
+
226
+ @classmethod
227
+ def from_parquet(
228
+ cls,
229
+ path: str,
230
+ config: DaskEngineConfig | None = None,
231
+ client: "Client | None" = None,
232
+ **read_kwargs: Any,
233
+ ) -> "DaskExecutionEngine":
234
+ """Create engine from Parquet files.
235
+
236
+ Args:
237
+ path: Path to Parquet files (can use glob patterns).
238
+ config: Optional configuration.
239
+ client: Optional distributed client.
240
+ **read_kwargs: Additional arguments for read_parquet.
241
+
242
+ Returns:
243
+ DaskExecutionEngine instance.
244
+ """
245
+ _check_dask_available()
246
+ import dask.dataframe as dd
247
+
248
+ cfg = config or DaskEngineConfig()
249
+ ddf = dd.read_parquet(path, blocksize=cfg.blocksize, **read_kwargs)
250
+
251
+ return cls(ddf, config, client)
252
+
253
+ @classmethod
254
+ def from_csv(
255
+ cls,
256
+ path: str,
257
+ config: DaskEngineConfig | None = None,
258
+ client: "Client | None" = None,
259
+ **read_kwargs: Any,
260
+ ) -> "DaskExecutionEngine":
261
+ """Create engine from CSV files.
262
+
263
+ Args:
264
+ path: Path to CSV files (can use glob patterns).
265
+ config: Optional configuration.
266
+ client: Optional distributed client.
267
+ **read_kwargs: Additional arguments for read_csv.
268
+
269
+ Returns:
270
+ DaskExecutionEngine instance.
271
+ """
272
+ _check_dask_available()
273
+ import dask.dataframe as dd
274
+
275
+ cfg = config or DaskEngineConfig()
276
+ ddf = dd.read_csv(path, blocksize=cfg.blocksize, **read_kwargs)
277
+
278
+ return cls(ddf, config, client)
279
+
280
+ @classmethod
281
+ def from_pandas(
282
+ cls,
283
+ pdf: "pd.DataFrame",
284
+ npartitions: int = 4,
285
+ config: DaskEngineConfig | None = None,
286
+ client: "Client | None" = None,
287
+ ) -> "DaskExecutionEngine":
288
+ """Create engine from Pandas DataFrame.
289
+
290
+ Args:
291
+ pdf: Pandas DataFrame.
292
+ npartitions: Number of partitions.
293
+ config: Optional configuration.
294
+ client: Optional distributed client.
295
+
296
+ Returns:
297
+ DaskExecutionEngine instance.
298
+ """
299
+ _check_dask_available()
300
+ import dask.dataframe as dd
301
+
302
+ ddf = dd.from_pandas(pdf, npartitions=npartitions)
303
+
304
+ return cls(ddf, config, client)
305
+
306
+ # -------------------------------------------------------------------------
307
+ # Properties
308
+ # -------------------------------------------------------------------------
309
+
310
+ @property
311
+ def backend_type(self) -> ComputeBackend:
312
+ """Get the compute backend type."""
313
+ return ComputeBackend.DASK
314
+
315
+ @property
316
+ def dask_dataframe(self) -> "dd.DataFrame":
317
+ """Get the underlying Dask DataFrame."""
318
+ return self._ddf
319
+
320
+ @property
321
+ def client(self) -> "Client | None":
322
+ """Get the distributed client."""
323
+ return self._client
324
+
325
+ @property
326
+ def supports_sql_pushdown(self) -> bool:
327
+ """Dask has limited SQL pushdown support via dask-sql."""
328
+ return False
329
+
330
+ # -------------------------------------------------------------------------
331
+ # Abstract Method Implementations
332
+ # -------------------------------------------------------------------------
333
+
334
+ def _get_partition_count(self) -> int:
335
+ """Get number of data partitions."""
336
+ return self._ddf.npartitions
337
+
338
+ def _get_partition_info(self) -> list[PartitionInfo]:
339
+ """Get information about all partitions.
340
+
341
+ Note: Dask doesn't expose partition boundaries easily,
342
+ so we return estimated information.
343
+ """
344
+ num_partitions = self._get_partition_count()
345
+ columns = tuple(self._columns)
346
+
347
+ return [
348
+ PartitionInfo(
349
+ partition_id=i,
350
+ total_partitions=num_partitions,
351
+ columns=columns,
352
+ )
353
+ for i in range(num_partitions)
354
+ ]
355
+
356
+ def _execute_on_partitions(
357
+ self,
358
+ operation: str,
359
+ func: Callable[[Any], dict[str, Any]],
360
+ columns: list[str] | None = None,
361
+ ) -> list[DistributedResult]:
362
+ """Execute function on all partitions using map_partitions.
363
+
364
+ Args:
365
+ operation: Operation name for metrics.
366
+ func: Function to apply to each partition (receives pandas DataFrame).
367
+ columns: Columns to include (None = all).
368
+
369
+ Returns:
370
+ Results from all partitions.
371
+ """
372
+ import pandas as pd
373
+
374
+ metrics = self._start_metrics(operation)
375
+
376
+ try:
377
+ ddf = self._ddf
378
+ if columns:
379
+ ddf = ddf[columns]
380
+
381
+ # Map partitions - func receives pandas DataFrame
382
+ def wrapped_func(pdf: pd.DataFrame, partition_info: dict | None = None) -> pd.DataFrame:
383
+ start_time = time.time()
384
+ result = func(pdf)
385
+ duration_ms = (time.time() - start_time) * 1000
386
+
387
+ partition_id = 0
388
+ if partition_info:
389
+ partition_id = partition_info.get("number", 0)
390
+
391
+ return pd.DataFrame([{
392
+ "partition_id": partition_id,
393
+ "value": result.get("value"),
394
+ "row_count": len(pdf),
395
+ "duration_ms": duration_ms,
396
+ "errors": result.get("errors", []),
397
+ "metadata": result.get("metadata", {}),
398
+ }])
399
+
400
+ results_ddf = ddf.map_partitions(
401
+ wrapped_func,
402
+ meta=pd.DataFrame({
403
+ "partition_id": pd.Series(dtype=int),
404
+ "value": pd.Series(dtype=object),
405
+ "row_count": pd.Series(dtype=int),
406
+ "duration_ms": pd.Series(dtype=float),
407
+ "errors": pd.Series(dtype=object),
408
+ "metadata": pd.Series(dtype=object),
409
+ }),
410
+ )
411
+
412
+ results_pdf = results_ddf.compute()
413
+
414
+ results = []
415
+ total_rows = 0
416
+ for _, row in results_pdf.iterrows():
417
+ row_count = row["row_count"]
418
+ total_rows += row_count
419
+ results.append(
420
+ DistributedResult(
421
+ partition_id=row["partition_id"],
422
+ operation=operation,
423
+ value=row["value"],
424
+ row_count=row_count,
425
+ duration_ms=row["duration_ms"],
426
+ errors=row["errors"] if row["errors"] else [],
427
+ metadata=row["metadata"] if row["metadata"] else {},
428
+ )
429
+ )
430
+
431
+ metrics.partitions_processed = len(results)
432
+ metrics.rows_processed = total_rows
433
+
434
+ return results
435
+
436
+ except Exception as e:
437
+ metrics.errors.append(str(e))
438
+ raise
439
+ finally:
440
+ self._end_metrics(metrics)
441
+
442
+ def _aggregate_distributed(
443
+ self,
444
+ spec: AggregationSpec,
445
+ ) -> dict[str, Any]:
446
+ """Perform distributed aggregation using native Dask operations.
447
+
448
+ Args:
449
+ spec: Aggregation specification.
450
+
451
+ Returns:
452
+ Aggregated results.
453
+ """
454
+ metrics = self._start_metrics("aggregate")
455
+
456
+ try:
457
+ results = {}
458
+
459
+ for agg in spec.aggregations:
460
+ column = agg.column
461
+ operation = agg.operation
462
+ alias = agg.alias
463
+ params = agg.params
464
+
465
+ if operation == "count":
466
+ if column == "*":
467
+ value = len(self._ddf)
468
+ else:
469
+ value = self._ddf[column].count().compute()
470
+ results[alias] = value
471
+
472
+ elif operation == "sum":
473
+ value = self._ddf[column].sum().compute()
474
+ results[alias] = value
475
+
476
+ elif operation == "mean":
477
+ value = self._ddf[column].mean().compute()
478
+ results[alias] = value
479
+
480
+ elif operation == "min":
481
+ value = self._ddf[column].min().compute()
482
+ results[alias] = value
483
+
484
+ elif operation == "max":
485
+ value = self._ddf[column].max().compute()
486
+ results[alias] = value
487
+
488
+ elif operation == "std":
489
+ ddof = params.get("ddof", 1)
490
+ value = self._ddf[column].std(ddof=ddof).compute()
491
+ results[alias] = value
492
+
493
+ elif operation == "var":
494
+ ddof = params.get("ddof", 1)
495
+ value = self._ddf[column].var(ddof=ddof).compute()
496
+ results[alias] = value
497
+
498
+ elif operation == "minmax":
499
+ min_val = self._ddf[column].min().compute()
500
+ max_val = self._ddf[column].max().compute()
501
+ results[alias] = {"min": min_val, "max": max_val}
502
+
503
+ elif operation == "null_count":
504
+ null_count = self._ddf[column].isna().sum().compute()
505
+ total_count = len(self._ddf)
506
+ results[alias] = {
507
+ "null_count": int(null_count),
508
+ "total_count": total_count,
509
+ }
510
+
511
+ elif operation == "distinct_count":
512
+ value = self._ddf[column].nunique().compute()
513
+ results[alias] = int(value)
514
+
515
+ else:
516
+ # Use custom aggregator via map-reduce
517
+ result = self._aggregate_with_aggregator(agg)
518
+ results[alias] = result
519
+
520
+ return results
521
+
522
+ except Exception as e:
523
+ metrics.errors.append(str(e))
524
+ raise
525
+ finally:
526
+ self._end_metrics(metrics)
527
+
528
+ def _aggregate_with_aggregator(
529
+ self,
530
+ agg: Any,
531
+ ) -> Any:
532
+ """Perform aggregation using custom aggregator via map-reduce.
533
+
534
+ Args:
535
+ agg: Aggregation specification.
536
+
537
+ Returns:
538
+ Aggregated result.
539
+ """
540
+ import pandas as pd
541
+
542
+ aggregator = get_aggregator(agg.operation, **agg.params)
543
+ column = agg.column
544
+
545
+ # Map phase: compute partial aggregates per partition
546
+ def map_partition(pdf: pd.DataFrame) -> pd.DataFrame:
547
+ state = aggregator.initialize()
548
+ for value in pdf[column]:
549
+ state = aggregator.accumulate(state, value)
550
+ return pd.DataFrame([{"state": state}])
551
+
552
+ partial_results = self._ddf.map_partitions(
553
+ map_partition,
554
+ meta=pd.DataFrame({"state": pd.Series(dtype=object)}),
555
+ ).compute()
556
+
557
+ states = partial_results["state"].tolist()
558
+
559
+ # Reduce phase: merge all partial results
560
+ if not states:
561
+ return aggregator.finalize(aggregator.initialize())
562
+
563
+ final_state = reduce(aggregator.merge, states)
564
+ return aggregator.finalize(final_state)
565
+
566
+ def _to_arrow_batches(
567
+ self,
568
+ batch_size: int | None = None,
569
+ ) -> list["pa.RecordBatch"]:
570
+ """Convert Dask DataFrame to Arrow batches.
571
+
572
+ Args:
573
+ batch_size: Batch size for conversion.
574
+
575
+ Returns:
576
+ List of Arrow record batches.
577
+ """
578
+ import pyarrow as pa
579
+
580
+ batch_size = batch_size or self._config.arrow_batch_size
581
+
582
+ try:
583
+ # Dask has native Arrow support via to_arrow
584
+ # This works when pyarrow is installed
585
+ table = self._ddf.compute().to_arrow()
586
+ return table.to_batches(max_chunksize=batch_size)
587
+ except AttributeError:
588
+ # Fallback: Convert via Pandas
589
+ logger.debug("Falling back to Pandas-based Arrow conversion")
590
+ pdf = self._ddf.compute()
591
+ table = pa.Table.from_pandas(pdf)
592
+ return table.to_batches(max_chunksize=batch_size)
593
+
594
+ def _repartition(self, num_partitions: int) -> "DaskExecutionEngine":
595
+ """Repartition the underlying DataFrame.
596
+
597
+ Args:
598
+ num_partitions: New number of partitions.
599
+
600
+ Returns:
601
+ New engine with repartitioned data.
602
+ """
603
+ repartitioned = self._ddf.repartition(npartitions=num_partitions)
604
+ return DaskExecutionEngine(repartitioned, self._config, self._client)
605
+
606
+ def coalesce(self, num_partitions: int) -> "DaskExecutionEngine":
607
+ """Coalesce partitions (no shuffle when reducing).
608
+
609
+ Args:
610
+ num_partitions: New number of partitions.
611
+
612
+ Returns:
613
+ New engine with coalesced data.
614
+ """
615
+ # Dask's repartition with fewer partitions is similar to coalesce
616
+ coalesced = self._ddf.repartition(npartitions=num_partitions)
617
+ return DaskExecutionEngine(coalesced, self._config, self._client)
618
+
619
+ # -------------------------------------------------------------------------
620
+ # Core Operation Overrides (Native Dask)
621
+ # -------------------------------------------------------------------------
622
+
623
+ def count_rows(self) -> int:
624
+ """Count rows using native Dask len."""
625
+ if self._cached_row_count is not None:
626
+ return self._cached_row_count
627
+
628
+ cache_key = self._cache_key("count_rows")
629
+ cached = self._get_cached(cache_key)
630
+ if cached is not None:
631
+ return cached
632
+
633
+ count = len(self._ddf)
634
+ self._cached_row_count = count
635
+ self._set_cached(cache_key, count)
636
+ return count
637
+
638
+ def get_columns(self) -> list[str]:
639
+ """Get column names."""
640
+ return self._columns
641
+
642
+ def count_nulls(self, column: str) -> int:
643
+ """Count nulls using native Dask isna."""
644
+ cache_key = self._cache_key("count_nulls", column)
645
+ cached = self._get_cached(cache_key)
646
+ if cached is not None:
647
+ return cached
648
+
649
+ count = int(self._ddf[column].isna().sum().compute())
650
+ self._set_cached(cache_key, count)
651
+ return count
652
+
653
+ def count_nulls_all(self) -> dict[str, int]:
654
+ """Count nulls in all columns using batch aggregation."""
655
+ cache_key = self._cache_key("count_nulls_all")
656
+ cached = self._get_cached(cache_key)
657
+ if cached is not None:
658
+ return cached
659
+
660
+ # Compute all null counts in parallel
661
+ results = {}
662
+ for col in self._columns:
663
+ results[col] = self._ddf[col].isna().sum()
664
+
665
+ # Compute all at once
666
+ import dask
667
+
668
+ computed = dask.compute(results)[0]
669
+ result = {col: int(val) for col, val in computed.items()}
670
+
671
+ self._set_cached(cache_key, result)
672
+ return result
673
+
674
+ def count_distinct(self, column: str) -> int:
675
+ """Count distinct values using native Dask nunique."""
676
+ cache_key = self._cache_key("count_distinct", column)
677
+ cached = self._get_cached(cache_key)
678
+ if cached is not None:
679
+ return cached
680
+
681
+ count = int(self._ddf[column].nunique().compute())
682
+ self._set_cached(cache_key, count)
683
+ return count
684
+
685
+ def get_stats(self, column: str) -> dict[str, Any]:
686
+ """Get column statistics using native Dask aggregations."""
687
+ cache_key = self._cache_key("get_stats", column)
688
+ cached = self._get_cached(cache_key)
689
+ if cached is not None:
690
+ return cached
691
+
692
+ # Compute all stats in parallel
693
+ col = self._ddf[column]
694
+ computations = {
695
+ "count": col.count(),
696
+ "null_count": col.isna().sum(),
697
+ "mean": col.mean(),
698
+ "std": col.std(),
699
+ "min": col.min(),
700
+ "max": col.max(),
701
+ }
702
+
703
+ import dask
704
+
705
+ computed = dask.compute(computations)[0]
706
+
707
+ stats = {
708
+ "count": int(computed["count"]),
709
+ "null_count": int(computed["null_count"]),
710
+ "mean": float(computed["mean"]) if computed["mean"] is not None else None,
711
+ "std": float(computed["std"]) if computed["std"] is not None else None,
712
+ "min": computed["min"],
713
+ "max": computed["max"],
714
+ }
715
+
716
+ self._set_cached(cache_key, stats)
717
+ return stats
718
+
719
+ def get_quantiles(
720
+ self,
721
+ column: str,
722
+ quantiles: list[float],
723
+ ) -> list[float]:
724
+ """Get quantiles using Dask's quantile method."""
725
+ cache_key = self._cache_key("get_quantiles", column, tuple(quantiles))
726
+ cached = self._get_cached(cache_key)
727
+ if cached is not None:
728
+ return cached
729
+
730
+ result = self._ddf[column].quantile(quantiles).compute()
731
+ result_list = list(result)
732
+
733
+ self._set_cached(cache_key, result_list)
734
+ return result_list
735
+
736
+ def get_value_counts(
737
+ self,
738
+ column: str,
739
+ limit: int | None = None,
740
+ ) -> dict[Any, int]:
741
+ """Get value counts using native Dask value_counts."""
742
+ cache_key = self._cache_key("get_value_counts", column, limit)
743
+ cached = self._get_cached(cache_key)
744
+ if cached is not None:
745
+ return cached
746
+
747
+ counts = self._ddf[column].value_counts()
748
+
749
+ if limit:
750
+ counts = counts.head(limit, npartitions=-1, compute=False)
751
+
752
+ result_series = counts.compute()
753
+ result = dict(result_series)
754
+
755
+ self._set_cached(cache_key, result)
756
+ return result
757
+
758
+ def count_duplicates(self, columns: list[str]) -> int:
759
+ """Count duplicates using native Dask operations."""
760
+ cache_key = self._cache_key("count_duplicates", tuple(columns))
761
+ cached = self._get_cached(cache_key)
762
+ if cached is not None:
763
+ return cached
764
+
765
+ total = self.count_rows()
766
+ unique = len(self._ddf[columns].drop_duplicates())
767
+ duplicates = total - unique
768
+
769
+ self._set_cached(cache_key, duplicates)
770
+ return duplicates
771
+
772
+ def count_matching_regex(self, column: str, pattern: str) -> int:
773
+ """Count values matching regex."""
774
+ cache_key = self._cache_key("count_matching_regex", column, pattern)
775
+ cached = self._get_cached(cache_key)
776
+ if cached is not None:
777
+ return cached
778
+
779
+ count = int(
780
+ self._ddf[column]
781
+ .str.match(pattern, na=False)
782
+ .sum()
783
+ .compute()
784
+ )
785
+
786
+ self._set_cached(cache_key, count)
787
+ return count
788
+
789
+ def count_in_range(
790
+ self,
791
+ column: str,
792
+ min_value: Any | None = None,
793
+ max_value: Any | None = None,
794
+ inclusive: bool = True,
795
+ ) -> int:
796
+ """Count values in range using native Dask filter."""
797
+ cache_key = self._cache_key(
798
+ "count_in_range", column, min_value, max_value, inclusive
799
+ )
800
+ cached = self._get_cached(cache_key)
801
+ if cached is not None:
802
+ return cached
803
+
804
+ series = self._ddf[column]
805
+ mask = None
806
+
807
+ if min_value is not None:
808
+ if inclusive:
809
+ mask = series >= min_value
810
+ else:
811
+ mask = series > min_value
812
+
813
+ if max_value is not None:
814
+ max_mask = series <= max_value if inclusive else series < max_value
815
+ mask = mask & max_mask if mask is not None else max_mask
816
+
817
+ if mask is None:
818
+ count = self.count_rows()
819
+ else:
820
+ count = int(mask.sum().compute())
821
+
822
+ self._set_cached(cache_key, count)
823
+ return count
824
+
825
+ def count_in_set(self, column: str, values: set[Any]) -> int:
826
+ """Count values in set using Dask isin."""
827
+ cache_key = self._cache_key("count_in_set", column, frozenset(values))
828
+ cached = self._get_cached(cache_key)
829
+ if cached is not None:
830
+ return cached
831
+
832
+ count = int(self._ddf[column].isin(list(values)).sum().compute())
833
+ self._set_cached(cache_key, count)
834
+ return count
835
+
836
+ # -------------------------------------------------------------------------
837
+ # Sampling
838
+ # -------------------------------------------------------------------------
839
+
840
+ def sample(
841
+ self,
842
+ n: int = 1000,
843
+ seed: int | None = None,
844
+ ) -> "DaskExecutionEngine":
845
+ """Create sampled engine using Dask's native sampling.
846
+
847
+ Args:
848
+ n: Target number of rows.
849
+ seed: Random seed.
850
+
851
+ Returns:
852
+ New engine with sampled data.
853
+ """
854
+ row_count = self.count_rows()
855
+
856
+ if row_count <= n:
857
+ return self
858
+
859
+ fraction = min((n * 1.1) / row_count, 1.0)
860
+
861
+ sampled = self._ddf.sample(
862
+ frac=fraction,
863
+ random_state=seed,
864
+ )
865
+
866
+ # Limit to exact n rows
867
+ sampled = sampled.head(n, npartitions=-1, compute=False)
868
+
869
+ return DaskExecutionEngine(sampled, self._config, self._client)
870
+
871
+ # -------------------------------------------------------------------------
872
+ # Dask-Specific Methods
873
+ # -------------------------------------------------------------------------
874
+
875
+ def persist(self) -> "DaskExecutionEngine":
876
+ """Persist the DataFrame in distributed memory.
877
+
878
+ Returns:
879
+ Self after persisting.
880
+ """
881
+ self._ddf = self._ddf.persist()
882
+ return self
883
+
884
+ def compute(self) -> "pd.DataFrame":
885
+ """Compute and return as Pandas DataFrame.
886
+
887
+ Returns:
888
+ Pandas DataFrame.
889
+ """
890
+ return self._ddf.compute()
891
+
892
+ def visualize(
893
+ self,
894
+ filename: str = "dask_graph",
895
+ format: str = "png",
896
+ ) -> str:
897
+ """Visualize the task graph.
898
+
899
+ Args:
900
+ filename: Output filename (without extension).
901
+ format: Output format (png, svg, pdf).
902
+
903
+ Returns:
904
+ Path to the generated file.
905
+ """
906
+ return self._ddf.visualize(filename=filename, format=format)
907
+
908
+ def filter(self, condition: str) -> "DaskExecutionEngine":
909
+ """Filter the DataFrame using a query string.
910
+
911
+ Args:
912
+ condition: Query condition string.
913
+
914
+ Returns:
915
+ New engine with filtered data.
916
+ """
917
+ filtered = self._ddf.query(condition)
918
+ return DaskExecutionEngine(filtered, self._config, self._client)
919
+
920
+ def select(self, columns: list[str]) -> "DaskExecutionEngine":
921
+ """Select specific columns.
922
+
923
+ Args:
924
+ columns: Columns to select.
925
+
926
+ Returns:
927
+ New engine with selected columns.
928
+ """
929
+ selected = self._ddf[columns]
930
+ return DaskExecutionEngine(selected, self._config, self._client)
931
+
932
+ def head(self, n: int = 5) -> "pd.DataFrame":
933
+ """Get first n rows as Pandas DataFrame.
934
+
935
+ Args:
936
+ n: Number of rows.
937
+
938
+ Returns:
939
+ Pandas DataFrame.
940
+ """
941
+ return self._ddf.head(n)
942
+
943
+ def tail(self, n: int = 5) -> "pd.DataFrame":
944
+ """Get last n rows as Pandas DataFrame.
945
+
946
+ Args:
947
+ n: Number of rows.
948
+
949
+ Returns:
950
+ Pandas DataFrame.
951
+ """
952
+ return self._ddf.tail(n)
953
+
954
+ def describe(self) -> "pd.DataFrame":
955
+ """Get descriptive statistics.
956
+
957
+ Returns:
958
+ Pandas DataFrame with statistics.
959
+ """
960
+ return self._ddf.describe().compute()
961
+
962
+ # -------------------------------------------------------------------------
963
+ # Context Manager
964
+ # -------------------------------------------------------------------------
965
+
966
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
967
+ """Context manager exit - cleanup client if we created it."""
968
+ super().__exit__(exc_type, exc_val, exc_tb)
969
+ # Note: We don't close the client here as it might be shared
970
+ # Users should manage client lifecycle separately
971
+
972
+ def close(self) -> None:
973
+ """Close the distributed client if it exists."""
974
+ if self._client is not None:
975
+ self._client.close()
976
+ self._client = None