truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1127 @@
1
+ """Ray-native execution engine for distributed data validation.
2
+
3
+ This module provides a Ray-native execution engine that:
4
+ - Executes validation operations directly on Ray Datasets
5
+ - Avoids Polars conversion overhead for distributed operations
6
+ - Uses Arrow for efficient data transfer when conversion is needed
7
+ - Supports distributed aggregations with proper reduce semantics
8
+
9
+ Architecture:
10
+ ┌─────────────────────────────────────────────────────────────────┐
11
+ │ RayExecutionEngine │
12
+ │ │
13
+ │ ┌──────────────────────────────────────────────────────────┐ │
14
+ │ │ Native Ray Operations │ │
15
+ │ │ (count, aggregate, filter - no conversion overhead) │ │
16
+ │ └──────────────────────────────────────────────────────────┘ │
17
+ │ │ │
18
+ │ ▼ │
19
+ │ ┌──────────────────────────────────────────────────────────┐ │
20
+ │ │ Arrow Bridge (when needed) │ │
21
+ │ │ (zero-copy conversion to Polars for ML validators) │ │
22
+ │ └──────────────────────────────────────────────────────────┘ │
23
+ │ │ │
24
+ │ ▼ │
25
+ │ ┌──────────────────────────────────────────────────────────┐ │
26
+ │ │ Polars LazyFrame (fallback) │ │
27
+ │ │ (only for validators that require Polars operations) │ │
28
+ │ └──────────────────────────────────────────────────────────┘ │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────────┘
31
+
32
+ Example:
33
+ >>> import ray
34
+ >>> from truthound.execution.distributed import RayExecutionEngine
35
+ >>>
36
+ >>> ray.init()
37
+ >>> ds = ray.data.read_parquet("large_data.parquet")
38
+ >>>
39
+ >>> # Create native Ray engine
40
+ >>> engine = RayExecutionEngine.from_dataset(ds)
41
+ >>>
42
+ >>> # Native Ray operations (no conversion overhead)
43
+ >>> row_count = engine.count_rows()
44
+ >>> null_counts = engine.count_nulls_all()
45
+ >>> stats = engine.get_stats("price")
46
+ >>>
47
+ >>> # Convert to Polars only when needed (via Arrow)
48
+ >>> lf = engine.to_polars_lazyframe()
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import logging
54
+ import time
55
+ from dataclasses import dataclass, field
56
+ from functools import reduce
57
+ from typing import TYPE_CHECKING, Any, Callable, Iterator
58
+
59
+ from truthound.execution.distributed.base import (
60
+ BaseDistributedEngine,
61
+ DistributedEngineConfig,
62
+ ExecutionMetrics,
63
+ )
64
+ from truthound.execution.distributed.protocols import (
65
+ AggregationScope,
66
+ AggregationSpec,
67
+ ComputeBackend,
68
+ DistributedResult,
69
+ PartitionInfo,
70
+ PartitionStrategy,
71
+ get_aggregator,
72
+ )
73
+
74
+ if TYPE_CHECKING:
75
+ import pyarrow as pa
76
+ import ray
77
+ from ray.data import Dataset
78
+
79
+ logger = logging.getLogger(__name__)
80
+
81
+
82
+ # =============================================================================
83
+ # Configuration
84
+ # =============================================================================
85
+
86
+
87
+ @dataclass
88
+ class RayEngineConfig(DistributedEngineConfig):
89
+ """Configuration for Ray execution engine.
90
+
91
+ Attributes:
92
+ ray_address: Ray cluster address (None = local).
93
+ num_cpus: Number of CPUs to use.
94
+ num_gpus: Number of GPUs to use.
95
+ object_store_memory: Object store memory in bytes.
96
+ batch_size: Batch size for iterating over data.
97
+ prefetch_batches: Number of batches to prefetch.
98
+ concurrency: Number of concurrent tasks for map operations.
99
+ use_actors: Use actor pool for better resource utilization.
100
+ actor_pool_size: Size of actor pool.
101
+ target_max_block_size: Target max block size in bytes.
102
+ """
103
+
104
+ ray_address: str | None = None
105
+ num_cpus: int | None = None
106
+ num_gpus: int | None = None
107
+ object_store_memory: int | None = None
108
+ batch_size: int = 4096
109
+ prefetch_batches: int = 2
110
+ concurrency: int | None = None
111
+ use_actors: bool = False
112
+ actor_pool_size: int = 4
113
+ target_max_block_size: int = 128 * 1024 * 1024 # 128MB
114
+
115
+
116
+ def _check_ray_available() -> None:
117
+ """Check if Ray is available."""
118
+ try:
119
+ import ray # noqa: F401
120
+ import ray.data # noqa: F401
121
+ except ImportError:
122
+ raise ImportError(
123
+ "ray is required for RayExecutionEngine. "
124
+ "Install with: pip install 'ray[data]'"
125
+ )
126
+
127
+
128
+ def _ensure_ray_initialized(config: RayEngineConfig) -> None:
129
+ """Ensure Ray is initialized."""
130
+ import ray
131
+
132
+ if not ray.is_initialized():
133
+ init_kwargs = {}
134
+ if config.ray_address:
135
+ init_kwargs["address"] = config.ray_address
136
+ if config.num_cpus:
137
+ init_kwargs["num_cpus"] = config.num_cpus
138
+ if config.num_gpus:
139
+ init_kwargs["num_gpus"] = config.num_gpus
140
+ if config.object_store_memory:
141
+ init_kwargs["object_store_memory"] = config.object_store_memory
142
+
143
+ ray.init(**init_kwargs)
144
+
145
+
146
+ # =============================================================================
147
+ # Ray Execution Engine
148
+ # =============================================================================
149
+
150
+
151
+ class RayExecutionEngine(BaseDistributedEngine[RayEngineConfig]):
152
+ """Ray-native execution engine for distributed validation.
153
+
154
+ This engine executes validation operations directly on Ray Datasets,
155
+ avoiding the overhead of converting to Polars for operations that can
156
+ be performed natively in Ray.
157
+
158
+ Key Features:
159
+ - Native Ray aggregations (count, sum, mean, min, max, etc.)
160
+ - Distributed null/duplicate checking
161
+ - Arrow-based zero-copy conversion to Polars when needed
162
+ - Block-aware operations
163
+ - Automatic scaling and fault tolerance
164
+
165
+ Example:
166
+ >>> engine = RayExecutionEngine.from_dataset(ray_dataset)
167
+ >>> null_counts = engine.count_nulls_all() # Native Ray
168
+ >>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
169
+ """
170
+
171
+ engine_type = "ray"
172
+
173
+ def __init__(
174
+ self,
175
+ dataset: "Dataset",
176
+ config: RayEngineConfig | None = None,
177
+ ) -> None:
178
+ """Initialize Ray execution engine.
179
+
180
+ Args:
181
+ dataset: Ray Dataset.
182
+ config: Optional configuration.
183
+ """
184
+ _check_ray_available()
185
+ super().__init__(config)
186
+
187
+ _ensure_ray_initialized(self._config)
188
+
189
+ self._ds = dataset
190
+ self._schema = dataset.schema()
191
+ self._columns = list(self._schema.names) if self._schema else []
192
+ self._cached_row_count: int | None = None
193
+
194
+ @classmethod
195
+ def _default_config(cls) -> RayEngineConfig:
196
+ """Create default configuration."""
197
+ return RayEngineConfig()
198
+
199
+ # -------------------------------------------------------------------------
200
+ # Factory Methods
201
+ # -------------------------------------------------------------------------
202
+
203
+ @classmethod
204
+ def from_dataset(
205
+ cls,
206
+ dataset: "Dataset",
207
+ config: RayEngineConfig | None = None,
208
+ ) -> "RayExecutionEngine":
209
+ """Create engine from existing Ray Dataset.
210
+
211
+ Args:
212
+ dataset: Ray Dataset.
213
+ config: Optional configuration.
214
+
215
+ Returns:
216
+ RayExecutionEngine instance.
217
+ """
218
+ return cls(dataset, config)
219
+
220
+ @classmethod
221
+ def from_parquet(
222
+ cls,
223
+ path: str,
224
+ config: RayEngineConfig | None = None,
225
+ **read_kwargs: Any,
226
+ ) -> "RayExecutionEngine":
227
+ """Create engine from Parquet files.
228
+
229
+ Args:
230
+ path: Path to Parquet files (can use glob patterns).
231
+ config: Optional configuration.
232
+ **read_kwargs: Additional arguments for read_parquet.
233
+
234
+ Returns:
235
+ RayExecutionEngine instance.
236
+ """
237
+ _check_ray_available()
238
+ import ray.data
239
+
240
+ cfg = config or RayEngineConfig()
241
+ _ensure_ray_initialized(cfg)
242
+
243
+ ds = ray.data.read_parquet(path, **read_kwargs)
244
+
245
+ return cls(ds, config)
246
+
247
+ @classmethod
248
+ def from_csv(
249
+ cls,
250
+ path: str,
251
+ config: RayEngineConfig | None = None,
252
+ **read_kwargs: Any,
253
+ ) -> "RayExecutionEngine":
254
+ """Create engine from CSV files.
255
+
256
+ Args:
257
+ path: Path to CSV files (can use glob patterns).
258
+ config: Optional configuration.
259
+ **read_kwargs: Additional arguments for read_csv.
260
+
261
+ Returns:
262
+ RayExecutionEngine instance.
263
+ """
264
+ _check_ray_available()
265
+ import ray.data
266
+
267
+ cfg = config or RayEngineConfig()
268
+ _ensure_ray_initialized(cfg)
269
+
270
+ ds = ray.data.read_csv(path, **read_kwargs)
271
+
272
+ return cls(ds, config)
273
+
274
+ @classmethod
275
+ def from_pandas(
276
+ cls,
277
+ df: Any,
278
+ config: RayEngineConfig | None = None,
279
+ ) -> "RayExecutionEngine":
280
+ """Create engine from Pandas DataFrame.
281
+
282
+ Args:
283
+ df: Pandas DataFrame.
284
+ config: Optional configuration.
285
+
286
+ Returns:
287
+ RayExecutionEngine instance.
288
+ """
289
+ _check_ray_available()
290
+ import ray.data
291
+
292
+ cfg = config or RayEngineConfig()
293
+ _ensure_ray_initialized(cfg)
294
+
295
+ ds = ray.data.from_pandas(df)
296
+
297
+ return cls(ds, config)
298
+
299
+ @classmethod
300
+ def from_arrow(
301
+ cls,
302
+ table: "pa.Table",
303
+ config: RayEngineConfig | None = None,
304
+ ) -> "RayExecutionEngine":
305
+ """Create engine from Arrow Table.
306
+
307
+ Args:
308
+ table: PyArrow Table.
309
+ config: Optional configuration.
310
+
311
+ Returns:
312
+ RayExecutionEngine instance.
313
+ """
314
+ _check_ray_available()
315
+ import ray.data
316
+
317
+ cfg = config or RayEngineConfig()
318
+ _ensure_ray_initialized(cfg)
319
+
320
+ ds = ray.data.from_arrow(table)
321
+
322
+ return cls(ds, config)
323
+
324
+ @classmethod
325
+ def from_items(
326
+ cls,
327
+ items: list[dict[str, Any]],
328
+ config: RayEngineConfig | None = None,
329
+ ) -> "RayExecutionEngine":
330
+ """Create engine from list of dictionaries.
331
+
332
+ Args:
333
+ items: List of row dictionaries.
334
+ config: Optional configuration.
335
+
336
+ Returns:
337
+ RayExecutionEngine instance.
338
+ """
339
+ _check_ray_available()
340
+ import ray.data
341
+
342
+ cfg = config or RayEngineConfig()
343
+ _ensure_ray_initialized(cfg)
344
+
345
+ ds = ray.data.from_items(items)
346
+
347
+ return cls(ds, config)
348
+
349
+ # -------------------------------------------------------------------------
350
+ # Properties
351
+ # -------------------------------------------------------------------------
352
+
353
+ @property
354
+ def backend_type(self) -> ComputeBackend:
355
+ """Get the compute backend type."""
356
+ return ComputeBackend.RAY
357
+
358
+ @property
359
+ def dataset(self) -> "Dataset":
360
+ """Get the underlying Ray Dataset."""
361
+ return self._ds
362
+
363
+ @property
364
+ def schema(self) -> Any:
365
+ """Get the dataset schema."""
366
+ return self._schema
367
+
368
+ @property
369
+ def supports_sql_pushdown(self) -> bool:
370
+ """Ray doesn't have native SQL pushdown."""
371
+ return False
372
+
373
+ # -------------------------------------------------------------------------
374
+ # Abstract Method Implementations
375
+ # -------------------------------------------------------------------------
376
+
377
+ def _get_partition_count(self) -> int:
378
+ """Get number of data blocks (partitions)."""
379
+ return self._ds.num_blocks()
380
+
381
+ def _get_partition_info(self) -> list[PartitionInfo]:
382
+ """Get information about all partitions (blocks)."""
383
+ num_blocks = self._get_partition_count()
384
+ columns = tuple(self._columns)
385
+
386
+ return [
387
+ PartitionInfo(
388
+ partition_id=i,
389
+ total_partitions=num_blocks,
390
+ columns=columns,
391
+ )
392
+ for i in range(num_blocks)
393
+ ]
394
+
395
+ def _execute_on_partitions(
396
+ self,
397
+ operation: str,
398
+ func: Callable[[Any], dict[str, Any]],
399
+ columns: list[str] | None = None,
400
+ ) -> list[DistributedResult]:
401
+ """Execute function on all blocks using map_batches.
402
+
403
+ Args:
404
+ operation: Operation name for metrics.
405
+ func: Function to apply to each batch.
406
+ columns: Columns to include (None = all).
407
+
408
+ Returns:
409
+ Results from all blocks.
410
+ """
411
+ import ray
412
+
413
+ metrics = self._start_metrics(operation)
414
+
415
+ try:
416
+ ds = self._ds
417
+ if columns:
418
+ ds = ds.select_columns(columns)
419
+
420
+ # Map batches - func receives batch dict
421
+ def wrapped_func(batch: dict[str, Any]) -> dict[str, Any]:
422
+ start_time = time.time()
423
+ result = func(batch)
424
+ duration_ms = (time.time() - start_time) * 1000
425
+
426
+ # Get row count from batch
427
+ row_count = len(next(iter(batch.values()))) if batch else 0
428
+
429
+ return {
430
+ "value": [result.get("value")],
431
+ "row_count": [row_count],
432
+ "duration_ms": [duration_ms],
433
+ "errors": [result.get("errors", [])],
434
+ "metadata": [result.get("metadata", {})],
435
+ }
436
+
437
+ results_ds = ds.map_batches(
438
+ wrapped_func,
439
+ batch_format="pydict",
440
+ batch_size=self._config.batch_size,
441
+ )
442
+
443
+ # Collect results
444
+ collected = []
445
+ for i, batch in enumerate(results_ds.iter_batches(batch_format="pydict")):
446
+ for j in range(len(batch["value"])):
447
+ collected.append(
448
+ DistributedResult(
449
+ partition_id=i,
450
+ operation=operation,
451
+ value=batch["value"][j],
452
+ row_count=batch["row_count"][j],
453
+ duration_ms=batch["duration_ms"][j],
454
+ errors=batch["errors"][j] if batch["errors"][j] else [],
455
+ metadata=batch["metadata"][j] if batch["metadata"][j] else {},
456
+ )
457
+ )
458
+
459
+ total_rows = sum(r.row_count for r in collected)
460
+ metrics.partitions_processed = len(collected)
461
+ metrics.rows_processed = total_rows
462
+
463
+ return collected
464
+
465
+ except Exception as e:
466
+ metrics.errors.append(str(e))
467
+ raise
468
+ finally:
469
+ self._end_metrics(metrics)
470
+
471
+ def _aggregate_distributed(
472
+ self,
473
+ spec: AggregationSpec,
474
+ ) -> dict[str, Any]:
475
+ """Perform distributed aggregation using native Ray operations.
476
+
477
+ Args:
478
+ spec: Aggregation specification.
479
+
480
+ Returns:
481
+ Aggregated results.
482
+ """
483
+ import ray
484
+
485
+ metrics = self._start_metrics("aggregate")
486
+
487
+ try:
488
+ results = {}
489
+
490
+ for agg in spec.aggregations:
491
+ column = agg.column
492
+ operation = agg.operation
493
+ alias = agg.alias
494
+ params = agg.params
495
+
496
+ if operation == "count":
497
+ if column == "*":
498
+ value = self._ds.count()
499
+ else:
500
+ # Count non-null values
501
+ value = self._count_non_null(column)
502
+ results[alias] = value
503
+
504
+ elif operation == "sum":
505
+ value = self._ds.sum(column)
506
+ results[alias] = value
507
+
508
+ elif operation == "mean":
509
+ value = self._ds.mean(column)
510
+ results[alias] = value
511
+
512
+ elif operation == "min":
513
+ value = self._ds.min(column)
514
+ results[alias] = value
515
+
516
+ elif operation == "max":
517
+ value = self._ds.max(column)
518
+ results[alias] = value
519
+
520
+ elif operation == "std":
521
+ value = self._ds.std(column)
522
+ results[alias] = value
523
+
524
+ elif operation == "var":
525
+ # Ray doesn't have built-in var, compute from std
526
+ std = self._ds.std(column)
527
+ value = std ** 2 if std is not None else None
528
+ results[alias] = value
529
+
530
+ elif operation == "minmax":
531
+ min_val = self._ds.min(column)
532
+ max_val = self._ds.max(column)
533
+ results[alias] = {"min": min_val, "max": max_val}
534
+
535
+ elif operation == "null_count":
536
+ null_count = self._count_nulls_column(column)
537
+ total_count = self._ds.count()
538
+ results[alias] = {
539
+ "null_count": null_count,
540
+ "total_count": total_count,
541
+ }
542
+
543
+ elif operation == "distinct_count":
544
+ value = self._count_distinct_column(column)
545
+ results[alias] = value
546
+
547
+ else:
548
+ # Use custom aggregator via map-reduce
549
+ result = self._aggregate_with_aggregator(agg)
550
+ results[alias] = result
551
+
552
+ return results
553
+
554
+ except Exception as e:
555
+ metrics.errors.append(str(e))
556
+ raise
557
+ finally:
558
+ self._end_metrics(metrics)
559
+
560
+ def _count_non_null(self, column: str) -> int:
561
+ """Count non-null values in a column."""
562
+ total = self._ds.count()
563
+ null_count = self._count_nulls_column(column)
564
+ return total - null_count
565
+
566
+ def _count_nulls_column(self, column: str) -> int:
567
+ """Count null values in a column."""
568
+ import ray
569
+
570
+ @ray.remote
571
+ def count_nulls_batch(batch: dict) -> int:
572
+ values = batch.get(column, [])
573
+ return sum(1 for v in values if v is None)
574
+
575
+ null_counts = []
576
+ for batch in self._ds.iter_batches(
577
+ batch_format="pydict",
578
+ batch_size=self._config.batch_size,
579
+ ):
580
+ ref = count_nulls_batch.remote(batch)
581
+ null_counts.append(ref)
582
+
583
+ return sum(ray.get(null_counts))
584
+
585
+ def _count_distinct_column(self, column: str) -> int:
586
+ """Count distinct values in a column."""
587
+ # Use unique() which returns a dataset with unique values
588
+ unique_ds = self._ds.unique(column)
589
+ return unique_ds.count()
590
+
591
+ def _aggregate_with_aggregator(
592
+ self,
593
+ agg: Any,
594
+ ) -> Any:
595
+ """Perform aggregation using custom aggregator via map-reduce.
596
+
597
+ Args:
598
+ agg: Aggregation specification.
599
+
600
+ Returns:
601
+ Aggregated result.
602
+ """
603
+ import ray
604
+
605
+ aggregator = get_aggregator(agg.operation, **agg.params)
606
+ column = agg.column
607
+
608
+ @ray.remote
609
+ def map_batch(batch: dict) -> Any:
610
+ state = aggregator.initialize()
611
+ values = batch.get(column, [])
612
+ for value in values:
613
+ state = aggregator.accumulate(state, value)
614
+ return state
615
+
616
+ # Map phase: compute partial aggregates per batch
617
+ batch_refs = []
618
+ for batch in self._ds.iter_batches(
619
+ batch_format="pydict",
620
+ batch_size=self._config.batch_size,
621
+ ):
622
+ ref = map_batch.remote(batch)
623
+ batch_refs.append(ref)
624
+
625
+ partial_states = ray.get(batch_refs)
626
+
627
+ # Reduce phase: merge all partial states
628
+ if not partial_states:
629
+ return aggregator.finalize(aggregator.initialize())
630
+
631
+ final_state = reduce(aggregator.merge, partial_states)
632
+ return aggregator.finalize(final_state)
633
+
634
+ def _to_arrow_batches(
635
+ self,
636
+ batch_size: int | None = None,
637
+ ) -> list["pa.RecordBatch"]:
638
+ """Convert Ray Dataset to Arrow batches.
639
+
640
+ Ray has native Arrow support, making this efficient.
641
+
642
+ Args:
643
+ batch_size: Batch size for conversion.
644
+
645
+ Returns:
646
+ List of Arrow record batches.
647
+ """
648
+ import pyarrow as pa
649
+
650
+ batch_size = batch_size or self._config.arrow_batch_size
651
+
652
+ # Ray Dataset has native Arrow support
653
+ batches = []
654
+ for batch in self._ds.iter_batches(
655
+ batch_format="pyarrow",
656
+ batch_size=batch_size,
657
+ ):
658
+ if isinstance(batch, pa.RecordBatch):
659
+ batches.append(batch)
660
+ elif isinstance(batch, pa.Table):
661
+ batches.extend(batch.to_batches(max_chunksize=batch_size))
662
+
663
+ return batches
664
+
665
+ def _repartition(self, num_partitions: int) -> "RayExecutionEngine":
666
+ """Repartition the underlying Dataset.
667
+
668
+ Args:
669
+ num_partitions: New number of partitions (blocks).
670
+
671
+ Returns:
672
+ New engine with repartitioned data.
673
+ """
674
+ repartitioned = self._ds.repartition(num_partitions)
675
+ return RayExecutionEngine(repartitioned, self._config)
676
+
677
+ def coalesce(self, num_partitions: int) -> "RayExecutionEngine":
678
+ """Coalesce partitions (blocks).
679
+
680
+ Args:
681
+ num_partitions: New number of partitions.
682
+
683
+ Returns:
684
+ New engine with coalesced data.
685
+ """
686
+ # Ray's repartition can reduce partitions without full shuffle
687
+ coalesced = self._ds.repartition(num_partitions)
688
+ return RayExecutionEngine(coalesced, self._config)
689
+
690
+ # -------------------------------------------------------------------------
691
+ # Core Operation Overrides (Native Ray)
692
+ # -------------------------------------------------------------------------
693
+
694
+ def count_rows(self) -> int:
695
+ """Count rows using native Ray count."""
696
+ if self._cached_row_count is not None:
697
+ return self._cached_row_count
698
+
699
+ cache_key = self._cache_key("count_rows")
700
+ cached = self._get_cached(cache_key)
701
+ if cached is not None:
702
+ return cached
703
+
704
+ count = self._ds.count()
705
+ self._cached_row_count = count
706
+ self._set_cached(cache_key, count)
707
+ return count
708
+
709
+ def get_columns(self) -> list[str]:
710
+ """Get column names."""
711
+ return self._columns
712
+
713
+ def count_nulls(self, column: str) -> int:
714
+ """Count nulls using distributed computation."""
715
+ cache_key = self._cache_key("count_nulls", column)
716
+ cached = self._get_cached(cache_key)
717
+ if cached is not None:
718
+ return cached
719
+
720
+ count = self._count_nulls_column(column)
721
+ self._set_cached(cache_key, count)
722
+ return count
723
+
724
+ def count_nulls_all(self) -> dict[str, int]:
725
+ """Count nulls in all columns."""
726
+ import ray
727
+
728
+ cache_key = self._cache_key("count_nulls_all")
729
+ cached = self._get_cached(cache_key)
730
+ if cached is not None:
731
+ return cached
732
+
733
+ # Compute null counts for all columns in parallel
734
+ @ray.remote
735
+ def count_batch_nulls(batch: dict, columns: list) -> dict[str, int]:
736
+ result = {}
737
+ for col in columns:
738
+ values = batch.get(col, [])
739
+ result[col] = sum(1 for v in values if v is None)
740
+ return result
741
+
742
+ batch_results = []
743
+ for batch in self._ds.iter_batches(
744
+ batch_format="pydict",
745
+ batch_size=self._config.batch_size,
746
+ ):
747
+ ref = count_batch_nulls.remote(batch, self._columns)
748
+ batch_results.append(ref)
749
+
750
+ all_counts = ray.get(batch_results)
751
+
752
+ # Merge results
753
+ result = {col: 0 for col in self._columns}
754
+ for counts in all_counts:
755
+ for col, count in counts.items():
756
+ result[col] += count
757
+
758
+ self._set_cached(cache_key, result)
759
+ return result
760
+
761
+ def count_distinct(self, column: str) -> int:
762
+ """Count distinct values using native Ray."""
763
+ cache_key = self._cache_key("count_distinct", column)
764
+ cached = self._get_cached(cache_key)
765
+ if cached is not None:
766
+ return cached
767
+
768
+ count = self._count_distinct_column(column)
769
+ self._set_cached(cache_key, count)
770
+ return count
771
+
772
+ def get_stats(self, column: str) -> dict[str, Any]:
773
+ """Get column statistics using native Ray aggregations."""
774
+ cache_key = self._cache_key("get_stats", column)
775
+ cached = self._get_cached(cache_key)
776
+ if cached is not None:
777
+ return cached
778
+
779
+ # Compute stats
780
+ stats = {
781
+ "count": self._ds.count(),
782
+ "null_count": self._count_nulls_column(column),
783
+ "mean": self._ds.mean(column),
784
+ "std": self._ds.std(column),
785
+ "min": self._ds.min(column),
786
+ "max": self._ds.max(column),
787
+ }
788
+
789
+ self._set_cached(cache_key, stats)
790
+ return stats
791
+
792
+ def get_value_counts(
793
+ self,
794
+ column: str,
795
+ limit: int | None = None,
796
+ ) -> dict[Any, int]:
797
+ """Get value counts."""
798
+ import ray
799
+
800
+ cache_key = self._cache_key("get_value_counts", column, limit)
801
+ cached = self._get_cached(cache_key)
802
+ if cached is not None:
803
+ return cached
804
+
805
+ # Use groupby with count
806
+ grouped = self._ds.groupby(column).count()
807
+
808
+ # Collect and sort
809
+ counts = {}
810
+ for batch in grouped.iter_batches(batch_format="pydict"):
811
+ for i in range(len(batch[column])):
812
+ value = batch[column][i]
813
+ count = batch["count()"][i]
814
+ counts[value] = count
815
+
816
+ # Sort by count descending
817
+ sorted_counts = dict(
818
+ sorted(counts.items(), key=lambda x: x[1], reverse=True)
819
+ )
820
+
821
+ if limit:
822
+ sorted_counts = dict(list(sorted_counts.items())[:limit])
823
+
824
+ self._set_cached(cache_key, sorted_counts)
825
+ return sorted_counts
826
+
827
+ def count_duplicates(self, columns: list[str]) -> int:
828
+ """Count duplicates."""
829
+ cache_key = self._cache_key("count_duplicates", tuple(columns))
830
+ cached = self._get_cached(cache_key)
831
+ if cached is not None:
832
+ return cached
833
+
834
+ total = self.count_rows()
835
+
836
+ # Get unique count
837
+ if len(columns) == 1:
838
+ unique_ds = self._ds.unique(columns[0])
839
+ else:
840
+ # For multiple columns, use groupby
841
+ grouped = self._ds.groupby(columns).count()
842
+ unique_count = grouped.count()
843
+ duplicates = total - unique_count
844
+ self._set_cached(cache_key, duplicates)
845
+ return duplicates
846
+
847
+ unique_count = unique_ds.count()
848
+ duplicates = total - unique_count
849
+
850
+ self._set_cached(cache_key, duplicates)
851
+ return duplicates
852
+
853
+ def count_matching_regex(self, column: str, pattern: str) -> int:
854
+ """Count values matching regex."""
855
+ import ray
856
+ import re
857
+
858
+ cache_key = self._cache_key("count_matching_regex", column, pattern)
859
+ cached = self._get_cached(cache_key)
860
+ if cached is not None:
861
+ return cached
862
+
863
+ compiled = re.compile(pattern)
864
+
865
+ @ray.remote
866
+ def count_matches_batch(batch: dict) -> int:
867
+ values = batch.get(column, [])
868
+ return sum(
869
+ 1 for v in values
870
+ if v is not None and compiled.match(str(v))
871
+ )
872
+
873
+ batch_refs = []
874
+ for batch in self._ds.iter_batches(
875
+ batch_format="pydict",
876
+ batch_size=self._config.batch_size,
877
+ ):
878
+ ref = count_matches_batch.remote(batch)
879
+ batch_refs.append(ref)
880
+
881
+ count = sum(ray.get(batch_refs))
882
+
883
+ self._set_cached(cache_key, count)
884
+ return count
885
+
886
+ def count_in_range(
887
+ self,
888
+ column: str,
889
+ min_value: Any | None = None,
890
+ max_value: Any | None = None,
891
+ inclusive: bool = True,
892
+ ) -> int:
893
+ """Count values in range."""
894
+ import ray
895
+
896
+ cache_key = self._cache_key(
897
+ "count_in_range", column, min_value, max_value, inclusive
898
+ )
899
+ cached = self._get_cached(cache_key)
900
+ if cached is not None:
901
+ return cached
902
+
903
+ @ray.remote
904
+ def count_range_batch(batch: dict) -> int:
905
+ values = batch.get(column, [])
906
+ count = 0
907
+ for v in values:
908
+ if v is None:
909
+ continue
910
+ in_range = True
911
+ if min_value is not None:
912
+ in_range = v >= min_value if inclusive else v > min_value
913
+ if in_range and max_value is not None:
914
+ in_range = v <= max_value if inclusive else v < max_value
915
+ if in_range:
916
+ count += 1
917
+ return count
918
+
919
+ batch_refs = []
920
+ for batch in self._ds.iter_batches(
921
+ batch_format="pydict",
922
+ batch_size=self._config.batch_size,
923
+ ):
924
+ ref = count_range_batch.remote(batch)
925
+ batch_refs.append(ref)
926
+
927
+ count = sum(ray.get(batch_refs))
928
+
929
+ self._set_cached(cache_key, count)
930
+ return count
931
+
932
+ def count_in_set(self, column: str, values: set[Any]) -> int:
933
+ """Count values in set."""
934
+ import ray
935
+
936
+ cache_key = self._cache_key("count_in_set", column, frozenset(values))
937
+ cached = self._get_cached(cache_key)
938
+ if cached is not None:
939
+ return cached
940
+
941
+ values_set = set(values)
942
+
943
+ @ray.remote
944
+ def count_in_set_batch(batch: dict) -> int:
945
+ col_values = batch.get(column, [])
946
+ return sum(1 for v in col_values if v in values_set)
947
+
948
+ batch_refs = []
949
+ for batch in self._ds.iter_batches(
950
+ batch_format="pydict",
951
+ batch_size=self._config.batch_size,
952
+ ):
953
+ ref = count_in_set_batch.remote(batch)
954
+ batch_refs.append(ref)
955
+
956
+ count = sum(ray.get(batch_refs))
957
+
958
+ self._set_cached(cache_key, count)
959
+ return count
960
+
961
+ # -------------------------------------------------------------------------
962
+ # Sampling
963
+ # -------------------------------------------------------------------------
964
+
965
+ def sample(
966
+ self,
967
+ n: int = 1000,
968
+ seed: int | None = None,
969
+ ) -> "RayExecutionEngine":
970
+ """Create sampled engine using Ray's native sampling.
971
+
972
+ Args:
973
+ n: Target number of rows.
974
+ seed: Random seed.
975
+
976
+ Returns:
977
+ New engine with sampled data.
978
+ """
979
+ row_count = self.count_rows()
980
+
981
+ if row_count <= n:
982
+ return self
983
+
984
+ fraction = min((n * 1.1) / row_count, 1.0)
985
+
986
+ # Ray's random_sample method
987
+ sampled = self._ds.random_sample(fraction, seed=seed)
988
+
989
+ # Limit to exact n rows
990
+ sampled = sampled.limit(n)
991
+
992
+ return RayExecutionEngine(sampled, self._config)
993
+
994
+ # -------------------------------------------------------------------------
995
+ # Ray-Specific Methods
996
+ # -------------------------------------------------------------------------
997
+
998
+ def materialize(self) -> "RayExecutionEngine":
999
+ """Materialize the dataset (trigger execution and cache).
1000
+
1001
+ Returns:
1002
+ Self after materializing.
1003
+ """
1004
+ self._ds = self._ds.materialize()
1005
+ return self
1006
+
1007
+ def filter(
1008
+ self,
1009
+ fn: Callable[[dict[str, Any]], bool],
1010
+ ) -> "RayExecutionEngine":
1011
+ """Filter the dataset using a function.
1012
+
1013
+ Args:
1014
+ fn: Filter function that takes a row dict and returns bool.
1015
+
1016
+ Returns:
1017
+ New engine with filtered data.
1018
+ """
1019
+ filtered = self._ds.filter(fn)
1020
+ return RayExecutionEngine(filtered, self._config)
1021
+
1022
+ def select_columns(self, columns: list[str]) -> "RayExecutionEngine":
1023
+ """Select specific columns.
1024
+
1025
+ Args:
1026
+ columns: Columns to select.
1027
+
1028
+ Returns:
1029
+ New engine with selected columns.
1030
+ """
1031
+ selected = self._ds.select_columns(columns)
1032
+ return RayExecutionEngine(selected, self._config)
1033
+
1034
+ def take(self, n: int = 5) -> list[dict[str, Any]]:
1035
+ """Get first n rows as list of dicts.
1036
+
1037
+ Args:
1038
+ n: Number of rows.
1039
+
1040
+ Returns:
1041
+ List of row dictionaries.
1042
+ """
1043
+ return self._ds.take(n)
1044
+
1045
+ def take_all(self) -> list[dict[str, Any]]:
1046
+ """Get all rows as list of dicts.
1047
+
1048
+ Returns:
1049
+ List of row dictionaries.
1050
+ """
1051
+ return self._ds.take_all()
1052
+
1053
+ def show(self, n: int = 20) -> None:
1054
+ """Print the first n rows.
1055
+
1056
+ Args:
1057
+ n: Number of rows to show.
1058
+ """
1059
+ self._ds.show(n)
1060
+
1061
+ def to_pandas(self) -> Any:
1062
+ """Convert to Pandas DataFrame.
1063
+
1064
+ Returns:
1065
+ Pandas DataFrame.
1066
+ """
1067
+ return self._ds.to_pandas()
1068
+
1069
+ def to_arrow(self) -> "pa.Table":
1070
+ """Convert to Arrow Table.
1071
+
1072
+ Returns:
1073
+ PyArrow Table.
1074
+ """
1075
+ return self._ds.to_arrow()
1076
+
1077
+ def write_parquet(self, path: str, **kwargs: Any) -> None:
1078
+ """Write to Parquet files.
1079
+
1080
+ Args:
1081
+ path: Output path.
1082
+ **kwargs: Additional arguments for write_parquet.
1083
+ """
1084
+ self._ds.write_parquet(path, **kwargs)
1085
+
1086
+ def write_csv(self, path: str, **kwargs: Any) -> None:
1087
+ """Write to CSV files.
1088
+
1089
+ Args:
1090
+ path: Output path.
1091
+ **kwargs: Additional arguments for write_csv.
1092
+ """
1093
+ self._ds.write_csv(path, **kwargs)
1094
+
1095
+ def stats(self) -> str:
1096
+ """Get dataset statistics.
1097
+
1098
+ Returns:
1099
+ Statistics string.
1100
+ """
1101
+ return self._ds.stats()
1102
+
1103
+ def schema_str(self) -> str:
1104
+ """Get schema as string.
1105
+
1106
+ Returns:
1107
+ Schema string.
1108
+ """
1109
+ return str(self._schema)
1110
+
1111
+ # -------------------------------------------------------------------------
1112
+ # Context Manager
1113
+ # -------------------------------------------------------------------------
1114
+
1115
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
1116
+ """Context manager exit."""
1117
+ super().__exit__(exc_type, exc_val, exc_tb)
1118
+ # Note: We don't shutdown Ray here as it might be shared
1119
+ # Users should manage Ray lifecycle separately
1120
+
1121
+ @staticmethod
1122
+ def shutdown() -> None:
1123
+ """Shutdown Ray."""
1124
+ import ray
1125
+
1126
+ if ray.is_initialized():
1127
+ ray.shutdown()