truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1011 @@
1
+ """Spark-native execution engine for distributed data validation.
2
+
3
+ This module provides a Spark-native execution engine that:
4
+ - Executes validation operations directly on Spark DataFrames
5
+ - Avoids Polars conversion overhead for distributed operations
6
+ - Uses Arrow for efficient data transfer when conversion is needed
7
+ - Supports distributed aggregations with proper reduce semantics
8
+
9
+ Architecture:
10
+ ┌─────────────────────────────────────────────────────────────────┐
11
+ │ SparkExecutionEngine │
12
+ │ │
13
+ │ ┌──────────────────────────────────────────────────────────┐ │
14
+ │ │ Native Spark Operations │ │
15
+ │ │ (count, aggregate, filter - no conversion overhead) │ │
16
+ │ └──────────────────────────────────────────────────────────┘ │
17
+ │ │ │
18
+ │ ▼ │
19
+ │ ┌──────────────────────────────────────────────────────────┐ │
20
+ │ │ Arrow Bridge (when needed) │ │
21
+ │ │ (zero-copy conversion to Polars for ML validators) │ │
22
+ │ └──────────────────────────────────────────────────────────┘ │
23
+ │ │ │
24
+ │ ▼ │
25
+ │ ┌──────────────────────────────────────────────────────────┐ │
26
+ │ │ Polars LazyFrame (fallback) │ │
27
+ │ │ (only for validators that require Polars operations) │ │
28
+ │ └──────────────────────────────────────────────────────────┘ │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────────┘
31
+
32
+ Example:
33
+ >>> from pyspark.sql import SparkSession
34
+ >>> from truthound.execution.distributed import SparkExecutionEngine
35
+ >>>
36
+ >>> spark = SparkSession.builder.getOrCreate()
37
+ >>> df = spark.read.parquet("large_data.parquet")
38
+ >>>
39
+ >>> # Create native Spark engine
40
+ >>> engine = SparkExecutionEngine.from_dataframe(df)
41
+ >>>
42
+ >>> # Native Spark operations (no conversion overhead)
43
+ >>> row_count = engine.count_rows()
44
+ >>> null_counts = engine.count_nulls_all()
45
+ >>> stats = engine.get_stats("price")
46
+ >>>
47
+ >>> # Convert to Polars only when needed (via Arrow)
48
+ >>> lf = engine.to_polars_lazyframe()
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import logging
54
+ import time
55
+ from dataclasses import dataclass, field
56
+ from typing import TYPE_CHECKING, Any, Callable, Iterator
57
+
58
+ from truthound.execution.distributed.base import (
59
+ BaseDistributedEngine,
60
+ DistributedEngineConfig,
61
+ ExecutionMetrics,
62
+ )
63
+ from truthound.execution.distributed.protocols import (
64
+ AggregationScope,
65
+ AggregationSpec,
66
+ ComputeBackend,
67
+ DistributedResult,
68
+ PartitionInfo,
69
+ PartitionStrategy,
70
+ get_aggregator,
71
+ )
72
+
73
+ if TYPE_CHECKING:
74
+ import pyarrow as pa
75
+ from pyspark.sql import DataFrame as SparkDataFrame
76
+ from pyspark.sql import SparkSession
77
+
78
+ logger = logging.getLogger(__name__)
79
+
80
+
81
+ # =============================================================================
82
+ # Configuration
83
+ # =============================================================================
84
+
85
+
86
+ @dataclass
87
+ class SparkEngineConfig(DistributedEngineConfig):
88
+ """Configuration for Spark execution engine.
89
+
90
+ Attributes:
91
+ app_name: Spark application name.
92
+ master: Spark master URL.
93
+ executor_memory: Memory per executor.
94
+ driver_memory: Driver memory.
95
+ executor_cores: Cores per executor.
96
+ arrow_enabled: Enable Arrow optimization.
97
+ adaptive_enabled: Enable adaptive query execution.
98
+ broadcast_threshold: Broadcast join threshold in bytes.
99
+ shuffle_partitions: Number of shuffle partitions.
100
+ """
101
+
102
+ app_name: str = "truthound-spark"
103
+ master: str = "" # Empty = use existing session
104
+ executor_memory: str = "4g"
105
+ driver_memory: str = "2g"
106
+ executor_cores: int = 2
107
+ arrow_enabled: bool = True
108
+ adaptive_enabled: bool = True
109
+ broadcast_threshold: int = 10 * 1024 * 1024 # 10MB
110
+ shuffle_partitions: int = 200
111
+ extra_spark_conf: dict[str, str] = field(default_factory=dict)
112
+
113
+
114
+ def _check_pyspark_available() -> None:
115
+ """Check if PySpark is available."""
116
+ try:
117
+ import pyspark # noqa: F401
118
+ except ImportError:
119
+ raise ImportError(
120
+ "pyspark is required for SparkExecutionEngine. "
121
+ "Install with: pip install pyspark"
122
+ )
123
+
124
+
125
+ # =============================================================================
126
+ # Spark Execution Engine
127
+ # =============================================================================
128
+
129
+
130
+ class SparkExecutionEngine(BaseDistributedEngine[SparkEngineConfig]):
131
+ """Spark-native execution engine for distributed validation.
132
+
133
+ This engine executes validation operations directly on Spark DataFrames,
134
+ avoiding the overhead of converting to Polars for operations that can
135
+ be performed natively in Spark.
136
+
137
+ Key Features:
138
+ - Native Spark aggregations (count, sum, avg, min, max, etc.)
139
+ - Distributed null/duplicate checking
140
+ - Arrow-based zero-copy conversion to Polars when needed
141
+ - Partition-aware operations
142
+ - Checkpoint support for fault tolerance
143
+
144
+ Example:
145
+ >>> engine = SparkExecutionEngine.from_dataframe(spark_df)
146
+ >>> null_counts = engine.count_nulls_all() # Native Spark
147
+ >>> lf = engine.to_polars_lazyframe() # Arrow-based conversion
148
+ """
149
+
150
+ engine_type = "spark"
151
+
152
+ def __init__(
153
+ self,
154
+ spark_df: "SparkDataFrame",
155
+ config: SparkEngineConfig | None = None,
156
+ spark_session: "SparkSession | None" = None,
157
+ ) -> None:
158
+ """Initialize Spark execution engine.
159
+
160
+ Args:
161
+ spark_df: PySpark DataFrame.
162
+ config: Optional configuration.
163
+ spark_session: Optional SparkSession (defaults to df's session).
164
+ """
165
+ _check_pyspark_available()
166
+ super().__init__(config)
167
+
168
+ self._df = spark_df
169
+ self._spark = spark_session or spark_df.sparkSession
170
+ self._schema = spark_df.schema
171
+ self._columns = spark_df.columns
172
+ self._cached_row_count: int | None = None
173
+
174
+ # Configure Spark for optimal performance
175
+ self._configure_spark()
176
+
177
+ @classmethod
178
+ def _default_config(cls) -> SparkEngineConfig:
179
+ """Create default configuration."""
180
+ return SparkEngineConfig()
181
+
182
+ def _configure_spark(self) -> None:
183
+ """Configure Spark session for optimal performance."""
184
+ if self._config.arrow_enabled:
185
+ self._spark.conf.set(
186
+ "spark.sql.execution.arrow.pyspark.enabled",
187
+ "true",
188
+ )
189
+ self._spark.conf.set(
190
+ "spark.sql.execution.arrow.pyspark.fallback.enabled",
191
+ "true",
192
+ )
193
+
194
+ if self._config.adaptive_enabled:
195
+ self._spark.conf.set(
196
+ "spark.sql.adaptive.enabled",
197
+ "true",
198
+ )
199
+
200
+ self._spark.conf.set(
201
+ "spark.sql.autoBroadcastJoinThreshold",
202
+ str(self._config.broadcast_threshold),
203
+ )
204
+
205
+ self._spark.conf.set(
206
+ "spark.sql.shuffle.partitions",
207
+ str(self._config.shuffle_partitions),
208
+ )
209
+
210
+ # Apply extra configurations
211
+ for key, value in self._config.extra_spark_conf.items():
212
+ self._spark.conf.set(key, value)
213
+
214
+ # -------------------------------------------------------------------------
215
+ # Factory Methods
216
+ # -------------------------------------------------------------------------
217
+
218
+ @classmethod
219
+ def from_dataframe(
220
+ cls,
221
+ df: "SparkDataFrame",
222
+ config: SparkEngineConfig | None = None,
223
+ ) -> "SparkExecutionEngine":
224
+ """Create engine from existing Spark DataFrame.
225
+
226
+ Args:
227
+ df: PySpark DataFrame.
228
+ config: Optional configuration.
229
+
230
+ Returns:
231
+ SparkExecutionEngine instance.
232
+ """
233
+ return cls(df, config)
234
+
235
+ @classmethod
236
+ def from_table(
237
+ cls,
238
+ spark: "SparkSession",
239
+ table_name: str,
240
+ database: str | None = None,
241
+ config: SparkEngineConfig | None = None,
242
+ ) -> "SparkExecutionEngine":
243
+ """Create engine from Spark table.
244
+
245
+ Args:
246
+ spark: SparkSession.
247
+ table_name: Table name.
248
+ database: Optional database name.
249
+ config: Optional configuration.
250
+
251
+ Returns:
252
+ SparkExecutionEngine instance.
253
+ """
254
+ _check_pyspark_available()
255
+
256
+ full_name = f"{database}.{table_name}" if database else table_name
257
+ df = spark.table(full_name)
258
+
259
+ return cls(df, config, spark)
260
+
261
+ @classmethod
262
+ def from_parquet(
263
+ cls,
264
+ spark: "SparkSession",
265
+ path: str,
266
+ config: SparkEngineConfig | None = None,
267
+ ) -> "SparkExecutionEngine":
268
+ """Create engine from Parquet files.
269
+
270
+ Args:
271
+ spark: SparkSession.
272
+ path: Path to Parquet files.
273
+ config: Optional configuration.
274
+
275
+ Returns:
276
+ SparkExecutionEngine instance.
277
+ """
278
+ _check_pyspark_available()
279
+
280
+ df = spark.read.parquet(path)
281
+ return cls(df, config, spark)
282
+
283
+ # -------------------------------------------------------------------------
284
+ # Properties
285
+ # -------------------------------------------------------------------------
286
+
287
+ @property
288
+ def backend_type(self) -> ComputeBackend:
289
+ """Get the compute backend type."""
290
+ return ComputeBackend.SPARK
291
+
292
+ @property
293
+ def spark_dataframe(self) -> "SparkDataFrame":
294
+ """Get the underlying Spark DataFrame."""
295
+ return self._df
296
+
297
+ @property
298
+ def spark_session(self) -> "SparkSession":
299
+ """Get the Spark session."""
300
+ return self._spark
301
+
302
+ @property
303
+ def supports_sql_pushdown(self) -> bool:
304
+ """Spark supports SQL pushdown."""
305
+ return True
306
+
307
+ # -------------------------------------------------------------------------
308
+ # Abstract Method Implementations
309
+ # -------------------------------------------------------------------------
310
+
311
+ def _get_partition_count(self) -> int:
312
+ """Get number of data partitions."""
313
+ return self._df.rdd.getNumPartitions()
314
+
315
+ def _get_partition_info(self) -> list[PartitionInfo]:
316
+ """Get information about all partitions."""
317
+ num_partitions = self._get_partition_count()
318
+ columns = tuple(self._columns)
319
+
320
+ return [
321
+ PartitionInfo(
322
+ partition_id=i,
323
+ total_partitions=num_partitions,
324
+ columns=columns,
325
+ )
326
+ for i in range(num_partitions)
327
+ ]
328
+
329
+ def _execute_on_partitions(
330
+ self,
331
+ operation: str,
332
+ func: Callable[[Iterator[Any]], Iterator[dict[str, Any]]],
333
+ columns: list[str] | None = None,
334
+ ) -> list[DistributedResult]:
335
+ """Execute function on all partitions using mapPartitions.
336
+
337
+ Args:
338
+ operation: Operation name for metrics.
339
+ func: Function to apply to each partition.
340
+ columns: Columns to include (None = all).
341
+
342
+ Returns:
343
+ Results from all partitions.
344
+ """
345
+ import time
346
+
347
+ metrics = self._start_metrics(operation)
348
+
349
+ try:
350
+ df = self._df
351
+ if columns:
352
+ df = df.select(*columns)
353
+
354
+ # Execute on partitions
355
+ results_rdd = df.rdd.mapPartitions(func)
356
+ raw_results = results_rdd.collect()
357
+
358
+ results = []
359
+ total_rows = 0
360
+ for i, result_dict in enumerate(raw_results):
361
+ row_count = result_dict.get("row_count", 0)
362
+ total_rows += row_count
363
+ results.append(
364
+ DistributedResult(
365
+ partition_id=i,
366
+ operation=operation,
367
+ value=result_dict.get("value"),
368
+ row_count=row_count,
369
+ duration_ms=result_dict.get("duration_ms", 0),
370
+ errors=result_dict.get("errors", []),
371
+ metadata=result_dict.get("metadata", {}),
372
+ )
373
+ )
374
+
375
+ metrics.partitions_processed = len(results)
376
+ metrics.rows_processed = total_rows
377
+
378
+ return results
379
+
380
+ except Exception as e:
381
+ metrics.errors.append(str(e))
382
+ raise
383
+ finally:
384
+ self._end_metrics(metrics)
385
+
386
+ def _aggregate_distributed(
387
+ self,
388
+ spec: AggregationSpec,
389
+ ) -> dict[str, Any]:
390
+ """Perform distributed aggregation using native Spark operations.
391
+
392
+ This method uses Spark's built-in aggregation functions for
393
+ optimal performance, falling back to map-reduce style
394
+ aggregation for custom aggregators.
395
+
396
+ Args:
397
+ spec: Aggregation specification.
398
+
399
+ Returns:
400
+ Aggregated results.
401
+ """
402
+ from pyspark.sql import functions as F
403
+
404
+ metrics = self._start_metrics("aggregate")
405
+
406
+ try:
407
+ results = {}
408
+
409
+ # Group aggregations by type for batching
410
+ spark_aggs = []
411
+ custom_aggs = []
412
+
413
+ spark_agg_funcs = {
414
+ "count": lambda c: F.count(F.lit(1)) if c == "*" else F.count(c),
415
+ "sum": F.sum,
416
+ "mean": F.avg,
417
+ "min": F.min,
418
+ "max": F.max,
419
+ "std": F.stddev,
420
+ "var": F.variance,
421
+ }
422
+
423
+ for agg in spec.aggregations:
424
+ if agg.operation in spark_agg_funcs:
425
+ spark_aggs.append(agg)
426
+ else:
427
+ custom_aggs.append(agg)
428
+
429
+ # Execute native Spark aggregations in batch
430
+ if spark_aggs:
431
+ exprs = []
432
+ for agg in spark_aggs:
433
+ func = spark_agg_funcs[agg.operation]
434
+ expr = func(agg.column).alias(agg.alias)
435
+ exprs.append(expr)
436
+
437
+ if spec.group_by:
438
+ agg_df = self._df.groupBy(*spec.group_by).agg(*exprs)
439
+ else:
440
+ agg_df = self._df.agg(*exprs)
441
+
442
+ # Collect results
443
+ row = agg_df.collect()[0]
444
+ for agg in spark_aggs:
445
+ results[agg.alias] = row[agg.alias]
446
+
447
+ # Handle minmax specially (returns dict)
448
+ minmax_aggs = [a for a in spec.aggregations if a.operation == "minmax"]
449
+ for agg in minmax_aggs:
450
+ min_val = self._df.agg(F.min(agg.column)).collect()[0][0]
451
+ max_val = self._df.agg(F.max(agg.column)).collect()[0][0]
452
+ results[agg.alias] = {"min": min_val, "max": max_val}
453
+
454
+ # Execute custom aggregations using map-reduce
455
+ for agg in custom_aggs:
456
+ if agg.operation == "null_count":
457
+ # Native Spark null count
458
+ null_count = self._df.filter(F.col(agg.column).isNull()).count()
459
+ total_count = self._df.count()
460
+ results[agg.alias] = {
461
+ "null_count": null_count,
462
+ "total_count": total_count,
463
+ }
464
+ elif agg.operation == "distinct_count":
465
+ # Native Spark distinct count
466
+ distinct_count = self._df.select(agg.column).distinct().count()
467
+ results[agg.alias] = distinct_count
468
+ else:
469
+ # Use custom aggregator via map-reduce
470
+ result = self._aggregate_with_aggregator(agg)
471
+ results[agg.alias] = result
472
+
473
+ return results
474
+
475
+ except Exception as e:
476
+ metrics.errors.append(str(e))
477
+ raise
478
+ finally:
479
+ self._end_metrics(metrics)
480
+
481
+ def _aggregate_with_aggregator(
482
+ self,
483
+ agg: Any,
484
+ ) -> Any:
485
+ """Perform aggregation using custom aggregator via map-reduce.
486
+
487
+ Args:
488
+ agg: Aggregation specification.
489
+
490
+ Returns:
491
+ Aggregated result.
492
+ """
493
+ aggregator = get_aggregator(agg.operation, **agg.params)
494
+ column = agg.column
495
+
496
+ # Map phase: compute partial aggregates per partition
497
+ def map_partition(iterator: Iterator) -> Iterator:
498
+ state = aggregator.initialize()
499
+ for row in iterator:
500
+ value = row[column] if column in row.asDict() else None
501
+ state = aggregator.accumulate(state, value)
502
+ yield state
503
+
504
+ partial_results = self._df.rdd.mapPartitions(map_partition).collect()
505
+
506
+ # Reduce phase: merge all partial results
507
+ if not partial_results:
508
+ return aggregator.finalize(aggregator.initialize())
509
+
510
+ final_state = partial_results[0]
511
+ for state in partial_results[1:]:
512
+ final_state = aggregator.merge(final_state, state)
513
+
514
+ return aggregator.finalize(final_state)
515
+
516
+ def _to_arrow_batches(
517
+ self,
518
+ batch_size: int | None = None,
519
+ ) -> list["pa.RecordBatch"]:
520
+ """Convert Spark DataFrame to Arrow batches.
521
+
522
+ Uses Spark's native Arrow support when available for
523
+ optimal performance and zero-copy conversion.
524
+
525
+ Args:
526
+ batch_size: Batch size for conversion.
527
+
528
+ Returns:
529
+ List of Arrow record batches.
530
+ """
531
+ import pyarrow as pa
532
+
533
+ batch_size = batch_size or self._config.arrow_batch_size
534
+
535
+ try:
536
+ # Try native Arrow conversion (Spark 3.0+)
537
+ # This is the most efficient path
538
+ arrow_batches = self._df._collect_as_arrow()
539
+ return arrow_batches
540
+ except AttributeError:
541
+ # Fallback: Convert via Pandas with Arrow
542
+ logger.debug("Falling back to Pandas-based Arrow conversion")
543
+
544
+ try:
545
+ # Use toPandas with Arrow enabled
546
+ pandas_df = self._df.toPandas()
547
+ table = pa.Table.from_pandas(pandas_df)
548
+ return table.to_batches(max_chunksize=batch_size)
549
+ except Exception as e:
550
+ logger.warning(f"Arrow conversion failed: {e}")
551
+ # Last resort: manual conversion
552
+ return self._manual_arrow_conversion(batch_size)
553
+
554
+ def _manual_arrow_conversion(
555
+ self,
556
+ batch_size: int,
557
+ ) -> list["pa.RecordBatch"]:
558
+ """Manual Arrow conversion for older Spark versions.
559
+
560
+ Args:
561
+ batch_size: Batch size.
562
+
563
+ Returns:
564
+ List of Arrow record batches.
565
+ """
566
+ import pyarrow as pa
567
+
568
+ # Collect data in batches
569
+ batches = []
570
+ schema = self._infer_arrow_schema()
571
+
572
+ for partition in self._df.rdd.mapPartitions(
573
+ lambda it: [list(it)]
574
+ ).collect():
575
+ if not partition:
576
+ continue
577
+
578
+ # Convert partition to dict of arrays
579
+ data = {col: [] for col in self._columns}
580
+ for row in partition:
581
+ row_dict = row.asDict()
582
+ for col in self._columns:
583
+ data[col].append(row_dict.get(col))
584
+
585
+ # Create record batch
586
+ batch = pa.RecordBatch.from_pydict(data, schema=schema)
587
+ batches.append(batch)
588
+
589
+ return batches
590
+
591
+ def _infer_arrow_schema(self) -> "pa.Schema":
592
+ """Infer Arrow schema from Spark schema."""
593
+ import pyarrow as pa
594
+ from pyspark.sql.types import (
595
+ BooleanType,
596
+ ByteType,
597
+ DateType,
598
+ DecimalType,
599
+ DoubleType,
600
+ FloatType,
601
+ IntegerType,
602
+ LongType,
603
+ ShortType,
604
+ StringType,
605
+ TimestampType,
606
+ )
607
+
608
+ type_mapping = {
609
+ ByteType: pa.int8(),
610
+ ShortType: pa.int16(),
611
+ IntegerType: pa.int32(),
612
+ LongType: pa.int64(),
613
+ FloatType: pa.float32(),
614
+ DoubleType: pa.float64(),
615
+ StringType: pa.string(),
616
+ BooleanType: pa.bool_(),
617
+ DateType: pa.date32(),
618
+ TimestampType: pa.timestamp("us"),
619
+ }
620
+
621
+ fields = []
622
+ for field in self._schema.fields:
623
+ arrow_type = type_mapping.get(type(field.dataType), pa.string())
624
+ if isinstance(field.dataType, DecimalType):
625
+ arrow_type = pa.decimal128(
626
+ field.dataType.precision,
627
+ field.dataType.scale,
628
+ )
629
+ fields.append(pa.field(field.name, arrow_type, nullable=field.nullable))
630
+
631
+ return pa.schema(fields)
632
+
633
+ def _repartition(self, num_partitions: int) -> "SparkExecutionEngine":
634
+ """Repartition the underlying DataFrame.
635
+
636
+ Args:
637
+ num_partitions: New number of partitions.
638
+
639
+ Returns:
640
+ New engine with repartitioned data.
641
+ """
642
+ repartitioned = self._df.repartition(num_partitions)
643
+ return SparkExecutionEngine(repartitioned, self._config, self._spark)
644
+
645
+ def coalesce(self, num_partitions: int) -> "SparkExecutionEngine":
646
+ """Coalesce partitions (no shuffle).
647
+
648
+ Args:
649
+ num_partitions: New number of partitions.
650
+
651
+ Returns:
652
+ New engine with coalesced data.
653
+ """
654
+ coalesced = self._df.coalesce(num_partitions)
655
+ return SparkExecutionEngine(coalesced, self._config, self._spark)
656
+
657
+ # -------------------------------------------------------------------------
658
+ # Core Operation Overrides (Native Spark)
659
+ # -------------------------------------------------------------------------
660
+
661
+ def count_rows(self) -> int:
662
+ """Count rows using native Spark count."""
663
+ if self._cached_row_count is not None:
664
+ return self._cached_row_count
665
+
666
+ cache_key = self._cache_key("count_rows")
667
+ cached = self._get_cached(cache_key)
668
+ if cached is not None:
669
+ return cached
670
+
671
+ count = self._df.count()
672
+ self._cached_row_count = count
673
+ self._set_cached(cache_key, count)
674
+ return count
675
+
676
+ def get_columns(self) -> list[str]:
677
+ """Get column names."""
678
+ return self._columns
679
+
680
+ def count_nulls(self, column: str) -> int:
681
+ """Count nulls using native Spark filter."""
682
+ from pyspark.sql import functions as F
683
+
684
+ cache_key = self._cache_key("count_nulls", column)
685
+ cached = self._get_cached(cache_key)
686
+ if cached is not None:
687
+ return cached
688
+
689
+ count = self._df.filter(F.col(column).isNull()).count()
690
+ self._set_cached(cache_key, count)
691
+ return count
692
+
693
+ def count_nulls_all(self) -> dict[str, int]:
694
+ """Count nulls in all columns using batch aggregation."""
695
+ from pyspark.sql import functions as F
696
+
697
+ cache_key = self._cache_key("count_nulls_all")
698
+ cached = self._get_cached(cache_key)
699
+ if cached is not None:
700
+ return cached
701
+
702
+ # Single pass aggregation for all columns
703
+ exprs = [
704
+ F.sum(F.when(F.col(col).isNull(), 1).otherwise(0)).alias(f"{col}_nulls")
705
+ for col in self._columns
706
+ ]
707
+
708
+ row = self._df.agg(*exprs).collect()[0]
709
+
710
+ result = {
711
+ col: row[f"{col}_nulls"] or 0
712
+ for col in self._columns
713
+ }
714
+
715
+ self._set_cached(cache_key, result)
716
+ return result
717
+
718
+ def count_distinct(self, column: str) -> int:
719
+ """Count distinct values using native Spark."""
720
+ from pyspark.sql import functions as F
721
+
722
+ cache_key = self._cache_key("count_distinct", column)
723
+ cached = self._get_cached(cache_key)
724
+ if cached is not None:
725
+ return cached
726
+
727
+ count = self._df.select(F.countDistinct(column)).collect()[0][0]
728
+ self._set_cached(cache_key, count)
729
+ return count
730
+
731
+ def get_stats(self, column: str) -> dict[str, Any]:
732
+ """Get column statistics using native Spark aggregations."""
733
+ from pyspark.sql import functions as F
734
+
735
+ cache_key = self._cache_key("get_stats", column)
736
+ cached = self._get_cached(cache_key)
737
+ if cached is not None:
738
+ return cached
739
+
740
+ # Single-pass aggregation for all stats
741
+ row = self._df.agg(
742
+ F.count(column).alias("count"),
743
+ F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias("null_count"),
744
+ F.avg(column).alias("mean"),
745
+ F.stddev(column).alias("std"),
746
+ F.min(column).alias("min"),
747
+ F.max(column).alias("max"),
748
+ ).collect()[0]
749
+
750
+ stats = {
751
+ "count": row["count"],
752
+ "null_count": row["null_count"] or 0,
753
+ "mean": row["mean"],
754
+ "std": row["std"],
755
+ "min": row["min"],
756
+ "max": row["max"],
757
+ }
758
+
759
+ self._set_cached(cache_key, stats)
760
+ return stats
761
+
762
+ def get_quantiles(
763
+ self,
764
+ column: str,
765
+ quantiles: list[float],
766
+ ) -> list[float]:
767
+ """Get quantiles using Spark's approxQuantile."""
768
+ cache_key = self._cache_key("get_quantiles", column, tuple(quantiles))
769
+ cached = self._get_cached(cache_key)
770
+ if cached is not None:
771
+ return cached
772
+
773
+ # approxQuantile with 0.01 relative error
774
+ result = self._df.approxQuantile(column, quantiles, 0.01)
775
+ self._set_cached(cache_key, result)
776
+ return result
777
+
778
+ def get_value_counts(
779
+ self,
780
+ column: str,
781
+ limit: int | None = None,
782
+ ) -> dict[Any, int]:
783
+ """Get value counts using native Spark groupBy."""
784
+ from pyspark.sql import functions as F
785
+
786
+ cache_key = self._cache_key("get_value_counts", column, limit)
787
+ cached = self._get_cached(cache_key)
788
+ if cached is not None:
789
+ return cached
790
+
791
+ counts = (
792
+ self._df.groupBy(column)
793
+ .agg(F.count("*").alias("count"))
794
+ .orderBy(F.desc("count"))
795
+ )
796
+
797
+ if limit:
798
+ counts = counts.limit(limit)
799
+
800
+ rows = counts.collect()
801
+ result = {row[column]: row["count"] for row in rows}
802
+
803
+ self._set_cached(cache_key, result)
804
+ return result
805
+
806
+ def count_duplicates(self, columns: list[str]) -> int:
807
+ """Count duplicates using native Spark operations."""
808
+ from pyspark.sql import functions as F
809
+
810
+ cache_key = self._cache_key("count_duplicates", tuple(columns))
811
+ cached = self._get_cached(cache_key)
812
+ if cached is not None:
813
+ return cached
814
+
815
+ total = self.count_rows()
816
+ unique = self._df.select(columns).distinct().count()
817
+ duplicates = total - unique
818
+
819
+ self._set_cached(cache_key, duplicates)
820
+ return duplicates
821
+
822
+ def count_matching_regex(self, column: str, pattern: str) -> int:
823
+ """Count values matching regex using Spark rlike."""
824
+ from pyspark.sql import functions as F
825
+
826
+ cache_key = self._cache_key("count_matching_regex", column, pattern)
827
+ cached = self._get_cached(cache_key)
828
+ if cached is not None:
829
+ return cached
830
+
831
+ count = self._df.filter(F.col(column).rlike(pattern)).count()
832
+ self._set_cached(cache_key, count)
833
+ return count
834
+
835
+ def count_in_range(
836
+ self,
837
+ column: str,
838
+ min_value: Any | None = None,
839
+ max_value: Any | None = None,
840
+ inclusive: bool = True,
841
+ ) -> int:
842
+ """Count values in range using native Spark filter."""
843
+ from pyspark.sql import functions as F
844
+
845
+ cache_key = self._cache_key(
846
+ "count_in_range", column, min_value, max_value, inclusive
847
+ )
848
+ cached = self._get_cached(cache_key)
849
+ if cached is not None:
850
+ return cached
851
+
852
+ condition = None
853
+
854
+ if min_value is not None:
855
+ if inclusive:
856
+ condition = F.col(column) >= min_value
857
+ else:
858
+ condition = F.col(column) > min_value
859
+
860
+ if max_value is not None:
861
+ max_cond = (
862
+ F.col(column) <= max_value
863
+ if inclusive
864
+ else F.col(column) < max_value
865
+ )
866
+ condition = condition & max_cond if condition is not None else max_cond
867
+
868
+ if condition is None:
869
+ count = self.count_rows()
870
+ else:
871
+ count = self._df.filter(condition).count()
872
+
873
+ self._set_cached(cache_key, count)
874
+ return count
875
+
876
+ def count_in_set(self, column: str, values: set[Any]) -> int:
877
+ """Count values in set using Spark isin."""
878
+ from pyspark.sql import functions as F
879
+
880
+ cache_key = self._cache_key("count_in_set", column, frozenset(values))
881
+ cached = self._get_cached(cache_key)
882
+ if cached is not None:
883
+ return cached
884
+
885
+ count = self._df.filter(F.col(column).isin(list(values))).count()
886
+ self._set_cached(cache_key, count)
887
+ return count
888
+
889
+ # -------------------------------------------------------------------------
890
+ # Sampling
891
+ # -------------------------------------------------------------------------
892
+
893
+ def sample(
894
+ self,
895
+ n: int = 1000,
896
+ seed: int | None = None,
897
+ ) -> "SparkExecutionEngine":
898
+ """Create sampled engine using Spark's native sampling.
899
+
900
+ Args:
901
+ n: Target number of rows.
902
+ seed: Random seed.
903
+
904
+ Returns:
905
+ New engine with sampled data.
906
+ """
907
+ row_count = self.count_rows()
908
+
909
+ if row_count <= n:
910
+ return self
911
+
912
+ fraction = min((n * 1.1) / row_count, 1.0)
913
+
914
+ if seed is not None:
915
+ sampled = self._df.sample(
916
+ withReplacement=False,
917
+ fraction=fraction,
918
+ seed=seed,
919
+ )
920
+ else:
921
+ sampled = self._df.sample(withReplacement=False, fraction=fraction)
922
+
923
+ sampled = sampled.limit(n)
924
+
925
+ return SparkExecutionEngine(sampled, self._config, self._spark)
926
+
927
+ # -------------------------------------------------------------------------
928
+ # Spark-Specific Methods
929
+ # -------------------------------------------------------------------------
930
+
931
+ def persist(self, storage_level: str = "MEMORY_AND_DISK") -> "SparkExecutionEngine":
932
+ """Persist the DataFrame.
933
+
934
+ Args:
935
+ storage_level: Spark storage level.
936
+
937
+ Returns:
938
+ Self after persisting.
939
+ """
940
+ from pyspark import StorageLevel
941
+
942
+ levels = {
943
+ "MEMORY_ONLY": StorageLevel.MEMORY_ONLY,
944
+ "MEMORY_AND_DISK": StorageLevel.MEMORY_AND_DISK,
945
+ "DISK_ONLY": StorageLevel.DISK_ONLY,
946
+ "MEMORY_ONLY_SER": StorageLevel.MEMORY_ONLY_SER,
947
+ }
948
+
949
+ level = levels.get(storage_level, StorageLevel.MEMORY_AND_DISK)
950
+ self._df.persist(level)
951
+ return self
952
+
953
+ def unpersist(self) -> "SparkExecutionEngine":
954
+ """Unpersist the DataFrame.
955
+
956
+ Returns:
957
+ Self after unpersisting.
958
+ """
959
+ self._df.unpersist()
960
+ return self
961
+
962
+ def checkpoint(self) -> "SparkExecutionEngine":
963
+ """Checkpoint the DataFrame for fault tolerance.
964
+
965
+ Returns:
966
+ New engine with checkpointed data.
967
+ """
968
+ if self._config.checkpoint_dir:
969
+ self._spark.sparkContext.setCheckpointDir(self._config.checkpoint_dir)
970
+
971
+ checkpointed = self._df.checkpoint()
972
+ return SparkExecutionEngine(checkpointed, self._config, self._spark)
973
+
974
+ def explain(self, extended: bool = False) -> str:
975
+ """Get the execution plan.
976
+
977
+ Args:
978
+ extended: Show extended plan.
979
+
980
+ Returns:
981
+ Execution plan as string.
982
+ """
983
+ import io
984
+ import sys
985
+
986
+ old_stdout = sys.stdout
987
+ sys.stdout = buffer = io.StringIO()
988
+ try:
989
+ self._df.explain(extended=extended)
990
+ return buffer.getvalue()
991
+ finally:
992
+ sys.stdout = old_stdout
993
+
994
+ def sql(self, query: str) -> "SparkExecutionEngine":
995
+ """Execute SQL query on this DataFrame.
996
+
997
+ Args:
998
+ query: SQL query with {table} placeholder.
999
+
1000
+ Returns:
1001
+ New engine with query results.
1002
+ """
1003
+ # Register temp view
1004
+ view_name = f"truthound_temp_{id(self._df)}"
1005
+ self._df.createOrReplaceTempView(view_name)
1006
+
1007
+ try:
1008
+ result_df = self._spark.sql(query.format(table=view_name))
1009
+ return SparkExecutionEngine(result_df, self._config, self._spark)
1010
+ finally:
1011
+ self._spark.catalog.dropTempView(view_name)