truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1366 @@
1
+ """Distributed processing backends for large-scale data profiling.
2
+
3
+ This module provides pluggable distributed computing backends:
4
+ - Spark: Apache Spark for cluster computing
5
+ - Dask: Parallel computing with task scheduling
6
+ - Ray: Distributed computing framework
7
+
8
+ Key features:
9
+ - Unified API across all backends
10
+ - Automatic backend detection
11
+ - Fallback to local processing
12
+ - Resource-aware partitioning
13
+
14
+ Example:
15
+ from truthound.profiler.distributed import (
16
+ DistributedProfiler,
17
+ SparkBackend,
18
+ DaskBackend,
19
+ RayBackend,
20
+ )
21
+
22
+ # Auto-detect backend
23
+ profiler = DistributedProfiler.create(backend="auto")
24
+
25
+ # Or specify backend
26
+ profiler = DistributedProfiler.create(
27
+ backend="spark",
28
+ spark_config={"spark.executor.memory": "4g"}
29
+ )
30
+
31
+ # Profile large dataset
32
+ profile = profiler.profile("hdfs://data/large_dataset.parquet")
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import hashlib
38
+ import logging
39
+ import os
40
+ import threading
41
+ import time
42
+ from abc import ABC, abstractmethod
43
+ from concurrent.futures import ThreadPoolExecutor, as_completed
44
+ from dataclasses import dataclass, field
45
+ from datetime import datetime
46
+ from enum import Enum
47
+ from pathlib import Path
48
+ from typing import (
49
+ TYPE_CHECKING,
50
+ Any,
51
+ Callable,
52
+ Dict,
53
+ Generic,
54
+ Iterator,
55
+ List,
56
+ Optional,
57
+ Protocol,
58
+ Tuple,
59
+ Type,
60
+ TypeVar,
61
+ Union,
62
+ )
63
+
64
+ import polars as pl
65
+
66
+ from truthound.profiler.base import (
67
+ ColumnProfile,
68
+ DataType,
69
+ DistributionStats,
70
+ TableProfile,
71
+ ValueFrequency,
72
+ )
73
+
74
+
75
+ logger = logging.getLogger(__name__)
76
+
77
+
78
+ # =============================================================================
79
+ # Types and Enums
80
+ # =============================================================================
81
+
82
+
83
+ class BackendType(str, Enum):
84
+ """Supported distributed computing backends."""
85
+
86
+ LOCAL = "local"
87
+ SPARK = "spark"
88
+ DASK = "dask"
89
+ RAY = "ray"
90
+ AUTO = "auto"
91
+
92
+
93
+ class PartitionStrategy(str, Enum):
94
+ """Data partitioning strategies."""
95
+
96
+ ROW_BASED = "row_based" # Split by row ranges
97
+ COLUMN_BASED = "column_based" # Profile columns in parallel
98
+ HYBRID = "hybrid" # Combine both strategies
99
+ HASH = "hash" # Hash-based partitioning
100
+
101
+
102
+ @dataclass
103
+ class PartitionInfo:
104
+ """Information about a data partition."""
105
+
106
+ partition_id: int
107
+ total_partitions: int
108
+ start_row: int = 0
109
+ end_row: int = 0
110
+ columns: list[str] = field(default_factory=list)
111
+ size_bytes: int = 0
112
+ metadata: dict[str, Any] = field(default_factory=dict)
113
+
114
+
115
+ @dataclass
116
+ class WorkerResult:
117
+ """Result from a worker processing a partition."""
118
+
119
+ partition_id: int
120
+ column_stats: dict[str, dict[str, Any]]
121
+ row_count: int
122
+ processing_time_ms: float
123
+ errors: list[str] = field(default_factory=list)
124
+ metadata: dict[str, Any] = field(default_factory=dict)
125
+
126
+
127
+ # =============================================================================
128
+ # Backend Configuration
129
+ # =============================================================================
130
+
131
+
132
+ @dataclass
133
+ class BackendConfig:
134
+ """Base configuration for distributed backends."""
135
+
136
+ backend_type: BackendType = BackendType.LOCAL
137
+ num_workers: int = 0 # 0 = auto-detect
138
+ memory_per_worker: str = "2g"
139
+ parallelism: int = 0 # 0 = auto
140
+ timeout_seconds: int = 3600
141
+ retry_count: int = 3
142
+ checkpoint_enabled: bool = False
143
+ checkpoint_dir: str = ""
144
+ metadata: dict[str, Any] = field(default_factory=dict)
145
+
146
+ def to_dict(self) -> dict[str, Any]:
147
+ return {
148
+ "backend_type": self.backend_type.value,
149
+ "num_workers": self.num_workers,
150
+ "memory_per_worker": self.memory_per_worker,
151
+ "parallelism": self.parallelism,
152
+ "timeout_seconds": self.timeout_seconds,
153
+ "retry_count": self.retry_count,
154
+ "checkpoint_enabled": self.checkpoint_enabled,
155
+ "checkpoint_dir": self.checkpoint_dir,
156
+ "metadata": self.metadata,
157
+ }
158
+
159
+
160
+ @dataclass
161
+ class SparkConfig(BackendConfig):
162
+ """Spark-specific configuration."""
163
+
164
+ backend_type: BackendType = BackendType.SPARK
165
+ master: str = "local[*]"
166
+ app_name: str = "truthound-profiler"
167
+ executor_memory: str = "4g"
168
+ driver_memory: str = "2g"
169
+ executor_cores: int = 2
170
+ num_executors: int = 0
171
+ spark_config: dict[str, str] = field(default_factory=dict)
172
+ hadoop_config: dict[str, str] = field(default_factory=dict)
173
+
174
+
175
+ @dataclass
176
+ class DaskConfig(BackendConfig):
177
+ """Dask-specific configuration."""
178
+
179
+ backend_type: BackendType = BackendType.DASK
180
+ scheduler: str = "threads" # threads, processes, distributed
181
+ address: str = "" # For distributed scheduler
182
+ n_workers: int = 0
183
+ threads_per_worker: int = 2
184
+ memory_limit: str = "auto"
185
+ dashboard_address: str = ":8787"
186
+
187
+
188
+ @dataclass
189
+ class RayConfig(BackendConfig):
190
+ """Ray-specific configuration."""
191
+
192
+ backend_type: BackendType = BackendType.RAY
193
+ address: str = "" # Empty = local, "auto" = cluster
194
+ num_cpus: int = 0
195
+ num_gpus: int = 0
196
+ object_store_memory: int = 0
197
+ runtime_env: dict[str, Any] = field(default_factory=dict)
198
+
199
+
200
+ # =============================================================================
201
+ # Distributed Backend Protocol
202
+ # =============================================================================
203
+
204
+
205
+ class DistributedBackend(ABC):
206
+ """Abstract base class for distributed computing backends.
207
+
208
+ Implement this to create custom distributed backends.
209
+ All backends must provide a consistent interface for:
210
+ - Initialization and cleanup
211
+ - Data distribution
212
+ - Parallel execution
213
+ - Result aggregation
214
+ """
215
+
216
+ name: str = "base"
217
+ available: bool = False
218
+
219
+ def __init__(self, config: BackendConfig):
220
+ self.config = config
221
+ self._initialized = False
222
+ self._lock = threading.Lock()
223
+
224
+ @abstractmethod
225
+ def initialize(self) -> None:
226
+ """Initialize the distributed backend.
227
+
228
+ Sets up connections, creates cluster, etc.
229
+ """
230
+ pass
231
+
232
+ @abstractmethod
233
+ def shutdown(self) -> None:
234
+ """Shutdown the distributed backend.
235
+
236
+ Cleans up resources, closes connections.
237
+ """
238
+ pass
239
+
240
+ @abstractmethod
241
+ def is_available(self) -> bool:
242
+ """Check if this backend is available.
243
+
244
+ Returns:
245
+ True if backend dependencies are installed
246
+ """
247
+ pass
248
+
249
+ @abstractmethod
250
+ def distribute_data(
251
+ self,
252
+ data: pl.DataFrame | pl.LazyFrame | str,
253
+ num_partitions: int | None = None,
254
+ strategy: PartitionStrategy = PartitionStrategy.ROW_BASED,
255
+ ) -> list[PartitionInfo]:
256
+ """Distribute data across workers.
257
+
258
+ Args:
259
+ data: DataFrame, LazyFrame, or path to data
260
+ num_partitions: Number of partitions (None = auto)
261
+ strategy: Partitioning strategy
262
+
263
+ Returns:
264
+ List of partition information
265
+ """
266
+ pass
267
+
268
+ @abstractmethod
269
+ def map_partitions(
270
+ self,
271
+ func: Callable[[PartitionInfo, Any], WorkerResult],
272
+ partitions: list[PartitionInfo],
273
+ data: Any,
274
+ ) -> list[WorkerResult]:
275
+ """Execute function on each partition.
276
+
277
+ Args:
278
+ func: Function to execute on each partition
279
+ partitions: List of partitions to process
280
+ data: Reference to distributed data
281
+
282
+ Returns:
283
+ List of results from each partition
284
+ """
285
+ pass
286
+
287
+ @abstractmethod
288
+ def aggregate_results(
289
+ self,
290
+ results: list[WorkerResult],
291
+ ) -> dict[str, dict[str, Any]]:
292
+ """Aggregate results from all partitions.
293
+
294
+ Args:
295
+ results: Results from map_partitions
296
+
297
+ Returns:
298
+ Aggregated statistics per column
299
+ """
300
+ pass
301
+
302
+ def __enter__(self) -> "DistributedBackend":
303
+ self.initialize()
304
+ return self
305
+
306
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
307
+ self.shutdown()
308
+
309
+
310
+ # =============================================================================
311
+ # Local Backend (Fallback)
312
+ # =============================================================================
313
+
314
+
315
+ class LocalBackend(DistributedBackend):
316
+ """Local multi-threaded backend as fallback.
317
+
318
+ Uses ThreadPoolExecutor for parallel column processing.
319
+ """
320
+
321
+ name = "local"
322
+ available = True
323
+
324
+ def __init__(self, config: BackendConfig | None = None):
325
+ super().__init__(config or BackendConfig())
326
+ self._executor: ThreadPoolExecutor | None = None
327
+
328
+ def initialize(self) -> None:
329
+ num_workers = self.config.num_workers
330
+ if num_workers <= 0:
331
+ num_workers = min(os.cpu_count() or 4, 8)
332
+
333
+ self._executor = ThreadPoolExecutor(max_workers=num_workers)
334
+ self._initialized = True
335
+ logger.info(f"LocalBackend initialized with {num_workers} workers")
336
+
337
+ def shutdown(self) -> None:
338
+ if self._executor:
339
+ self._executor.shutdown(wait=True)
340
+ self._initialized = False
341
+
342
+ def is_available(self) -> bool:
343
+ return True
344
+
345
+ def distribute_data(
346
+ self,
347
+ data: pl.DataFrame | pl.LazyFrame | str,
348
+ num_partitions: int | None = None,
349
+ strategy: PartitionStrategy = PartitionStrategy.COLUMN_BASED,
350
+ ) -> list[PartitionInfo]:
351
+ # Load data if path
352
+ if isinstance(data, str):
353
+ df = pl.scan_parquet(data).collect()
354
+ elif isinstance(data, pl.LazyFrame):
355
+ df = data.collect()
356
+ else:
357
+ df = data
358
+
359
+ columns = df.columns
360
+ row_count = len(df)
361
+
362
+ if strategy == PartitionStrategy.COLUMN_BASED:
363
+ # One partition per column
364
+ num_parts = len(columns) if num_partitions is None else num_partitions
365
+ partitions = []
366
+
367
+ cols_per_part = max(1, len(columns) // num_parts)
368
+ for i in range(num_parts):
369
+ start = i * cols_per_part
370
+ end = start + cols_per_part if i < num_parts - 1 else len(columns)
371
+ partitions.append(PartitionInfo(
372
+ partition_id=i,
373
+ total_partitions=num_parts,
374
+ start_row=0,
375
+ end_row=row_count,
376
+ columns=columns[start:end],
377
+ ))
378
+
379
+ return partitions
380
+
381
+ else: # ROW_BASED or HYBRID
382
+ num_parts = num_partitions or (os.cpu_count() or 4)
383
+ rows_per_part = max(1, row_count // num_parts)
384
+
385
+ partitions = []
386
+ for i in range(num_parts):
387
+ start = i * rows_per_part
388
+ end = start + rows_per_part if i < num_parts - 1 else row_count
389
+ partitions.append(PartitionInfo(
390
+ partition_id=i,
391
+ total_partitions=num_parts,
392
+ start_row=start,
393
+ end_row=end,
394
+ columns=columns,
395
+ ))
396
+
397
+ return partitions
398
+
399
+ def map_partitions(
400
+ self,
401
+ func: Callable[[PartitionInfo, Any], WorkerResult],
402
+ partitions: list[PartitionInfo],
403
+ data: Any,
404
+ ) -> list[WorkerResult]:
405
+ if not self._executor:
406
+ raise RuntimeError("Backend not initialized")
407
+
408
+ futures = {
409
+ self._executor.submit(func, partition, data): partition
410
+ for partition in partitions
411
+ }
412
+
413
+ results = []
414
+ for future in as_completed(futures):
415
+ partition = futures[future]
416
+ try:
417
+ result = future.result()
418
+ results.append(result)
419
+ except Exception as e:
420
+ logger.error(f"Partition {partition.partition_id} failed: {e}")
421
+ results.append(WorkerResult(
422
+ partition_id=partition.partition_id,
423
+ column_stats={},
424
+ row_count=0,
425
+ processing_time_ms=0,
426
+ errors=[str(e)],
427
+ ))
428
+
429
+ return sorted(results, key=lambda r: r.partition_id)
430
+
431
+ def aggregate_results(
432
+ self,
433
+ results: list[WorkerResult],
434
+ ) -> dict[str, dict[str, Any]]:
435
+ aggregated: dict[str, dict[str, Any]] = {}
436
+
437
+ for result in results:
438
+ for col_name, stats in result.column_stats.items():
439
+ if col_name not in aggregated:
440
+ aggregated[col_name] = {
441
+ "row_count": 0,
442
+ "null_count": 0,
443
+ "distinct_values": set(),
444
+ "min_value": None,
445
+ "max_value": None,
446
+ "sum_value": 0,
447
+ "sum_squared": 0,
448
+ }
449
+
450
+ agg = aggregated[col_name]
451
+ agg["row_count"] += stats.get("row_count", 0)
452
+ agg["null_count"] += stats.get("null_count", 0)
453
+
454
+ if "distinct_values" in stats:
455
+ agg["distinct_values"].update(stats["distinct_values"])
456
+
457
+ # Min/Max
458
+ if stats.get("min_value") is not None:
459
+ if agg["min_value"] is None or stats["min_value"] < agg["min_value"]:
460
+ agg["min_value"] = stats["min_value"]
461
+ if stats.get("max_value") is not None:
462
+ if agg["max_value"] is None or stats["max_value"] > agg["max_value"]:
463
+ agg["max_value"] = stats["max_value"]
464
+
465
+ # For computing variance
466
+ agg["sum_value"] += stats.get("sum_value", 0)
467
+ agg["sum_squared"] += stats.get("sum_squared", 0)
468
+
469
+ # Finalize aggregations
470
+ for col_name, agg in aggregated.items():
471
+ agg["distinct_count"] = len(agg.pop("distinct_values", set()))
472
+ n = agg["row_count"]
473
+ if n > 0:
474
+ mean = agg["sum_value"] / n
475
+ agg["mean"] = mean
476
+ variance = (agg["sum_squared"] / n) - (mean ** 2)
477
+ agg["std"] = variance ** 0.5 if variance > 0 else 0
478
+
479
+ return aggregated
480
+
481
+
482
+ # =============================================================================
483
+ # Spark Backend
484
+ # =============================================================================
485
+
486
+
487
+ class SparkBackend(DistributedBackend):
488
+ """Apache Spark backend for cluster computing.
489
+
490
+ Supports:
491
+ - Local mode
492
+ - Standalone cluster
493
+ - YARN
494
+ - Kubernetes
495
+ - Databricks
496
+ """
497
+
498
+ name = "spark"
499
+
500
+ def __init__(self, config: SparkConfig | None = None):
501
+ super().__init__(config or SparkConfig())
502
+ self._spark = None
503
+ self._sc = None
504
+
505
+ @property
506
+ def spark_config(self) -> SparkConfig:
507
+ return self.config # type: ignore
508
+
509
+ def is_available(self) -> bool:
510
+ try:
511
+ import pyspark
512
+ return True
513
+ except ImportError:
514
+ return False
515
+
516
+ def initialize(self) -> None:
517
+ if not self.is_available():
518
+ raise ImportError(
519
+ "PySpark is required for Spark backend. "
520
+ "Install with: pip install pyspark"
521
+ )
522
+
523
+ from pyspark.sql import SparkSession
524
+
525
+ builder = SparkSession.builder.appName(self.spark_config.app_name)
526
+
527
+ if self.spark_config.master:
528
+ builder = builder.master(self.spark_config.master)
529
+
530
+ # Set memory configurations
531
+ builder = builder.config(
532
+ "spark.executor.memory", self.spark_config.executor_memory
533
+ ).config(
534
+ "spark.driver.memory", self.spark_config.driver_memory
535
+ ).config(
536
+ "spark.executor.cores", str(self.spark_config.executor_cores)
537
+ )
538
+
539
+ # Custom Spark configs
540
+ for key, value in self.spark_config.spark_config.items():
541
+ builder = builder.config(key, value)
542
+
543
+ # Hadoop configs
544
+ for key, value in self.spark_config.hadoop_config.items():
545
+ builder = builder.config(f"spark.hadoop.{key}", value)
546
+
547
+ self._spark = builder.getOrCreate()
548
+ self._sc = self._spark.sparkContext
549
+ self._initialized = True
550
+
551
+ logger.info(f"SparkBackend initialized: {self._spark.version}")
552
+
553
+ def shutdown(self) -> None:
554
+ if self._spark:
555
+ self._spark.stop()
556
+ self._spark = None
557
+ self._sc = None
558
+ self._initialized = False
559
+
560
+ def distribute_data(
561
+ self,
562
+ data: pl.DataFrame | pl.LazyFrame | str,
563
+ num_partitions: int | None = None,
564
+ strategy: PartitionStrategy = PartitionStrategy.ROW_BASED,
565
+ ) -> list[PartitionInfo]:
566
+ if not self._spark:
567
+ raise RuntimeError("Spark not initialized")
568
+
569
+ # Load data into Spark DataFrame
570
+ if isinstance(data, str):
571
+ # Path to file
572
+ spark_df = self._spark.read.parquet(data)
573
+ elif isinstance(data, (pl.DataFrame, pl.LazyFrame)):
574
+ # Convert Polars to Spark
575
+ if isinstance(data, pl.LazyFrame):
576
+ data = data.collect()
577
+ pandas_df = data.to_pandas()
578
+ spark_df = self._spark.createDataFrame(pandas_df)
579
+ else:
580
+ raise TypeError(f"Unsupported data type: {type(data)}")
581
+
582
+ # Repartition
583
+ num_parts = num_partitions or spark_df.rdd.getNumPartitions()
584
+ spark_df = spark_df.repartition(num_parts)
585
+
586
+ # Store in context for later use
587
+ self._current_df = spark_df
588
+
589
+ # Create partition info
590
+ columns = spark_df.columns
591
+ return [
592
+ PartitionInfo(
593
+ partition_id=i,
594
+ total_partitions=num_parts,
595
+ columns=columns,
596
+ )
597
+ for i in range(num_parts)
598
+ ]
599
+
600
+ def map_partitions(
601
+ self,
602
+ func: Callable[[PartitionInfo, Any], WorkerResult],
603
+ partitions: list[PartitionInfo],
604
+ data: Any,
605
+ ) -> list[WorkerResult]:
606
+ if not self._spark or not hasattr(self, "_current_df"):
607
+ raise RuntimeError("No data distributed")
608
+
609
+ spark_df = self._current_df
610
+ columns = spark_df.columns
611
+
612
+ # Define Spark UDF for profiling
613
+ def profile_partition(iterator: Iterator) -> Iterator:
614
+ import time
615
+ import pandas as pd
616
+
617
+ start = time.time()
618
+ rows = list(iterator)
619
+
620
+ if not rows:
621
+ yield {
622
+ "partition_id": 0,
623
+ "column_stats": {},
624
+ "row_count": 0,
625
+ "processing_time_ms": 0,
626
+ }
627
+ return
628
+
629
+ pdf = pd.DataFrame(rows, columns=columns)
630
+ stats = {}
631
+
632
+ for col in columns:
633
+ col_data = pdf[col]
634
+ stats[col] = {
635
+ "row_count": len(col_data),
636
+ "null_count": int(col_data.isna().sum()),
637
+ "distinct_count": int(col_data.nunique()),
638
+ }
639
+
640
+ if pd.api.types.is_numeric_dtype(col_data):
641
+ stats[col].update({
642
+ "min_value": float(col_data.min()) if not col_data.isna().all() else None,
643
+ "max_value": float(col_data.max()) if not col_data.isna().all() else None,
644
+ "sum_value": float(col_data.sum()),
645
+ "sum_squared": float((col_data ** 2).sum()),
646
+ })
647
+
648
+ elapsed = (time.time() - start) * 1000
649
+
650
+ yield {
651
+ "partition_id": 0,
652
+ "column_stats": stats,
653
+ "row_count": len(pdf),
654
+ "processing_time_ms": elapsed,
655
+ }
656
+
657
+ # Execute on partitions
658
+ results_rdd = spark_df.rdd.mapPartitions(profile_partition)
659
+ results = results_rdd.collect()
660
+
661
+ return [
662
+ WorkerResult(
663
+ partition_id=i,
664
+ column_stats=r["column_stats"],
665
+ row_count=r["row_count"],
666
+ processing_time_ms=r["processing_time_ms"],
667
+ )
668
+ for i, r in enumerate(results)
669
+ ]
670
+
671
+ def aggregate_results(
672
+ self,
673
+ results: list[WorkerResult],
674
+ ) -> dict[str, dict[str, Any]]:
675
+ # Use same logic as LocalBackend
676
+ return LocalBackend(self.config).aggregate_results(results)
677
+
678
+
679
+ # =============================================================================
680
+ # Dask Backend
681
+ # =============================================================================
682
+
683
+
684
+ class DaskBackend(DistributedBackend):
685
+ """Dask backend for parallel computing.
686
+
687
+ Supports:
688
+ - Threaded scheduler (single machine)
689
+ - Process scheduler (single machine, multiprocessing)
690
+ - Distributed scheduler (cluster)
691
+ """
692
+
693
+ name = "dask"
694
+
695
+ def __init__(self, config: DaskConfig | None = None):
696
+ super().__init__(config or DaskConfig())
697
+ self._client = None
698
+ self._cluster = None
699
+
700
+ @property
701
+ def dask_config(self) -> DaskConfig:
702
+ return self.config # type: ignore
703
+
704
+ def is_available(self) -> bool:
705
+ try:
706
+ import dask
707
+ return True
708
+ except ImportError:
709
+ return False
710
+
711
+ def initialize(self) -> None:
712
+ if not self.is_available():
713
+ raise ImportError(
714
+ "Dask is required for Dask backend. "
715
+ "Install with: pip install dask[complete]"
716
+ )
717
+
718
+ import dask
719
+
720
+ scheduler = self.dask_config.scheduler
721
+
722
+ if scheduler == "distributed":
723
+ from dask.distributed import Client, LocalCluster
724
+
725
+ if self.dask_config.address:
726
+ # Connect to existing cluster
727
+ self._client = Client(self.dask_config.address)
728
+ else:
729
+ # Create local cluster
730
+ n_workers = self.dask_config.n_workers or (os.cpu_count() or 4)
731
+ self._cluster = LocalCluster(
732
+ n_workers=n_workers,
733
+ threads_per_worker=self.dask_config.threads_per_worker,
734
+ memory_limit=self.dask_config.memory_limit,
735
+ dashboard_address=self.dask_config.dashboard_address,
736
+ )
737
+ self._client = Client(self._cluster)
738
+
739
+ logger.info(f"DaskBackend (distributed) initialized: {self._client}")
740
+ else:
741
+ # Use simple scheduler
742
+ dask.config.set(scheduler=scheduler)
743
+ logger.info(f"DaskBackend ({scheduler}) initialized")
744
+
745
+ self._initialized = True
746
+
747
+ def shutdown(self) -> None:
748
+ if self._client:
749
+ self._client.close()
750
+ if self._cluster:
751
+ self._cluster.close()
752
+ self._client = None
753
+ self._cluster = None
754
+ self._initialized = False
755
+
756
+ def distribute_data(
757
+ self,
758
+ data: pl.DataFrame | pl.LazyFrame | str,
759
+ num_partitions: int | None = None,
760
+ strategy: PartitionStrategy = PartitionStrategy.ROW_BASED,
761
+ ) -> list[PartitionInfo]:
762
+ import dask.dataframe as dd
763
+
764
+ # Load data as Dask DataFrame
765
+ if isinstance(data, str):
766
+ ddf = dd.read_parquet(data)
767
+ elif isinstance(data, (pl.DataFrame, pl.LazyFrame)):
768
+ if isinstance(data, pl.LazyFrame):
769
+ data = data.collect()
770
+ pdf = data.to_pandas()
771
+ ddf = dd.from_pandas(pdf, npartitions=num_partitions or (os.cpu_count() or 4))
772
+ else:
773
+ raise TypeError(f"Unsupported data type: {type(data)}")
774
+
775
+ # Repartition if needed
776
+ if num_partitions:
777
+ ddf = ddf.repartition(npartitions=num_partitions)
778
+
779
+ self._current_ddf = ddf
780
+ num_parts = ddf.npartitions
781
+ columns = list(ddf.columns)
782
+
783
+ return [
784
+ PartitionInfo(
785
+ partition_id=i,
786
+ total_partitions=num_parts,
787
+ columns=columns,
788
+ )
789
+ for i in range(num_parts)
790
+ ]
791
+
792
+ def map_partitions(
793
+ self,
794
+ func: Callable[[PartitionInfo, Any], WorkerResult],
795
+ partitions: list[PartitionInfo],
796
+ data: Any,
797
+ ) -> list[WorkerResult]:
798
+ if not hasattr(self, "_current_ddf"):
799
+ raise RuntimeError("No data distributed")
800
+
801
+ import dask
802
+ import pandas as pd
803
+
804
+ ddf = self._current_ddf
805
+ columns = list(ddf.columns)
806
+
807
+ def profile_partition(pdf: pd.DataFrame) -> pd.DataFrame:
808
+ """Profile a single partition."""
809
+ stats = {}
810
+
811
+ for col in pdf.columns:
812
+ col_data = pdf[col]
813
+ stats[col] = {
814
+ "row_count": len(col_data),
815
+ "null_count": int(col_data.isna().sum()),
816
+ "distinct_count": int(col_data.nunique()),
817
+ }
818
+
819
+ if pd.api.types.is_numeric_dtype(col_data):
820
+ non_null = col_data.dropna()
821
+ if len(non_null) > 0:
822
+ stats[col].update({
823
+ "min_value": float(non_null.min()),
824
+ "max_value": float(non_null.max()),
825
+ "sum_value": float(non_null.sum()),
826
+ "sum_squared": float((non_null ** 2).sum()),
827
+ })
828
+
829
+ # Return as single-row DataFrame
830
+ import json
831
+ return pd.DataFrame([{
832
+ "stats": json.dumps(stats),
833
+ "row_count": len(pdf),
834
+ }])
835
+
836
+ # Apply to all partitions
837
+ result_ddf = ddf.map_partitions(
838
+ profile_partition,
839
+ meta={"stats": str, "row_count": int},
840
+ )
841
+
842
+ # Compute results
843
+ results_pdf = result_ddf.compute()
844
+
845
+ import json
846
+ return [
847
+ WorkerResult(
848
+ partition_id=i,
849
+ column_stats=json.loads(row["stats"]),
850
+ row_count=row["row_count"],
851
+ processing_time_ms=0,
852
+ )
853
+ for i, (_, row) in enumerate(results_pdf.iterrows())
854
+ ]
855
+
856
+ def aggregate_results(
857
+ self,
858
+ results: list[WorkerResult],
859
+ ) -> dict[str, dict[str, Any]]:
860
+ return LocalBackend(self.config).aggregate_results(results)
861
+
862
+
863
+ # =============================================================================
864
+ # Ray Backend
865
+ # =============================================================================
866
+
867
+
868
+ class RayBackend(DistributedBackend):
869
+ """Ray backend for distributed computing.
870
+
871
+ Features:
872
+ - Automatic cluster management
873
+ - Object store for shared data
874
+ - Actor-based processing
875
+ """
876
+
877
+ name = "ray"
878
+
879
+ def __init__(self, config: RayConfig | None = None):
880
+ super().__init__(config or RayConfig())
881
+ self._ray = None
882
+
883
+ @property
884
+ def ray_config(self) -> RayConfig:
885
+ return self.config # type: ignore
886
+
887
+ def is_available(self) -> bool:
888
+ try:
889
+ import ray
890
+ return True
891
+ except ImportError:
892
+ return False
893
+
894
+ def initialize(self) -> None:
895
+ if not self.is_available():
896
+ raise ImportError(
897
+ "Ray is required for Ray backend. "
898
+ "Install with: pip install ray"
899
+ )
900
+
901
+ import ray
902
+
903
+ init_kwargs: dict[str, Any] = {}
904
+
905
+ if self.ray_config.address:
906
+ init_kwargs["address"] = self.ray_config.address
907
+ if self.ray_config.num_cpus:
908
+ init_kwargs["num_cpus"] = self.ray_config.num_cpus
909
+ if self.ray_config.num_gpus:
910
+ init_kwargs["num_gpus"] = self.ray_config.num_gpus
911
+ if self.ray_config.object_store_memory:
912
+ init_kwargs["object_store_memory"] = self.ray_config.object_store_memory
913
+ if self.ray_config.runtime_env:
914
+ init_kwargs["runtime_env"] = self.ray_config.runtime_env
915
+
916
+ if not ray.is_initialized():
917
+ ray.init(**init_kwargs)
918
+
919
+ self._ray = ray
920
+ self._initialized = True
921
+ logger.info("RayBackend initialized")
922
+
923
+ def shutdown(self) -> None:
924
+ if self._ray and self._ray.is_initialized():
925
+ self._ray.shutdown()
926
+ self._ray = None
927
+ self._initialized = False
928
+
929
+ def distribute_data(
930
+ self,
931
+ data: pl.DataFrame | pl.LazyFrame | str,
932
+ num_partitions: int | None = None,
933
+ strategy: PartitionStrategy = PartitionStrategy.ROW_BASED,
934
+ ) -> list[PartitionInfo]:
935
+ if not self._ray:
936
+ raise RuntimeError("Ray not initialized")
937
+
938
+ # Load data
939
+ if isinstance(data, str):
940
+ df = pl.read_parquet(data)
941
+ elif isinstance(data, pl.LazyFrame):
942
+ df = data.collect()
943
+ else:
944
+ df = data
945
+
946
+ num_parts = num_partitions or (os.cpu_count() or 4)
947
+ rows_per_part = max(1, len(df) // num_parts)
948
+ columns = df.columns
949
+
950
+ # Create partitions and store in Ray object store
951
+ self._partitioned_data = []
952
+ partitions = []
953
+
954
+ for i in range(num_parts):
955
+ start = i * rows_per_part
956
+ end = start + rows_per_part if i < num_parts - 1 else len(df)
957
+
958
+ partition_df = df.slice(start, end - start)
959
+ ref = self._ray.put(partition_df)
960
+ self._partitioned_data.append(ref)
961
+
962
+ partitions.append(PartitionInfo(
963
+ partition_id=i,
964
+ total_partitions=num_parts,
965
+ start_row=start,
966
+ end_row=end,
967
+ columns=columns,
968
+ ))
969
+
970
+ return partitions
971
+
972
+ def map_partitions(
973
+ self,
974
+ func: Callable[[PartitionInfo, Any], WorkerResult],
975
+ partitions: list[PartitionInfo],
976
+ data: Any,
977
+ ) -> list[WorkerResult]:
978
+ if not self._ray or not hasattr(self, "_partitioned_data"):
979
+ raise RuntimeError("No data distributed")
980
+
981
+ ray = self._ray
982
+
983
+ @ray.remote
984
+ def profile_partition_remote(
985
+ df: pl.DataFrame,
986
+ partition_id: int,
987
+ ) -> dict[str, Any]:
988
+ """Remote function to profile a partition."""
989
+ import time
990
+ start = time.time()
991
+
992
+ stats = {}
993
+ for col in df.columns:
994
+ col_data = df.get_column(col)
995
+ row_count = len(col_data)
996
+ null_count = col_data.null_count()
997
+
998
+ stats[col] = {
999
+ "row_count": row_count,
1000
+ "null_count": null_count,
1001
+ "distinct_count": col_data.n_unique(),
1002
+ }
1003
+
1004
+ # Numeric stats
1005
+ if col_data.dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]:
1006
+ non_null = col_data.drop_nulls()
1007
+ if len(non_null) > 0:
1008
+ stats[col].update({
1009
+ "min_value": float(non_null.min()),
1010
+ "max_value": float(non_null.max()),
1011
+ "sum_value": float(non_null.sum()),
1012
+ "sum_squared": float((non_null ** 2).sum()),
1013
+ })
1014
+
1015
+ elapsed = (time.time() - start) * 1000
1016
+
1017
+ return {
1018
+ "partition_id": partition_id,
1019
+ "column_stats": stats,
1020
+ "row_count": len(df),
1021
+ "processing_time_ms": elapsed,
1022
+ }
1023
+
1024
+ # Launch tasks
1025
+ futures = [
1026
+ profile_partition_remote.remote(
1027
+ self._partitioned_data[p.partition_id],
1028
+ p.partition_id,
1029
+ )
1030
+ for p in partitions
1031
+ ]
1032
+
1033
+ # Collect results
1034
+ results_raw = ray.get(futures)
1035
+
1036
+ return [
1037
+ WorkerResult(
1038
+ partition_id=r["partition_id"],
1039
+ column_stats=r["column_stats"],
1040
+ row_count=r["row_count"],
1041
+ processing_time_ms=r["processing_time_ms"],
1042
+ )
1043
+ for r in results_raw
1044
+ ]
1045
+
1046
+ def aggregate_results(
1047
+ self,
1048
+ results: list[WorkerResult],
1049
+ ) -> dict[str, dict[str, Any]]:
1050
+ return LocalBackend(self.config).aggregate_results(results)
1051
+
1052
+
1053
+ # =============================================================================
1054
+ # Backend Registry
1055
+ # =============================================================================
1056
+
1057
+
1058
+ class BackendRegistry:
1059
+ """Registry for distributed backends.
1060
+
1061
+ Allows dynamic registration of custom backends.
1062
+ """
1063
+
1064
+ def __init__(self) -> None:
1065
+ self._backends: dict[str, type[DistributedBackend]] = {}
1066
+
1067
+ def register(
1068
+ self,
1069
+ name: str,
1070
+ backend_class: type[DistributedBackend],
1071
+ ) -> None:
1072
+ """Register a backend class."""
1073
+ self._backends[name] = backend_class
1074
+
1075
+ def get(self, name: str) -> type[DistributedBackend]:
1076
+ """Get a registered backend class."""
1077
+ if name not in self._backends:
1078
+ raise KeyError(
1079
+ f"Unknown backend: {name}. "
1080
+ f"Available: {list(self._backends.keys())}"
1081
+ )
1082
+ return self._backends[name]
1083
+
1084
+ def create(
1085
+ self,
1086
+ name: str,
1087
+ config: BackendConfig | None = None,
1088
+ ) -> DistributedBackend:
1089
+ """Create a backend instance."""
1090
+ backend_class = self.get(name)
1091
+ return backend_class(config)
1092
+
1093
+ def list_backends(self) -> list[str]:
1094
+ """List available backends."""
1095
+ return list(self._backends.keys())
1096
+
1097
+ def get_available_backends(self) -> list[str]:
1098
+ """List backends with available dependencies."""
1099
+ available = []
1100
+ for name, backend_class in self._backends.items():
1101
+ try:
1102
+ instance = backend_class()
1103
+ if instance.is_available():
1104
+ available.append(name)
1105
+ except Exception:
1106
+ pass
1107
+ return available
1108
+
1109
+
1110
+ # Global registry
1111
+ backend_registry = BackendRegistry()
1112
+ backend_registry.register("local", LocalBackend)
1113
+ backend_registry.register("spark", SparkBackend)
1114
+ backend_registry.register("dask", DaskBackend)
1115
+ backend_registry.register("ray", RayBackend)
1116
+
1117
+
1118
+ # =============================================================================
1119
+ # Distributed Profiler
1120
+ # =============================================================================
1121
+
1122
+
1123
+ @dataclass
1124
+ class DistributedProfileConfig:
1125
+ """Configuration for distributed profiling."""
1126
+
1127
+ backend: str = "auto"
1128
+ backend_config: BackendConfig | None = None
1129
+ partition_strategy: PartitionStrategy = PartitionStrategy.ROW_BASED
1130
+ num_partitions: int | None = None
1131
+ include_patterns: bool = True
1132
+ sample_size: int | None = None
1133
+ timeout_seconds: int = 3600
1134
+
1135
+
1136
+ class DistributedProfiler:
1137
+ """High-level distributed data profiler.
1138
+
1139
+ Provides a unified interface for profiling large datasets
1140
+ using any of the supported distributed backends.
1141
+
1142
+ Example:
1143
+ profiler = DistributedProfiler.create(backend="dask")
1144
+
1145
+ with profiler:
1146
+ profile = profiler.profile("hdfs://data/large.parquet")
1147
+ """
1148
+
1149
+ def __init__(
1150
+ self,
1151
+ backend: DistributedBackend,
1152
+ config: DistributedProfileConfig | None = None,
1153
+ ):
1154
+ self._backend = backend
1155
+ self._config = config or DistributedProfileConfig()
1156
+
1157
+ @classmethod
1158
+ def create(
1159
+ cls,
1160
+ backend: str = "auto",
1161
+ backend_config: BackendConfig | None = None,
1162
+ **kwargs: Any,
1163
+ ) -> "DistributedProfiler":
1164
+ """Create a distributed profiler with the specified backend.
1165
+
1166
+ Args:
1167
+ backend: Backend name or "auto" for auto-detection
1168
+ backend_config: Backend-specific configuration
1169
+ **kwargs: Additional profiler configuration
1170
+
1171
+ Returns:
1172
+ Configured DistributedProfiler
1173
+ """
1174
+ # Auto-detect backend
1175
+ if backend == "auto":
1176
+ available = backend_registry.get_available_backends()
1177
+ # Prefer in order: ray, dask, spark, local
1178
+ for preferred in ["ray", "dask", "spark", "local"]:
1179
+ if preferred in available:
1180
+ backend = preferred
1181
+ break
1182
+ else:
1183
+ backend = "local"
1184
+
1185
+ logger.info(f"Auto-selected backend: {backend}")
1186
+
1187
+ backend_instance = backend_registry.create(backend, backend_config)
1188
+ config = DistributedProfileConfig(
1189
+ backend=backend,
1190
+ backend_config=backend_config,
1191
+ **kwargs,
1192
+ )
1193
+
1194
+ return cls(backend_instance, config)
1195
+
1196
+ def __enter__(self) -> "DistributedProfiler":
1197
+ self._backend.initialize()
1198
+ return self
1199
+
1200
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
1201
+ self._backend.shutdown()
1202
+
1203
+ @property
1204
+ def backend(self) -> DistributedBackend:
1205
+ """Access the underlying backend."""
1206
+ return self._backend
1207
+
1208
+ def profile(
1209
+ self,
1210
+ data: pl.DataFrame | pl.LazyFrame | str,
1211
+ name: str = "",
1212
+ ) -> TableProfile:
1213
+ """Profile a dataset using distributed computing.
1214
+
1215
+ Args:
1216
+ data: DataFrame, LazyFrame, or path to data file
1217
+ name: Name for the profile
1218
+
1219
+ Returns:
1220
+ Complete TableProfile
1221
+ """
1222
+ start_time = time.time()
1223
+
1224
+ # Distribute data
1225
+ partitions = self._backend.distribute_data(
1226
+ data,
1227
+ num_partitions=self._config.num_partitions,
1228
+ strategy=self._config.partition_strategy,
1229
+ )
1230
+
1231
+ logger.info(f"Data distributed into {len(partitions)} partitions")
1232
+
1233
+ # Profile partitions
1234
+ results = self._backend.map_partitions(
1235
+ self._profile_partition,
1236
+ partitions,
1237
+ data,
1238
+ )
1239
+
1240
+ logger.info(f"Collected results from {len(results)} partitions")
1241
+
1242
+ # Aggregate results
1243
+ aggregated = self._backend.aggregate_results(results)
1244
+
1245
+ # Build TableProfile
1246
+ total_rows = sum(r.row_count for r in results)
1247
+ columns = list(aggregated.keys())
1248
+
1249
+ column_profiles = []
1250
+ for col_name, stats in aggregated.items():
1251
+ profile = self._build_column_profile(col_name, stats)
1252
+ column_profiles.append(profile)
1253
+
1254
+ elapsed_ms = (time.time() - start_time) * 1000
1255
+
1256
+ return TableProfile(
1257
+ name=name or "distributed_profile",
1258
+ row_count=total_rows,
1259
+ column_count=len(columns),
1260
+ columns=tuple(column_profiles),
1261
+ source=str(data) if isinstance(data, str) else "dataframe",
1262
+ profile_duration_ms=elapsed_ms,
1263
+ )
1264
+
1265
+ def _profile_partition(
1266
+ self,
1267
+ partition: PartitionInfo,
1268
+ data: Any,
1269
+ ) -> WorkerResult:
1270
+ """Profile a single partition (called by workers)."""
1271
+ start = time.time()
1272
+
1273
+ # This is a placeholder - actual implementation in backend
1274
+ stats: dict[str, dict[str, Any]] = {}
1275
+ row_count = 0
1276
+
1277
+ elapsed = (time.time() - start) * 1000
1278
+
1279
+ return WorkerResult(
1280
+ partition_id=partition.partition_id,
1281
+ column_stats=stats,
1282
+ row_count=row_count,
1283
+ processing_time_ms=elapsed,
1284
+ )
1285
+
1286
+ def _build_column_profile(
1287
+ self,
1288
+ name: str,
1289
+ stats: dict[str, Any],
1290
+ ) -> ColumnProfile:
1291
+ """Build ColumnProfile from aggregated stats."""
1292
+ row_count = stats.get("row_count", 0)
1293
+ null_count = stats.get("null_count", 0)
1294
+ distinct_count = stats.get("distinct_count", 0)
1295
+
1296
+ distribution = None
1297
+ if "mean" in stats:
1298
+ distribution = DistributionStats(
1299
+ mean=stats.get("mean"),
1300
+ std=stats.get("std"),
1301
+ min=stats.get("min_value"),
1302
+ max=stats.get("max_value"),
1303
+ )
1304
+
1305
+ return ColumnProfile(
1306
+ name=name,
1307
+ physical_type="unknown", # Would need type info
1308
+ row_count=row_count,
1309
+ null_count=null_count,
1310
+ null_ratio=null_count / row_count if row_count > 0 else 0,
1311
+ distinct_count=distinct_count,
1312
+ unique_ratio=distinct_count / row_count if row_count > 0 else 0,
1313
+ distribution=distribution,
1314
+ )
1315
+
1316
+
1317
+ # =============================================================================
1318
+ # Convenience Functions
1319
+ # =============================================================================
1320
+
1321
+
1322
+ def create_distributed_profiler(
1323
+ backend: str = "auto",
1324
+ **kwargs: Any,
1325
+ ) -> DistributedProfiler:
1326
+ """Create a distributed profiler.
1327
+
1328
+ Args:
1329
+ backend: Backend name ("local", "spark", "dask", "ray", "auto")
1330
+ **kwargs: Backend configuration options
1331
+
1332
+ Returns:
1333
+ Configured DistributedProfiler
1334
+ """
1335
+ return DistributedProfiler.create(backend=backend, **kwargs)
1336
+
1337
+
1338
+ def profile_distributed(
1339
+ data: pl.DataFrame | pl.LazyFrame | str,
1340
+ backend: str = "auto",
1341
+ name: str = "",
1342
+ **kwargs: Any,
1343
+ ) -> TableProfile:
1344
+ """Profile data using distributed computing.
1345
+
1346
+ Args:
1347
+ data: Data to profile
1348
+ backend: Backend to use
1349
+ name: Profile name
1350
+ **kwargs: Additional options
1351
+
1352
+ Returns:
1353
+ TableProfile
1354
+ """
1355
+ profiler = DistributedProfiler.create(backend=backend, **kwargs)
1356
+ with profiler:
1357
+ return profiler.profile(data, name=name)
1358
+
1359
+
1360
+ def get_available_backends() -> list[str]:
1361
+ """Get list of available distributed backends.
1362
+
1363
+ Returns:
1364
+ List of backend names with installed dependencies
1365
+ """
1366
+ return backend_registry.get_available_backends()