truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,846 @@
1
+ """Streaming data sources for validators.
2
+
3
+ Provides various streaming sources for processing large datasets:
4
+ - File-based streaming (CSV, Parquet, JSON, Arrow IPC)
5
+ - Arrow Flight streaming for distributed processing
6
+ - Memory-mapped file streaming for low-memory processing
7
+
8
+ Memory Optimization:
9
+ These sources enable processing datasets larger than available memory
10
+ by reading data in chunks without loading the entire file.
11
+
12
+ # Stream through a 100GB Parquet file in 100K row chunks:
13
+ with ParquetStreamingSource("huge_data.parquet", chunk_size=100_000) as source:
14
+ for chunk_df in source:
15
+ issues = validator.validate(chunk_df.lazy())
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from abc import ABC, abstractmethod
21
+ from dataclasses import dataclass, field
22
+ from pathlib import Path
23
+ from typing import Any, Iterator, TypeVar, Generic
24
+ import tempfile
25
+ import os
26
+
27
+ import polars as pl
28
+ import pyarrow as pa
29
+ import pyarrow.parquet as pq
30
+
31
+
32
+ T = TypeVar("T")
33
+
34
+
35
+ # =============================================================================
36
+ # Base Classes
37
+ # =============================================================================
38
+
39
+
40
+ @dataclass
41
+ class StreamingSourceConfig:
42
+ """Base configuration for streaming sources.
43
+
44
+ Attributes:
45
+ chunk_size: Number of rows per chunk
46
+ columns: Specific columns to load (None = all)
47
+ skip_rows: Number of rows to skip at the start
48
+ max_rows: Maximum total rows to read (None = all)
49
+ """
50
+
51
+ chunk_size: int = 100_000
52
+ columns: list[str] | None = None
53
+ skip_rows: int = 0
54
+ max_rows: int | None = None
55
+
56
+
57
+ class StreamingSource(ABC, Generic[T]):
58
+ """Abstract base class for streaming data sources.
59
+
60
+ Streaming sources provide an iterator interface for reading data
61
+ in chunks, enabling memory-efficient processing of large datasets.
62
+
63
+ Subclasses must implement:
64
+ - __iter__(): Yield DataFrame chunks
65
+ - __len__(): Return total row count (if known)
66
+
67
+ Example:
68
+ with MyStreamingSource("data.parquet") as source:
69
+ for chunk_df in source:
70
+ process(chunk_df)
71
+ """
72
+
73
+ def __init__(self, config: StreamingSourceConfig | None = None, **kwargs: Any):
74
+ self.config = config or StreamingSourceConfig(**kwargs)
75
+ self._is_open = False
76
+ self._rows_read = 0
77
+
78
+ @abstractmethod
79
+ def __iter__(self) -> Iterator[pl.DataFrame]:
80
+ """Iterate over DataFrame chunks."""
81
+ pass
82
+
83
+ @abstractmethod
84
+ def __len__(self) -> int:
85
+ """Return total row count (may be estimated)."""
86
+ pass
87
+
88
+ def __enter__(self) -> "StreamingSource":
89
+ self.open()
90
+ return self
91
+
92
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
93
+ self.close()
94
+
95
+ def open(self) -> None:
96
+ """Open the source for reading."""
97
+ self._is_open = True
98
+ self._rows_read = 0
99
+
100
+ def close(self) -> None:
101
+ """Close the source."""
102
+ self._is_open = False
103
+
104
+ @property
105
+ def is_open(self) -> bool:
106
+ return self._is_open
107
+
108
+ @property
109
+ def rows_read(self) -> int:
110
+ return self._rows_read
111
+
112
+
113
+ # =============================================================================
114
+ # File-Based Streaming Sources
115
+ # =============================================================================
116
+
117
+
118
+ @dataclass
119
+ class FileStreamingConfig(StreamingSourceConfig):
120
+ """Configuration for file-based streaming sources.
121
+
122
+ Attributes:
123
+ file_path: Path to the data file
124
+ use_mmap: Use memory mapping for reduced memory usage
125
+ """
126
+
127
+ file_path: str = ""
128
+ use_mmap: bool = True
129
+
130
+
131
+ class ParquetStreamingSource(StreamingSource):
132
+ """Streaming source for Parquet files.
133
+
134
+ Uses PyArrow's streaming reader to read Parquet files in row groups,
135
+ enabling processing of files larger than available memory.
136
+
137
+ Memory Optimization:
138
+ - Uses row group streaming (no full file load)
139
+ - Supports column projection
140
+ - Optional memory mapping
141
+
142
+ Example:
143
+ source = ParquetStreamingSource(
144
+ "huge_data.parquet",
145
+ chunk_size=100_000,
146
+ columns=["id", "value"], # Only load these columns
147
+ )
148
+ with source:
149
+ for chunk in source:
150
+ validate(chunk)
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ file_path: str | Path,
156
+ chunk_size: int = 100_000,
157
+ columns: list[str] | None = None,
158
+ use_mmap: bool = True,
159
+ **kwargs: Any,
160
+ ):
161
+ config = FileStreamingConfig(
162
+ file_path=str(file_path),
163
+ chunk_size=chunk_size,
164
+ columns=columns,
165
+ use_mmap=use_mmap,
166
+ **kwargs,
167
+ )
168
+ super().__init__(config)
169
+ self._file_path = Path(file_path)
170
+ self._parquet_file: pq.ParquetFile | None = None
171
+ self._total_rows: int = 0
172
+
173
+ def open(self) -> None:
174
+ """Open the Parquet file for streaming."""
175
+ super().open()
176
+ self._parquet_file = pq.ParquetFile(
177
+ self._file_path,
178
+ memory_map=self.config.use_mmap,
179
+ )
180
+ self._total_rows = self._parquet_file.metadata.num_rows
181
+
182
+ def close(self) -> None:
183
+ """Close the Parquet file."""
184
+ if self._parquet_file:
185
+ self._parquet_file = None
186
+ super().close()
187
+
188
+ def __len__(self) -> int:
189
+ if self._parquet_file:
190
+ return self._total_rows
191
+ # Open temporarily to get count
192
+ with pq.ParquetFile(self._file_path) as pf:
193
+ return pf.metadata.num_rows
194
+
195
+ def __iter__(self) -> Iterator[pl.DataFrame]:
196
+ if not self._is_open or not self._parquet_file:
197
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
198
+
199
+ # Stream by row groups
200
+ num_row_groups = self._parquet_file.metadata.num_row_groups
201
+ rows_yielded = 0
202
+ max_rows = self.config.max_rows
203
+
204
+ for rg_idx in range(num_row_groups):
205
+ # Skip if we've hit max_rows
206
+ if max_rows is not None and rows_yielded >= max_rows:
207
+ break
208
+
209
+ # Read row group
210
+ table = self._parquet_file.read_row_group(
211
+ rg_idx,
212
+ columns=self.config.columns,
213
+ )
214
+
215
+ # Convert to Polars
216
+ df = pl.from_arrow(table)
217
+
218
+ # Apply max_rows limit within row group
219
+ if max_rows is not None:
220
+ remaining = max_rows - rows_yielded
221
+ if len(df) > remaining:
222
+ df = df.head(remaining)
223
+
224
+ # Skip rows if needed
225
+ if self.config.skip_rows > 0 and rows_yielded == 0:
226
+ if len(df) <= self.config.skip_rows:
227
+ rows_yielded += len(df)
228
+ continue
229
+ df = df.slice(self.config.skip_rows)
230
+
231
+ # Yield in chunk_size batches
232
+ for offset in range(0, len(df), self.config.chunk_size):
233
+ chunk = df.slice(offset, self.config.chunk_size)
234
+ self._rows_read += len(chunk)
235
+ rows_yielded += len(chunk)
236
+ yield chunk
237
+
238
+ if max_rows is not None and rows_yielded >= max_rows:
239
+ break
240
+
241
+
242
+ class CSVStreamingSource(StreamingSource):
243
+ """Streaming source for CSV files.
244
+
245
+ Uses Polars' lazy scanning with slicing for memory-efficient
246
+ CSV processing.
247
+
248
+ Example:
249
+ source = CSVStreamingSource(
250
+ "large_data.csv",
251
+ chunk_size=50_000,
252
+ separator=",",
253
+ )
254
+ with source:
255
+ for chunk in source:
256
+ validate(chunk)
257
+ """
258
+
259
+ def __init__(
260
+ self,
261
+ file_path: str | Path,
262
+ chunk_size: int = 100_000,
263
+ columns: list[str] | None = None,
264
+ separator: str = ",",
265
+ has_header: bool = True,
266
+ skip_rows: int = 0,
267
+ max_rows: int | None = None,
268
+ **kwargs: Any,
269
+ ):
270
+ config = StreamingSourceConfig(
271
+ chunk_size=chunk_size,
272
+ columns=columns,
273
+ skip_rows=skip_rows,
274
+ max_rows=max_rows,
275
+ )
276
+ super().__init__(config)
277
+ self._file_path = Path(file_path)
278
+ self._separator = separator
279
+ self._has_header = has_header
280
+ self._total_rows: int | None = None
281
+ self._lazy_frame: pl.LazyFrame | None = None
282
+
283
+ def open(self) -> None:
284
+ """Open the CSV file for streaming."""
285
+ super().open()
286
+ self._lazy_frame = pl.scan_csv(
287
+ self._file_path,
288
+ separator=self._separator,
289
+ has_header=self._has_header,
290
+ skip_rows=self.config.skip_rows,
291
+ )
292
+
293
+ # Select columns if specified
294
+ if self.config.columns:
295
+ self._lazy_frame = self._lazy_frame.select(self.config.columns)
296
+
297
+ def close(self) -> None:
298
+ """Close the CSV source."""
299
+ self._lazy_frame = None
300
+ super().close()
301
+
302
+ def __len__(self) -> int:
303
+ if self._total_rows is not None:
304
+ return self._total_rows
305
+
306
+ # Count rows (this requires scanning the file)
307
+ if self._lazy_frame is not None:
308
+ self._total_rows = self._lazy_frame.select(pl.len()).collect().item()
309
+ else:
310
+ lf = pl.scan_csv(self._file_path)
311
+ self._total_rows = lf.select(pl.len()).collect().item()
312
+
313
+ return self._total_rows
314
+
315
+ def __iter__(self) -> Iterator[pl.DataFrame]:
316
+ if not self._is_open or self._lazy_frame is None:
317
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
318
+
319
+ total_rows = len(self)
320
+ max_rows = self.config.max_rows or total_rows
321
+ rows_yielded = 0
322
+
323
+ for offset in range(0, total_rows, self.config.chunk_size):
324
+ if rows_yielded >= max_rows:
325
+ break
326
+
327
+ remaining = min(self.config.chunk_size, max_rows - rows_yielded)
328
+ chunk = self._lazy_frame.slice(offset, remaining).collect()
329
+
330
+ if len(chunk) == 0:
331
+ break
332
+
333
+ self._rows_read += len(chunk)
334
+ rows_yielded += len(chunk)
335
+ yield chunk
336
+
337
+
338
+ class JSONLStreamingSource(StreamingSource):
339
+ """Streaming source for JSON Lines (JSONL/NDJSON) files.
340
+
341
+ Reads JSON Lines files line by line in chunks.
342
+
343
+ Example:
344
+ source = JSONLStreamingSource(
345
+ "events.jsonl",
346
+ chunk_size=10_000,
347
+ )
348
+ with source:
349
+ for chunk in source:
350
+ validate(chunk)
351
+ """
352
+
353
+ def __init__(
354
+ self,
355
+ file_path: str | Path,
356
+ chunk_size: int = 100_000,
357
+ columns: list[str] | None = None,
358
+ **kwargs: Any,
359
+ ):
360
+ config = StreamingSourceConfig(
361
+ chunk_size=chunk_size,
362
+ columns=columns,
363
+ **kwargs,
364
+ )
365
+ super().__init__(config)
366
+ self._file_path = Path(file_path)
367
+ self._lazy_frame: pl.LazyFrame | None = None
368
+
369
+ def open(self) -> None:
370
+ """Open the JSONL file for streaming."""
371
+ super().open()
372
+ self._lazy_frame = pl.scan_ndjson(self._file_path)
373
+
374
+ if self.config.columns:
375
+ self._lazy_frame = self._lazy_frame.select(self.config.columns)
376
+
377
+ def close(self) -> None:
378
+ self._lazy_frame = None
379
+ super().close()
380
+
381
+ def __len__(self) -> int:
382
+ if self._lazy_frame is not None:
383
+ return self._lazy_frame.select(pl.len()).collect().item()
384
+ return pl.scan_ndjson(self._file_path).select(pl.len()).collect().item()
385
+
386
+ def __iter__(self) -> Iterator[pl.DataFrame]:
387
+ if not self._is_open or self._lazy_frame is None:
388
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
389
+
390
+ total_rows = len(self)
391
+ max_rows = self.config.max_rows or total_rows
392
+ rows_yielded = 0
393
+
394
+ for offset in range(0, total_rows, self.config.chunk_size):
395
+ if rows_yielded >= max_rows:
396
+ break
397
+
398
+ remaining = min(self.config.chunk_size, max_rows - rows_yielded)
399
+ chunk = self._lazy_frame.slice(offset, remaining).collect()
400
+
401
+ if len(chunk) == 0:
402
+ break
403
+
404
+ self._rows_read += len(chunk)
405
+ rows_yielded += len(chunk)
406
+ yield chunk
407
+
408
+
409
+ # =============================================================================
410
+ # Arrow IPC Streaming
411
+ # =============================================================================
412
+
413
+
414
+ class ArrowIPCStreamingSource(StreamingSource):
415
+ """Streaming source for Arrow IPC files.
416
+
417
+ Uses Arrow's streaming reader for zero-copy reading of Arrow IPC files.
418
+ This is the most memory-efficient format for large datasets.
419
+
420
+ Features:
421
+ - Zero-copy reading (minimal memory overhead)
422
+ - Preserves Arrow schema and metadata
423
+ - Supports memory mapping
424
+
425
+ Example:
426
+ source = ArrowIPCStreamingSource(
427
+ "data.arrow",
428
+ chunk_size=100_000,
429
+ )
430
+ with source:
431
+ for chunk in source:
432
+ validate(chunk)
433
+ """
434
+
435
+ def __init__(
436
+ self,
437
+ file_path: str | Path,
438
+ chunk_size: int = 100_000,
439
+ columns: list[str] | None = None,
440
+ use_mmap: bool = True,
441
+ **kwargs: Any,
442
+ ):
443
+ config = FileStreamingConfig(
444
+ file_path=str(file_path),
445
+ chunk_size=chunk_size,
446
+ columns=columns,
447
+ use_mmap=use_mmap,
448
+ **kwargs,
449
+ )
450
+ super().__init__(config)
451
+ self._file_path = Path(file_path)
452
+ self._reader: pa.RecordBatchFileReader | None = None
453
+ self._source_file = None
454
+
455
+ def open(self) -> None:
456
+ """Open the Arrow IPC file."""
457
+ super().open()
458
+
459
+ if self.config.use_mmap:
460
+ self._source_file = pa.memory_map(str(self._file_path), "r")
461
+ self._reader = pa.ipc.open_file(self._source_file)
462
+ else:
463
+ self._reader = pa.ipc.open_file(str(self._file_path))
464
+
465
+ def close(self) -> None:
466
+ """Close the Arrow IPC file."""
467
+ self._reader = None
468
+ if self._source_file:
469
+ self._source_file.close()
470
+ self._source_file = None
471
+ super().close()
472
+
473
+ def __len__(self) -> int:
474
+ if self._reader:
475
+ return sum(
476
+ self._reader.get_batch(i).num_rows
477
+ for i in range(self._reader.num_record_batches)
478
+ )
479
+ with pa.ipc.open_file(str(self._file_path)) as reader:
480
+ return sum(
481
+ reader.get_batch(i).num_rows
482
+ for i in range(reader.num_record_batches)
483
+ )
484
+
485
+ def __iter__(self) -> Iterator[pl.DataFrame]:
486
+ if not self._is_open or not self._reader:
487
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
488
+
489
+ max_rows = self.config.max_rows
490
+ rows_yielded = 0
491
+ buffer: list[pa.RecordBatch] = []
492
+ buffer_rows = 0
493
+
494
+ for batch_idx in range(self._reader.num_record_batches):
495
+ if max_rows is not None and rows_yielded >= max_rows:
496
+ break
497
+
498
+ batch = self._reader.get_batch(batch_idx)
499
+
500
+ # Select columns if specified
501
+ if self.config.columns:
502
+ batch = batch.select(self.config.columns)
503
+
504
+ buffer.append(batch)
505
+ buffer_rows += batch.num_rows
506
+
507
+ # Yield when buffer reaches chunk_size
508
+ while buffer_rows >= self.config.chunk_size:
509
+ # Combine and split
510
+ combined = pa.Table.from_batches(buffer)
511
+ chunk_table = combined.slice(0, self.config.chunk_size)
512
+ remaining_table = combined.slice(self.config.chunk_size)
513
+
514
+ chunk_df = pl.from_arrow(chunk_table)
515
+
516
+ # Apply max_rows limit
517
+ if max_rows is not None:
518
+ limit = max_rows - rows_yielded
519
+ if len(chunk_df) > limit:
520
+ chunk_df = chunk_df.head(limit)
521
+
522
+ self._rows_read += len(chunk_df)
523
+ rows_yielded += len(chunk_df)
524
+ yield chunk_df
525
+
526
+ # Update buffer
527
+ if remaining_table.num_rows > 0:
528
+ buffer = [remaining_table.to_batches()[0]] if remaining_table.to_batches() else []
529
+ buffer_rows = remaining_table.num_rows
530
+ else:
531
+ buffer = []
532
+ buffer_rows = 0
533
+
534
+ if max_rows is not None and rows_yielded >= max_rows:
535
+ break
536
+
537
+ # Yield remaining buffer
538
+ if buffer and (max_rows is None or rows_yielded < max_rows):
539
+ combined = pa.Table.from_batches(buffer)
540
+ chunk_df = pl.from_arrow(combined)
541
+
542
+ if max_rows is not None:
543
+ limit = max_rows - rows_yielded
544
+ if len(chunk_df) > limit:
545
+ chunk_df = chunk_df.head(limit)
546
+
547
+ if len(chunk_df) > 0:
548
+ self._rows_read += len(chunk_df)
549
+ yield chunk_df
550
+
551
+
552
+ # =============================================================================
553
+ # Arrow Flight Streaming (for distributed processing)
554
+ # =============================================================================
555
+
556
+
557
+ @dataclass
558
+ class ArrowFlightConfig(StreamingSourceConfig):
559
+ """Configuration for Arrow Flight streaming.
560
+
561
+ Attributes:
562
+ host: Flight server host
563
+ port: Flight server port
564
+ ticket: Flight ticket for the data stream
565
+ use_tls: Use TLS encryption
566
+ token: Authentication token
567
+ """
568
+
569
+ host: str = "localhost"
570
+ port: int = 8815
571
+ ticket: bytes = b""
572
+ use_tls: bool = False
573
+ token: str | None = None
574
+ timeout_seconds: float = 60.0
575
+
576
+
577
+ class ArrowFlightStreamingSource(StreamingSource):
578
+ """Streaming source using Arrow Flight protocol.
579
+
580
+ Arrow Flight is designed for high-performance data transfer between
581
+ processes and machines. This source connects to a Flight server
582
+ and streams data in record batches.
583
+
584
+ Features:
585
+ - Network streaming from remote servers
586
+ - Parallel data retrieval
587
+ - Zero-copy when possible
588
+ - Authentication support
589
+
590
+ Example:
591
+ source = ArrowFlightStreamingSource(
592
+ host="data-server.example.com",
593
+ port=8815,
594
+ ticket=b"dataset-1234",
595
+ )
596
+ with source:
597
+ for chunk in source:
598
+ validate(chunk)
599
+ """
600
+
601
+ def __init__(
602
+ self,
603
+ host: str = "localhost",
604
+ port: int = 8815,
605
+ ticket: bytes = b"",
606
+ use_tls: bool = False,
607
+ token: str | None = None,
608
+ chunk_size: int = 100_000,
609
+ columns: list[str] | None = None,
610
+ **kwargs: Any,
611
+ ):
612
+ config = ArrowFlightConfig(
613
+ host=host,
614
+ port=port,
615
+ ticket=ticket,
616
+ use_tls=use_tls,
617
+ token=token,
618
+ chunk_size=chunk_size,
619
+ columns=columns,
620
+ **kwargs,
621
+ )
622
+ super().__init__(config)
623
+ self._client: Any = None
624
+ self._reader: Any = None
625
+ self._total_rows: int | None = None
626
+
627
+ def _check_flight_available(self) -> None:
628
+ """Check if Arrow Flight is available."""
629
+ try:
630
+ import pyarrow.flight # noqa: F401
631
+ except ImportError:
632
+ raise ImportError(
633
+ "pyarrow.flight is required for Arrow Flight streaming. "
634
+ "Install with: pip install pyarrow[flight]"
635
+ )
636
+
637
+ def open(self) -> None:
638
+ """Connect to the Flight server."""
639
+ super().open()
640
+ self._check_flight_available()
641
+
642
+ import pyarrow.flight as flight
643
+
644
+ # Build connection string
645
+ scheme = "grpc+tls" if self.config.use_tls else "grpc"
646
+ location = f"{scheme}://{self.config.host}:{self.config.port}"
647
+
648
+ # Create client
649
+ self._client = flight.connect(location)
650
+
651
+ # Authenticate if token provided
652
+ if self.config.token:
653
+ self._client.authenticate_basic_token("", self.config.token)
654
+
655
+ # Create reader from ticket
656
+ ticket = flight.Ticket(self.config.ticket)
657
+ self._reader = self._client.do_get(ticket)
658
+
659
+ def close(self) -> None:
660
+ """Disconnect from the Flight server."""
661
+ if self._reader:
662
+ self._reader.close()
663
+ self._reader = None
664
+ if self._client:
665
+ self._client.close()
666
+ self._client = None
667
+ super().close()
668
+
669
+ def __len__(self) -> int:
670
+ # Flight doesn't always provide row count upfront
671
+ if self._total_rows is not None:
672
+ return self._total_rows
673
+ return -1 # Unknown
674
+
675
+ def __iter__(self) -> Iterator[pl.DataFrame]:
676
+ if not self._is_open or not self._reader:
677
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
678
+
679
+ max_rows = self.config.max_rows
680
+ rows_yielded = 0
681
+ buffer: list[pa.RecordBatch] = []
682
+ buffer_rows = 0
683
+
684
+ # Stream record batches from Flight
685
+ for batch in self._reader:
686
+ if max_rows is not None and rows_yielded >= max_rows:
687
+ break
688
+
689
+ # Select columns if specified
690
+ if self.config.columns:
691
+ batch = batch.select(self.config.columns)
692
+
693
+ buffer.append(batch)
694
+ buffer_rows += batch.num_rows
695
+
696
+ # Yield when buffer reaches chunk_size
697
+ while buffer_rows >= self.config.chunk_size:
698
+ combined = pa.Table.from_batches(buffer)
699
+ chunk_table = combined.slice(0, self.config.chunk_size)
700
+ remaining_table = combined.slice(self.config.chunk_size)
701
+
702
+ chunk_df = pl.from_arrow(chunk_table)
703
+
704
+ if max_rows is not None:
705
+ limit = max_rows - rows_yielded
706
+ if len(chunk_df) > limit:
707
+ chunk_df = chunk_df.head(limit)
708
+
709
+ self._rows_read += len(chunk_df)
710
+ rows_yielded += len(chunk_df)
711
+ yield chunk_df
712
+
713
+ if remaining_table.num_rows > 0:
714
+ buffer = [remaining_table.to_batches()[0]] if remaining_table.to_batches() else []
715
+ buffer_rows = remaining_table.num_rows
716
+ else:
717
+ buffer = []
718
+ buffer_rows = 0
719
+
720
+ if max_rows is not None and rows_yielded >= max_rows:
721
+ break
722
+
723
+ # Yield remaining buffer
724
+ if buffer and (max_rows is None or rows_yielded < max_rows):
725
+ combined = pa.Table.from_batches(buffer)
726
+ chunk_df = pl.from_arrow(combined)
727
+
728
+ if max_rows is not None:
729
+ limit = max_rows - rows_yielded
730
+ if len(chunk_df) > limit:
731
+ chunk_df = chunk_df.head(limit)
732
+
733
+ if len(chunk_df) > 0:
734
+ self._rows_read += len(chunk_df)
735
+ self._total_rows = (self._total_rows or 0) + len(chunk_df)
736
+ yield chunk_df
737
+
738
+
739
+ # =============================================================================
740
+ # Utility Functions
741
+ # =============================================================================
742
+
743
+
744
+ def create_streaming_source(
745
+ source: str | Path | pl.LazyFrame,
746
+ chunk_size: int = 100_000,
747
+ columns: list[str] | None = None,
748
+ **kwargs: Any,
749
+ ) -> StreamingSource:
750
+ """Create an appropriate streaming source based on input type.
751
+
752
+ Automatically detects file type and creates the appropriate
753
+ streaming source.
754
+
755
+ Args:
756
+ source: File path, URL, or LazyFrame
757
+ chunk_size: Rows per chunk
758
+ columns: Columns to select (None = all)
759
+ **kwargs: Additional source-specific options
760
+
761
+ Returns:
762
+ Appropriate StreamingSource instance
763
+
764
+ Example:
765
+ # Automatically detects Parquet
766
+ source = create_streaming_source("data.parquet")
767
+
768
+ # Explicitly configure
769
+ source = create_streaming_source(
770
+ "data.csv",
771
+ chunk_size=50_000,
772
+ separator=";",
773
+ )
774
+ """
775
+ if isinstance(source, pl.LazyFrame):
776
+ return LazyFrameStreamingSource(source, chunk_size=chunk_size, columns=columns)
777
+
778
+ path = Path(source)
779
+ suffix = path.suffix.lower()
780
+
781
+ if suffix in (".parquet", ".pq"):
782
+ return ParquetStreamingSource(path, chunk_size=chunk_size, columns=columns, **kwargs)
783
+ elif suffix in (".csv", ".tsv"):
784
+ separator = "\t" if suffix == ".tsv" else kwargs.pop("separator", ",")
785
+ return CSVStreamingSource(path, chunk_size=chunk_size, columns=columns, separator=separator, **kwargs)
786
+ elif suffix in (".jsonl", ".ndjson"):
787
+ return JSONLStreamingSource(path, chunk_size=chunk_size, columns=columns, **kwargs)
788
+ elif suffix in (".arrow", ".ipc", ".feather"):
789
+ return ArrowIPCStreamingSource(path, chunk_size=chunk_size, columns=columns, **kwargs)
790
+ else:
791
+ raise ValueError(f"Unsupported file type: {suffix}")
792
+
793
+
794
+ class LazyFrameStreamingSource(StreamingSource):
795
+ """Streaming source wrapping a Polars LazyFrame.
796
+
797
+ Enables using the streaming interface with existing LazyFrames.
798
+
799
+ Example:
800
+ lf = pl.scan_parquet("data.parquet")
801
+ source = LazyFrameStreamingSource(lf, chunk_size=100_000)
802
+ with source:
803
+ for chunk in source:
804
+ validate(chunk)
805
+ """
806
+
807
+ def __init__(
808
+ self,
809
+ lazy_frame: pl.LazyFrame,
810
+ chunk_size: int = 100_000,
811
+ columns: list[str] | None = None,
812
+ **kwargs: Any,
813
+ ):
814
+ config = StreamingSourceConfig(chunk_size=chunk_size, columns=columns, **kwargs)
815
+ super().__init__(config)
816
+ self._lazy_frame = lazy_frame
817
+ if columns:
818
+ self._lazy_frame = self._lazy_frame.select(columns)
819
+ self._total_rows: int | None = None
820
+
821
+ def __len__(self) -> int:
822
+ if self._total_rows is None:
823
+ self._total_rows = self._lazy_frame.select(pl.len()).collect().item()
824
+ return self._total_rows
825
+
826
+ def __iter__(self) -> Iterator[pl.DataFrame]:
827
+ if not self._is_open:
828
+ raise RuntimeError("Source not open. Use 'with' statement or call open().")
829
+
830
+ total_rows = len(self)
831
+ max_rows = self.config.max_rows or total_rows
832
+ rows_yielded = 0
833
+
834
+ for offset in range(0, total_rows, self.config.chunk_size):
835
+ if rows_yielded >= max_rows:
836
+ break
837
+
838
+ remaining = min(self.config.chunk_size, max_rows - rows_yielded)
839
+ chunk = self._lazy_frame.slice(offset, remaining).collect()
840
+
841
+ if len(chunk) == 0:
842
+ break
843
+
844
+ self._rows_read += len(chunk)
845
+ rows_yielded += len(chunk)
846
+ yield chunk