truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1089 @@
1
+ """Base classes for validators.
2
+
3
+ Features:
4
+ - Immutable configuration (thread-safe)
5
+ - Timeout mechanism
6
+ - Type-safe column filtering
7
+ - ReDoS protection for regex patterns
8
+ - Graceful degradation on errors
9
+ """
10
+
11
+ from abc import ABC, abstractmethod
12
+ from dataclasses import dataclass
13
+ from typing import Any, Callable
14
+ import re
15
+ import signal
16
+ import threading
17
+ import logging
18
+ import time
19
+ from functools import wraps
20
+ from enum import Enum
21
+
22
+ import polars as pl
23
+
24
+ from truthound.types import Severity
25
+
26
+
27
+ # ============================================================================
28
+ # Logging - Uses standard Python logging directly
29
+ # ============================================================================
30
+
31
+ def _get_logger(name: str) -> logging.Logger:
32
+ """Get a logger for the given validator name."""
33
+ return logging.getLogger(f"truthound.{name}")
34
+
35
+
36
+ # ============================================================================
37
+ # Error Types
38
+ # ============================================================================
39
+
40
+ class RegexValidationError(ValueError):
41
+ """Raised when a regex pattern is invalid."""
42
+
43
+ def __init__(self, pattern: str, error: str):
44
+ self.pattern = pattern
45
+ self.error = error
46
+ super().__init__(f"Invalid regex pattern '{pattern}': {error}")
47
+
48
+
49
+ class ValidationTimeoutError(Exception):
50
+ """Raised when validation exceeds the configured timeout."""
51
+
52
+ def __init__(self, timeout_seconds: float, validator_name: str = ""):
53
+ self.timeout_seconds = timeout_seconds
54
+ self.validator_name = validator_name
55
+ message = f"Validation timed out after {timeout_seconds}s"
56
+ if validator_name:
57
+ message = f"[{validator_name}] {message}"
58
+ super().__init__(message)
59
+
60
+
61
+ class ColumnNotFoundError(Exception):
62
+ """Raised when a required column is not found in the schema."""
63
+
64
+ def __init__(self, column: str, available_columns: list[str]):
65
+ self.column = column
66
+ self.available_columns = available_columns
67
+ super().__init__(
68
+ f"Column '{column}' not found. Available: {available_columns[:10]}"
69
+ + ("..." if len(available_columns) > 10 else "")
70
+ )
71
+
72
+
73
+ # ============================================================================
74
+ # ReDoS Protection (simplified)
75
+ # ============================================================================
76
+
77
+ class RegexSafetyChecker:
78
+ """Detects ReDoS vulnerabilities in regex patterns.
79
+
80
+ Checks for common dangerous patterns that could cause exponential backtracking.
81
+ """
82
+
83
+ REDOS_PATTERNS = [
84
+ r"\(.+\)\+\+", # Nested quantifiers: (a+)+
85
+ r"\(.+\)\*\*", # Nested quantifiers: (a*)*
86
+ r"\(.+\)\{\d+,\}", # Nested with unbounded repetition
87
+ r"\(.+\|.+\)\+", # Alternation in quantified group
88
+ ]
89
+
90
+ MAX_PATTERN_LENGTH = 1000
91
+
92
+ @classmethod
93
+ def check_pattern(cls, pattern: str) -> tuple[bool, str | None]:
94
+ """Check if a pattern is potentially vulnerable to ReDoS."""
95
+ if len(pattern) > cls.MAX_PATTERN_LENGTH:
96
+ return False, f"Pattern too long ({len(pattern)} > {cls.MAX_PATTERN_LENGTH})"
97
+
98
+ for redos_pattern in cls.REDOS_PATTERNS:
99
+ if re.search(redos_pattern, pattern):
100
+ return False, f"Potentially vulnerable to ReDoS: matches {redos_pattern}"
101
+
102
+ return True, None
103
+
104
+
105
+ # ============================================================================
106
+ # Safe Sampling
107
+ # ============================================================================
108
+
109
+ class SafeSampler:
110
+ """Memory-safe sampling using Polars lazy evaluation."""
111
+
112
+ @staticmethod
113
+ def safe_head(
114
+ lf: pl.LazyFrame,
115
+ n: int,
116
+ columns: list[str] | None = None,
117
+ ) -> pl.DataFrame:
118
+ """Safely get first n rows."""
119
+ query = lf
120
+ if columns:
121
+ schema = lf.collect_schema()
122
+ valid_cols = [c for c in columns if c in schema.names()]
123
+ if valid_cols:
124
+ query = query.select(valid_cols)
125
+ return query.head(n).collect(engine="streaming")
126
+
127
+ @staticmethod
128
+ def safe_sample(
129
+ lf: pl.LazyFrame,
130
+ n: int,
131
+ columns: list[str] | None = None,
132
+ seed: int | None = None,
133
+ ) -> pl.DataFrame:
134
+ """Safely sample n rows."""
135
+ return SafeSampler.safe_head(lf, n, columns)
136
+
137
+ @staticmethod
138
+ def safe_filter_sample(
139
+ lf: pl.LazyFrame,
140
+ filter_expr: pl.Expr,
141
+ n: int,
142
+ columns: list[str] | None = None,
143
+ ) -> pl.DataFrame:
144
+ """Safely get filtered samples."""
145
+ query = lf.filter(filter_expr)
146
+ if columns:
147
+ schema = lf.collect_schema()
148
+ valid_cols = [c for c in columns if c in schema.names()]
149
+ if valid_cols:
150
+ query = query.select(valid_cols)
151
+ return query.head(n).collect(engine="streaming")
152
+
153
+
154
+ # ============================================================================
155
+ # Memory Tracking (stub for compatibility)
156
+ # ============================================================================
157
+
158
+ class MemoryTracker:
159
+ """Stub for backward compatibility. Memory tracking is not enforced."""
160
+
161
+ def __init__(self, limit_mb: float | None = None):
162
+ self.limit_mb = limit_mb
163
+ self.peak_mb: float = 0.0
164
+
165
+ def get_current_mb(self) -> float:
166
+ return 0.0
167
+
168
+ def start(self) -> None:
169
+ pass
170
+
171
+ def check(self) -> None:
172
+ pass
173
+
174
+ def get_delta_mb(self) -> float:
175
+ return 0.0
176
+
177
+ def __enter__(self) -> "MemoryTracker":
178
+ return self
179
+
180
+ def __exit__(self, *args: Any) -> None:
181
+ pass
182
+
183
+
184
+ # ============================================================================
185
+ # Graceful Degradation (#8, #13)
186
+ # ============================================================================
187
+
188
+ class ValidationResult(Enum):
189
+ """Result status for individual validation operations."""
190
+ SUCCESS = "success"
191
+ PARTIAL = "partial" # Completed with some issues
192
+ SKIPPED = "skipped" # Skipped due to missing columns, etc.
193
+ FAILED = "failed" # Unrecoverable error
194
+ TIMEOUT = "timeout" # Exceeded time limit
195
+
196
+
197
+ @dataclass
198
+ class ErrorContext:
199
+ """Simplified error context for validation failures."""
200
+ error_type: str
201
+ message: str
202
+
203
+ def to_dict(self) -> dict[str, Any]:
204
+ return {"error_type": self.error_type, "message": self.message}
205
+
206
+
207
+ @dataclass
208
+ class ValidatorExecutionResult:
209
+ """Result of a single validator execution with error handling."""
210
+ validator_name: str
211
+ status: ValidationResult
212
+ issues: list["ValidationIssue"]
213
+ error_message: str | None = None
214
+ error_context: ErrorContext | None = None
215
+ execution_time_ms: float = 0.0
216
+
217
+ def to_dict(self) -> dict[str, Any]:
218
+ return {
219
+ "validator": self.validator_name,
220
+ "status": self.status.value,
221
+ "issue_count": len(self.issues),
222
+ "execution_time_ms": self.execution_time_ms,
223
+ "error": self.error_context.to_dict() if self.error_context else None,
224
+ }
225
+
226
+
227
+ def _validate_safe(
228
+ validator: "Validator",
229
+ lf: pl.LazyFrame,
230
+ skip_on_error: bool = True,
231
+ log_errors: bool = True,
232
+ ) -> ValidatorExecutionResult:
233
+ """Execute validation with error handling.
234
+
235
+ Returns:
236
+ ValidatorExecutionResult with status and any issues found
237
+ """
238
+ start_time = time.time()
239
+ logger = _get_logger(validator.name)
240
+
241
+ try:
242
+ issues = validator.validate(lf)
243
+ return ValidatorExecutionResult(
244
+ validator_name=validator.name,
245
+ status=ValidationResult.SUCCESS,
246
+ issues=issues,
247
+ execution_time_ms=(time.time() - start_time) * 1000,
248
+ )
249
+
250
+ except ColumnNotFoundError as e:
251
+ if log_errors:
252
+ logger.warning(f"Column not found: {e.column}")
253
+ return ValidatorExecutionResult(
254
+ validator_name=validator.name,
255
+ status=ValidationResult.SKIPPED,
256
+ issues=[],
257
+ error_message=str(e),
258
+ error_context=ErrorContext("ColumnNotFoundError", str(e)),
259
+ execution_time_ms=(time.time() - start_time) * 1000,
260
+ )
261
+
262
+ except ValidationTimeoutError as e:
263
+ if log_errors:
264
+ logger.warning(f"Validation timed out: {e.timeout_seconds}s")
265
+ return ValidatorExecutionResult(
266
+ validator_name=validator.name,
267
+ status=ValidationResult.TIMEOUT,
268
+ issues=[],
269
+ error_message=str(e),
270
+ error_context=ErrorContext("ValidationTimeoutError", str(e)),
271
+ execution_time_ms=(time.time() - start_time) * 1000,
272
+ )
273
+
274
+ except Exception as e:
275
+ if log_errors:
276
+ logger.exception(f"Error in {validator.name}: {e}")
277
+ if skip_on_error:
278
+ return ValidatorExecutionResult(
279
+ validator_name=validator.name,
280
+ status=ValidationResult.FAILED,
281
+ issues=[],
282
+ error_message=str(e),
283
+ error_context=ErrorContext(type(e).__name__, str(e)),
284
+ execution_time_ms=(time.time() - start_time) * 1000,
285
+ )
286
+ raise
287
+
288
+
289
+ class GracefulValidator:
290
+ """Wrapper for backward compatibility. Use validator.validate_safe() instead."""
291
+
292
+ def __init__(
293
+ self,
294
+ validator: "Validator",
295
+ skip_on_error: bool = True,
296
+ log_errors: bool = True,
297
+ ):
298
+ self.validator = validator
299
+ self.skip_on_error = skip_on_error
300
+ self.log_errors = log_errors
301
+
302
+ def validate(self, lf: pl.LazyFrame) -> ValidatorExecutionResult:
303
+ return _validate_safe(
304
+ self.validator, lf, self.skip_on_error, self.log_errors
305
+ )
306
+
307
+
308
+ # ============================================================================
309
+ # Schema Resilience (#9)
310
+ # ============================================================================
311
+
312
+ class SchemaValidator:
313
+ """Validates schema compatibility before running validators.
314
+
315
+ Prevents runtime errors from missing columns by pre-checking schema.
316
+ """
317
+
318
+ @staticmethod
319
+ def check_columns_exist(
320
+ lf: pl.LazyFrame,
321
+ required_columns: list[str],
322
+ raise_on_missing: bool = True,
323
+ ) -> tuple[bool, list[str]]:
324
+ """Check if required columns exist in the LazyFrame.
325
+
326
+ Args:
327
+ lf: LazyFrame to check
328
+ required_columns: List of required column names
329
+ raise_on_missing: If True, raise ColumnNotFoundError
330
+
331
+ Returns:
332
+ Tuple of (all_exist, missing_columns)
333
+ """
334
+ schema = lf.collect_schema()
335
+ available = set(schema.names())
336
+ missing = [c for c in required_columns if c not in available]
337
+
338
+ if missing and raise_on_missing:
339
+ raise ColumnNotFoundError(missing[0], list(available))
340
+
341
+ return len(missing) == 0, missing
342
+
343
+ @staticmethod
344
+ def get_safe_columns(
345
+ lf: pl.LazyFrame,
346
+ requested_columns: list[str] | None,
347
+ dtype_filter: set[type] | None = None,
348
+ ) -> list[str]:
349
+ """Get columns that exist and match type filter.
350
+
351
+ Args:
352
+ lf: LazyFrame to check
353
+ requested_columns: Requested columns (None = all)
354
+ dtype_filter: Optional set of allowed types
355
+
356
+ Returns:
357
+ List of valid column names
358
+ """
359
+ schema = lf.collect_schema()
360
+ available = list(schema.names())
361
+
362
+ if requested_columns:
363
+ columns = [c for c in requested_columns if c in available]
364
+ else:
365
+ columns = available
366
+
367
+ if dtype_filter:
368
+ columns = [c for c in columns if type(schema[c]) in dtype_filter]
369
+
370
+ return columns
371
+
372
+
373
+ # ============================================================================
374
+ # Configuration
375
+ # ============================================================================
376
+
377
+ @dataclass(frozen=True)
378
+ class ValidatorConfig:
379
+ """Immutable configuration for validators.
380
+
381
+ Thread-safe frozen dataclass that can be used as dict keys.
382
+ """
383
+
384
+ columns: tuple[str, ...] | None = None
385
+ exclude_columns: tuple[str, ...] | None = None
386
+ severity_override: Severity | None = None
387
+ sample_size: int = 5
388
+ mostly: float | None = None # Fraction of rows that must pass (0.0 to 1.0)
389
+ timeout_seconds: float | None = 300.0
390
+ graceful_degradation: bool = True
391
+ log_errors: bool = True
392
+
393
+ def __post_init__(self) -> None:
394
+ """Validate configuration parameters."""
395
+ if self.sample_size < 0:
396
+ raise ValueError(f"sample_size must be >= 0, got {self.sample_size}")
397
+ if self.mostly is not None and not (0.0 <= self.mostly <= 1.0):
398
+ raise ValueError(f"mostly must be in [0.0, 1.0], got {self.mostly}")
399
+ if self.timeout_seconds is not None and self.timeout_seconds <= 0:
400
+ raise ValueError(f"timeout_seconds must be > 0, got {self.timeout_seconds}")
401
+
402
+ def replace(self, **kwargs: Any) -> "ValidatorConfig":
403
+ """Create a new config with updated values."""
404
+ from dataclasses import asdict
405
+ current = asdict(self)
406
+ current.update(kwargs)
407
+ # Convert lists to tuples for frozen dataclass
408
+ if "columns" in current and isinstance(current["columns"], list):
409
+ current["columns"] = tuple(current["columns"])
410
+ if "exclude_columns" in current and isinstance(current["exclude_columns"], list):
411
+ current["exclude_columns"] = tuple(current["exclude_columns"])
412
+ return ValidatorConfig(**current)
413
+
414
+ @classmethod
415
+ def from_kwargs(cls, **kwargs: Any) -> "ValidatorConfig":
416
+ """Create config from kwargs, converting lists to tuples."""
417
+ if "columns" in kwargs and isinstance(kwargs["columns"], list):
418
+ kwargs["columns"] = tuple(kwargs["columns"])
419
+ if "exclude_columns" in kwargs and isinstance(kwargs["exclude_columns"], list):
420
+ kwargs["exclude_columns"] = tuple(kwargs["exclude_columns"])
421
+ valid_fields = {
422
+ "columns", "exclude_columns", "severity_override", "sample_size",
423
+ "mostly", "timeout_seconds", "graceful_degradation", "log_errors"
424
+ }
425
+ filtered = {k: v for k, v in kwargs.items() if k in valid_fields}
426
+ return cls(**filtered)
427
+
428
+
429
+ # ============================================================================
430
+ # Timeout Handler
431
+ # ============================================================================
432
+
433
+ class TimeoutHandler:
434
+ """Thread-safe timeout handler for validation operations."""
435
+
436
+ def __init__(self, timeout_seconds: float | None, validator_name: str = ""):
437
+ self.timeout_seconds = timeout_seconds
438
+ self.validator_name = validator_name
439
+ self._old_handler = None
440
+
441
+ def _timeout_handler(self, signum: int, frame: Any) -> None:
442
+ raise ValidationTimeoutError(self.timeout_seconds or 0, self.validator_name)
443
+
444
+ def __enter__(self) -> "TimeoutHandler":
445
+ if self.timeout_seconds is None:
446
+ return self
447
+
448
+ try:
449
+ if threading.current_thread() is threading.main_thread():
450
+ self._old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
451
+ signal.setitimer(signal.ITIMER_REAL, self.timeout_seconds)
452
+ except (AttributeError, ValueError):
453
+ pass
454
+
455
+ return self
456
+
457
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> bool:
458
+ if self.timeout_seconds is None:
459
+ return False
460
+
461
+ try:
462
+ if threading.current_thread() is threading.main_thread():
463
+ signal.setitimer(signal.ITIMER_REAL, 0)
464
+ if self._old_handler is not None:
465
+ signal.signal(signal.SIGALRM, self._old_handler)
466
+ except (AttributeError, ValueError):
467
+ pass
468
+
469
+ return False
470
+
471
+
472
+ def with_timeout(func: Callable) -> Callable:
473
+ """Decorator to add timeout support to validation methods."""
474
+ @wraps(func)
475
+ def wrapper(self: "Validator", *args: Any, **kwargs: Any) -> Any:
476
+ timeout = self.config.timeout_seconds
477
+ validator_name = getattr(self, "name", self.__class__.__name__)
478
+
479
+ with TimeoutHandler(timeout, validator_name):
480
+ return func(self, *args, **kwargs)
481
+
482
+ return wrapper
483
+
484
+
485
+ # ============================================================================
486
+ # ValidationIssue
487
+ # ============================================================================
488
+
489
+ @dataclass
490
+ class ValidationIssue:
491
+ """Represents a single data quality issue found during validation."""
492
+
493
+ column: str
494
+ issue_type: str
495
+ count: int
496
+ severity: Severity
497
+ details: str | None = None
498
+ expected: Any | None = None
499
+ actual: Any | None = None
500
+ sample_values: list[Any] | None = None
501
+
502
+ def to_dict(self) -> dict:
503
+ """Convert to dictionary for JSON serialization."""
504
+ result = {
505
+ "column": self.column,
506
+ "issue_type": self.issue_type,
507
+ "count": self.count,
508
+ "severity": self.severity.value,
509
+ "details": self.details,
510
+ }
511
+ if self.expected is not None:
512
+ result["expected"] = self.expected
513
+ if self.actual is not None:
514
+ result["actual"] = self.actual
515
+ if self.sample_values is not None:
516
+ result["sample_values"] = self.sample_values
517
+ return result
518
+
519
+
520
+ # ============================================================================
521
+ # Type Filters
522
+ # ============================================================================
523
+
524
+ NUMERIC_TYPES: set[type[pl.DataType]] = {
525
+ pl.Int8, pl.Int16, pl.Int32, pl.Int64,
526
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
527
+ pl.Float32, pl.Float64,
528
+ }
529
+
530
+ STRING_TYPES: set[type[pl.DataType]] = {pl.String, pl.Utf8}
531
+
532
+ DATETIME_TYPES: set[type[pl.DataType]] = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
533
+
534
+ FLOAT_TYPES: set[type[pl.DataType]] = {pl.Float32, pl.Float64}
535
+
536
+
537
+ # ============================================================================
538
+ # Base Validator
539
+ # ============================================================================
540
+
541
+ class Validator(ABC):
542
+ """Abstract base class for all validators.
543
+
544
+ Features:
545
+ - Immutable ValidatorConfig (thread-safe)
546
+ - Timeout support
547
+ - Schema validation
548
+ - Graceful degradation on errors
549
+ - Dependency-aware execution ordering
550
+
551
+ Data Type Support:
552
+ Validators ONLY accept Polars LazyFrame (pl.LazyFrame) directly.
553
+ For other data types, use the public API (th.check()) which handles conversion:
554
+
555
+ - th.check("data.csv") → Automatically converts to LazyFrame
556
+ - th.check(pl.DataFrame()) → Converts DataFrame to LazyFrame
557
+ - th.check(pd.DataFrame()) → Converts pandas DataFrame to LazyFrame
558
+ - th.check({"col": [1,2]}) → Converts dict to LazyFrame
559
+
560
+ If using validators directly, convert data first::
561
+
562
+ import polars as pl
563
+ from truthound.adapters import to_lazyframe
564
+
565
+ # Option 1: Use the adapter
566
+ lf = to_lazyframe(your_data)
567
+ issues = NullValidator().validate(lf)
568
+
569
+ # Option 2: Convert manually
570
+ lf = pl.DataFrame(your_data).lazy()
571
+ issues = NullValidator().validate(lf)
572
+
573
+ Class Attributes:
574
+ name: Unique identifier for this validator
575
+ category: Validator category (schema, completeness, uniqueness, etc.)
576
+ dependencies: Set of validator names that must run before this one
577
+ provides: Set of capabilities this validator provides
578
+ priority: Execution priority within phase (lower = earlier)
579
+
580
+ Example:
581
+ class MyValidator(Validator):
582
+ name = "my_validator"
583
+ category = "custom"
584
+ dependencies = {"null", "schema"} # Runs after null and schema
585
+ provides = {"my_check"} # Other validators can depend on this
586
+
587
+ def validate(self, lf):
588
+ ...
589
+ """
590
+
591
+ name: str = "base"
592
+ category: str = "general"
593
+
594
+ # DAG execution metadata
595
+ dependencies: set[str] = set() # Validators that must run before this
596
+ provides: set[str] = set() # Capabilities this validator provides
597
+ priority: int = 100 # Lower = runs earlier within same phase
598
+
599
+ def __init__(self, config: ValidatorConfig | None = None, **kwargs: Any):
600
+ """Initialize the validator.
601
+
602
+ Args:
603
+ config: Immutable validator configuration
604
+ **kwargs: Additional config options (merged into config)
605
+ """
606
+ if config is not None:
607
+ self.config = config.replace(**kwargs) if kwargs else config
608
+ else:
609
+ self.config = ValidatorConfig.from_kwargs(**kwargs)
610
+ self.logger = _get_logger(self.name)
611
+
612
+ @abstractmethod
613
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
614
+ """Run validation on the given LazyFrame."""
615
+ pass
616
+
617
+ def validate_safe(self, lf: pl.LazyFrame) -> ValidatorExecutionResult:
618
+ """Run validation with graceful error handling."""
619
+ return _validate_safe(
620
+ self,
621
+ lf,
622
+ skip_on_error=self.config.graceful_degradation,
623
+ log_errors=self.config.log_errors,
624
+ )
625
+
626
+ def validate_with_timeout(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
627
+ """Run validation with timeout protection."""
628
+ timeout = self.config.timeout_seconds
629
+ validator_name = getattr(self, "name", self.__class__.__name__)
630
+
631
+ with TimeoutHandler(timeout, validator_name):
632
+ return self.validate(lf)
633
+
634
+ def _get_target_columns(
635
+ self,
636
+ lf: pl.LazyFrame,
637
+ dtype_filter: set[type[pl.DataType]] | None = None,
638
+ ) -> list[str]:
639
+ """Get columns to validate based on config and dtype filter.
640
+
641
+ Uses SchemaValidator for safe column resolution.
642
+ """
643
+ requested = list(self.config.columns) if self.config.columns else None
644
+ exclude = list(self.config.exclude_columns) if self.config.exclude_columns else []
645
+
646
+ columns = SchemaValidator.get_safe_columns(lf, requested, dtype_filter)
647
+
648
+ if exclude:
649
+ columns = [c for c in columns if c not in exclude]
650
+
651
+ return columns
652
+
653
+ def _calculate_severity(
654
+ self,
655
+ ratio: float,
656
+ thresholds: tuple[float, float, float] = (0.5, 0.2, 0.05),
657
+ ) -> Severity:
658
+ """Calculate severity based on ratio and thresholds."""
659
+ if self.config.severity_override:
660
+ return self.config.severity_override
661
+
662
+ critical_th, high_th, medium_th = thresholds
663
+ if ratio > critical_th:
664
+ return Severity.CRITICAL
665
+ elif ratio > high_th:
666
+ return Severity.HIGH
667
+ elif ratio > medium_th:
668
+ return Severity.MEDIUM
669
+ return Severity.LOW
670
+
671
+ def _passes_mostly(self, failure_count: int, total_count: int) -> bool:
672
+ """Check if validation passes based on mostly threshold."""
673
+ if self.config.mostly is None:
674
+ return False
675
+
676
+ if total_count == 0:
677
+ return True
678
+
679
+ pass_ratio = 1 - (failure_count / total_count)
680
+ return pass_ratio >= self.config.mostly
681
+
682
+ def _get_mostly_adjusted_severity(
683
+ self,
684
+ failure_count: int,
685
+ total_count: int,
686
+ base_severity: Severity,
687
+ ) -> Severity | None:
688
+ """Get severity adjusted for mostly threshold."""
689
+ if self._passes_mostly(failure_count, total_count):
690
+ return None
691
+ return base_severity
692
+
693
+ def _safe_sample(
694
+ self,
695
+ lf: pl.LazyFrame,
696
+ filter_expr: pl.Expr,
697
+ columns: list[str] | None = None,
698
+ ) -> list[Any]:
699
+ """Safely get sample values."""
700
+ try:
701
+ df = SafeSampler.safe_filter_sample(
702
+ lf, filter_expr, self.config.sample_size, columns
703
+ )
704
+ return df.to_dicts() if len(df) > 0 else []
705
+ except Exception as e:
706
+ self.logger.warning(f"Failed to collect samples: {e}")
707
+ return []
708
+
709
+
710
+ # ============================================================================
711
+ # Mixins
712
+ # ============================================================================
713
+
714
+ class NumericValidatorMixin:
715
+ """Mixin for validators that work with numeric columns."""
716
+
717
+ def _get_numeric_columns(self, lf: pl.LazyFrame) -> list[str]:
718
+ return self._get_target_columns(lf, dtype_filter=NUMERIC_TYPES) # type: ignore
719
+
720
+
721
+ class StringValidatorMixin:
722
+ """Mixin for validators that work with string columns."""
723
+
724
+ def _get_string_columns(self, lf: pl.LazyFrame) -> list[str]:
725
+ return self._get_target_columns(lf, dtype_filter=STRING_TYPES) # type: ignore
726
+
727
+
728
+ class DatetimeValidatorMixin:
729
+ """Mixin for validators that work with datetime columns."""
730
+
731
+ def _get_datetime_columns(self, lf: pl.LazyFrame) -> list[str]:
732
+ return self._get_target_columns(lf, dtype_filter=DATETIME_TYPES) # type: ignore
733
+
734
+
735
+ class FloatValidatorMixin:
736
+ """Mixin for validators that work with float columns."""
737
+
738
+ def _get_float_columns(self, lf: pl.LazyFrame) -> list[str]:
739
+ return self._get_target_columns(lf, dtype_filter=FLOAT_TYPES) # type: ignore
740
+
741
+
742
+ class RegexValidatorMixin:
743
+ """Mixin for validators that use regex patterns with ReDoS protection."""
744
+
745
+ @staticmethod
746
+ def validate_pattern(pattern: str, flags: int = 0) -> re.Pattern[str]:
747
+ """Validate and compile a regex pattern with ReDoS check."""
748
+ if pattern is None:
749
+ raise RegexValidationError("None", "Pattern cannot be None")
750
+
751
+ is_safe, warning = RegexSafetyChecker.check_pattern(pattern)
752
+ if not is_safe:
753
+ raise RegexValidationError(pattern, f"ReDoS risk: {warning}")
754
+
755
+ try:
756
+ return re.compile(pattern, flags)
757
+ except re.error as e:
758
+ raise RegexValidationError(pattern, str(e)) from e
759
+
760
+ @staticmethod
761
+ def validate_patterns(patterns: list[str], flags: int = 0) -> list[re.Pattern[str]]:
762
+ """Validate and compile multiple regex patterns."""
763
+ return [RegexValidatorMixin.validate_pattern(p, flags) for p in patterns]
764
+
765
+
766
+ class StreamingValidatorMixin:
767
+ """Mixin for validators that support streaming/chunked processing."""
768
+
769
+ default_chunk_size: int = 100_000
770
+
771
+ def _validate_streaming(
772
+ self,
773
+ lf: pl.LazyFrame,
774
+ chunk_size: int | None = None,
775
+ validate_chunk: Callable[[pl.LazyFrame], list["ValidationIssue"]] | None = None,
776
+ ) -> list["ValidationIssue"]:
777
+ """Process validation in streaming chunks."""
778
+ chunk_size = chunk_size or self.default_chunk_size
779
+ validate_fn = validate_chunk or self.validate # type: ignore
780
+
781
+ total_rows = lf.select(pl.len()).collect().item()
782
+ if total_rows == 0:
783
+ return []
784
+ if total_rows <= chunk_size:
785
+ return validate_fn(lf)
786
+
787
+ all_issues: dict[tuple[str, str], "ValidationIssue"] = {}
788
+ for offset in range(0, total_rows, chunk_size):
789
+ chunk_lf = lf.slice(offset, chunk_size)
790
+ for issue in validate_fn(chunk_lf):
791
+ key = (issue.column, issue.issue_type)
792
+ if key in all_issues:
793
+ all_issues[key].count += issue.count
794
+ else:
795
+ all_issues[key] = issue
796
+ return list(all_issues.values())
797
+
798
+
799
+ class EnterpriseScaleSamplingMixin:
800
+ """Mixin for validators that support enterprise-scale sampling.
801
+
802
+ Provides automatic sampling for large datasets (100M+ rows) with
803
+ statistical quality guarantees.
804
+
805
+ Features:
806
+ - Automatic scale detection and strategy selection
807
+ - Memory-aware sampling with backpressure
808
+ - Statistical confidence bounds on results
809
+ - Time-budget aware processing
810
+
811
+ Usage:
812
+ class MyValidator(Validator, EnterpriseScaleSamplingMixin):
813
+ # Enable sampling for datasets > 10M rows
814
+ sampling_threshold: int = 10_000_000
815
+ sampling_target_rows: int = 100_000
816
+ sampling_quality: str = "standard"
817
+
818
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
819
+ # Automatically sample if dataset is large
820
+ sampled_lf, metrics = self._sample_for_validation(lf)
821
+
822
+ # Validate on sampled data
823
+ issues = self._do_validation(sampled_lf)
824
+
825
+ # Extrapolate counts if sampled
826
+ if metrics.is_sampled:
827
+ issues = self._extrapolate_issues(issues, metrics)
828
+
829
+ return issues
830
+ """
831
+
832
+ # Sampling configuration (override in subclass)
833
+ sampling_threshold: int = 10_000_000 # 10M rows
834
+ sampling_target_rows: int = 100_000 # Target sample size
835
+ sampling_quality: str = "standard" # Quality level
836
+ sampling_confidence: float = 0.95 # Confidence level
837
+ sampling_margin_of_error: float = 0.05 # Acceptable error
838
+
839
+ def _sample_for_validation(
840
+ self,
841
+ lf: pl.LazyFrame,
842
+ target_rows: int | None = None,
843
+ ) -> tuple[pl.LazyFrame, "SamplingInfo"]:
844
+ """Sample data if it exceeds threshold.
845
+
846
+ Args:
847
+ lf: Input LazyFrame
848
+ target_rows: Override target sample size
849
+
850
+ Returns:
851
+ Tuple of (sampled LazyFrame, sampling info)
852
+ """
853
+ # Get row count
854
+ total_rows = lf.select(pl.len()).collect().item()
855
+
856
+ # Check if sampling needed
857
+ if total_rows <= self.sampling_threshold:
858
+ return lf, SamplingInfo(
859
+ is_sampled=False,
860
+ original_rows=total_rows,
861
+ sampled_rows=total_rows,
862
+ sampling_ratio=1.0,
863
+ confidence_level=1.0,
864
+ margin_of_error=0.0,
865
+ )
866
+
867
+ # Determine target
868
+ target = target_rows or self.sampling_target_rows
869
+ target = min(target, total_rows)
870
+
871
+ # Calculate sampling ratio
872
+ sample_ratio = target / total_rows
873
+
874
+ # Apply sampling
875
+ seed = getattr(self, "_sampling_seed", 42)
876
+ threshold = max(1, int(sample_ratio * 10000))
877
+
878
+ sampled_lf = (
879
+ lf.with_row_index("__sample_idx")
880
+ .filter(pl.col("__sample_idx").hash(seed) % 10000 < threshold)
881
+ .drop("__sample_idx")
882
+ )
883
+
884
+ return sampled_lf, SamplingInfo(
885
+ is_sampled=True,
886
+ original_rows=total_rows,
887
+ sampled_rows=target,
888
+ sampling_ratio=sample_ratio,
889
+ confidence_level=self.sampling_confidence,
890
+ margin_of_error=self.sampling_margin_of_error,
891
+ )
892
+
893
+ def _extrapolate_issues(
894
+ self,
895
+ issues: list["ValidationIssue"],
896
+ sampling_info: "SamplingInfo",
897
+ ) -> list["ValidationIssue"]:
898
+ """Extrapolate issue counts from sample to population.
899
+
900
+ Args:
901
+ issues: Issues found in sample
902
+ sampling_info: Sampling information
903
+
904
+ Returns:
905
+ Issues with extrapolated counts
906
+ """
907
+ if not sampling_info.is_sampled:
908
+ return issues
909
+
910
+ extrapolation_factor = 1.0 / sampling_info.sampling_ratio
911
+
912
+ for issue in issues:
913
+ # Extrapolate count
914
+ original_count = issue.count
915
+ extrapolated_count = int(original_count * extrapolation_factor)
916
+ issue.count = extrapolated_count
917
+
918
+ # Add sampling note to details
919
+ if issue.details:
920
+ issue.details = (
921
+ f"{issue.details} "
922
+ f"[sampled: {original_count} → estimated: {extrapolated_count}, "
923
+ f"confidence: {sampling_info.confidence_level:.0%}]"
924
+ )
925
+
926
+ return issues
927
+
928
+ def _get_sampling_strategy(self, total_rows: int) -> str:
929
+ """Get recommended sampling strategy for data size."""
930
+ if total_rows < 1_000_000:
931
+ return "none"
932
+ elif total_rows < 10_000_000:
933
+ return "systematic"
934
+ elif total_rows < 100_000_000:
935
+ return "block"
936
+ else:
937
+ return "multi_stage"
938
+
939
+
940
+ @dataclass
941
+ class SamplingInfo:
942
+ """Information about sampling applied to validation.
943
+
944
+ Attributes:
945
+ is_sampled: Whether sampling was applied
946
+ original_rows: Original row count
947
+ sampled_rows: Rows after sampling
948
+ sampling_ratio: Sample size / original size
949
+ confidence_level: Statistical confidence
950
+ margin_of_error: Error margin
951
+ """
952
+ is_sampled: bool
953
+ original_rows: int
954
+ sampled_rows: int
955
+ sampling_ratio: float
956
+ confidence_level: float
957
+ margin_of_error: float
958
+
959
+ @property
960
+ def extrapolation_factor(self) -> float:
961
+ """Factor to multiply sample counts by for population estimate."""
962
+ if self.sampling_ratio <= 0:
963
+ return 1.0
964
+ return 1.0 / self.sampling_ratio
965
+
966
+ def to_dict(self) -> dict[str, Any]:
967
+ return {
968
+ "is_sampled": self.is_sampled,
969
+ "original_rows": self.original_rows,
970
+ "sampled_rows": self.sampled_rows,
971
+ "sampling_ratio": self.sampling_ratio,
972
+ "confidence_level": self.confidence_level,
973
+ "margin_of_error": self.margin_of_error,
974
+ }
975
+
976
+
977
+ # ============================================================================
978
+ # Template Validators
979
+ # ============================================================================
980
+
981
+ class ColumnValidator(Validator):
982
+ """Template for column-level validation."""
983
+
984
+ @abstractmethod
985
+ def check_column(
986
+ self,
987
+ lf: pl.LazyFrame,
988
+ col: str,
989
+ total_rows: int,
990
+ ) -> ValidationIssue | None:
991
+ """Check a single column. Implement in subclass."""
992
+ pass
993
+
994
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
995
+ issues: list[ValidationIssue] = []
996
+ columns = self._get_target_columns(lf)
997
+
998
+ total_rows = lf.select(pl.len()).collect().item()
999
+
1000
+ if total_rows == 0:
1001
+ return issues
1002
+
1003
+ for col in columns:
1004
+ try:
1005
+ issue = self.check_column(lf, col, total_rows)
1006
+ if issue:
1007
+ issues.append(issue)
1008
+ except Exception as e:
1009
+ if self.config.graceful_degradation:
1010
+ self.logger.warning(f"Error checking column {col}: {e}")
1011
+ else:
1012
+ raise
1013
+
1014
+ return issues
1015
+
1016
+
1017
+ class AggregateValidator(Validator, NumericValidatorMixin):
1018
+ """Template for aggregate statistics validation."""
1019
+
1020
+ @abstractmethod
1021
+ def check_aggregate(
1022
+ self,
1023
+ col: str,
1024
+ stats: dict[str, Any],
1025
+ total_rows: int,
1026
+ ) -> ValidationIssue | None:
1027
+ """Check aggregate stats for a column. Implement in subclass."""
1028
+ pass
1029
+
1030
+ def _compute_stats(
1031
+ self,
1032
+ lf: pl.LazyFrame,
1033
+ columns: list[str],
1034
+ ) -> tuple[int, dict[str, dict[str, Any]]]:
1035
+ """Compute statistics for all columns in single query."""
1036
+ exprs: list[pl.Expr] = [pl.len().alias("_total")]
1037
+
1038
+ for col in columns:
1039
+ exprs.extend([
1040
+ pl.col(col).mean().alias(f"_mean_{col}"),
1041
+ pl.col(col).std().alias(f"_std_{col}"),
1042
+ pl.col(col).min().alias(f"_min_{col}"),
1043
+ pl.col(col).max().alias(f"_max_{col}"),
1044
+ pl.col(col).sum().alias(f"_sum_{col}"),
1045
+ pl.col(col).median().alias(f"_median_{col}"),
1046
+ pl.col(col).count().alias(f"_count_{col}"),
1047
+ ])
1048
+
1049
+ result = lf.select(exprs).collect()
1050
+ total = result["_total"][0]
1051
+
1052
+ stats: dict[str, dict[str, Any]] = {}
1053
+ for col in columns:
1054
+ stats[col] = {
1055
+ "mean": result[f"_mean_{col}"][0],
1056
+ "std": result[f"_std_{col}"][0],
1057
+ "min": result[f"_min_{col}"][0],
1058
+ "max": result[f"_max_{col}"][0],
1059
+ "sum": result[f"_sum_{col}"][0],
1060
+ "median": result[f"_median_{col}"][0],
1061
+ "count": result[f"_count_{col}"][0],
1062
+ }
1063
+
1064
+ return total, stats
1065
+
1066
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
1067
+ issues: list[ValidationIssue] = []
1068
+ columns = self._get_numeric_columns(lf)
1069
+
1070
+ if not columns:
1071
+ return issues
1072
+
1073
+ total_rows, all_stats = self._compute_stats(lf, columns)
1074
+
1075
+ if total_rows == 0:
1076
+ return issues
1077
+
1078
+ for col in columns:
1079
+ try:
1080
+ issue = self.check_aggregate(col, all_stats[col], total_rows)
1081
+ if issue:
1082
+ issues.append(issue)
1083
+ except Exception as e:
1084
+ if self.config.graceful_degradation:
1085
+ self.logger.warning(f"Error checking aggregate for {col}: {e}")
1086
+ else:
1087
+ raise
1088
+
1089
+ return issues