truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1170 @@
1
+ """Automatic threshold tuning based on data characteristics.
2
+
3
+ This module provides intelligent threshold tuning for validation rules:
4
+ - Analyzes data distribution to determine optimal thresholds
5
+ - Adapts strictness based on data quality
6
+ - Supports multiple tuning strategies
7
+ - Provides confidence-based recommendations
8
+
9
+ Key features:
10
+ - Statistical analysis for threshold determination
11
+ - Outlier detection for boundary setting
12
+ - Domain-aware defaults
13
+ - A/B testing support for threshold comparison
14
+
15
+ Example:
16
+ from truthound.profiler.auto_threshold import (
17
+ ThresholdTuner,
18
+ tune_thresholds,
19
+ TuningStrategy,
20
+ )
21
+
22
+ # Create tuner
23
+ tuner = ThresholdTuner(strategy="adaptive")
24
+
25
+ # Tune thresholds for a profile
26
+ thresholds = tuner.tune(profile)
27
+
28
+ print(f"Null threshold: {thresholds.null_threshold}")
29
+ print(f"Uniqueness threshold: {thresholds.uniqueness_threshold}")
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import math
35
+ import statistics
36
+ from abc import ABC, abstractmethod
37
+ from collections import defaultdict
38
+ from dataclasses import dataclass, field
39
+ from datetime import datetime
40
+ from enum import Enum
41
+ from typing import Any, Callable, Dict, List, Optional, Tuple
42
+
43
+ import polars as pl
44
+
45
+ from truthound.profiler.base import (
46
+ ColumnProfile,
47
+ DataType,
48
+ DistributionStats,
49
+ Strictness,
50
+ TableProfile,
51
+ )
52
+
53
+
54
+ # =============================================================================
55
+ # Types and Enums
56
+ # =============================================================================
57
+
58
+
59
+ class TuningStrategy(str, Enum):
60
+ """Threshold tuning strategies."""
61
+
62
+ CONSERVATIVE = "conservative" # Strict thresholds, fewer false positives
63
+ BALANCED = "balanced" # Balance between precision and recall
64
+ PERMISSIVE = "permissive" # Loose thresholds, fewer false negatives
65
+ ADAPTIVE = "adaptive" # Adapt based on data characteristics
66
+ STATISTICAL = "statistical" # Use statistical methods (percentiles, IQR)
67
+ DOMAIN_AWARE = "domain_aware" # Use domain-specific knowledge
68
+
69
+
70
+ class ThresholdType(str, Enum):
71
+ """Types of thresholds."""
72
+
73
+ NULL_RATIO = "null_ratio"
74
+ UNIQUENESS_RATIO = "uniqueness_ratio"
75
+ MIN_VALUE = "min_value"
76
+ MAX_VALUE = "max_value"
77
+ MIN_LENGTH = "min_length"
78
+ MAX_LENGTH = "max_length"
79
+ PATTERN_MATCH_RATIO = "pattern_match_ratio"
80
+ OUTLIER_RATIO = "outlier_ratio"
81
+ CARDINALITY = "cardinality"
82
+
83
+
84
+ # =============================================================================
85
+ # Threshold Configuration
86
+ # =============================================================================
87
+
88
+
89
+ @dataclass
90
+ class ColumnThresholds:
91
+ """Thresholds for a single column."""
92
+
93
+ column_name: str
94
+ null_threshold: float = 0.0
95
+ uniqueness_threshold: float | None = None
96
+ min_value: float | None = None
97
+ max_value: float | None = None
98
+ min_length: int | None = None
99
+ max_length: int | None = None
100
+ pattern_match_threshold: float = 0.8
101
+ allowed_values: set[Any] | None = None
102
+ outlier_threshold: float = 0.01
103
+ confidence: float = 0.5
104
+ reasoning: list[str] = field(default_factory=list)
105
+
106
+ def to_dict(self) -> dict[str, Any]:
107
+ return {
108
+ "column_name": self.column_name,
109
+ "null_threshold": self.null_threshold,
110
+ "uniqueness_threshold": self.uniqueness_threshold,
111
+ "min_value": self.min_value,
112
+ "max_value": self.max_value,
113
+ "min_length": self.min_length,
114
+ "max_length": self.max_length,
115
+ "pattern_match_threshold": self.pattern_match_threshold,
116
+ "allowed_values": list(self.allowed_values) if self.allowed_values else None,
117
+ "outlier_threshold": self.outlier_threshold,
118
+ "confidence": self.confidence,
119
+ "reasoning": self.reasoning,
120
+ }
121
+
122
+
123
+ @dataclass
124
+ class TableThresholds:
125
+ """Thresholds for an entire table."""
126
+
127
+ table_name: str
128
+ columns: dict[str, ColumnThresholds] = field(default_factory=dict)
129
+ duplicate_threshold: float = 0.0
130
+ row_count_min: int | None = None
131
+ row_count_max: int | None = None
132
+ global_null_threshold: float = 0.1
133
+ strategy_used: TuningStrategy = TuningStrategy.BALANCED
134
+ tuned_at: datetime = field(default_factory=datetime.now)
135
+ metadata: dict[str, Any] = field(default_factory=dict)
136
+
137
+ def get_column(self, name: str) -> ColumnThresholds | None:
138
+ """Get thresholds for a column."""
139
+ return self.columns.get(name)
140
+
141
+ def to_dict(self) -> dict[str, Any]:
142
+ return {
143
+ "table_name": self.table_name,
144
+ "columns": {k: v.to_dict() for k, v in self.columns.items()},
145
+ "duplicate_threshold": self.duplicate_threshold,
146
+ "row_count_min": self.row_count_min,
147
+ "row_count_max": self.row_count_max,
148
+ "global_null_threshold": self.global_null_threshold,
149
+ "strategy_used": self.strategy_used.value,
150
+ "tuned_at": self.tuned_at.isoformat(),
151
+ "metadata": self.metadata,
152
+ }
153
+
154
+
155
+ # =============================================================================
156
+ # Strictness Presets
157
+ # =============================================================================
158
+
159
+
160
+ @dataclass
161
+ class StrictnessPreset:
162
+ """Preset threshold multipliers for different strictness levels."""
163
+
164
+ null_multiplier: float = 1.0
165
+ range_buffer: float = 0.1 # 10% buffer on ranges
166
+ pattern_threshold: float = 0.8
167
+ uniqueness_tolerance: float = 0.05
168
+ outlier_sensitivity: float = 1.0
169
+
170
+ @classmethod
171
+ def for_strictness(cls, strictness: Strictness) -> "StrictnessPreset":
172
+ """Get preset for a strictness level."""
173
+ presets = {
174
+ Strictness.LOOSE: cls(
175
+ null_multiplier=1.5,
176
+ range_buffer=0.2,
177
+ pattern_threshold=0.6,
178
+ uniqueness_tolerance=0.1,
179
+ outlier_sensitivity=0.5,
180
+ ),
181
+ Strictness.MEDIUM: cls(
182
+ null_multiplier=1.0,
183
+ range_buffer=0.1,
184
+ pattern_threshold=0.8,
185
+ uniqueness_tolerance=0.05,
186
+ outlier_sensitivity=1.0,
187
+ ),
188
+ Strictness.STRICT: cls(
189
+ null_multiplier=0.5,
190
+ range_buffer=0.05,
191
+ pattern_threshold=0.95,
192
+ uniqueness_tolerance=0.01,
193
+ outlier_sensitivity=2.0,
194
+ ),
195
+ }
196
+ return presets.get(strictness, cls())
197
+
198
+
199
+ # =============================================================================
200
+ # Tuning Strategy Protocol
201
+ # =============================================================================
202
+
203
+
204
+ class TuningStrategyImpl(ABC):
205
+ """Abstract base for tuning strategies."""
206
+
207
+ name: str = "base"
208
+
209
+ @abstractmethod
210
+ def tune_column(
211
+ self,
212
+ profile: ColumnProfile,
213
+ context: dict[str, Any],
214
+ ) -> ColumnThresholds:
215
+ """Tune thresholds for a column.
216
+
217
+ Args:
218
+ profile: Column profile
219
+ context: Additional context
220
+
221
+ Returns:
222
+ Tuned thresholds
223
+ """
224
+ pass
225
+
226
+ @abstractmethod
227
+ def tune_table(
228
+ self,
229
+ profile: TableProfile,
230
+ context: dict[str, Any],
231
+ ) -> TableThresholds:
232
+ """Tune thresholds for a table.
233
+
234
+ Args:
235
+ profile: Table profile
236
+ context: Additional context
237
+
238
+ Returns:
239
+ Tuned thresholds
240
+ """
241
+ pass
242
+
243
+
244
+ class ConservativeStrategy(TuningStrategyImpl):
245
+ """Conservative tuning - strict thresholds.
246
+
247
+ Minimizes false positives at the cost of false negatives.
248
+ """
249
+
250
+ name = "conservative"
251
+
252
+ def tune_column(
253
+ self,
254
+ profile: ColumnProfile,
255
+ context: dict[str, Any],
256
+ ) -> ColumnThresholds:
257
+ thresholds = ColumnThresholds(column_name=profile.name)
258
+ reasoning = []
259
+
260
+ # Null threshold - very strict
261
+ thresholds.null_threshold = max(0, profile.null_ratio * 0.5)
262
+ reasoning.append(f"Null threshold set to half of observed ({profile.null_ratio:.1%})")
263
+
264
+ # Range thresholds - tight bounds
265
+ if profile.distribution:
266
+ dist = profile.distribution
267
+ if dist.min is not None and dist.max is not None:
268
+ range_size = dist.max - dist.min
269
+ buffer = range_size * 0.02 # 2% buffer
270
+ thresholds.min_value = dist.min - buffer
271
+ thresholds.max_value = dist.max + buffer
272
+ reasoning.append(f"Range set with 2% buffer: [{thresholds.min_value:.2f}, {thresholds.max_value:.2f}]")
273
+
274
+ # Pattern threshold - high
275
+ thresholds.pattern_match_threshold = 0.95
276
+ reasoning.append("Pattern match threshold: 95%")
277
+
278
+ # Uniqueness - if unique, require it
279
+ if profile.is_unique:
280
+ thresholds.uniqueness_threshold = 1.0
281
+ reasoning.append("Column appears unique, requiring 100% uniqueness")
282
+
283
+ thresholds.confidence = 0.8
284
+ thresholds.reasoning = reasoning
285
+
286
+ return thresholds
287
+
288
+ def tune_table(
289
+ self,
290
+ profile: TableProfile,
291
+ context: dict[str, Any],
292
+ ) -> TableThresholds:
293
+ thresholds = TableThresholds(
294
+ table_name=profile.name,
295
+ strategy_used=TuningStrategy.CONSERVATIVE,
296
+ )
297
+
298
+ # Tune each column
299
+ for col_profile in profile.columns:
300
+ col_thresholds = self.tune_column(col_profile, context)
301
+ thresholds.columns[col_profile.name] = col_thresholds
302
+
303
+ # Table-level thresholds
304
+ thresholds.duplicate_threshold = 0.0 # No duplicates allowed
305
+ thresholds.global_null_threshold = 0.05 # Max 5% nulls overall
306
+
307
+ return thresholds
308
+
309
+
310
+ class BalancedStrategy(TuningStrategyImpl):
311
+ """Balanced tuning - middle ground.
312
+
313
+ Balances between precision and recall.
314
+ """
315
+
316
+ name = "balanced"
317
+
318
+ def tune_column(
319
+ self,
320
+ profile: ColumnProfile,
321
+ context: dict[str, Any],
322
+ ) -> ColumnThresholds:
323
+ thresholds = ColumnThresholds(column_name=profile.name)
324
+ reasoning = []
325
+
326
+ # Null threshold - match observed with small buffer
327
+ thresholds.null_threshold = profile.null_ratio * 1.2 + 0.01
328
+ reasoning.append(f"Null threshold: observed + 20% buffer = {thresholds.null_threshold:.1%}")
329
+
330
+ # Range thresholds - moderate buffer
331
+ if profile.distribution:
332
+ dist = profile.distribution
333
+ if dist.min is not None and dist.max is not None:
334
+ range_size = dist.max - dist.min
335
+ buffer = range_size * 0.1 # 10% buffer
336
+ thresholds.min_value = dist.min - buffer
337
+ thresholds.max_value = dist.max + buffer
338
+ reasoning.append(f"Range with 10% buffer: [{thresholds.min_value:.2f}, {thresholds.max_value:.2f}]")
339
+
340
+ # Length constraints
341
+ if profile.min_length is not None:
342
+ thresholds.min_length = max(0, profile.min_length - 1)
343
+ thresholds.max_length = profile.max_length + 5 if profile.max_length else None
344
+ reasoning.append(f"Length: [{thresholds.min_length}, {thresholds.max_length}]")
345
+
346
+ # Pattern threshold
347
+ thresholds.pattern_match_threshold = 0.8
348
+ reasoning.append("Pattern match threshold: 80%")
349
+
350
+ # Uniqueness
351
+ if profile.is_unique:
352
+ thresholds.uniqueness_threshold = 0.99 # Allow tiny margin
353
+ reasoning.append("Near-unique required (99%)")
354
+ elif profile.unique_ratio > 0.9:
355
+ thresholds.uniqueness_threshold = 0.9
356
+ reasoning.append("High uniqueness required (90%)")
357
+
358
+ thresholds.confidence = 0.7
359
+ thresholds.reasoning = reasoning
360
+
361
+ return thresholds
362
+
363
+ def tune_table(
364
+ self,
365
+ profile: TableProfile,
366
+ context: dict[str, Any],
367
+ ) -> TableThresholds:
368
+ thresholds = TableThresholds(
369
+ table_name=profile.name,
370
+ strategy_used=TuningStrategy.BALANCED,
371
+ )
372
+
373
+ for col_profile in profile.columns:
374
+ col_thresholds = self.tune_column(col_profile, context)
375
+ thresholds.columns[col_profile.name] = col_thresholds
376
+
377
+ # Table-level
378
+ thresholds.duplicate_threshold = profile.duplicate_row_ratio * 1.1
379
+ thresholds.global_null_threshold = 0.1
380
+
381
+ return thresholds
382
+
383
+
384
+ class PermissiveStrategy(TuningStrategyImpl):
385
+ """Permissive tuning - loose thresholds.
386
+
387
+ Minimizes false negatives at the cost of false positives.
388
+ """
389
+
390
+ name = "permissive"
391
+
392
+ def tune_column(
393
+ self,
394
+ profile: ColumnProfile,
395
+ context: dict[str, Any],
396
+ ) -> ColumnThresholds:
397
+ thresholds = ColumnThresholds(column_name=profile.name)
398
+ reasoning = []
399
+
400
+ # Null threshold - generous
401
+ thresholds.null_threshold = min(1.0, profile.null_ratio * 2 + 0.05)
402
+ reasoning.append(f"Null threshold: 2x observed = {thresholds.null_threshold:.1%}")
403
+
404
+ # Range thresholds - wide buffer
405
+ if profile.distribution:
406
+ dist = profile.distribution
407
+ if dist.min is not None and dist.max is not None:
408
+ range_size = dist.max - dist.min
409
+ buffer = range_size * 0.25 # 25% buffer
410
+ thresholds.min_value = dist.min - buffer
411
+ thresholds.max_value = dist.max + buffer
412
+ reasoning.append(f"Wide range: [{thresholds.min_value:.2f}, {thresholds.max_value:.2f}]")
413
+
414
+ # Pattern threshold - low
415
+ thresholds.pattern_match_threshold = 0.6
416
+ reasoning.append("Pattern match threshold: 60%")
417
+
418
+ thresholds.confidence = 0.6
419
+ thresholds.reasoning = reasoning
420
+
421
+ return thresholds
422
+
423
+ def tune_table(
424
+ self,
425
+ profile: TableProfile,
426
+ context: dict[str, Any],
427
+ ) -> TableThresholds:
428
+ thresholds = TableThresholds(
429
+ table_name=profile.name,
430
+ strategy_used=TuningStrategy.PERMISSIVE,
431
+ )
432
+
433
+ for col_profile in profile.columns:
434
+ col_thresholds = self.tune_column(col_profile, context)
435
+ thresholds.columns[col_profile.name] = col_thresholds
436
+
437
+ thresholds.duplicate_threshold = 0.05 # Allow some duplicates
438
+ thresholds.global_null_threshold = 0.2
439
+
440
+ return thresholds
441
+
442
+
443
+ class AdaptiveStrategy(TuningStrategyImpl):
444
+ """Adaptive tuning - adjusts based on data characteristics.
445
+
446
+ Analyzes data quality signals to choose appropriate thresholds.
447
+ """
448
+
449
+ name = "adaptive"
450
+
451
+ def tune_column(
452
+ self,
453
+ profile: ColumnProfile,
454
+ context: dict[str, Any],
455
+ ) -> ColumnThresholds:
456
+ thresholds = ColumnThresholds(column_name=profile.name)
457
+ reasoning = []
458
+
459
+ # Determine data quality score
460
+ quality_score = self._assess_quality(profile)
461
+ reasoning.append(f"Data quality score: {quality_score:.2f}")
462
+
463
+ # Adjust strictness based on quality
464
+ if quality_score > 0.8:
465
+ # High quality - can be stricter
466
+ null_mult = 0.8
467
+ range_buffer = 0.05
468
+ pattern_threshold = 0.9
469
+ elif quality_score > 0.5:
470
+ # Medium quality - balanced
471
+ null_mult = 1.2
472
+ range_buffer = 0.1
473
+ pattern_threshold = 0.8
474
+ else:
475
+ # Low quality - be permissive
476
+ null_mult = 1.5
477
+ range_buffer = 0.2
478
+ pattern_threshold = 0.6
479
+
480
+ # Apply adjusted thresholds
481
+ thresholds.null_threshold = profile.null_ratio * null_mult + 0.01
482
+
483
+ if profile.distribution:
484
+ dist = profile.distribution
485
+ if dist.min is not None and dist.max is not None:
486
+ range_size = dist.max - dist.min
487
+ buffer = range_size * range_buffer
488
+ thresholds.min_value = dist.min - buffer
489
+ thresholds.max_value = dist.max + buffer
490
+
491
+ thresholds.pattern_match_threshold = pattern_threshold
492
+
493
+ # Adaptive uniqueness
494
+ if profile.is_unique:
495
+ thresholds.uniqueness_threshold = 1.0 if quality_score > 0.7 else 0.99
496
+ elif profile.unique_ratio > 0.9:
497
+ thresholds.uniqueness_threshold = profile.unique_ratio * 0.95
498
+
499
+ # Length constraints
500
+ if profile.min_length is not None:
501
+ length_buffer = max(1, int(profile.avg_length * 0.1)) if profile.avg_length else 2
502
+ thresholds.min_length = max(0, profile.min_length - length_buffer)
503
+ thresholds.max_length = (profile.max_length or 0) + length_buffer * 2
504
+
505
+ thresholds.confidence = quality_score
506
+ thresholds.reasoning = reasoning
507
+
508
+ return thresholds
509
+
510
+ def tune_table(
511
+ self,
512
+ profile: TableProfile,
513
+ context: dict[str, Any],
514
+ ) -> TableThresholds:
515
+ thresholds = TableThresholds(
516
+ table_name=profile.name,
517
+ strategy_used=TuningStrategy.ADAPTIVE,
518
+ )
519
+
520
+ # Calculate overall quality
521
+ col_qualities = []
522
+ for col_profile in profile.columns:
523
+ quality = self._assess_quality(col_profile)
524
+ col_qualities.append(quality)
525
+ col_thresholds = self.tune_column(col_profile, context)
526
+ thresholds.columns[col_profile.name] = col_thresholds
527
+
528
+ avg_quality = sum(col_qualities) / len(col_qualities) if col_qualities else 0.5
529
+
530
+ # Adaptive table thresholds
531
+ if avg_quality > 0.8:
532
+ thresholds.duplicate_threshold = 0.0
533
+ thresholds.global_null_threshold = 0.05
534
+ elif avg_quality > 0.5:
535
+ thresholds.duplicate_threshold = profile.duplicate_row_ratio * 1.1
536
+ thresholds.global_null_threshold = 0.1
537
+ else:
538
+ thresholds.duplicate_threshold = 0.05
539
+ thresholds.global_null_threshold = 0.2
540
+
541
+ thresholds.metadata["overall_quality"] = avg_quality
542
+
543
+ return thresholds
544
+
545
+ def _assess_quality(self, profile: ColumnProfile) -> float:
546
+ """Assess data quality for a column."""
547
+ scores = []
548
+
549
+ # Completeness score (inverse of null ratio)
550
+ completeness = 1.0 - profile.null_ratio
551
+ scores.append(completeness)
552
+
553
+ # Consistency score (based on patterns)
554
+ if profile.detected_patterns:
555
+ best_match = max(p.match_ratio for p in profile.detected_patterns)
556
+ scores.append(best_match)
557
+
558
+ # Uniqueness appropriateness
559
+ if profile.is_unique or profile.unique_ratio > 0.9:
560
+ # High uniqueness is often good for IDs
561
+ scores.append(0.9)
562
+ elif profile.unique_ratio < 0.01:
563
+ # Very low uniqueness might be categorical (ok) or constant (suspicious)
564
+ scores.append(0.5 if not profile.is_constant else 0.3)
565
+ else:
566
+ scores.append(0.7)
567
+
568
+ return sum(scores) / len(scores) if scores else 0.5
569
+
570
+
571
+ class StatisticalStrategy(TuningStrategyImpl):
572
+ """Statistical tuning - uses statistical methods.
573
+
574
+ Uses percentiles, IQR, and other statistical measures.
575
+ """
576
+
577
+ name = "statistical"
578
+
579
+ def __init__(
580
+ self,
581
+ percentile_low: float = 0.01,
582
+ percentile_high: float = 0.99,
583
+ iqr_multiplier: float = 1.5,
584
+ ):
585
+ self.percentile_low = percentile_low
586
+ self.percentile_high = percentile_high
587
+ self.iqr_multiplier = iqr_multiplier
588
+
589
+ def tune_column(
590
+ self,
591
+ profile: ColumnProfile,
592
+ context: dict[str, Any],
593
+ ) -> ColumnThresholds:
594
+ thresholds = ColumnThresholds(column_name=profile.name)
595
+ reasoning = []
596
+
597
+ # Get column data if available
598
+ data = context.get("column_data")
599
+
600
+ # Null threshold using binomial confidence interval
601
+ n = profile.row_count
602
+ p = profile.null_ratio
603
+ if n > 0:
604
+ # Wilson score interval
605
+ z = 2.576 # 99% confidence
606
+ denominator = 1 + z * z / n
607
+ centre = p + z * z / (2 * n)
608
+ margin = z * math.sqrt((p * (1 - p) + z * z / (4 * n)) / n)
609
+ upper_bound = min(1.0, (centre + margin) / denominator)
610
+ thresholds.null_threshold = upper_bound
611
+ reasoning.append(f"Null threshold from Wilson CI: {upper_bound:.3f}")
612
+
613
+ # Range using percentiles or IQR
614
+ if profile.distribution:
615
+ dist = profile.distribution
616
+ if dist.q1 is not None and dist.q3 is not None:
617
+ # Use IQR method
618
+ iqr = dist.q3 - dist.q1
619
+ lower = dist.q1 - self.iqr_multiplier * iqr
620
+ upper = dist.q3 + self.iqr_multiplier * iqr
621
+ thresholds.min_value = lower
622
+ thresholds.max_value = upper
623
+ reasoning.append(f"Range from IQR ({self.iqr_multiplier}x): [{lower:.2f}, {upper:.2f}]")
624
+ elif dist.min is not None and dist.max is not None:
625
+ # Use min/max with buffer based on std
626
+ if dist.std:
627
+ buffer = dist.std * 3 # 3 sigma
628
+ else:
629
+ buffer = (dist.max - dist.min) * 0.1
630
+ thresholds.min_value = dist.min - buffer
631
+ thresholds.max_value = dist.max + buffer
632
+ reasoning.append(f"Range from 3-sigma: [{thresholds.min_value:.2f}, {thresholds.max_value:.2f}]")
633
+
634
+ # Pattern threshold based on distribution
635
+ if profile.detected_patterns:
636
+ match_ratios = [p.match_ratio for p in profile.detected_patterns]
637
+ # Use 10th percentile of match ratios
638
+ if len(match_ratios) > 1:
639
+ threshold = sorted(match_ratios)[max(0, len(match_ratios) // 10)]
640
+ else:
641
+ threshold = match_ratios[0] * 0.9
642
+ thresholds.pattern_match_threshold = threshold
643
+ reasoning.append(f"Pattern threshold from distribution: {threshold:.2f}")
644
+
645
+ thresholds.confidence = 0.85
646
+ thresholds.reasoning = reasoning
647
+
648
+ return thresholds
649
+
650
+ def tune_table(
651
+ self,
652
+ profile: TableProfile,
653
+ context: dict[str, Any],
654
+ ) -> TableThresholds:
655
+ thresholds = TableThresholds(
656
+ table_name=profile.name,
657
+ strategy_used=TuningStrategy.STATISTICAL,
658
+ )
659
+
660
+ for col_profile in profile.columns:
661
+ col_thresholds = self.tune_column(col_profile, context)
662
+ thresholds.columns[col_profile.name] = col_thresholds
663
+
664
+ # Statistical duplicate threshold
665
+ n = profile.row_count
666
+ p = profile.duplicate_row_ratio
667
+ if n > 0 and p > 0:
668
+ z = 2.576
669
+ margin = z * math.sqrt(p * (1 - p) / n)
670
+ thresholds.duplicate_threshold = min(1.0, p + margin)
671
+ else:
672
+ thresholds.duplicate_threshold = 0.01
673
+
674
+ return thresholds
675
+
676
+
677
+ class DomainAwareStrategy(TuningStrategyImpl):
678
+ """Domain-aware tuning - uses domain-specific knowledge.
679
+
680
+ Applies different rules based on detected data types.
681
+ """
682
+
683
+ name = "domain_aware"
684
+
685
+ # Domain-specific defaults
686
+ DOMAIN_DEFAULTS: dict[DataType, dict[str, Any]] = {
687
+ DataType.EMAIL: {
688
+ "null_threshold": 0.1,
689
+ "pattern_threshold": 0.95,
690
+ "min_length": 5,
691
+ "max_length": 254,
692
+ },
693
+ DataType.PHONE: {
694
+ "null_threshold": 0.2,
695
+ "pattern_threshold": 0.9,
696
+ "min_length": 7,
697
+ "max_length": 20,
698
+ },
699
+ DataType.UUID: {
700
+ "null_threshold": 0.0,
701
+ "pattern_threshold": 0.99,
702
+ "uniqueness_threshold": 1.0,
703
+ "min_length": 36,
704
+ "max_length": 36,
705
+ },
706
+ DataType.DATE: {
707
+ "null_threshold": 0.1,
708
+ "pattern_threshold": 0.95,
709
+ },
710
+ DataType.IDENTIFIER: {
711
+ "null_threshold": 0.0,
712
+ "uniqueness_threshold": 1.0,
713
+ },
714
+ DataType.CATEGORICAL: {
715
+ "null_threshold": 0.05,
716
+ "max_cardinality": 100,
717
+ },
718
+ DataType.CURRENCY: {
719
+ "null_threshold": 0.05,
720
+ "min_value": 0.0,
721
+ },
722
+ DataType.PERCENTAGE: {
723
+ "null_threshold": 0.05,
724
+ "min_value": 0.0,
725
+ "max_value": 100.0,
726
+ },
727
+ DataType.BOOLEAN: {
728
+ "null_threshold": 0.0,
729
+ "allowed_values": {True, False, 0, 1, "true", "false", "yes", "no"},
730
+ },
731
+ DataType.KOREAN_PHONE: {
732
+ "null_threshold": 0.1,
733
+ "pattern_threshold": 0.95,
734
+ "min_length": 10,
735
+ "max_length": 13,
736
+ },
737
+ DataType.KOREAN_RRN: {
738
+ "null_threshold": 0.0,
739
+ "pattern_threshold": 0.99,
740
+ "min_length": 13,
741
+ "max_length": 14,
742
+ },
743
+ }
744
+
745
+ def tune_column(
746
+ self,
747
+ profile: ColumnProfile,
748
+ context: dict[str, Any],
749
+ ) -> ColumnThresholds:
750
+ thresholds = ColumnThresholds(column_name=profile.name)
751
+ reasoning = []
752
+
753
+ # Get domain defaults for this type
754
+ defaults = self.DOMAIN_DEFAULTS.get(profile.inferred_type, {})
755
+ reasoning.append(f"Using domain defaults for {profile.inferred_type.value}")
756
+
757
+ # Apply domain defaults
758
+ if "null_threshold" in defaults:
759
+ thresholds.null_threshold = defaults["null_threshold"]
760
+ else:
761
+ thresholds.null_threshold = profile.null_ratio * 1.2 + 0.01
762
+
763
+ if "pattern_threshold" in defaults:
764
+ thresholds.pattern_match_threshold = defaults["pattern_threshold"]
765
+
766
+ if "min_length" in defaults:
767
+ thresholds.min_length = defaults["min_length"]
768
+ elif profile.min_length is not None:
769
+ thresholds.min_length = profile.min_length
770
+
771
+ if "max_length" in defaults:
772
+ thresholds.max_length = defaults["max_length"]
773
+ elif profile.max_length is not None:
774
+ thresholds.max_length = profile.max_length
775
+
776
+ if "uniqueness_threshold" in defaults:
777
+ thresholds.uniqueness_threshold = defaults["uniqueness_threshold"]
778
+
779
+ if "min_value" in defaults:
780
+ thresholds.min_value = defaults["min_value"]
781
+
782
+ if "max_value" in defaults:
783
+ thresholds.max_value = defaults["max_value"]
784
+ elif profile.distribution and profile.distribution.max:
785
+ thresholds.max_value = profile.distribution.max * 1.1
786
+
787
+ if "allowed_values" in defaults:
788
+ thresholds.allowed_values = defaults["allowed_values"]
789
+
790
+ thresholds.confidence = 0.75
791
+ thresholds.reasoning = reasoning
792
+
793
+ return thresholds
794
+
795
+ def tune_table(
796
+ self,
797
+ profile: TableProfile,
798
+ context: dict[str, Any],
799
+ ) -> TableThresholds:
800
+ thresholds = TableThresholds(
801
+ table_name=profile.name,
802
+ strategy_used=TuningStrategy.DOMAIN_AWARE,
803
+ )
804
+
805
+ for col_profile in profile.columns:
806
+ col_thresholds = self.tune_column(col_profile, context)
807
+ thresholds.columns[col_profile.name] = col_thresholds
808
+
809
+ # Table-level thresholds
810
+ # Check if any column is a unique identifier
811
+ has_identifier = any(
812
+ col.inferred_type == DataType.IDENTIFIER or col.is_unique
813
+ for col in profile.columns
814
+ )
815
+
816
+ if has_identifier:
817
+ thresholds.duplicate_threshold = 0.0
818
+ else:
819
+ thresholds.duplicate_threshold = profile.duplicate_row_ratio * 1.1
820
+
821
+ return thresholds
822
+
823
+
824
+ # =============================================================================
825
+ # Strategy Registry
826
+ # =============================================================================
827
+
828
+
829
+ class StrategyRegistry:
830
+ """Registry for tuning strategies."""
831
+
832
+ def __init__(self) -> None:
833
+ self._strategies: dict[str, TuningStrategyImpl] = {}
834
+
835
+ def register(self, strategy: TuningStrategyImpl) -> None:
836
+ """Register a strategy."""
837
+ self._strategies[strategy.name] = strategy
838
+
839
+ def get(self, name: str) -> TuningStrategyImpl:
840
+ """Get strategy by name."""
841
+ if name not in self._strategies:
842
+ raise KeyError(f"Unknown strategy: {name}")
843
+ return self._strategies[name]
844
+
845
+ def list_strategies(self) -> list[str]:
846
+ """List available strategies."""
847
+ return list(self._strategies.keys())
848
+
849
+
850
+ # Global registry
851
+ strategy_registry = StrategyRegistry()
852
+ strategy_registry.register(ConservativeStrategy())
853
+ strategy_registry.register(BalancedStrategy())
854
+ strategy_registry.register(PermissiveStrategy())
855
+ strategy_registry.register(AdaptiveStrategy())
856
+ strategy_registry.register(StatisticalStrategy())
857
+ strategy_registry.register(DomainAwareStrategy())
858
+
859
+
860
+ # =============================================================================
861
+ # Threshold Tuner
862
+ # =============================================================================
863
+
864
+
865
+ @dataclass
866
+ class TunerConfig:
867
+ """Configuration for threshold tuner."""
868
+
869
+ strategy: str = "adaptive"
870
+ strictness: Strictness = Strictness.MEDIUM
871
+ use_domain_hints: bool = True
872
+ min_confidence: float = 0.5
873
+ combine_strategies: bool = False
874
+
875
+
876
+ class ThresholdTuner:
877
+ """Main interface for threshold tuning.
878
+
879
+ Analyzes data profiles and determines optimal thresholds.
880
+
881
+ Example:
882
+ tuner = ThresholdTuner(strategy="adaptive")
883
+ thresholds = tuner.tune(profile)
884
+
885
+ for col_name, col_thresh in thresholds.columns.items():
886
+ print(f"{col_name}: null <= {col_thresh.null_threshold:.1%}")
887
+ """
888
+
889
+ def __init__(
890
+ self,
891
+ strategy: str | TuningStrategyImpl = "adaptive",
892
+ config: TunerConfig | None = None,
893
+ ):
894
+ self.config = config or TunerConfig()
895
+
896
+ if isinstance(strategy, TuningStrategyImpl):
897
+ self._strategy = strategy
898
+ else:
899
+ self._strategy = strategy_registry.get(strategy)
900
+
901
+ def tune(
902
+ self,
903
+ profile: TableProfile,
904
+ context: dict[str, Any] | None = None,
905
+ ) -> TableThresholds:
906
+ """Tune thresholds for a table profile.
907
+
908
+ Args:
909
+ profile: Table profile to tune
910
+ context: Additional context
911
+
912
+ Returns:
913
+ Tuned thresholds
914
+ """
915
+ context = context or {}
916
+
917
+ # Apply strictness preset
918
+ preset = StrictnessPreset.for_strictness(self.config.strictness)
919
+ context["strictness_preset"] = preset
920
+
921
+ # Tune using strategy
922
+ thresholds = self._strategy.tune_table(profile, context)
923
+
924
+ # Apply strictness multipliers
925
+ self._apply_strictness(thresholds, preset)
926
+
927
+ return thresholds
928
+
929
+ def tune_column(
930
+ self,
931
+ profile: ColumnProfile,
932
+ context: dict[str, Any] | None = None,
933
+ ) -> ColumnThresholds:
934
+ """Tune thresholds for a single column.
935
+
936
+ Args:
937
+ profile: Column profile
938
+ context: Additional context
939
+
940
+ Returns:
941
+ Tuned thresholds
942
+ """
943
+ context = context or {}
944
+
945
+ preset = StrictnessPreset.for_strictness(self.config.strictness)
946
+ context["strictness_preset"] = preset
947
+
948
+ thresholds = self._strategy.tune_column(profile, context)
949
+
950
+ # Apply strictness
951
+ thresholds.null_threshold *= preset.null_multiplier
952
+ thresholds.pattern_match_threshold = min(
953
+ 1.0,
954
+ thresholds.pattern_match_threshold + (1 - thresholds.pattern_match_threshold) *
955
+ (1 - preset.pattern_threshold)
956
+ )
957
+
958
+ return thresholds
959
+
960
+ def _apply_strictness(
961
+ self,
962
+ thresholds: TableThresholds,
963
+ preset: StrictnessPreset,
964
+ ) -> None:
965
+ """Apply strictness preset to thresholds."""
966
+ for col_thresholds in thresholds.columns.values():
967
+ # Adjust null threshold
968
+ col_thresholds.null_threshold *= preset.null_multiplier
969
+
970
+ # Adjust pattern threshold
971
+ if col_thresholds.pattern_match_threshold < preset.pattern_threshold:
972
+ col_thresholds.pattern_match_threshold = preset.pattern_threshold
973
+
974
+ # Adjust range buffer
975
+ if col_thresholds.min_value is not None and col_thresholds.max_value is not None:
976
+ range_size = col_thresholds.max_value - col_thresholds.min_value
977
+ buffer = range_size * preset.range_buffer
978
+
979
+ col_thresholds.min_value -= buffer
980
+ col_thresholds.max_value += buffer
981
+
982
+ def compare_strategies(
983
+ self,
984
+ profile: TableProfile,
985
+ strategies: list[str] | None = None,
986
+ ) -> dict[str, TableThresholds]:
987
+ """Compare thresholds from different strategies.
988
+
989
+ Args:
990
+ profile: Profile to analyze
991
+ strategies: List of strategy names (None = all)
992
+
993
+ Returns:
994
+ Dictionary mapping strategy name to thresholds
995
+ """
996
+ if strategies is None:
997
+ strategies = strategy_registry.list_strategies()
998
+
999
+ results = {}
1000
+ for strategy_name in strategies:
1001
+ try:
1002
+ strategy = strategy_registry.get(strategy_name)
1003
+ thresholds = strategy.tune_table(profile, {})
1004
+ results[strategy_name] = thresholds
1005
+ except Exception as e:
1006
+ logger.warning(f"Strategy {strategy_name} failed: {e}")
1007
+
1008
+ return results
1009
+
1010
+
1011
+ # =============================================================================
1012
+ # A/B Testing Support
1013
+ # =============================================================================
1014
+
1015
+
1016
+ @dataclass
1017
+ class ThresholdTestResult:
1018
+ """Result of A/B testing thresholds."""
1019
+
1020
+ threshold_a: TableThresholds
1021
+ threshold_b: TableThresholds
1022
+ violations_a: int
1023
+ violations_b: int
1024
+ false_positives_a: int
1025
+ false_positives_b: int
1026
+ recommendation: str
1027
+ details: dict[str, Any] = field(default_factory=dict)
1028
+
1029
+
1030
+ class ThresholdTester:
1031
+ """A/B test different threshold configurations.
1032
+
1033
+ Compare how different thresholds perform on validation.
1034
+ """
1035
+
1036
+ def __init__(self) -> None:
1037
+ pass
1038
+
1039
+ def compare(
1040
+ self,
1041
+ data: pl.DataFrame,
1042
+ threshold_a: TableThresholds,
1043
+ threshold_b: TableThresholds,
1044
+ ground_truth: dict[str, bool] | None = None,
1045
+ ) -> ThresholdTestResult:
1046
+ """Compare two threshold configurations.
1047
+
1048
+ Args:
1049
+ data: Data to validate
1050
+ threshold_a: First threshold set
1051
+ threshold_b: Second threshold set
1052
+ ground_truth: Optional ground truth (row valid/invalid)
1053
+
1054
+ Returns:
1055
+ Comparison results
1056
+ """
1057
+ violations_a = self._count_violations(data, threshold_a)
1058
+ violations_b = self._count_violations(data, threshold_b)
1059
+
1060
+ # If ground truth provided, calculate false positives
1061
+ fp_a = 0
1062
+ fp_b = 0
1063
+
1064
+ if ground_truth is not None:
1065
+ # This would require more sophisticated tracking
1066
+ pass
1067
+
1068
+ # Generate recommendation
1069
+ if violations_a < violations_b:
1070
+ if violations_a * 1.2 < violations_b:
1071
+ recommendation = f"Strong preference for A ({violations_a} vs {violations_b} violations)"
1072
+ else:
1073
+ recommendation = f"Slight preference for A ({violations_a} vs {violations_b} violations)"
1074
+ elif violations_b < violations_a:
1075
+ if violations_b * 1.2 < violations_a:
1076
+ recommendation = f"Strong preference for B ({violations_b} vs {violations_a} violations)"
1077
+ else:
1078
+ recommendation = f"Slight preference for B ({violations_b} vs {violations_a} violations)"
1079
+ else:
1080
+ recommendation = "No significant difference"
1081
+
1082
+ return ThresholdTestResult(
1083
+ threshold_a=threshold_a,
1084
+ threshold_b=threshold_b,
1085
+ violations_a=violations_a,
1086
+ violations_b=violations_b,
1087
+ false_positives_a=fp_a,
1088
+ false_positives_b=fp_b,
1089
+ recommendation=recommendation,
1090
+ )
1091
+
1092
+ def _count_violations(
1093
+ self,
1094
+ data: pl.DataFrame,
1095
+ thresholds: TableThresholds,
1096
+ ) -> int:
1097
+ """Count threshold violations in data."""
1098
+ violations = 0
1099
+
1100
+ for col_name, col_thresh in thresholds.columns.items():
1101
+ if col_name not in data.columns:
1102
+ continue
1103
+
1104
+ col = data.get_column(col_name)
1105
+
1106
+ # Check null threshold
1107
+ null_ratio = col.null_count() / len(col) if len(col) > 0 else 0
1108
+ if null_ratio > col_thresh.null_threshold:
1109
+ violations += 1
1110
+
1111
+ # Check range
1112
+ if col_thresh.min_value is not None:
1113
+ below_min = (col < col_thresh.min_value).sum()
1114
+ if below_min > 0:
1115
+ violations += below_min
1116
+
1117
+ if col_thresh.max_value is not None:
1118
+ above_max = (col > col_thresh.max_value).sum()
1119
+ if above_max > 0:
1120
+ violations += above_max
1121
+
1122
+ return violations
1123
+
1124
+
1125
+ # =============================================================================
1126
+ # Convenience Functions
1127
+ # =============================================================================
1128
+
1129
+
1130
+ def tune_thresholds(
1131
+ profile: TableProfile,
1132
+ strategy: str = "adaptive",
1133
+ strictness: Strictness = Strictness.MEDIUM,
1134
+ ) -> TableThresholds:
1135
+ """Tune thresholds for a profile.
1136
+
1137
+ Args:
1138
+ profile: Profile to tune
1139
+ strategy: Tuning strategy
1140
+ strictness: Desired strictness level
1141
+
1142
+ Returns:
1143
+ Tuned thresholds
1144
+ """
1145
+ config = TunerConfig(strategy=strategy, strictness=strictness)
1146
+ tuner = ThresholdTuner(strategy=strategy, config=config)
1147
+ return tuner.tune(profile)
1148
+
1149
+
1150
+ def get_available_strategies() -> list[str]:
1151
+ """Get list of available tuning strategies."""
1152
+ return strategy_registry.list_strategies()
1153
+
1154
+
1155
+ def create_tuner(
1156
+ strategy: str = "adaptive",
1157
+ strictness: str = "medium",
1158
+ ) -> ThresholdTuner:
1159
+ """Create a threshold tuner.
1160
+
1161
+ Args:
1162
+ strategy: Strategy name
1163
+ strictness: Strictness level ("loose", "medium", "strict")
1164
+
1165
+ Returns:
1166
+ Configured tuner
1167
+ """
1168
+ strictness_enum = Strictness(strictness)
1169
+ config = TunerConfig(strategy=strategy, strictness=strictness_enum)
1170
+ return ThresholdTuner(strategy=strategy, config=config)