truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1532 @@
1
+ """Rule quality validation with labeled data support.
2
+
3
+ This module provides comprehensive quality validation using labeled datasets:
4
+ - Ground truth dataset management
5
+ - Statistical validation with confidence intervals
6
+ - Cross-validation and bootstrap methods
7
+ - A/B testing for rule comparison
8
+ - Labeled data collection and annotation
9
+
10
+ Key features:
11
+ - Pluggable validation strategy architecture
12
+ - Support for partial labeling (not all rows need labels)
13
+ - Integration with existing quality scoring
14
+ - Detailed validation reports with actionable insights
15
+
16
+ Example:
17
+ from truthound.profiler.validation import (
18
+ LabeledDataValidator,
19
+ ValidationDataset,
20
+ create_validation_suite,
21
+ )
22
+
23
+ # Create labeled dataset
24
+ dataset = ValidationDataset.from_csv(
25
+ "validation_data.csv",
26
+ label_column="is_valid",
27
+ )
28
+
29
+ # Validate rule quality
30
+ validator = LabeledDataValidator()
31
+ result = validator.validate(rule, dataset)
32
+
33
+ print(f"Precision: {result.precision:.2%}")
34
+ print(f"Recall: {result.recall:.2%}")
35
+ print(f"Confidence: {result.confidence:.2%}")
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import hashlib
41
+ import json
42
+ import math
43
+ import random
44
+ import statistics
45
+ import threading
46
+ from abc import ABC, abstractmethod
47
+ from collections import defaultdict
48
+ from dataclasses import dataclass, field
49
+ from datetime import datetime
50
+ from enum import Enum
51
+ from pathlib import Path
52
+ from typing import Any, Callable, Generic, Iterator, Protocol, TypeVar
53
+
54
+ import polars as pl
55
+
56
+ from truthound.profiler.quality import (
57
+ ConfusionMatrix,
58
+ QualityLevel,
59
+ QualityMetrics,
60
+ RuleProtocol,
61
+ RuleType,
62
+ ValidationRule,
63
+ )
64
+
65
+
66
+ # =============================================================================
67
+ # Types and Enums
68
+ # =============================================================================
69
+
70
+
71
+ class LabelType(str, Enum):
72
+ """Types of labels for validation data."""
73
+
74
+ BINARY = "binary" # True/False for valid/invalid
75
+ CATEGORICAL = "categorical" # Multiple categories
76
+ ORDINAL = "ordinal" # Ordered categories (e.g., quality scores)
77
+ CONFIDENCE = "confidence" # Probability/confidence scores
78
+
79
+
80
+ class ValidationMethod(str, Enum):
81
+ """Validation methods available."""
82
+
83
+ HOLDOUT = "holdout" # Simple train/test split
84
+ CROSS_VALIDATION = "cross_validation" # K-fold cross-validation
85
+ BOOTSTRAP = "bootstrap" # Bootstrap resampling
86
+ TEMPORAL = "temporal" # Time-based validation
87
+ STRATIFIED = "stratified" # Stratified sampling
88
+
89
+
90
+ class ValidationStatus(str, Enum):
91
+ """Status of validation result."""
92
+
93
+ PASSED = "passed" # Rule meets quality threshold
94
+ FAILED = "failed" # Rule below quality threshold
95
+ INCONCLUSIVE = "inconclusive" # Not enough data/confidence
96
+ DEGRADED = "degraded" # Quality dropped from previous
97
+
98
+
99
+ # =============================================================================
100
+ # Labeled Data Management
101
+ # =============================================================================
102
+
103
+
104
+ @dataclass
105
+ class LabeledRow:
106
+ """A single labeled data point."""
107
+
108
+ row_id: str | int
109
+ data: dict[str, Any]
110
+ label: bool | str | float
111
+ label_type: LabelType = LabelType.BINARY
112
+ confidence: float = 1.0 # Confidence in the label
113
+ source: str = "" # Where the label came from
114
+ annotated_at: datetime = field(default_factory=datetime.now)
115
+ annotated_by: str = ""
116
+ notes: str = ""
117
+
118
+
119
+ @dataclass
120
+ class ValidationDataset:
121
+ """Dataset with labeled ground truth.
122
+
123
+ Manages labeled data for validating rule quality.
124
+
125
+ Attributes:
126
+ name: Dataset name
127
+ rows: Labeled data rows
128
+ label_type: Type of labels
129
+ label_column: Name of label column
130
+ metadata: Additional metadata
131
+ """
132
+
133
+ name: str
134
+ rows: list[LabeledRow] = field(default_factory=list)
135
+ label_type: LabelType = LabelType.BINARY
136
+ label_column: str = "is_valid"
137
+ metadata: dict[str, Any] = field(default_factory=dict)
138
+ created_at: datetime = field(default_factory=datetime.now)
139
+ version: str = "1.0"
140
+
141
+ def __len__(self) -> int:
142
+ return len(self.rows)
143
+
144
+ def __iter__(self) -> Iterator[LabeledRow]:
145
+ return iter(self.rows)
146
+
147
+ @classmethod
148
+ def from_dataframe(
149
+ cls,
150
+ df: pl.DataFrame,
151
+ label_column: str,
152
+ name: str = "validation_set",
153
+ id_column: str | None = None,
154
+ label_type: LabelType = LabelType.BINARY,
155
+ ) -> "ValidationDataset":
156
+ """Create dataset from a Polars DataFrame.
157
+
158
+ Args:
159
+ df: DataFrame with data and labels
160
+ label_column: Column containing labels
161
+ name: Dataset name
162
+ id_column: Column to use as row ID (uses index if None)
163
+ label_type: Type of labels
164
+
165
+ Returns:
166
+ ValidationDataset instance
167
+ """
168
+ rows = []
169
+ for i, row in enumerate(df.iter_rows(named=True)):
170
+ label = row.pop(label_column) if label_column in row else None
171
+ row_id = row.get(id_column, i) if id_column else i
172
+
173
+ rows.append(LabeledRow(
174
+ row_id=row_id,
175
+ data=row,
176
+ label=label,
177
+ label_type=label_type,
178
+ ))
179
+
180
+ return cls(
181
+ name=name,
182
+ rows=rows,
183
+ label_type=label_type,
184
+ label_column=label_column,
185
+ )
186
+
187
+ @classmethod
188
+ def from_csv(
189
+ cls,
190
+ path: str | Path,
191
+ label_column: str,
192
+ name: str | None = None,
193
+ **kwargs: Any,
194
+ ) -> "ValidationDataset":
195
+ """Load dataset from CSV file.
196
+
197
+ Args:
198
+ path: Path to CSV file
199
+ label_column: Column containing labels
200
+ name: Dataset name (uses filename if None)
201
+ **kwargs: Additional arguments for from_dataframe
202
+
203
+ Returns:
204
+ ValidationDataset instance
205
+ """
206
+ path = Path(path)
207
+ df = pl.read_csv(path)
208
+ return cls.from_dataframe(
209
+ df,
210
+ label_column=label_column,
211
+ name=name or path.stem,
212
+ **kwargs,
213
+ )
214
+
215
+ @classmethod
216
+ def from_json(
217
+ cls,
218
+ path: str | Path,
219
+ ) -> "ValidationDataset":
220
+ """Load dataset from JSON file.
221
+
222
+ Args:
223
+ path: Path to JSON file
224
+
225
+ Returns:
226
+ ValidationDataset instance
227
+ """
228
+ path = Path(path)
229
+ with open(path) as f:
230
+ data = json.load(f)
231
+
232
+ rows = [
233
+ LabeledRow(
234
+ row_id=r["row_id"],
235
+ data=r["data"],
236
+ label=r["label"],
237
+ label_type=LabelType(r.get("label_type", "binary")),
238
+ confidence=r.get("confidence", 1.0),
239
+ source=r.get("source", ""),
240
+ annotated_at=datetime.fromisoformat(r["annotated_at"])
241
+ if "annotated_at" in r else datetime.now(),
242
+ annotated_by=r.get("annotated_by", ""),
243
+ notes=r.get("notes", ""),
244
+ )
245
+ for r in data.get("rows", [])
246
+ ]
247
+
248
+ return cls(
249
+ name=data.get("name", path.stem),
250
+ rows=rows,
251
+ label_type=LabelType(data.get("label_type", "binary")),
252
+ label_column=data.get("label_column", "is_valid"),
253
+ metadata=data.get("metadata", {}),
254
+ created_at=datetime.fromisoformat(data["created_at"])
255
+ if "created_at" in data else datetime.now(),
256
+ version=data.get("version", "1.0"),
257
+ )
258
+
259
+ def to_dataframe(self) -> pl.DataFrame:
260
+ """Convert to Polars DataFrame with labels.
261
+
262
+ Returns:
263
+ DataFrame with data and label column
264
+ """
265
+ if not self.rows:
266
+ return pl.DataFrame()
267
+
268
+ # Collect all data
269
+ data_dicts = [row.data for row in self.rows]
270
+ labels = [row.label for row in self.rows]
271
+
272
+ # Create DataFrame
273
+ df = pl.DataFrame(data_dicts)
274
+ df = df.with_columns(pl.Series(self.label_column, labels))
275
+
276
+ return df
277
+
278
+ def to_json(self, path: str | Path) -> None:
279
+ """Save dataset to JSON file.
280
+
281
+ Args:
282
+ path: Output path
283
+ """
284
+ data = {
285
+ "name": self.name,
286
+ "label_type": self.label_type.value,
287
+ "label_column": self.label_column,
288
+ "metadata": self.metadata,
289
+ "created_at": self.created_at.isoformat(),
290
+ "version": self.version,
291
+ "rows": [
292
+ {
293
+ "row_id": r.row_id,
294
+ "data": r.data,
295
+ "label": r.label,
296
+ "label_type": r.label_type.value,
297
+ "confidence": r.confidence,
298
+ "source": r.source,
299
+ "annotated_at": r.annotated_at.isoformat(),
300
+ "annotated_by": r.annotated_by,
301
+ "notes": r.notes,
302
+ }
303
+ for r in self.rows
304
+ ],
305
+ }
306
+
307
+ path = Path(path)
308
+ path.parent.mkdir(parents=True, exist_ok=True)
309
+ with open(path, "w") as f:
310
+ json.dump(data, f, indent=2)
311
+
312
+ def split(
313
+ self,
314
+ test_ratio: float = 0.2,
315
+ random_seed: int | None = None,
316
+ stratify: bool = True,
317
+ ) -> tuple["ValidationDataset", "ValidationDataset"]:
318
+ """Split dataset into training and test sets.
319
+
320
+ Args:
321
+ test_ratio: Ratio of data for test set
322
+ random_seed: Random seed for reproducibility
323
+ stratify: Whether to stratify by label
324
+
325
+ Returns:
326
+ Tuple of (train_dataset, test_dataset)
327
+ """
328
+ if random_seed is not None:
329
+ random.seed(random_seed)
330
+
331
+ if stratify and self.label_type == LabelType.BINARY:
332
+ # Stratified split
333
+ positive = [r for r in self.rows if r.label]
334
+ negative = [r for r in self.rows if not r.label]
335
+
336
+ random.shuffle(positive)
337
+ random.shuffle(negative)
338
+
339
+ n_pos_test = max(1, int(len(positive) * test_ratio))
340
+ n_neg_test = max(1, int(len(negative) * test_ratio))
341
+
342
+ test_rows = positive[:n_pos_test] + negative[:n_neg_test]
343
+ train_rows = positive[n_pos_test:] + negative[n_neg_test:]
344
+ else:
345
+ # Random split
346
+ rows = list(self.rows)
347
+ random.shuffle(rows)
348
+ n_test = max(1, int(len(rows) * test_ratio))
349
+ test_rows = rows[:n_test]
350
+ train_rows = rows[n_test:]
351
+
352
+ train_ds = ValidationDataset(
353
+ name=f"{self.name}_train",
354
+ rows=train_rows,
355
+ label_type=self.label_type,
356
+ label_column=self.label_column,
357
+ metadata={**self.metadata, "split": "train"},
358
+ )
359
+
360
+ test_ds = ValidationDataset(
361
+ name=f"{self.name}_test",
362
+ rows=test_rows,
363
+ label_type=self.label_type,
364
+ label_column=self.label_column,
365
+ metadata={**self.metadata, "split": "test"},
366
+ )
367
+
368
+ return train_ds, test_ds
369
+
370
+ def get_folds(
371
+ self,
372
+ n_folds: int = 5,
373
+ random_seed: int | None = None,
374
+ ) -> Iterator[tuple["ValidationDataset", "ValidationDataset"]]:
375
+ """Generate k-fold cross-validation splits.
376
+
377
+ Args:
378
+ n_folds: Number of folds
379
+ random_seed: Random seed for reproducibility
380
+
381
+ Yields:
382
+ Tuples of (train_fold, test_fold)
383
+ """
384
+ if random_seed is not None:
385
+ random.seed(random_seed)
386
+
387
+ rows = list(self.rows)
388
+ random.shuffle(rows)
389
+
390
+ fold_size = len(rows) // n_folds
391
+
392
+ for i in range(n_folds):
393
+ start = i * fold_size
394
+ end = start + fold_size if i < n_folds - 1 else len(rows)
395
+
396
+ test_rows = rows[start:end]
397
+ train_rows = rows[:start] + rows[end:]
398
+
399
+ train_ds = ValidationDataset(
400
+ name=f"{self.name}_fold{i}_train",
401
+ rows=train_rows,
402
+ label_type=self.label_type,
403
+ label_column=self.label_column,
404
+ )
405
+
406
+ test_ds = ValidationDataset(
407
+ name=f"{self.name}_fold{i}_test",
408
+ rows=test_rows,
409
+ label_type=self.label_type,
410
+ label_column=self.label_column,
411
+ )
412
+
413
+ yield train_ds, test_ds
414
+
415
+ def filter_by_confidence(
416
+ self,
417
+ min_confidence: float = 0.8,
418
+ ) -> "ValidationDataset":
419
+ """Filter to high-confidence labels only.
420
+
421
+ Args:
422
+ min_confidence: Minimum label confidence
423
+
424
+ Returns:
425
+ Filtered dataset
426
+ """
427
+ filtered_rows = [r for r in self.rows if r.confidence >= min_confidence]
428
+ return ValidationDataset(
429
+ name=f"{self.name}_high_confidence",
430
+ rows=filtered_rows,
431
+ label_type=self.label_type,
432
+ label_column=self.label_column,
433
+ metadata={**self.metadata, "min_confidence": min_confidence},
434
+ )
435
+
436
+ def get_label_distribution(self) -> dict[Any, int]:
437
+ """Get distribution of labels.
438
+
439
+ Returns:
440
+ Dictionary mapping labels to counts
441
+ """
442
+ distribution: dict[Any, int] = defaultdict(int)
443
+ for row in self.rows:
444
+ distribution[row.label] += 1
445
+ return dict(distribution)
446
+
447
+
448
+ # =============================================================================
449
+ # Validation Results
450
+ # =============================================================================
451
+
452
+
453
+ @dataclass
454
+ class ValidationResult:
455
+ """Comprehensive validation result.
456
+
457
+ Contains detailed metrics, confidence intervals, and recommendations.
458
+ """
459
+
460
+ # Core metrics
461
+ precision: float = 0.0
462
+ recall: float = 0.0
463
+ f1_score: float = 0.0
464
+ accuracy: float = 0.0
465
+
466
+ # Confidence intervals (95%)
467
+ precision_ci: tuple[float, float] = (0.0, 1.0)
468
+ recall_ci: tuple[float, float] = (0.0, 1.0)
469
+ f1_ci: tuple[float, float] = (0.0, 1.0)
470
+
471
+ # Validation metadata
472
+ n_samples: int = 0
473
+ n_positive: int = 0
474
+ n_negative: int = 0
475
+ confidence: float = 0.0
476
+
477
+ # Confusion matrix
478
+ true_positives: int = 0
479
+ true_negatives: int = 0
480
+ false_positives: int = 0
481
+ false_negatives: int = 0
482
+
483
+ # Status and recommendations
484
+ status: ValidationStatus = ValidationStatus.INCONCLUSIVE
485
+ quality_level: QualityLevel = QualityLevel.UNACCEPTABLE
486
+ recommendation: str = ""
487
+ warnings: list[str] = field(default_factory=list)
488
+
489
+ # Method details
490
+ validation_method: ValidationMethod = ValidationMethod.HOLDOUT
491
+ method_details: dict[str, Any] = field(default_factory=dict)
492
+
493
+ # Timing
494
+ validated_at: datetime = field(default_factory=datetime.now)
495
+ duration_ms: float = 0.0
496
+
497
+ @property
498
+ def confusion_matrix(self) -> ConfusionMatrix:
499
+ """Get confusion matrix."""
500
+ return ConfusionMatrix(
501
+ true_positives=self.true_positives,
502
+ true_negatives=self.true_negatives,
503
+ false_positives=self.false_positives,
504
+ false_negatives=self.false_negatives,
505
+ )
506
+
507
+ @property
508
+ def specificity(self) -> float:
509
+ """Calculate specificity."""
510
+ if self.true_negatives + self.false_positives == 0:
511
+ return 0.0
512
+ return self.true_negatives / (self.true_negatives + self.false_positives)
513
+
514
+ @property
515
+ def mcc(self) -> float:
516
+ """Calculate Matthews Correlation Coefficient."""
517
+ return self.confusion_matrix.mcc
518
+
519
+ def to_dict(self) -> dict[str, Any]:
520
+ """Convert to dictionary."""
521
+ return {
522
+ "precision": self.precision,
523
+ "recall": self.recall,
524
+ "f1_score": self.f1_score,
525
+ "accuracy": self.accuracy,
526
+ "precision_ci": self.precision_ci,
527
+ "recall_ci": self.recall_ci,
528
+ "f1_ci": self.f1_ci,
529
+ "n_samples": self.n_samples,
530
+ "n_positive": self.n_positive,
531
+ "n_negative": self.n_negative,
532
+ "confidence": self.confidence,
533
+ "confusion_matrix": {
534
+ "true_positives": self.true_positives,
535
+ "true_negatives": self.true_negatives,
536
+ "false_positives": self.false_positives,
537
+ "false_negatives": self.false_negatives,
538
+ },
539
+ "status": self.status.value,
540
+ "quality_level": self.quality_level.value,
541
+ "recommendation": self.recommendation,
542
+ "warnings": self.warnings,
543
+ "validation_method": self.validation_method.value,
544
+ "method_details": self.method_details,
545
+ "validated_at": self.validated_at.isoformat(),
546
+ "duration_ms": self.duration_ms,
547
+ }
548
+
549
+ def to_quality_metrics(self) -> QualityMetrics:
550
+ """Convert to QualityMetrics for compatibility."""
551
+ return QualityMetrics.from_confusion_matrix(
552
+ self.confusion_matrix,
553
+ sample_size=self.n_samples,
554
+ population_size=self.n_samples,
555
+ )
556
+
557
+
558
+ @dataclass
559
+ class ValidationReport:
560
+ """Complete validation report for multiple rules."""
561
+
562
+ rule_results: dict[str, ValidationResult] = field(default_factory=dict)
563
+ overall_status: ValidationStatus = ValidationStatus.INCONCLUSIVE
564
+ summary: str = ""
565
+ recommendations: list[str] = field(default_factory=list)
566
+ dataset_info: dict[str, Any] = field(default_factory=dict)
567
+ generated_at: datetime = field(default_factory=datetime.now)
568
+
569
+ def add_result(self, rule_name: str, result: ValidationResult) -> None:
570
+ """Add a rule validation result."""
571
+ self.rule_results[rule_name] = result
572
+ self._update_overall_status()
573
+
574
+ def _update_overall_status(self) -> None:
575
+ """Update overall status based on individual results."""
576
+ if not self.rule_results:
577
+ self.overall_status = ValidationStatus.INCONCLUSIVE
578
+ return
579
+
580
+ statuses = [r.status for r in self.rule_results.values()]
581
+
582
+ if all(s == ValidationStatus.PASSED for s in statuses):
583
+ self.overall_status = ValidationStatus.PASSED
584
+ elif any(s == ValidationStatus.FAILED for s in statuses):
585
+ self.overall_status = ValidationStatus.FAILED
586
+ elif any(s == ValidationStatus.DEGRADED for s in statuses):
587
+ self.overall_status = ValidationStatus.DEGRADED
588
+ else:
589
+ self.overall_status = ValidationStatus.INCONCLUSIVE
590
+
591
+ def to_dict(self) -> dict[str, Any]:
592
+ """Convert to dictionary."""
593
+ return {
594
+ "rule_results": {
595
+ name: result.to_dict()
596
+ for name, result in self.rule_results.items()
597
+ },
598
+ "overall_status": self.overall_status.value,
599
+ "summary": self.summary,
600
+ "recommendations": self.recommendations,
601
+ "dataset_info": self.dataset_info,
602
+ "generated_at": self.generated_at.isoformat(),
603
+ }
604
+
605
+
606
+ # =============================================================================
607
+ # Validation Strategies
608
+ # =============================================================================
609
+
610
+
611
+ class ValidationStrategy(ABC):
612
+ """Abstract base class for validation strategies."""
613
+
614
+ name: str = "base"
615
+
616
+ @abstractmethod
617
+ def validate(
618
+ self,
619
+ rule: RuleProtocol | ValidationRule,
620
+ dataset: ValidationDataset,
621
+ column: str,
622
+ ) -> ValidationResult:
623
+ """Validate a rule against labeled data.
624
+
625
+ Args:
626
+ rule: Rule to validate
627
+ dataset: Labeled validation dataset
628
+ column: Column to validate
629
+
630
+ Returns:
631
+ Validation result
632
+ """
633
+ pass
634
+
635
+
636
+ class HoldoutValidation(ValidationStrategy):
637
+ """Simple holdout validation strategy."""
638
+
639
+ name = "holdout"
640
+
641
+ def __init__(
642
+ self,
643
+ quality_threshold: float = 0.70,
644
+ min_samples: int = 30,
645
+ ):
646
+ self.quality_threshold = quality_threshold
647
+ self.min_samples = min_samples
648
+
649
+ def validate(
650
+ self,
651
+ rule: RuleProtocol | ValidationRule,
652
+ dataset: ValidationDataset,
653
+ column: str,
654
+ ) -> ValidationResult:
655
+ """Validate using holdout method."""
656
+ start_time = datetime.now()
657
+
658
+ if len(dataset) < self.min_samples:
659
+ return ValidationResult(
660
+ status=ValidationStatus.INCONCLUSIVE,
661
+ recommendation=f"Need at least {self.min_samples} samples, got {len(dataset)}",
662
+ n_samples=len(dataset),
663
+ )
664
+
665
+ # Convert to DataFrame
666
+ df = dataset.to_dataframe()
667
+
668
+ # Get predictions
669
+ predictions = rule.validate_column(df, column)
670
+
671
+ # Get ground truth
672
+ ground_truth = df.get_column(dataset.label_column)
673
+
674
+ # Calculate confusion matrix
675
+ result = self._calculate_metrics(
676
+ predictions.to_list(),
677
+ ground_truth.to_list(),
678
+ )
679
+
680
+ # Determine status
681
+ if result.f1_score >= self.quality_threshold:
682
+ result.status = ValidationStatus.PASSED
683
+ result.recommendation = (
684
+ f"Rule meets quality threshold (F1={result.f1_score:.2%} >= {self.quality_threshold:.0%})"
685
+ )
686
+ else:
687
+ result.status = ValidationStatus.FAILED
688
+ result.recommendation = (
689
+ f"Rule below quality threshold (F1={result.f1_score:.2%} < {self.quality_threshold:.0%})"
690
+ )
691
+
692
+ result.quality_level = QualityLevel.from_f1(result.f1_score)
693
+ result.validation_method = ValidationMethod.HOLDOUT
694
+ result.duration_ms = (datetime.now() - start_time).total_seconds() * 1000
695
+
696
+ return result
697
+
698
+ def _calculate_metrics(
699
+ self,
700
+ predictions: list[bool],
701
+ ground_truth: list[bool],
702
+ ) -> ValidationResult:
703
+ """Calculate validation metrics."""
704
+ tp = tn = fp = fn = 0
705
+
706
+ for pred, truth in zip(predictions, ground_truth):
707
+ if pred and truth:
708
+ tp += 1
709
+ elif not pred and not truth:
710
+ tn += 1
711
+ elif pred and not truth:
712
+ fp += 1
713
+ else:
714
+ fn += 1
715
+
716
+ n = len(predictions)
717
+ n_pos = sum(1 for t in ground_truth if t)
718
+ n_neg = n - n_pos
719
+
720
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
721
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
722
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
723
+ accuracy = (tp + tn) / n if n > 0 else 0.0
724
+
725
+ # Calculate confidence intervals using Wilson score
726
+ precision_ci = self._wilson_ci(tp, tp + fp)
727
+ recall_ci = self._wilson_ci(tp, tp + fn)
728
+
729
+ return ValidationResult(
730
+ precision=precision,
731
+ recall=recall,
732
+ f1_score=f1,
733
+ accuracy=accuracy,
734
+ precision_ci=precision_ci,
735
+ recall_ci=recall_ci,
736
+ n_samples=n,
737
+ n_positive=n_pos,
738
+ n_negative=n_neg,
739
+ true_positives=tp,
740
+ true_negatives=tn,
741
+ false_positives=fp,
742
+ false_negatives=fn,
743
+ confidence=min(1.0, n / 100), # Higher confidence with more samples
744
+ )
745
+
746
+ @staticmethod
747
+ def _wilson_ci(
748
+ successes: int,
749
+ trials: int,
750
+ z: float = 1.96,
751
+ ) -> tuple[float, float]:
752
+ """Calculate Wilson confidence interval."""
753
+ if trials == 0:
754
+ return (0.0, 1.0)
755
+
756
+ p = successes / trials
757
+ denominator = 1 + z * z / trials
758
+ centre = p + z * z / (2 * trials)
759
+ margin = z * math.sqrt((p * (1 - p) + z * z / (4 * trials)) / trials)
760
+
761
+ lower = max(0.0, (centre - margin) / denominator)
762
+ upper = min(1.0, (centre + margin) / denominator)
763
+
764
+ return (lower, upper)
765
+
766
+
767
+ class CrossValidationStrategy(ValidationStrategy):
768
+ """K-fold cross-validation strategy."""
769
+
770
+ name = "cross_validation"
771
+
772
+ def __init__(
773
+ self,
774
+ n_folds: int = 5,
775
+ quality_threshold: float = 0.70,
776
+ random_seed: int | None = None,
777
+ ):
778
+ self.n_folds = n_folds
779
+ self.quality_threshold = quality_threshold
780
+ self.random_seed = random_seed
781
+
782
+ def validate(
783
+ self,
784
+ rule: RuleProtocol | ValidationRule,
785
+ dataset: ValidationDataset,
786
+ column: str,
787
+ ) -> ValidationResult:
788
+ """Validate using k-fold cross-validation."""
789
+ start_time = datetime.now()
790
+
791
+ fold_metrics: list[dict[str, float]] = []
792
+
793
+ for train_ds, test_ds in dataset.get_folds(
794
+ n_folds=self.n_folds,
795
+ random_seed=self.random_seed,
796
+ ):
797
+ df = test_ds.to_dataframe()
798
+ predictions = rule.validate_column(df, column)
799
+ ground_truth = df.get_column(dataset.label_column)
800
+
801
+ # Calculate fold metrics
802
+ metrics = self._calculate_fold_metrics(
803
+ predictions.to_list(),
804
+ ground_truth.to_list(),
805
+ )
806
+ fold_metrics.append(metrics)
807
+
808
+ # Aggregate across folds
809
+ result = self._aggregate_folds(fold_metrics, len(dataset))
810
+
811
+ # Determine status
812
+ if result.f1_score >= self.quality_threshold:
813
+ result.status = ValidationStatus.PASSED
814
+ else:
815
+ result.status = ValidationStatus.FAILED
816
+
817
+ result.quality_level = QualityLevel.from_f1(result.f1_score)
818
+ result.validation_method = ValidationMethod.CROSS_VALIDATION
819
+ result.method_details = {
820
+ "n_folds": self.n_folds,
821
+ "fold_f1_scores": [m["f1"] for m in fold_metrics],
822
+ }
823
+ result.duration_ms = (datetime.now() - start_time).total_seconds() * 1000
824
+
825
+ return result
826
+
827
+ def _calculate_fold_metrics(
828
+ self,
829
+ predictions: list[bool],
830
+ ground_truth: list[bool],
831
+ ) -> dict[str, float]:
832
+ """Calculate metrics for a single fold."""
833
+ tp = tn = fp = fn = 0
834
+
835
+ for pred, truth in zip(predictions, ground_truth):
836
+ if pred and truth:
837
+ tp += 1
838
+ elif not pred and not truth:
839
+ tn += 1
840
+ elif pred and not truth:
841
+ fp += 1
842
+ else:
843
+ fn += 1
844
+
845
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
846
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
847
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
848
+
849
+ return {
850
+ "precision": precision,
851
+ "recall": recall,
852
+ "f1": f1,
853
+ "tp": tp,
854
+ "tn": tn,
855
+ "fp": fp,
856
+ "fn": fn,
857
+ }
858
+
859
+ def _aggregate_folds(
860
+ self,
861
+ fold_metrics: list[dict[str, float]],
862
+ total_samples: int,
863
+ ) -> ValidationResult:
864
+ """Aggregate metrics across folds."""
865
+ precisions = [m["precision"] for m in fold_metrics]
866
+ recalls = [m["recall"] for m in fold_metrics]
867
+ f1s = [m["f1"] for m in fold_metrics]
868
+
869
+ # Calculate mean and confidence intervals
870
+ mean_precision = statistics.mean(precisions)
871
+ mean_recall = statistics.mean(recalls)
872
+ mean_f1 = statistics.mean(f1s)
873
+
874
+ std_precision = statistics.stdev(precisions) if len(precisions) > 1 else 0.0
875
+ std_recall = statistics.stdev(recalls) if len(recalls) > 1 else 0.0
876
+ std_f1 = statistics.stdev(f1s) if len(f1s) > 1 else 0.0
877
+
878
+ # 95% CI = mean +/- 1.96 * std / sqrt(n)
879
+ z = 1.96
880
+ n = len(fold_metrics)
881
+ margin_precision = z * std_precision / math.sqrt(n) if n > 0 else 0.0
882
+ margin_recall = z * std_recall / math.sqrt(n) if n > 0 else 0.0
883
+ margin_f1 = z * std_f1 / math.sqrt(n) if n > 0 else 0.0
884
+
885
+ # Sum confusion matrix across folds
886
+ total_tp = sum(int(m["tp"]) for m in fold_metrics)
887
+ total_tn = sum(int(m["tn"]) for m in fold_metrics)
888
+ total_fp = sum(int(m["fp"]) for m in fold_metrics)
889
+ total_fn = sum(int(m["fn"]) for m in fold_metrics)
890
+
891
+ # Calculate consistency-based confidence
892
+ f1_cv = std_f1 / mean_f1 if mean_f1 > 0 else 1.0
893
+ confidence = max(0.0, 1.0 - f1_cv)
894
+
895
+ return ValidationResult(
896
+ precision=mean_precision,
897
+ recall=mean_recall,
898
+ f1_score=mean_f1,
899
+ accuracy=(total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)
900
+ if (total_tp + total_tn + total_fp + total_fn) > 0 else 0.0,
901
+ precision_ci=(
902
+ max(0.0, mean_precision - margin_precision),
903
+ min(1.0, mean_precision + margin_precision),
904
+ ),
905
+ recall_ci=(
906
+ max(0.0, mean_recall - margin_recall),
907
+ min(1.0, mean_recall + margin_recall),
908
+ ),
909
+ f1_ci=(
910
+ max(0.0, mean_f1 - margin_f1),
911
+ min(1.0, mean_f1 + margin_f1),
912
+ ),
913
+ n_samples=total_samples,
914
+ true_positives=total_tp,
915
+ true_negatives=total_tn,
916
+ false_positives=total_fp,
917
+ false_negatives=total_fn,
918
+ confidence=confidence,
919
+ recommendation=f"Cross-validated F1: {mean_f1:.2%} (±{margin_f1:.2%})",
920
+ )
921
+
922
+
923
+ class BootstrapValidation(ValidationStrategy):
924
+ """Bootstrap resampling validation strategy."""
925
+
926
+ name = "bootstrap"
927
+
928
+ def __init__(
929
+ self,
930
+ n_iterations: int = 1000,
931
+ sample_ratio: float = 0.8,
932
+ quality_threshold: float = 0.70,
933
+ random_seed: int | None = None,
934
+ ):
935
+ self.n_iterations = n_iterations
936
+ self.sample_ratio = sample_ratio
937
+ self.quality_threshold = quality_threshold
938
+ self.random_seed = random_seed
939
+
940
+ def validate(
941
+ self,
942
+ rule: RuleProtocol | ValidationRule,
943
+ dataset: ValidationDataset,
944
+ column: str,
945
+ ) -> ValidationResult:
946
+ """Validate using bootstrap resampling."""
947
+ start_time = datetime.now()
948
+
949
+ if self.random_seed is not None:
950
+ random.seed(self.random_seed)
951
+
952
+ df = dataset.to_dataframe()
953
+ n = len(df)
954
+ sample_size = int(n * self.sample_ratio)
955
+
956
+ # Collect bootstrap samples
957
+ f1_scores: list[float] = []
958
+ precisions: list[float] = []
959
+ recalls: list[float] = []
960
+
961
+ for _ in range(self.n_iterations):
962
+ # Sample with replacement
963
+ indices = [random.randint(0, n - 1) for _ in range(sample_size)]
964
+ sample_df = df[indices]
965
+
966
+ predictions = rule.validate_column(sample_df, column)
967
+ ground_truth = sample_df.get_column(dataset.label_column)
968
+
969
+ metrics = self._calculate_metrics(
970
+ predictions.to_list(),
971
+ ground_truth.to_list(),
972
+ )
973
+
974
+ f1_scores.append(metrics["f1"])
975
+ precisions.append(metrics["precision"])
976
+ recalls.append(metrics["recall"])
977
+
978
+ # Calculate percentile confidence intervals
979
+ f1_scores.sort()
980
+ precisions.sort()
981
+ recalls.sort()
982
+
983
+ lower_idx = int(0.025 * self.n_iterations)
984
+ upper_idx = int(0.975 * self.n_iterations)
985
+
986
+ result = ValidationResult(
987
+ precision=statistics.mean(precisions),
988
+ recall=statistics.mean(recalls),
989
+ f1_score=statistics.mean(f1_scores),
990
+ precision_ci=(precisions[lower_idx], precisions[upper_idx]),
991
+ recall_ci=(recalls[lower_idx], recalls[upper_idx]),
992
+ f1_ci=(f1_scores[lower_idx], f1_scores[upper_idx]),
993
+ n_samples=n,
994
+ confidence=1.0 - statistics.stdev(f1_scores) if len(f1_scores) > 1 else 0.5,
995
+ validation_method=ValidationMethod.BOOTSTRAP,
996
+ method_details={
997
+ "n_iterations": self.n_iterations,
998
+ "sample_ratio": self.sample_ratio,
999
+ },
1000
+ )
1001
+
1002
+ if result.f1_score >= self.quality_threshold:
1003
+ result.status = ValidationStatus.PASSED
1004
+ else:
1005
+ result.status = ValidationStatus.FAILED
1006
+
1007
+ result.quality_level = QualityLevel.from_f1(result.f1_score)
1008
+ result.recommendation = (
1009
+ f"Bootstrap F1: {result.f1_score:.2%} "
1010
+ f"(95% CI: [{result.f1_ci[0]:.2%}, {result.f1_ci[1]:.2%}])"
1011
+ )
1012
+ result.duration_ms = (datetime.now() - start_time).total_seconds() * 1000
1013
+
1014
+ return result
1015
+
1016
+ def _calculate_metrics(
1017
+ self,
1018
+ predictions: list[bool],
1019
+ ground_truth: list[bool],
1020
+ ) -> dict[str, float]:
1021
+ """Calculate metrics for a bootstrap sample."""
1022
+ tp = tn = fp = fn = 0
1023
+
1024
+ for pred, truth in zip(predictions, ground_truth):
1025
+ if pred and truth:
1026
+ tp += 1
1027
+ elif not pred and not truth:
1028
+ tn += 1
1029
+ elif pred and not truth:
1030
+ fp += 1
1031
+ else:
1032
+ fn += 1
1033
+
1034
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
1035
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
1036
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
1037
+
1038
+ return {"precision": precision, "recall": recall, "f1": f1}
1039
+
1040
+
1041
+ # =============================================================================
1042
+ # Strategy Registry
1043
+ # =============================================================================
1044
+
1045
+
1046
+ class ValidationStrategyRegistry:
1047
+ """Registry for validation strategies."""
1048
+
1049
+ def __init__(self) -> None:
1050
+ self._strategies: dict[str, type[ValidationStrategy]] = {}
1051
+
1052
+ def register(
1053
+ self,
1054
+ name: str,
1055
+ strategy_class: type[ValidationStrategy],
1056
+ ) -> None:
1057
+ """Register a validation strategy."""
1058
+ self._strategies[name] = strategy_class
1059
+
1060
+ def create(self, name: str, **kwargs: Any) -> ValidationStrategy:
1061
+ """Create a strategy instance."""
1062
+ if name not in self._strategies:
1063
+ raise KeyError(
1064
+ f"Unknown strategy: {name}. "
1065
+ f"Available: {list(self._strategies.keys())}"
1066
+ )
1067
+ return self._strategies[name](**kwargs)
1068
+
1069
+ def list_strategies(self) -> list[str]:
1070
+ """List registered strategy names."""
1071
+ return list(self._strategies.keys())
1072
+
1073
+
1074
+ # Global registry
1075
+ validation_strategy_registry = ValidationStrategyRegistry()
1076
+ validation_strategy_registry.register("holdout", HoldoutValidation)
1077
+ validation_strategy_registry.register("cross_validation", CrossValidationStrategy)
1078
+ validation_strategy_registry.register("bootstrap", BootstrapValidation)
1079
+
1080
+
1081
+ # =============================================================================
1082
+ # Main Validator
1083
+ # =============================================================================
1084
+
1085
+
1086
+ class LabeledDataValidator:
1087
+ """Main validator using labeled ground truth data.
1088
+
1089
+ Provides comprehensive rule quality validation with:
1090
+ - Multiple validation strategies
1091
+ - Detailed confidence intervals
1092
+ - Actionable recommendations
1093
+
1094
+ Example:
1095
+ validator = LabeledDataValidator(strategy="cross_validation")
1096
+
1097
+ # Validate single rule
1098
+ result = validator.validate(rule, dataset, "email_column")
1099
+ print(f"F1: {result.f1_score:.2%}")
1100
+
1101
+ # Validate multiple rules
1102
+ report = validator.validate_all(rules, dataset)
1103
+ print(report.overall_status)
1104
+ """
1105
+
1106
+ def __init__(
1107
+ self,
1108
+ strategy: str | ValidationStrategy = "cross_validation",
1109
+ strategy_options: dict[str, Any] | None = None,
1110
+ quality_threshold: float = 0.70,
1111
+ min_samples: int = 30,
1112
+ ):
1113
+ """Initialize validator.
1114
+
1115
+ Args:
1116
+ strategy: Validation strategy name or instance
1117
+ strategy_options: Options for strategy construction
1118
+ quality_threshold: Minimum F1 for passing
1119
+ min_samples: Minimum samples required
1120
+ """
1121
+ self.quality_threshold = quality_threshold
1122
+ self.min_samples = min_samples
1123
+ self._cache: dict[str, ValidationResult] = {}
1124
+ self._lock = threading.Lock()
1125
+
1126
+ if isinstance(strategy, ValidationStrategy):
1127
+ self._strategy = strategy
1128
+ else:
1129
+ options = strategy_options or {}
1130
+ options.setdefault("quality_threshold", quality_threshold)
1131
+ self._strategy = validation_strategy_registry.create(strategy, **options)
1132
+
1133
+ def validate(
1134
+ self,
1135
+ rule: RuleProtocol | ValidationRule,
1136
+ dataset: ValidationDataset,
1137
+ column: str | None = None,
1138
+ use_cache: bool = True,
1139
+ ) -> ValidationResult:
1140
+ """Validate a rule against labeled data.
1141
+
1142
+ Args:
1143
+ rule: Rule to validate
1144
+ dataset: Labeled validation dataset
1145
+ column: Column to validate (uses rule.column if None)
1146
+ use_cache: Whether to use cached results
1147
+
1148
+ Returns:
1149
+ Validation result
1150
+ """
1151
+ column = column or rule.column
1152
+ if column is None:
1153
+ return ValidationResult(
1154
+ status=ValidationStatus.INCONCLUSIVE,
1155
+ recommendation="No column specified for validation",
1156
+ )
1157
+
1158
+ # Check cache
1159
+ cache_key = self._make_cache_key(rule, dataset, column)
1160
+ if use_cache:
1161
+ with self._lock:
1162
+ if cache_key in self._cache:
1163
+ return self._cache[cache_key]
1164
+
1165
+ # Check sample size
1166
+ if len(dataset) < self.min_samples:
1167
+ return ValidationResult(
1168
+ status=ValidationStatus.INCONCLUSIVE,
1169
+ recommendation=f"Need at least {self.min_samples} samples",
1170
+ n_samples=len(dataset),
1171
+ warnings=[f"Only {len(dataset)} samples available"],
1172
+ )
1173
+
1174
+ # Validate
1175
+ result = self._strategy.validate(rule, dataset, column)
1176
+
1177
+ # Add warnings for edge cases
1178
+ if result.n_positive < 10 or result.n_negative < 10:
1179
+ result.warnings.append(
1180
+ "Imbalanced labels may affect reliability"
1181
+ )
1182
+
1183
+ if result.confidence < 0.7:
1184
+ result.warnings.append(
1185
+ "Low confidence - consider collecting more labels"
1186
+ )
1187
+
1188
+ # Cache result
1189
+ if use_cache:
1190
+ with self._lock:
1191
+ self._cache[cache_key] = result
1192
+
1193
+ return result
1194
+
1195
+ def validate_all(
1196
+ self,
1197
+ rules: list[RuleProtocol | ValidationRule],
1198
+ dataset: ValidationDataset,
1199
+ ) -> ValidationReport:
1200
+ """Validate multiple rules.
1201
+
1202
+ Args:
1203
+ rules: Rules to validate
1204
+ dataset: Labeled validation dataset
1205
+
1206
+ Returns:
1207
+ Complete validation report
1208
+ """
1209
+ report = ValidationReport(
1210
+ dataset_info={
1211
+ "name": dataset.name,
1212
+ "size": len(dataset),
1213
+ "label_distribution": dataset.get_label_distribution(),
1214
+ },
1215
+ )
1216
+
1217
+ passed = 0
1218
+ failed = 0
1219
+
1220
+ for rule in rules:
1221
+ result = self.validate(rule, dataset)
1222
+ report.add_result(rule.name, result)
1223
+
1224
+ if result.status == ValidationStatus.PASSED:
1225
+ passed += 1
1226
+ elif result.status == ValidationStatus.FAILED:
1227
+ failed += 1
1228
+
1229
+ # Generate summary
1230
+ report.summary = (
1231
+ f"Validated {len(rules)} rules: "
1232
+ f"{passed} passed, {failed} failed, "
1233
+ f"{len(rules) - passed - failed} inconclusive"
1234
+ )
1235
+
1236
+ # Generate recommendations
1237
+ if failed > 0:
1238
+ report.recommendations.append(
1239
+ f"{failed} rules failed validation - review and adjust thresholds or rule logic"
1240
+ )
1241
+
1242
+ low_confidence = [
1243
+ name for name, result in report.rule_results.items()
1244
+ if result.confidence < 0.7
1245
+ ]
1246
+ if low_confidence:
1247
+ report.recommendations.append(
1248
+ f"Low confidence for rules: {', '.join(low_confidence)} - collect more labels"
1249
+ )
1250
+
1251
+ return report
1252
+
1253
+ def compare_rules(
1254
+ self,
1255
+ rules: list[RuleProtocol | ValidationRule],
1256
+ dataset: ValidationDataset,
1257
+ column: str,
1258
+ ) -> list[tuple[str, ValidationResult]]:
1259
+ """Compare multiple rules for the same column.
1260
+
1261
+ Args:
1262
+ rules: Rules to compare
1263
+ dataset: Labeled validation dataset
1264
+ column: Column to validate
1265
+
1266
+ Returns:
1267
+ Rules sorted by F1 score (best first)
1268
+ """
1269
+ results = [
1270
+ (rule.name, self.validate(rule, dataset, column))
1271
+ for rule in rules
1272
+ ]
1273
+ return sorted(results, key=lambda x: x[1].f1_score, reverse=True)
1274
+
1275
+ def _make_cache_key(
1276
+ self,
1277
+ rule: RuleProtocol | ValidationRule,
1278
+ dataset: ValidationDataset,
1279
+ column: str,
1280
+ ) -> str:
1281
+ """Create cache key."""
1282
+ rule_str = f"{rule.name}:{rule.rule_type}"
1283
+ dataset_hash = hashlib.sha256(
1284
+ f"{dataset.name}:{len(dataset)}:{dataset.version}".encode()
1285
+ ).hexdigest()[:16]
1286
+ return f"{rule_str}:{column}:{dataset_hash}"
1287
+
1288
+ def clear_cache(self) -> None:
1289
+ """Clear validation cache."""
1290
+ with self._lock:
1291
+ self._cache.clear()
1292
+
1293
+
1294
+ # =============================================================================
1295
+ # A/B Testing
1296
+ # =============================================================================
1297
+
1298
+
1299
+ @dataclass
1300
+ class ABTestResult:
1301
+ """Result of A/B testing between two rules."""
1302
+
1303
+ rule_a_name: str
1304
+ rule_b_name: str
1305
+ rule_a_f1: float
1306
+ rule_b_f1: float
1307
+ difference: float
1308
+ p_value: float
1309
+ significant: bool
1310
+ winner: str | None
1311
+ confidence_level: float = 0.95
1312
+ recommendation: str = ""
1313
+
1314
+ def to_dict(self) -> dict[str, Any]:
1315
+ """Convert to dictionary."""
1316
+ return {
1317
+ "rule_a_name": self.rule_a_name,
1318
+ "rule_b_name": self.rule_b_name,
1319
+ "rule_a_f1": self.rule_a_f1,
1320
+ "rule_b_f1": self.rule_b_f1,
1321
+ "difference": self.difference,
1322
+ "p_value": self.p_value,
1323
+ "significant": self.significant,
1324
+ "winner": self.winner,
1325
+ "confidence_level": self.confidence_level,
1326
+ "recommendation": self.recommendation,
1327
+ }
1328
+
1329
+
1330
+ class RuleABTester:
1331
+ """A/B testing for comparing rule quality.
1332
+
1333
+ Performs statistical tests to determine if one rule
1334
+ is significantly better than another.
1335
+
1336
+ Example:
1337
+ tester = RuleABTester()
1338
+
1339
+ result = tester.test(rule_a, rule_b, dataset, "column")
1340
+
1341
+ if result.significant:
1342
+ print(f"Winner: {result.winner}")
1343
+ else:
1344
+ print("No significant difference")
1345
+ """
1346
+
1347
+ def __init__(
1348
+ self,
1349
+ n_iterations: int = 1000,
1350
+ confidence_level: float = 0.95,
1351
+ random_seed: int | None = None,
1352
+ ):
1353
+ """Initialize A/B tester.
1354
+
1355
+ Args:
1356
+ n_iterations: Number of bootstrap iterations
1357
+ confidence_level: Confidence level for significance
1358
+ random_seed: Random seed for reproducibility
1359
+ """
1360
+ self.n_iterations = n_iterations
1361
+ self.confidence_level = confidence_level
1362
+ self.random_seed = random_seed
1363
+
1364
+ def test(
1365
+ self,
1366
+ rule_a: RuleProtocol | ValidationRule,
1367
+ rule_b: RuleProtocol | ValidationRule,
1368
+ dataset: ValidationDataset,
1369
+ column: str,
1370
+ ) -> ABTestResult:
1371
+ """Perform A/B test between two rules.
1372
+
1373
+ Args:
1374
+ rule_a: First rule
1375
+ rule_b: Second rule
1376
+ dataset: Labeled validation dataset
1377
+ column: Column to validate
1378
+
1379
+ Returns:
1380
+ A/B test result
1381
+ """
1382
+ if self.random_seed is not None:
1383
+ random.seed(self.random_seed)
1384
+
1385
+ df = dataset.to_dataframe()
1386
+ n = len(df)
1387
+
1388
+ # Calculate observed F1 scores
1389
+ pred_a = rule_a.validate_column(df, column)
1390
+ pred_b = rule_b.validate_column(df, column)
1391
+ ground_truth = df.get_column(dataset.label_column)
1392
+
1393
+ f1_a = self._calculate_f1(pred_a.to_list(), ground_truth.to_list())
1394
+ f1_b = self._calculate_f1(pred_b.to_list(), ground_truth.to_list())
1395
+ observed_diff = f1_a - f1_b
1396
+
1397
+ # Bootstrap test
1398
+ diff_samples = []
1399
+ for _ in range(self.n_iterations):
1400
+ indices = [random.randint(0, n - 1) for _ in range(n)]
1401
+ sample_df = df[indices]
1402
+
1403
+ sample_pred_a = rule_a.validate_column(sample_df, column)
1404
+ sample_pred_b = rule_b.validate_column(sample_df, column)
1405
+ sample_gt = sample_df.get_column(dataset.label_column)
1406
+
1407
+ sample_f1_a = self._calculate_f1(
1408
+ sample_pred_a.to_list(), sample_gt.to_list()
1409
+ )
1410
+ sample_f1_b = self._calculate_f1(
1411
+ sample_pred_b.to_list(), sample_gt.to_list()
1412
+ )
1413
+
1414
+ diff_samples.append(sample_f1_a - sample_f1_b)
1415
+
1416
+ # Calculate p-value (two-tailed)
1417
+ # Under null hypothesis, difference centers at 0
1418
+ centered = [d - observed_diff for d in diff_samples]
1419
+ extreme_count = sum(1 for d in centered if abs(d) >= abs(observed_diff))
1420
+ p_value = extreme_count / self.n_iterations
1421
+
1422
+ # Determine significance
1423
+ alpha = 1 - self.confidence_level
1424
+ significant = p_value < alpha
1425
+
1426
+ # Determine winner
1427
+ if significant:
1428
+ winner = rule_a.name if observed_diff > 0 else rule_b.name
1429
+ recommendation = (
1430
+ f"{winner} is significantly better (p={p_value:.4f})"
1431
+ )
1432
+ else:
1433
+ winner = None
1434
+ recommendation = (
1435
+ f"No significant difference (p={p_value:.4f})"
1436
+ )
1437
+
1438
+ return ABTestResult(
1439
+ rule_a_name=rule_a.name,
1440
+ rule_b_name=rule_b.name,
1441
+ rule_a_f1=f1_a,
1442
+ rule_b_f1=f1_b,
1443
+ difference=observed_diff,
1444
+ p_value=p_value,
1445
+ significant=significant,
1446
+ winner=winner,
1447
+ confidence_level=self.confidence_level,
1448
+ recommendation=recommendation,
1449
+ )
1450
+
1451
+ def _calculate_f1(
1452
+ self,
1453
+ predictions: list[bool],
1454
+ ground_truth: list[bool],
1455
+ ) -> float:
1456
+ """Calculate F1 score."""
1457
+ tp = sum(1 for p, t in zip(predictions, ground_truth) if p and t)
1458
+ fp = sum(1 for p, t in zip(predictions, ground_truth) if p and not t)
1459
+ fn = sum(1 for p, t in zip(predictions, ground_truth) if not p and t)
1460
+
1461
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
1462
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
1463
+
1464
+ return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
1465
+
1466
+
1467
+ # =============================================================================
1468
+ # Convenience Functions
1469
+ # =============================================================================
1470
+
1471
+
1472
+ def validate_rule(
1473
+ rule: RuleProtocol | ValidationRule,
1474
+ dataset: ValidationDataset,
1475
+ column: str | None = None,
1476
+ strategy: str = "cross_validation",
1477
+ **kwargs: Any,
1478
+ ) -> ValidationResult:
1479
+ """Validate a rule against labeled data.
1480
+
1481
+ Args:
1482
+ rule: Rule to validate
1483
+ dataset: Labeled validation dataset
1484
+ column: Column to validate
1485
+ strategy: Validation strategy
1486
+ **kwargs: Additional options
1487
+
1488
+ Returns:
1489
+ Validation result
1490
+ """
1491
+ validator = LabeledDataValidator(strategy=strategy, **kwargs)
1492
+ return validator.validate(rule, dataset, column)
1493
+
1494
+
1495
+ def create_validation_suite(
1496
+ rules: list[RuleProtocol | ValidationRule],
1497
+ dataset: ValidationDataset,
1498
+ strategy: str = "cross_validation",
1499
+ ) -> ValidationReport:
1500
+ """Create a validation suite for multiple rules.
1501
+
1502
+ Args:
1503
+ rules: Rules to validate
1504
+ dataset: Labeled validation dataset
1505
+ strategy: Validation strategy
1506
+
1507
+ Returns:
1508
+ Complete validation report
1509
+ """
1510
+ validator = LabeledDataValidator(strategy=strategy)
1511
+ return validator.validate_all(rules, dataset)
1512
+
1513
+
1514
+ def compare_rule_quality(
1515
+ rule_a: RuleProtocol | ValidationRule,
1516
+ rule_b: RuleProtocol | ValidationRule,
1517
+ dataset: ValidationDataset,
1518
+ column: str,
1519
+ ) -> ABTestResult:
1520
+ """Compare two rules using A/B testing.
1521
+
1522
+ Args:
1523
+ rule_a: First rule
1524
+ rule_b: Second rule
1525
+ dataset: Labeled validation dataset
1526
+ column: Column to validate
1527
+
1528
+ Returns:
1529
+ A/B test result
1530
+ """
1531
+ tester = RuleABTester()
1532
+ return tester.test(rule_a, rule_b, dataset, column)