truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,794 @@
1
+ """Memory-safe pattern matcher with integrated sampling.
2
+
3
+ This module provides a pattern matcher that integrates sampling strategies
4
+ to prevent OOM errors while maintaining statistical accuracy.
5
+
6
+ Key features:
7
+ - Configurable sampling strategies
8
+ - Memory-aware processing
9
+ - Statistical confidence reporting
10
+ - Graceful degradation on failures
11
+ - Telemetry integration
12
+
13
+ Example:
14
+ from truthound.profiler.sampled_matcher import (
15
+ SampledPatternMatcher,
16
+ SampledMatcherConfig,
17
+ )
18
+
19
+ matcher = SampledPatternMatcher(
20
+ sampling_config=SamplingConfig.for_accuracy("high"),
21
+ )
22
+
23
+ results = matcher.match_column(lf, "email")
24
+ print(f"Confidence: {results.confidence:.2%}")
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import time
31
+ from dataclasses import dataclass, field
32
+ from typing import Any, Callable, Sequence
33
+
34
+ import polars as pl
35
+
36
+ from truthound.profiler.base import DataType, PatternMatch
37
+ from truthound.profiler.native_patterns import (
38
+ BUILTIN_PATTERNS,
39
+ NativePatternMatcher,
40
+ PatternMatchResult,
41
+ PatternRegistry,
42
+ PatternSpec,
43
+ )
44
+ from truthound.profiler.sampling import (
45
+ DEFAULT_SAMPLING_CONFIG,
46
+ DataSizeEstimator,
47
+ Sampler,
48
+ SamplingConfig,
49
+ SamplingMetrics,
50
+ SamplingMethod,
51
+ SamplingResult,
52
+ )
53
+
54
+ logger = logging.getLogger(__name__)
55
+
56
+
57
+ # =============================================================================
58
+ # Sampled Match Result
59
+ # =============================================================================
60
+
61
+
62
+ @dataclass
63
+ class SampledPatternMatchResult:
64
+ """Pattern match result with sampling metadata.
65
+
66
+ Extends PatternMatchResult with statistical confidence
67
+ information from sampling.
68
+
69
+ Attributes:
70
+ pattern: The matched pattern specification
71
+ match_count: Number of matches in sample
72
+ total_count: Total non-null values in sample
73
+ match_ratio: Ratio of matches in sample
74
+ sample_matches: Example matching values
75
+ sample_non_matches: Example non-matching values
76
+ sampling_metrics: Metrics from sampling operation
77
+ estimated_population_matches: Extrapolated matches in full data
78
+ confidence_interval: (lower, upper) bounds for match ratio
79
+ """
80
+
81
+ pattern: PatternSpec
82
+ match_count: int
83
+ total_count: int
84
+ match_ratio: float
85
+ sample_matches: tuple[str, ...] = field(default_factory=tuple)
86
+ sample_non_matches: tuple[str, ...] = field(default_factory=tuple)
87
+ sampling_metrics: SamplingMetrics | None = None
88
+ estimated_population_matches: int = 0
89
+ confidence_interval: tuple[float, float] = (0.0, 1.0)
90
+
91
+ def __post_init__(self) -> None:
92
+ """Calculate derived fields."""
93
+ if self.sampling_metrics and self.sampling_metrics.original_size > 0:
94
+ # Extrapolate to full population
95
+ self.estimated_population_matches = int(
96
+ self.match_ratio * self.sampling_metrics.original_size
97
+ )
98
+
99
+ # Calculate confidence interval using Wilson score
100
+ self._calculate_confidence_interval()
101
+
102
+ def _calculate_confidence_interval(self) -> None:
103
+ """Calculate Wilson score confidence interval."""
104
+ if self.total_count == 0:
105
+ self.confidence_interval = (0.0, 1.0)
106
+ return
107
+
108
+ n = self.total_count
109
+ p = self.match_ratio
110
+
111
+ # Z-score for confidence level (default 95%)
112
+ z = 1.96
113
+ if self.sampling_metrics:
114
+ z = self._z_from_confidence(self.sampling_metrics.confidence_level)
115
+
116
+ # Wilson score interval
117
+ denominator = 1 + z * z / n
118
+ center = (p + z * z / (2 * n)) / denominator
119
+ spread = z * ((p * (1 - p) / n + z * z / (4 * n * n)) ** 0.5) / denominator
120
+
121
+ lower = max(0.0, center - spread)
122
+ upper = min(1.0, center + spread)
123
+
124
+ self.confidence_interval = (lower, upper)
125
+
126
+ @staticmethod
127
+ def _z_from_confidence(confidence: float) -> float:
128
+ """Get Z-score from confidence level."""
129
+ z_scores = {
130
+ 0.90: 1.645,
131
+ 0.95: 1.96,
132
+ 0.99: 2.576,
133
+ 0.999: 3.291,
134
+ }
135
+ return z_scores.get(round(confidence, 3), 1.96)
136
+
137
+ @property
138
+ def confidence(self) -> float:
139
+ """Get confidence in the match ratio estimate."""
140
+ if self.sampling_metrics:
141
+ return self.sampling_metrics.confidence_level
142
+ return 1.0 # No sampling = full confidence
143
+
144
+ @property
145
+ def is_sampled(self) -> bool:
146
+ """Check if result is from sampled data."""
147
+ return self.sampling_metrics is not None and self.sampling_metrics.is_full_scan is False
148
+
149
+ @property
150
+ def margin_of_error(self) -> float:
151
+ """Get margin of error for match ratio."""
152
+ lower, upper = self.confidence_interval
153
+ return (upper - lower) / 2
154
+
155
+ def to_pattern_match(self) -> PatternMatch:
156
+ """Convert to legacy PatternMatch format."""
157
+ return PatternMatch(
158
+ pattern=self.pattern.name,
159
+ regex=self.pattern.regex,
160
+ match_ratio=self.match_ratio,
161
+ sample_matches=self.sample_matches,
162
+ )
163
+
164
+ def to_dict(self) -> dict[str, Any]:
165
+ """Convert to dictionary for serialization."""
166
+ return {
167
+ "pattern_name": self.pattern.name,
168
+ "pattern_regex": self.pattern.regex,
169
+ "match_count": self.match_count,
170
+ "total_count": self.total_count,
171
+ "match_ratio": self.match_ratio,
172
+ "confidence": self.confidence,
173
+ "confidence_interval": list(self.confidence_interval),
174
+ "margin_of_error": self.margin_of_error,
175
+ "is_sampled": self.is_sampled,
176
+ "estimated_population_matches": self.estimated_population_matches,
177
+ "sample_matches": list(self.sample_matches),
178
+ "sampling_metrics": (
179
+ self.sampling_metrics.to_dict() if self.sampling_metrics else None
180
+ ),
181
+ }
182
+
183
+
184
+ @dataclass
185
+ class SampledColumnMatchResult:
186
+ """Complete result for a column including all matches and metadata."""
187
+
188
+ column: str
189
+ matches: list[SampledPatternMatchResult]
190
+ sampling_metrics: SamplingMetrics | None
191
+ processing_time_ms: float
192
+ inferred_type: DataType | None = None
193
+
194
+ @property
195
+ def has_matches(self) -> bool:
196
+ """Check if any patterns matched."""
197
+ return len(self.matches) > 0
198
+
199
+ @property
200
+ def best_match(self) -> SampledPatternMatchResult | None:
201
+ """Get the best (highest ratio) match."""
202
+ if not self.matches:
203
+ return None
204
+ return self.matches[0]
205
+
206
+ @property
207
+ def is_sampled(self) -> bool:
208
+ """Check if sampling was applied."""
209
+ return (
210
+ self.sampling_metrics is not None
211
+ and not self.sampling_metrics.is_full_scan
212
+ )
213
+
214
+ def to_dict(self) -> dict[str, Any]:
215
+ """Convert to dictionary."""
216
+ return {
217
+ "column": self.column,
218
+ "matches": [m.to_dict() for m in self.matches],
219
+ "sampling_metrics": (
220
+ self.sampling_metrics.to_dict() if self.sampling_metrics else None
221
+ ),
222
+ "processing_time_ms": self.processing_time_ms,
223
+ "inferred_type": self.inferred_type.value if self.inferred_type else None,
224
+ "is_sampled": self.is_sampled,
225
+ }
226
+
227
+
228
+ # =============================================================================
229
+ # Sampled Pattern Matcher Configuration
230
+ # =============================================================================
231
+
232
+
233
+ @dataclass
234
+ class SampledMatcherConfig:
235
+ """Configuration for SampledPatternMatcher.
236
+
237
+ Attributes:
238
+ sampling_config: Sampling configuration
239
+ patterns: Pattern registry to use
240
+ min_match_ratio: Minimum ratio to consider a match
241
+ sample_size: Number of sample values to collect
242
+ include_non_matches: Whether to collect non-matching samples
243
+ parallel_threshold: Row count above which to use parallel processing
244
+ fallback_on_error: Whether to fallback to head sampling on error
245
+ cache_sampling_decisions: Cache sampling decisions for same data
246
+ """
247
+
248
+ sampling_config: SamplingConfig = field(default_factory=lambda: DEFAULT_SAMPLING_CONFIG)
249
+ patterns: PatternRegistry | None = None
250
+ min_match_ratio: float = 0.8
251
+ sample_size: int = 5
252
+ include_non_matches: bool = False
253
+ parallel_threshold: int = 100_000
254
+ fallback_on_error: bool = True
255
+ cache_sampling_decisions: bool = True
256
+
257
+ def __post_init__(self) -> None:
258
+ """Validate configuration."""
259
+ if not 0.0 <= self.min_match_ratio <= 1.0:
260
+ raise ValueError(
261
+ f"min_match_ratio must be between 0 and 1, got {self.min_match_ratio}"
262
+ )
263
+ if self.sample_size < 0:
264
+ raise ValueError(f"sample_size must be non-negative, got {self.sample_size}")
265
+
266
+ @classmethod
267
+ def fast(cls) -> "SampledMatcherConfig":
268
+ """Create config optimized for speed."""
269
+ return cls(
270
+ sampling_config=SamplingConfig.for_speed(),
271
+ min_match_ratio=0.7,
272
+ sample_size=3,
273
+ )
274
+
275
+ @classmethod
276
+ def accurate(cls) -> "SampledMatcherConfig":
277
+ """Create config optimized for accuracy."""
278
+ return cls(
279
+ sampling_config=SamplingConfig.for_accuracy("high"),
280
+ min_match_ratio=0.85,
281
+ sample_size=10,
282
+ )
283
+
284
+ @classmethod
285
+ def balanced(cls) -> "SampledMatcherConfig":
286
+ """Create balanced config (default)."""
287
+ return cls(
288
+ sampling_config=SamplingConfig.for_accuracy("medium"),
289
+ min_match_ratio=0.8,
290
+ sample_size=5,
291
+ )
292
+
293
+
294
+ # =============================================================================
295
+ # Sampled Pattern Matcher
296
+ # =============================================================================
297
+
298
+
299
+ class SampledPatternMatcher:
300
+ """Memory-safe pattern matcher with integrated sampling.
301
+
302
+ This is the recommended pattern matcher for production use.
303
+ It automatically samples large datasets to prevent OOM errors
304
+ while providing statistical confidence metrics.
305
+
306
+ Example:
307
+ # Basic usage
308
+ matcher = SampledPatternMatcher()
309
+ results = matcher.match_column(lf, "email")
310
+
311
+ for result in results.matches:
312
+ print(f"{result.pattern.name}: {result.match_ratio:.2%} "
313
+ f"(±{result.margin_of_error:.2%})")
314
+
315
+ # Custom configuration
316
+ config = SampledMatcherConfig(
317
+ sampling_config=SamplingConfig(
318
+ strategy=SamplingMethod.RANDOM,
319
+ max_rows=50_000,
320
+ confidence_level=0.99,
321
+ ),
322
+ min_match_ratio=0.9,
323
+ )
324
+ matcher = SampledPatternMatcher(config=config)
325
+
326
+ # Memory-constrained environment
327
+ matcher = SampledPatternMatcher(
328
+ config=SampledMatcherConfig(
329
+ sampling_config=SamplingConfig.for_memory(max_memory_mb=100)
330
+ )
331
+ )
332
+ """
333
+
334
+ def __init__(
335
+ self,
336
+ config: SampledMatcherConfig | None = None,
337
+ sampling_config: SamplingConfig | None = None,
338
+ patterns: PatternRegistry | None = None,
339
+ ):
340
+ """Initialize the sampled pattern matcher.
341
+
342
+ Args:
343
+ config: Full matcher configuration
344
+ sampling_config: Override sampling config (convenience)
345
+ patterns: Override pattern registry (convenience)
346
+ """
347
+ self.config = config or SampledMatcherConfig.balanced()
348
+
349
+ # Allow convenience overrides
350
+ if sampling_config is not None:
351
+ self.config.sampling_config = sampling_config
352
+ if patterns is not None:
353
+ self.config.patterns = patterns
354
+
355
+ # Initialize components
356
+ self._sampler = Sampler(self.config.sampling_config)
357
+ self._size_estimator = DataSizeEstimator()
358
+ self._patterns = self.config.patterns or BUILTIN_PATTERNS
359
+
360
+ # Internal matcher for actual pattern matching
361
+ self._matcher = NativePatternMatcher(
362
+ patterns=self._patterns,
363
+ min_match_ratio=self.config.min_match_ratio,
364
+ sample_size=self.config.sample_size,
365
+ include_non_matches=self.config.include_non_matches,
366
+ )
367
+
368
+ @property
369
+ def patterns(self) -> PatternRegistry:
370
+ """Get the pattern registry."""
371
+ return self._patterns
372
+
373
+ @property
374
+ def sampling_config(self) -> SamplingConfig:
375
+ """Get the sampling configuration."""
376
+ return self.config.sampling_config
377
+
378
+ def match_column(
379
+ self,
380
+ lf: pl.LazyFrame,
381
+ column: str,
382
+ *,
383
+ patterns: Sequence[PatternSpec] | None = None,
384
+ sampling_config: SamplingConfig | None = None,
385
+ ) -> SampledColumnMatchResult:
386
+ """Match patterns against a column with automatic sampling.
387
+
388
+ This is the main entry point. It will:
389
+ 1. Estimate data size
390
+ 2. Apply appropriate sampling strategy
391
+ 3. Run pattern matching on sample
392
+ 4. Calculate statistical confidence
393
+
394
+ Args:
395
+ lf: LazyFrame containing the data
396
+ column: Column name to analyze
397
+ patterns: Optional specific patterns to check
398
+ sampling_config: Override sampling config for this call
399
+
400
+ Returns:
401
+ SampledColumnMatchResult with matches and metrics
402
+ """
403
+ start_time = time.perf_counter()
404
+
405
+ # Use override or default config
406
+ config = sampling_config or self.config.sampling_config
407
+
408
+ # Step 1: Sample the data
409
+ try:
410
+ sampling_result = self._sample_column(lf, column, config)
411
+ except Exception as e:
412
+ logger.error(f"Sampling failed for column '{column}': {e}")
413
+ if self.config.fallback_on_error:
414
+ # Fallback to simple head sampling
415
+ sampling_result = self._fallback_sample(lf, column, config)
416
+ else:
417
+ raise
418
+
419
+ # Step 2: Run pattern matching on sampled data
420
+ try:
421
+ pattern_results = self._match_on_sample(
422
+ sampling_result.data,
423
+ column,
424
+ patterns,
425
+ )
426
+ except Exception as e:
427
+ logger.error(f"Pattern matching failed for column '{column}': {e}")
428
+ pattern_results = []
429
+
430
+ # Step 3: Convert to sampled results with confidence
431
+ sampled_results = self._enhance_results(
432
+ pattern_results,
433
+ sampling_result.metrics,
434
+ )
435
+
436
+ # Step 4: Infer type from best match
437
+ inferred_type = None
438
+ if sampled_results:
439
+ inferred_type = sampled_results[0].pattern.data_type
440
+
441
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
442
+
443
+ return SampledColumnMatchResult(
444
+ column=column,
445
+ matches=sampled_results,
446
+ sampling_metrics=sampling_result.metrics,
447
+ processing_time_ms=elapsed_ms,
448
+ inferred_type=inferred_type,
449
+ )
450
+
451
+ def match_all_columns(
452
+ self,
453
+ lf: pl.LazyFrame,
454
+ *,
455
+ string_columns_only: bool = True,
456
+ sampling_config: SamplingConfig | None = None,
457
+ ) -> dict[str, SampledColumnMatchResult]:
458
+ """Match patterns against all applicable columns.
459
+
460
+ Args:
461
+ lf: LazyFrame to analyze
462
+ string_columns_only: Only analyze string columns
463
+ sampling_config: Override sampling configuration
464
+
465
+ Returns:
466
+ Dictionary mapping column names to their results
467
+ """
468
+ schema = lf.collect_schema()
469
+ results: dict[str, SampledColumnMatchResult] = {}
470
+
471
+ for col_name, dtype in schema.items():
472
+ if string_columns_only:
473
+ if dtype not in {pl.String, pl.Utf8}:
474
+ continue
475
+
476
+ result = self.match_column(
477
+ lf,
478
+ col_name,
479
+ sampling_config=sampling_config,
480
+ )
481
+
482
+ if result.has_matches:
483
+ results[col_name] = result
484
+
485
+ return results
486
+
487
+ def infer_type(
488
+ self,
489
+ lf: pl.LazyFrame,
490
+ column: str,
491
+ *,
492
+ min_match_ratio: float | None = None,
493
+ ) -> DataType | None:
494
+ """Infer semantic type for a column.
495
+
496
+ Args:
497
+ lf: LazyFrame containing the data
498
+ column: Column name to analyze
499
+ min_match_ratio: Override minimum match ratio
500
+
501
+ Returns:
502
+ Inferred DataType or None
503
+ """
504
+ result = self.match_column(lf, column)
505
+ return result.inferred_type
506
+
507
+ def _sample_column(
508
+ self,
509
+ lf: pl.LazyFrame,
510
+ column: str,
511
+ config: SamplingConfig,
512
+ ) -> SamplingResult:
513
+ """Sample a single column for pattern matching."""
514
+ # Select only the needed column for efficiency
515
+ column_lf = lf.select(pl.col(column))
516
+ return self._sampler.sample(column_lf, config)
517
+
518
+ def _fallback_sample(
519
+ self,
520
+ lf: pl.LazyFrame,
521
+ column: str,
522
+ config: SamplingConfig,
523
+ ) -> SamplingResult:
524
+ """Fallback sampling using simple head."""
525
+ fallback_config = SamplingConfig(
526
+ strategy=config.fallback_strategy,
527
+ max_rows=config.max_rows or 10_000,
528
+ confidence_level=config.confidence_level,
529
+ margin_of_error=config.margin_of_error,
530
+ )
531
+ column_lf = lf.select(pl.col(column))
532
+ return self._sampler.sample(column_lf, fallback_config)
533
+
534
+ def _match_on_sample(
535
+ self,
536
+ sampled_lf: pl.LazyFrame,
537
+ column: str,
538
+ patterns: Sequence[PatternSpec] | None,
539
+ ) -> list[PatternMatchResult]:
540
+ """Run pattern matching on sampled data."""
541
+ return self._matcher.match_column(
542
+ sampled_lf,
543
+ column,
544
+ patterns=patterns,
545
+ limit=None, # Already sampled
546
+ )
547
+
548
+ def _enhance_results(
549
+ self,
550
+ results: list[PatternMatchResult],
551
+ sampling_metrics: SamplingMetrics,
552
+ ) -> list[SampledPatternMatchResult]:
553
+ """Enhance pattern results with sampling metadata."""
554
+ enhanced = []
555
+
556
+ for result in results:
557
+ enhanced.append(
558
+ SampledPatternMatchResult(
559
+ pattern=result.pattern,
560
+ match_count=result.match_count,
561
+ total_count=result.total_count,
562
+ match_ratio=result.match_ratio,
563
+ sample_matches=result.sample_matches,
564
+ sample_non_matches=result.sample_non_matches,
565
+ sampling_metrics=sampling_metrics,
566
+ )
567
+ )
568
+
569
+ return enhanced
570
+
571
+
572
+ # =============================================================================
573
+ # Factory Functions
574
+ # =============================================================================
575
+
576
+
577
+ def create_sampled_matcher(
578
+ strategy: str | SamplingMethod = "adaptive",
579
+ max_rows: int = 100_000,
580
+ min_match_ratio: float = 0.8,
581
+ **kwargs: Any,
582
+ ) -> SampledPatternMatcher:
583
+ """Create a sampled pattern matcher with common options.
584
+
585
+ Args:
586
+ strategy: Sampling strategy
587
+ max_rows: Maximum rows to sample
588
+ min_match_ratio: Minimum match ratio threshold
589
+ **kwargs: Additional SamplingConfig options
590
+
591
+ Returns:
592
+ Configured SampledPatternMatcher
593
+
594
+ Example:
595
+ matcher = create_sampled_matcher(
596
+ strategy="random",
597
+ max_rows=50_000,
598
+ confidence_level=0.99,
599
+ )
600
+ """
601
+ if isinstance(strategy, str):
602
+ strategy = SamplingMethod(strategy)
603
+
604
+ sampling_config = SamplingConfig(
605
+ strategy=strategy,
606
+ max_rows=max_rows,
607
+ **kwargs,
608
+ )
609
+
610
+ config = SampledMatcherConfig(
611
+ sampling_config=sampling_config,
612
+ min_match_ratio=min_match_ratio,
613
+ )
614
+
615
+ return SampledPatternMatcher(config=config)
616
+
617
+
618
+ def match_patterns_safe(
619
+ data: pl.LazyFrame | pl.DataFrame,
620
+ column: str,
621
+ *,
622
+ max_rows: int = 100_000,
623
+ min_ratio: float = 0.8,
624
+ ) -> SampledColumnMatchResult:
625
+ """Convenience function for safe pattern matching.
626
+
627
+ Always applies sampling to prevent OOM.
628
+
629
+ Args:
630
+ data: DataFrame or LazyFrame
631
+ column: Column name to analyze
632
+ max_rows: Maximum rows to sample
633
+ min_ratio: Minimum match ratio
634
+
635
+ Returns:
636
+ SampledColumnMatchResult
637
+
638
+ Example:
639
+ import polars as pl
640
+ from truthound.profiler.sampled_matcher import match_patterns_safe
641
+
642
+ df = pl.read_parquet("large_file.parquet")
643
+ result = match_patterns_safe(df.lazy(), "email_column")
644
+
645
+ print(f"Best match: {result.best_match.pattern.name}")
646
+ print(f"Confidence: {result.best_match.confidence:.2%}")
647
+ """
648
+ if isinstance(data, pl.DataFrame):
649
+ data = data.lazy()
650
+
651
+ matcher = create_sampled_matcher(
652
+ max_rows=max_rows,
653
+ min_match_ratio=min_ratio,
654
+ )
655
+
656
+ return matcher.match_column(data, column)
657
+
658
+
659
+ def infer_column_type_safe(
660
+ data: pl.LazyFrame | pl.DataFrame,
661
+ column: str,
662
+ *,
663
+ max_rows: int = 100_000,
664
+ min_ratio: float = 0.9,
665
+ ) -> DataType | None:
666
+ """Convenience function for safe type inference.
667
+
668
+ Args:
669
+ data: DataFrame or LazyFrame
670
+ column: Column name
671
+ max_rows: Maximum rows to sample
672
+ min_ratio: Minimum match ratio for inference
673
+
674
+ Returns:
675
+ Inferred DataType or None
676
+
677
+ Example:
678
+ from truthound.profiler.sampled_matcher import infer_column_type_safe
679
+
680
+ dtype = infer_column_type_safe(df, "mystery_column")
681
+ if dtype:
682
+ print(f"Detected type: {dtype.value}")
683
+ """
684
+ if isinstance(data, pl.DataFrame):
685
+ data = data.lazy()
686
+
687
+ matcher = create_sampled_matcher(
688
+ max_rows=max_rows,
689
+ min_match_ratio=min_ratio,
690
+ )
691
+
692
+ return matcher.infer_type(data, column)
693
+
694
+
695
+ # =============================================================================
696
+ # Integration with NativePatternMatcher (Backward Compatibility)
697
+ # =============================================================================
698
+
699
+
700
+ class SafeNativePatternMatcher(NativePatternMatcher):
701
+ """Drop-in replacement for NativePatternMatcher with sampling.
702
+
703
+ This class extends NativePatternMatcher to add automatic
704
+ sampling, making it safe for use with large datasets.
705
+
706
+ It maintains the same API as NativePatternMatcher but
707
+ adds sampling configuration options.
708
+
709
+ Example:
710
+ # Drop-in replacement
711
+ matcher = SafeNativePatternMatcher(max_rows=50_000)
712
+ results = matcher.match_column(lf, "email")
713
+
714
+ # Same API as before, but now memory-safe
715
+ """
716
+
717
+ def __init__(
718
+ self,
719
+ patterns: PatternRegistry | None = None,
720
+ *,
721
+ min_match_ratio: float = 0.8,
722
+ sample_size: int = 5,
723
+ include_non_matches: bool = False,
724
+ # New sampling options
725
+ max_rows: int = 100_000,
726
+ sampling_strategy: SamplingMethod = SamplingMethod.ADAPTIVE,
727
+ confidence_level: float = 0.95,
728
+ ):
729
+ """Initialize with sampling options.
730
+
731
+ Args:
732
+ patterns: Pattern registry
733
+ min_match_ratio: Minimum match ratio
734
+ sample_size: Number of sample values
735
+ include_non_matches: Include non-matching samples
736
+ max_rows: Maximum rows to process
737
+ sampling_strategy: Sampling strategy to use
738
+ confidence_level: Statistical confidence level
739
+ """
740
+ super().__init__(
741
+ patterns=patterns,
742
+ min_match_ratio=min_match_ratio,
743
+ sample_size=sample_size,
744
+ include_non_matches=include_non_matches,
745
+ )
746
+
747
+ self._sampling_config = SamplingConfig(
748
+ strategy=sampling_strategy,
749
+ max_rows=max_rows,
750
+ confidence_level=confidence_level,
751
+ )
752
+ self._sampler = Sampler(self._sampling_config)
753
+
754
+ def match_column(
755
+ self,
756
+ lf: pl.LazyFrame,
757
+ column: str,
758
+ *,
759
+ patterns: Sequence[PatternSpec] | None = None,
760
+ limit: int | None = None, # Now uses sampling instead
761
+ ) -> list[PatternMatchResult]:
762
+ """Match patterns with automatic sampling.
763
+
764
+ Overrides parent to add sampling before matching.
765
+
766
+ Args:
767
+ lf: LazyFrame containing the data
768
+ column: Column name to analyze
769
+ patterns: Optional patterns to check
770
+ limit: Ignored (uses sampling config instead)
771
+
772
+ Returns:
773
+ List of PatternMatchResult
774
+ """
775
+ # Apply sampling
776
+ column_lf = lf.select(pl.col(column))
777
+ sampling_result = self._sampler.sample(column_lf)
778
+
779
+ # Log sampling decision
780
+ if sampling_result.is_sampled:
781
+ logger.debug(
782
+ f"Sampled column '{column}': "
783
+ f"{sampling_result.metrics.sample_size:,} of "
784
+ f"{sampling_result.metrics.original_size:,} rows "
785
+ f"({sampling_result.metrics.sampling_ratio:.1%})"
786
+ )
787
+
788
+ # Run parent's match_column on sampled data
789
+ return super().match_column(
790
+ sampling_result.data,
791
+ column,
792
+ patterns=patterns,
793
+ limit=None, # Already sampled
794
+ )