truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,815 @@
1
+ """Native Polars pattern matching for high-performance profiling.
2
+
3
+ This module provides a Polars-native pattern matching engine that
4
+ achieves 10-50x performance improvement over Python regex loops.
5
+
6
+ Key features:
7
+ - Vectorized regex matching using Polars' Rust-based engine
8
+ - Lazy evaluation for memory efficiency
9
+ - Extensible pattern registry
10
+ - Caching for repeated pattern matches
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from abc import ABC, abstractmethod
17
+ from dataclasses import dataclass, field
18
+ from enum import Enum
19
+ from functools import cached_property
20
+ from typing import Any, Callable, ClassVar, Iterator, Sequence
21
+
22
+ import polars as pl
23
+
24
+ from truthound.profiler.base import DataType, PatternMatch
25
+
26
+
27
+ # =============================================================================
28
+ # Pattern Definition System
29
+ # =============================================================================
30
+
31
+
32
+ class PatternPriority(int, Enum):
33
+ """Priority levels for pattern matching (higher = checked first)."""
34
+
35
+ HIGHEST = 100 # Country-specific formats (KRN, etc.)
36
+ HIGH = 80 # Well-defined formats (UUID, email)
37
+ MEDIUM = 60 # Common formats (URL, IP)
38
+ LOW = 40 # Generic formats (phone)
39
+ LOWEST = 20 # Fallback patterns
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class PatternSpec:
44
+ """Specification for a pattern to detect.
45
+
46
+ This is the immutable definition of a pattern. Use the
47
+ PatternBuilder for convenient construction.
48
+
49
+ Attributes:
50
+ name: Unique identifier for the pattern
51
+ regex: Regular expression string
52
+ data_type: Semantic data type this pattern represents
53
+ priority: Matching priority (higher = checked first)
54
+ description: Human-readable description
55
+ examples: Sample values matching this pattern
56
+ polars_compatible: Whether regex works with Polars' engine
57
+ """
58
+
59
+ name: str
60
+ regex: str
61
+ data_type: DataType
62
+ priority: int = PatternPriority.MEDIUM
63
+ description: str = ""
64
+ examples: tuple[str, ...] = field(default_factory=tuple)
65
+ polars_compatible: bool = True
66
+
67
+ def __post_init__(self) -> None:
68
+ """Validate regex is compilable."""
69
+ try:
70
+ re.compile(self.regex)
71
+ except re.error as e:
72
+ raise ValueError(f"Invalid regex for pattern '{self.name}': {e}")
73
+
74
+ @cached_property
75
+ def compiled_regex(self) -> re.Pattern:
76
+ """Get compiled Python regex (for fallback)."""
77
+ return re.compile(self.regex)
78
+
79
+ def to_polars_expr(self, column: str) -> pl.Expr:
80
+ """Create a Polars expression for matching this pattern.
81
+
82
+ Returns:
83
+ Expression that evaluates to True for matching values
84
+ """
85
+ # Use the regex as-is, adding anchors only if needed
86
+ pattern = self.regex
87
+ if not pattern.startswith("^"):
88
+ pattern = "^" + pattern
89
+ if not pattern.endswith("$"):
90
+ pattern = pattern + "$"
91
+ return pl.col(column).str.contains(pattern)
92
+
93
+
94
+ class PatternBuilder:
95
+ """Fluent builder for creating PatternSpec instances.
96
+
97
+ Example:
98
+ pattern = (
99
+ PatternBuilder("email")
100
+ .regex(r"^[a-z]+@[a-z]+\\.[a-z]{2,}$")
101
+ .data_type(DataType.EMAIL)
102
+ .priority(PatternPriority.HIGH)
103
+ .description("Email address format")
104
+ .examples("user@example.com", "test@domain.org")
105
+ .build()
106
+ )
107
+ """
108
+
109
+ def __init__(self, name: str):
110
+ self._name = name
111
+ self._regex: str = ""
112
+ self._data_type: DataType = DataType.UNKNOWN
113
+ self._priority: int = PatternPriority.MEDIUM
114
+ self._description: str = ""
115
+ self._examples: list[str] = []
116
+ self._polars_compatible: bool = True
117
+
118
+ def regex(self, pattern: str) -> "PatternBuilder":
119
+ """Set the regex pattern (without ^ and $ anchors)."""
120
+ self._regex = pattern
121
+ return self
122
+
123
+ def data_type(self, dtype: DataType) -> "PatternBuilder":
124
+ """Set the semantic data type."""
125
+ self._data_type = dtype
126
+ return self
127
+
128
+ def priority(self, p: int | PatternPriority) -> "PatternBuilder":
129
+ """Set the matching priority."""
130
+ self._priority = int(p)
131
+ return self
132
+
133
+ def description(self, desc: str) -> "PatternBuilder":
134
+ """Set the human-readable description."""
135
+ self._description = desc
136
+ return self
137
+
138
+ def examples(self, *values: str) -> "PatternBuilder":
139
+ """Add example values."""
140
+ self._examples.extend(values)
141
+ return self
142
+
143
+ def polars_compatible(self, compatible: bool) -> "PatternBuilder":
144
+ """Mark whether regex is Polars-compatible."""
145
+ self._polars_compatible = compatible
146
+ return self
147
+
148
+ def build(self) -> PatternSpec:
149
+ """Build the immutable PatternSpec."""
150
+ if not self._regex:
151
+ raise ValueError(f"Pattern '{self._name}' requires a regex")
152
+ return PatternSpec(
153
+ name=self._name,
154
+ regex=self._regex,
155
+ data_type=self._data_type,
156
+ priority=self._priority,
157
+ description=self._description,
158
+ examples=tuple(self._examples),
159
+ polars_compatible=self._polars_compatible,
160
+ )
161
+
162
+
163
+ # =============================================================================
164
+ # Pattern Registry
165
+ # =============================================================================
166
+
167
+
168
+ class PatternRegistry:
169
+ """Registry for pattern specifications with priority ordering.
170
+
171
+ The registry maintains patterns in priority order and provides
172
+ efficient lookup methods.
173
+
174
+ Example:
175
+ registry = PatternRegistry()
176
+ registry.register(email_pattern)
177
+ registry.register(uuid_pattern)
178
+
179
+ # Iterate in priority order
180
+ for pattern in registry:
181
+ print(pattern.name)
182
+
183
+ # Get by name
184
+ email = registry.get("email")
185
+ """
186
+
187
+ def __init__(self) -> None:
188
+ self._patterns: dict[str, PatternSpec] = {}
189
+ self._ordered: list[PatternSpec] | None = None
190
+
191
+ def register(self, pattern: PatternSpec) -> None:
192
+ """Register a pattern.
193
+
194
+ If a pattern with the same name exists, it will be replaced.
195
+ """
196
+ self._patterns[pattern.name] = pattern
197
+ self._ordered = None # Invalidate cache
198
+
199
+ def unregister(self, name: str) -> bool:
200
+ """Unregister a pattern by name. Returns True if found."""
201
+ if name in self._patterns:
202
+ del self._patterns[name]
203
+ self._ordered = None
204
+ return True
205
+ return False
206
+
207
+ def get(self, name: str) -> PatternSpec | None:
208
+ """Get pattern by name."""
209
+ return self._patterns.get(name)
210
+
211
+ def has(self, name: str) -> bool:
212
+ """Check if pattern exists."""
213
+ return name in self._patterns
214
+
215
+ def __iter__(self) -> Iterator[PatternSpec]:
216
+ """Iterate patterns in priority order (highest first)."""
217
+ if self._ordered is None:
218
+ self._ordered = sorted(
219
+ self._patterns.values(),
220
+ key=lambda p: (-p.priority, p.name),
221
+ )
222
+ return iter(self._ordered)
223
+
224
+ def __len__(self) -> int:
225
+ return len(self._patterns)
226
+
227
+ def by_data_type(self, dtype: DataType) -> list[PatternSpec]:
228
+ """Get all patterns for a specific data type."""
229
+ return [p for p in self._patterns.values() if p.data_type == dtype]
230
+
231
+ def clone(self) -> "PatternRegistry":
232
+ """Create a copy of this registry."""
233
+ new = PatternRegistry()
234
+ new._patterns = dict(self._patterns)
235
+ return new
236
+
237
+
238
+ # =============================================================================
239
+ # Built-in Patterns
240
+ # =============================================================================
241
+
242
+
243
+ def _create_builtin_patterns() -> PatternRegistry:
244
+ """Create registry with built-in patterns."""
245
+ registry = PatternRegistry()
246
+
247
+ # Korean specific patterns (highest priority)
248
+ registry.register(
249
+ PatternBuilder("korean_rrn")
250
+ .regex(r"\d{6}-[1-4]\d{6}")
251
+ .data_type(DataType.KOREAN_RRN)
252
+ .priority(PatternPriority.HIGHEST)
253
+ .description("Korean Resident Registration Number")
254
+ .examples("900101-1234567", "851231-2345678")
255
+ .build()
256
+ )
257
+
258
+ registry.register(
259
+ PatternBuilder("korean_phone")
260
+ .regex(r"01[0-9]-\d{3,4}-\d{4}")
261
+ .data_type(DataType.KOREAN_PHONE)
262
+ .priority(PatternPriority.HIGHEST)
263
+ .description("Korean mobile phone number")
264
+ .examples("010-1234-5678", "011-123-4567")
265
+ .build()
266
+ )
267
+
268
+ registry.register(
269
+ PatternBuilder("korean_business_number")
270
+ .regex(r"\d{3}-\d{2}-\d{5}")
271
+ .data_type(DataType.KOREAN_BUSINESS_NUMBER)
272
+ .priority(PatternPriority.HIGHEST)
273
+ .description("Korean business registration number")
274
+ .examples("123-45-67890", "987-65-43210")
275
+ .build()
276
+ )
277
+
278
+ # UUID (very specific format)
279
+ registry.register(
280
+ PatternBuilder("uuid")
281
+ .regex(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
282
+ .data_type(DataType.UUID)
283
+ .priority(PatternPriority.HIGH)
284
+ .description("UUID/GUID format")
285
+ .examples("550e8400-e29b-41d4-a716-446655440000")
286
+ .build()
287
+ )
288
+
289
+ # Email
290
+ registry.register(
291
+ PatternBuilder("email")
292
+ .regex(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
293
+ .data_type(DataType.EMAIL)
294
+ .priority(PatternPriority.HIGH)
295
+ .description("Email address")
296
+ .examples("user@example.com", "name.surname@domain.co.uk")
297
+ .build()
298
+ )
299
+
300
+ # URL
301
+ registry.register(
302
+ PatternBuilder("url")
303
+ .regex(r"https?://[^\s/$.?#][^\s]*")
304
+ .data_type(DataType.URL)
305
+ .priority(PatternPriority.HIGH)
306
+ .description("URL/URI format")
307
+ .examples("https://example.com", "http://api.domain.org/path")
308
+ .build()
309
+ )
310
+
311
+ # IP Address (IPv4)
312
+ registry.register(
313
+ PatternBuilder("ipv4")
314
+ .regex(r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)")
315
+ .data_type(DataType.IP_ADDRESS)
316
+ .priority(PatternPriority.MEDIUM)
317
+ .description("IPv4 address")
318
+ .examples("192.168.1.1", "10.0.0.255")
319
+ .build()
320
+ )
321
+
322
+ # IPv6 Address (simplified)
323
+ registry.register(
324
+ PatternBuilder("ipv6")
325
+ .regex(r"(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}")
326
+ .data_type(DataType.IP_ADDRESS)
327
+ .priority(PatternPriority.MEDIUM)
328
+ .description("IPv6 address (full format)")
329
+ .examples("2001:0db8:85a3:0000:0000:8a2e:0370:7334")
330
+ .build()
331
+ )
332
+
333
+ # Phone (international format)
334
+ registry.register(
335
+ PatternBuilder("phone_international")
336
+ .regex(r"\+?[1-9]\d{1,14}")
337
+ .data_type(DataType.PHONE)
338
+ .priority(PatternPriority.LOW)
339
+ .description("International phone number (E.164)")
340
+ .examples("+14155551234", "+821012345678")
341
+ .build()
342
+ )
343
+
344
+ # Credit card number (basic validation)
345
+ registry.register(
346
+ PatternBuilder("credit_card")
347
+ .regex(r"\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}")
348
+ .data_type(DataType.IDENTIFIER)
349
+ .priority(PatternPriority.MEDIUM)
350
+ .description("Credit card number format")
351
+ .examples("4111-1111-1111-1111", "5500 0000 0000 0004")
352
+ .build()
353
+ )
354
+
355
+ # ISO 8601 Date
356
+ registry.register(
357
+ PatternBuilder("iso_date")
358
+ .regex(r"\d{4}-\d{2}-\d{2}")
359
+ .data_type(DataType.DATE)
360
+ .priority(PatternPriority.MEDIUM)
361
+ .description("ISO 8601 date format")
362
+ .examples("2024-01-15", "2023-12-31")
363
+ .build()
364
+ )
365
+
366
+ # ISO 8601 DateTime
367
+ registry.register(
368
+ PatternBuilder("iso_datetime")
369
+ .regex(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
370
+ .data_type(DataType.DATETIME)
371
+ .priority(PatternPriority.MEDIUM)
372
+ .description("ISO 8601 datetime format")
373
+ .examples("2024-01-15T10:30:00Z", "2023-12-31 23:59:59+09:00")
374
+ .build()
375
+ )
376
+
377
+ # JSON object/array
378
+ registry.register(
379
+ PatternBuilder("json")
380
+ .regex(r'[\[\{].*[\]\}]')
381
+ .data_type(DataType.JSON)
382
+ .priority(PatternPriority.LOWEST)
383
+ .description("JSON object or array")
384
+ .examples('{"key": "value"}', '[1, 2, 3]')
385
+ .build()
386
+ )
387
+
388
+ return registry
389
+
390
+
391
+ # Global built-in pattern registry
392
+ BUILTIN_PATTERNS: PatternRegistry = _create_builtin_patterns()
393
+
394
+
395
+ # =============================================================================
396
+ # Native Pattern Matcher
397
+ # =============================================================================
398
+
399
+
400
+ @dataclass
401
+ class PatternMatchResult:
402
+ """Result of pattern matching on a column.
403
+
404
+ Attributes:
405
+ pattern: The pattern that matched
406
+ match_count: Number of matching values
407
+ total_count: Total non-null values
408
+ match_ratio: Ratio of matches (0.0 to 1.0)
409
+ sample_matches: Sample of matched values
410
+ sample_non_matches: Sample of non-matched values
411
+ """
412
+
413
+ pattern: PatternSpec
414
+ match_count: int
415
+ total_count: int
416
+ match_ratio: float
417
+ sample_matches: tuple[str, ...] = field(default_factory=tuple)
418
+ sample_non_matches: tuple[str, ...] = field(default_factory=tuple)
419
+
420
+ def to_pattern_match(self) -> PatternMatch:
421
+ """Convert to legacy PatternMatch format."""
422
+ return PatternMatch(
423
+ pattern=self.pattern.name,
424
+ regex=self.pattern.regex,
425
+ match_ratio=self.match_ratio,
426
+ sample_matches=self.sample_matches,
427
+ )
428
+
429
+
430
+ class NativePatternMatcher:
431
+ """High-performance pattern matcher using Polars native operations.
432
+
433
+ This matcher achieves 10-50x performance improvement over Python
434
+ regex loops by leveraging Polars' Rust-based string operations
435
+ and lazy evaluation.
436
+
437
+ Example:
438
+ matcher = NativePatternMatcher()
439
+
440
+ # Match single column
441
+ result = matcher.match_column(lf, "email_column")
442
+
443
+ # Match with custom patterns
444
+ custom = PatternRegistry()
445
+ custom.register(my_pattern)
446
+ matcher = NativePatternMatcher(patterns=custom)
447
+
448
+ # Match with minimum ratio
449
+ result = matcher.match_column(lf, "col", min_match_ratio=0.9)
450
+ """
451
+
452
+ def __init__(
453
+ self,
454
+ patterns: PatternRegistry | None = None,
455
+ *,
456
+ min_match_ratio: float = 0.8,
457
+ sample_size: int = 5,
458
+ include_non_matches: bool = False,
459
+ ):
460
+ """Initialize the pattern matcher.
461
+
462
+ Args:
463
+ patterns: Pattern registry to use (defaults to BUILTIN_PATTERNS)
464
+ min_match_ratio: Minimum ratio to consider a pattern matched
465
+ sample_size: Number of sample values to collect
466
+ include_non_matches: Whether to collect non-matching samples
467
+ """
468
+ self.patterns = patterns or BUILTIN_PATTERNS
469
+ self.min_match_ratio = min_match_ratio
470
+ self.sample_size = sample_size
471
+ self.include_non_matches = include_non_matches
472
+
473
+ def match_column(
474
+ self,
475
+ lf: pl.LazyFrame,
476
+ column: str,
477
+ *,
478
+ patterns: Sequence[PatternSpec] | None = None,
479
+ limit: int | None = None,
480
+ ) -> list[PatternMatchResult]:
481
+ """Match patterns against a column using native Polars operations.
482
+
483
+ This is the main entry point for pattern matching. It uses
484
+ vectorized operations for high performance.
485
+
486
+ Args:
487
+ lf: LazyFrame containing the data
488
+ column: Column name to analyze
489
+ patterns: Optional override of patterns to check
490
+ limit: Optional limit on number of rows to analyze
491
+
492
+ Returns:
493
+ List of matching patterns (sorted by match ratio, descending)
494
+ """
495
+ patterns_to_check = list(patterns) if patterns else list(self.patterns)
496
+
497
+ if not patterns_to_check:
498
+ return []
499
+
500
+ # Apply limit if specified
501
+ if limit:
502
+ lf = lf.head(limit)
503
+
504
+ # Build a single query that tests all patterns at once
505
+ # This is much faster than running separate queries
506
+ pattern_exprs = []
507
+ for pattern in patterns_to_check:
508
+ # Use .sum() to count matches (True = 1, False = 0)
509
+ expr = pattern.to_polars_expr(column).sum().alias(f"__pattern_{pattern.name}")
510
+ pattern_exprs.append(expr)
511
+
512
+ # Add total count
513
+ base_exprs = [
514
+ pl.col(column).is_not_null().sum().alias("__total_count"),
515
+ ]
516
+
517
+ # Collect all pattern match counts in one query
518
+ try:
519
+ result_df = (
520
+ lf.select(pl.col(column))
521
+ .filter(pl.col(column).is_not_null())
522
+ .select(base_exprs + pattern_exprs)
523
+ .collect()
524
+ )
525
+ except Exception:
526
+ # Fallback to individual queries if batch fails
527
+ return self._match_column_sequential(lf, column, patterns_to_check, limit)
528
+
529
+ total_count = result_df["__total_count"][0]
530
+ if total_count == 0:
531
+ return []
532
+
533
+ # Process results
534
+ results: list[PatternMatchResult] = []
535
+ for pattern in patterns_to_check:
536
+ col_name = f"__pattern_{pattern.name}"
537
+ if col_name not in result_df.columns:
538
+ continue
539
+
540
+ match_count = result_df[col_name][0]
541
+ match_ratio = match_count / total_count
542
+
543
+ if match_ratio >= self.min_match_ratio:
544
+ # Collect samples
545
+ samples = self._collect_samples(lf, column, pattern, self.sample_size)
546
+
547
+ results.append(
548
+ PatternMatchResult(
549
+ pattern=pattern,
550
+ match_count=match_count,
551
+ total_count=total_count,
552
+ match_ratio=match_ratio,
553
+ sample_matches=samples,
554
+ )
555
+ )
556
+
557
+ # Sort by match ratio descending
558
+ results.sort(key=lambda r: (-r.match_ratio, -r.pattern.priority))
559
+
560
+ return results
561
+
562
+ def _match_column_sequential(
563
+ self,
564
+ lf: pl.LazyFrame,
565
+ column: str,
566
+ patterns: list[PatternSpec],
567
+ limit: int | None,
568
+ ) -> list[PatternMatchResult]:
569
+ """Fallback sequential matching for complex patterns."""
570
+ if limit:
571
+ lf = lf.head(limit)
572
+
573
+ # Get total count
574
+ total_count = (
575
+ lf.select(pl.col(column).is_not_null().sum())
576
+ .collect()
577
+ .item()
578
+ )
579
+
580
+ if total_count == 0:
581
+ return []
582
+
583
+ results: list[PatternMatchResult] = []
584
+ for pattern in patterns:
585
+ try:
586
+ match_count = (
587
+ lf.select(pattern.to_polars_expr(column).sum())
588
+ .collect()
589
+ .item()
590
+ )
591
+ match_ratio = match_count / total_count
592
+
593
+ if match_ratio >= self.min_match_ratio:
594
+ samples = self._collect_samples(lf, column, pattern, self.sample_size)
595
+ results.append(
596
+ PatternMatchResult(
597
+ pattern=pattern,
598
+ match_count=match_count,
599
+ total_count=total_count,
600
+ match_ratio=match_ratio,
601
+ sample_matches=samples,
602
+ )
603
+ )
604
+ except Exception:
605
+ # Skip patterns that fail
606
+ continue
607
+
608
+ results.sort(key=lambda r: (-r.match_ratio, -r.pattern.priority))
609
+ return results
610
+
611
+ def _collect_samples(
612
+ self,
613
+ lf: pl.LazyFrame,
614
+ column: str,
615
+ pattern: PatternSpec,
616
+ n: int,
617
+ ) -> tuple[str, ...]:
618
+ """Collect sample matching values."""
619
+ try:
620
+ samples = (
621
+ lf.select(pl.col(column))
622
+ .filter(pl.col(column).is_not_null())
623
+ .filter(pattern.to_polars_expr(column))
624
+ .head(n)
625
+ .collect()
626
+ )
627
+ return tuple(str(v) for v in samples[column].to_list())
628
+ except Exception:
629
+ return ()
630
+
631
+ def infer_type(
632
+ self,
633
+ lf: pl.LazyFrame,
634
+ column: str,
635
+ *,
636
+ min_match_ratio: float | None = None,
637
+ ) -> DataType | None:
638
+ """Infer semantic type based on pattern matching.
639
+
640
+ Returns the data type of the highest-priority matching pattern,
641
+ or None if no patterns match.
642
+
643
+ Args:
644
+ lf: LazyFrame containing the data
645
+ column: Column name to analyze
646
+ min_match_ratio: Override minimum match ratio
647
+
648
+ Returns:
649
+ Inferred DataType or None
650
+ """
651
+ original_ratio = self.min_match_ratio
652
+ if min_match_ratio is not None:
653
+ self.min_match_ratio = min_match_ratio
654
+
655
+ try:
656
+ results = self.match_column(lf, column)
657
+ if results:
658
+ # Return the highest priority matching pattern's type
659
+ return results[0].pattern.data_type
660
+ return None
661
+ finally:
662
+ self.min_match_ratio = original_ratio
663
+
664
+ def match_all_columns(
665
+ self,
666
+ lf: pl.LazyFrame,
667
+ *,
668
+ string_columns_only: bool = True,
669
+ ) -> dict[str, list[PatternMatchResult]]:
670
+ """Match patterns against all applicable columns.
671
+
672
+ Args:
673
+ lf: LazyFrame to analyze
674
+ string_columns_only: Only analyze string columns (recommended)
675
+
676
+ Returns:
677
+ Dictionary mapping column names to their pattern matches
678
+ """
679
+ schema = lf.collect_schema()
680
+ results: dict[str, list[PatternMatchResult]] = {}
681
+
682
+ for col_name, dtype in schema.items():
683
+ if string_columns_only:
684
+ if type(dtype) not in {pl.String, pl.Utf8}:
685
+ continue
686
+
687
+ col_results = self.match_column(lf, col_name)
688
+ if col_results:
689
+ results[col_name] = col_results
690
+
691
+ return results
692
+
693
+
694
+ # =============================================================================
695
+ # Native Pattern Analyzer (Integration with profiler)
696
+ # =============================================================================
697
+
698
+
699
+ class NativePatternAnalyzer:
700
+ """Column analyzer using native Polars pattern matching.
701
+
702
+ This is a drop-in replacement for PatternAnalyzer that uses
703
+ vectorized operations for much better performance.
704
+ """
705
+
706
+ name = "native_pattern"
707
+ applicable_types = {pl.String, pl.Utf8}
708
+
709
+ def __init__(
710
+ self,
711
+ patterns: PatternRegistry | None = None,
712
+ min_match_ratio: float = 0.8,
713
+ sample_size: int = 5,
714
+ ):
715
+ self.matcher = NativePatternMatcher(
716
+ patterns=patterns,
717
+ min_match_ratio=min_match_ratio,
718
+ sample_size=sample_size,
719
+ )
720
+
721
+ def is_applicable(self, dtype: pl.DataType) -> bool:
722
+ """Check if this analyzer is applicable to the given dtype."""
723
+ return type(dtype) in self.applicable_types
724
+
725
+ def analyze(
726
+ self,
727
+ column: str,
728
+ lf: pl.LazyFrame,
729
+ config: Any,
730
+ ) -> dict[str, Any]:
731
+ """Analyze patterns in the column.
732
+
733
+ Args:
734
+ column: Column name
735
+ lf: LazyFrame containing the data
736
+ config: Profiler configuration
737
+
738
+ Returns:
739
+ Dictionary with detected_patterns key
740
+ """
741
+ # Get limit from config if available
742
+ limit = getattr(config, "pattern_sample_size", 1000)
743
+
744
+ results = self.matcher.match_column(lf, column, limit=limit)
745
+
746
+ # Convert to legacy PatternMatch format
747
+ detected = tuple(r.to_pattern_match() for r in results)
748
+
749
+ return {"detected_patterns": detected}
750
+
751
+
752
+ # =============================================================================
753
+ # Convenience Functions
754
+ # =============================================================================
755
+
756
+
757
+ def match_patterns(
758
+ data: pl.LazyFrame | pl.DataFrame,
759
+ column: str,
760
+ *,
761
+ min_ratio: float = 0.8,
762
+ ) -> list[PatternMatchResult]:
763
+ """Convenience function to match patterns against a column.
764
+
765
+ Args:
766
+ data: DataFrame or LazyFrame
767
+ column: Column name to analyze
768
+ min_ratio: Minimum match ratio
769
+
770
+ Returns:
771
+ List of matching patterns
772
+
773
+ Example:
774
+ import polars as pl
775
+ from truthound.profiler.native_patterns import match_patterns
776
+
777
+ df = pl.DataFrame({"email": ["user@example.com", "test@test.org"]})
778
+ results = match_patterns(df, "email")
779
+ for r in results:
780
+ print(f"{r.pattern.name}: {r.match_ratio:.2%}")
781
+ """
782
+ if isinstance(data, pl.DataFrame):
783
+ data = data.lazy()
784
+
785
+ matcher = NativePatternMatcher(min_match_ratio=min_ratio)
786
+ return matcher.match_column(data, column)
787
+
788
+
789
+ def infer_column_type(
790
+ data: pl.LazyFrame | pl.DataFrame,
791
+ column: str,
792
+ *,
793
+ min_ratio: float = 0.9,
794
+ ) -> DataType | None:
795
+ """Convenience function to infer column semantic type.
796
+
797
+ Args:
798
+ data: DataFrame or LazyFrame
799
+ column: Column name to analyze
800
+ min_ratio: Minimum match ratio for type inference
801
+
802
+ Returns:
803
+ Inferred DataType or None
804
+
805
+ Example:
806
+ from truthound.profiler.native_patterns import infer_column_type
807
+
808
+ df = pl.DataFrame({"col": ["550e8400-e29b-41d4-a716-446655440000"]})
809
+ dtype = infer_column_type(df, "col") # Returns DataType.UUID
810
+ """
811
+ if isinstance(data, pl.DataFrame):
812
+ data = data.lazy()
813
+
814
+ matcher = NativePatternMatcher()
815
+ return matcher.infer_type(data, column, min_match_ratio=min_ratio)