truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1354 @@
1
+ """Streaming Pattern Matching with Chunk Integration.
2
+
3
+ This module provides chunk-aware pattern matching for streaming data processing.
4
+ It solves the problem of pattern detection across chunk boundaries by maintaining
5
+ state and aggregating pattern statistics across multiple chunks.
6
+
7
+ Key features:
8
+ - Chunk-aware pattern state management
9
+ - Pluggable aggregation strategies
10
+ - Cross-chunk pattern boundary detection
11
+ - Statistical confidence tracking across chunks
12
+ - Memory-efficient incremental processing
13
+ - Integration with existing streaming profiler
14
+
15
+ Design Principles:
16
+ - Strategy Pattern: Aggregation strategies are pluggable
17
+ - Observer Pattern: Callbacks for pattern events
18
+ - State Pattern: Chunk state management
19
+ - Template Method: Customizable aggregation pipeline
20
+
21
+ Example:
22
+ from truthound.profiler.streaming_patterns import (
23
+ StreamingPatternMatcher,
24
+ IncrementalAggregation,
25
+ )
26
+
27
+ matcher = StreamingPatternMatcher(
28
+ aggregation_strategy=IncrementalAggregation(),
29
+ )
30
+
31
+ # Process chunks
32
+ for chunk in chunks:
33
+ matcher.process_chunk(chunk, "column_name")
34
+
35
+ # Get final results
36
+ results = matcher.finalize()
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import logging
42
+ import threading
43
+ import time
44
+ from abc import ABC, abstractmethod
45
+ from dataclasses import dataclass, field
46
+ from datetime import datetime
47
+ from enum import Enum
48
+ from typing import (
49
+ Any,
50
+ Callable,
51
+ Generic,
52
+ Iterator,
53
+ Protocol,
54
+ Sequence,
55
+ TypeVar,
56
+ )
57
+
58
+ import polars as pl
59
+
60
+ from truthound.profiler.base import DataType, PatternMatch
61
+ from truthound.profiler.native_patterns import (
62
+ BUILTIN_PATTERNS,
63
+ NativePatternMatcher,
64
+ PatternMatchResult,
65
+ PatternRegistry,
66
+ PatternSpec,
67
+ )
68
+ from truthound.profiler.sampling import (
69
+ SamplingConfig,
70
+ SamplingMetrics,
71
+ )
72
+
73
+ logger = logging.getLogger(__name__)
74
+
75
+
76
+ # =============================================================================
77
+ # Types and Enums
78
+ # =============================================================================
79
+
80
+
81
+ class AggregationMethod(str, Enum):
82
+ """Methods for aggregating pattern statistics across chunks."""
83
+
84
+ INCREMENTAL = "incremental" # Running totals
85
+ WEIGHTED = "weighted" # Size-weighted averages
86
+ SLIDING_WINDOW = "sliding_window" # Recent chunks only
87
+ EXPONENTIAL = "exponential" # Exponential moving average
88
+ RESERVOIR = "reservoir" # Reservoir-based sampling
89
+ CONSENSUS = "consensus" # Agreement across chunks
90
+ ADAPTIVE = "adaptive" # Auto-select based on data
91
+
92
+
93
+ class ChunkProcessingStatus(str, Enum):
94
+ """Status of chunk processing."""
95
+
96
+ PENDING = "pending"
97
+ PROCESSING = "processing"
98
+ COMPLETED = "completed"
99
+ FAILED = "failed"
100
+ SKIPPED = "skipped"
101
+
102
+
103
+ # =============================================================================
104
+ # Pattern State Management
105
+ # =============================================================================
106
+
107
+
108
+ @dataclass
109
+ class PatternChunkStats:
110
+ """Statistics for a pattern in a single chunk.
111
+
112
+ This is the basic unit of pattern statistics captured per chunk.
113
+ Immutable after creation to ensure thread-safety.
114
+ """
115
+
116
+ pattern_name: str
117
+ match_count: int
118
+ total_count: int
119
+ chunk_index: int
120
+ processing_time_ms: float = 0.0
121
+
122
+ @property
123
+ def match_ratio(self) -> float:
124
+ """Calculate match ratio for this chunk."""
125
+ return self.match_count / self.total_count if self.total_count > 0 else 0.0
126
+
127
+ def to_dict(self) -> dict[str, Any]:
128
+ """Convert to dictionary."""
129
+ return {
130
+ "pattern_name": self.pattern_name,
131
+ "match_count": self.match_count,
132
+ "total_count": self.total_count,
133
+ "match_ratio": self.match_ratio,
134
+ "chunk_index": self.chunk_index,
135
+ "processing_time_ms": self.processing_time_ms,
136
+ }
137
+
138
+
139
+ @dataclass
140
+ class PatternState:
141
+ """Mutable state for a pattern across all chunks.
142
+
143
+ Maintains running statistics and history for aggregation.
144
+ """
145
+
146
+ pattern: PatternSpec
147
+ chunk_stats: list[PatternChunkStats] = field(default_factory=list)
148
+
149
+ # Running totals
150
+ total_matches: int = 0
151
+ total_rows: int = 0
152
+ chunks_processed: int = 0
153
+
154
+ # Sample collection
155
+ sample_matches: list[str] = field(default_factory=list)
156
+ max_samples: int = 10
157
+
158
+ # Timing
159
+ total_processing_time_ms: float = 0.0
160
+
161
+ def add_chunk_stats(self, stats: PatternChunkStats) -> None:
162
+ """Add statistics from a new chunk."""
163
+ self.chunk_stats.append(stats)
164
+ self.total_matches += stats.match_count
165
+ self.total_rows += stats.total_count
166
+ self.chunks_processed += 1
167
+ self.total_processing_time_ms += stats.processing_time_ms
168
+
169
+ def add_samples(self, samples: Sequence[str]) -> None:
170
+ """Add sample matches (up to max_samples)."""
171
+ remaining = self.max_samples - len(self.sample_matches)
172
+ if remaining > 0:
173
+ self.sample_matches.extend(samples[:remaining])
174
+
175
+ @property
176
+ def overall_match_ratio(self) -> float:
177
+ """Calculate overall match ratio across all chunks."""
178
+ return self.total_matches / self.total_rows if self.total_rows > 0 else 0.0
179
+
180
+ @property
181
+ def chunk_ratios(self) -> list[float]:
182
+ """Get match ratios for each chunk."""
183
+ return [s.match_ratio for s in self.chunk_stats]
184
+
185
+ @property
186
+ def variance(self) -> float:
187
+ """Calculate variance of match ratios across chunks."""
188
+ if len(self.chunk_stats) < 2:
189
+ return 0.0
190
+ ratios = self.chunk_ratios
191
+ mean = sum(ratios) / len(ratios)
192
+ return sum((r - mean) ** 2 for r in ratios) / (len(ratios) - 1)
193
+
194
+ @property
195
+ def std_deviation(self) -> float:
196
+ """Calculate standard deviation of match ratios."""
197
+ return self.variance ** 0.5
198
+
199
+ @property
200
+ def is_consistent(self) -> bool:
201
+ """Check if pattern is consistent across chunks."""
202
+ if len(self.chunk_stats) < 2:
203
+ return True
204
+ return self.std_deviation < 0.1 # Less than 10% variation
205
+
206
+ def to_pattern_match(self) -> PatternMatch:
207
+ """Convert to legacy PatternMatch format."""
208
+ return PatternMatch(
209
+ pattern=self.pattern.name,
210
+ regex=self.pattern.regex,
211
+ match_ratio=self.overall_match_ratio,
212
+ sample_matches=tuple(self.sample_matches),
213
+ )
214
+
215
+
216
+ @dataclass
217
+ class ColumnPatternState:
218
+ """Complete pattern state for a single column."""
219
+
220
+ column_name: str
221
+ pattern_states: dict[str, PatternState] = field(default_factory=dict)
222
+ chunks_processed: int = 0
223
+ total_rows: int = 0
224
+ started_at: datetime = field(default_factory=datetime.now)
225
+ completed_at: datetime | None = None
226
+
227
+ def get_or_create_pattern_state(self, pattern: PatternSpec) -> PatternState:
228
+ """Get or create pattern state for a pattern."""
229
+ if pattern.name not in self.pattern_states:
230
+ self.pattern_states[pattern.name] = PatternState(pattern=pattern)
231
+ return self.pattern_states[pattern.name]
232
+
233
+ def add_chunk(self, chunk_rows: int) -> None:
234
+ """Register a chunk was processed."""
235
+ self.chunks_processed += 1
236
+ self.total_rows += chunk_rows
237
+
238
+ def finalize(self) -> None:
239
+ """Mark processing as complete."""
240
+ self.completed_at = datetime.now()
241
+
242
+ @property
243
+ def processing_duration_ms(self) -> float:
244
+ """Get total processing duration."""
245
+ end = self.completed_at or datetime.now()
246
+ return (end - self.started_at).total_seconds() * 1000
247
+
248
+
249
+ # =============================================================================
250
+ # Aggregation Strategies
251
+ # =============================================================================
252
+
253
+
254
+ class AggregationStrategy(ABC):
255
+ """Abstract base class for pattern aggregation strategies.
256
+
257
+ Aggregation strategies determine how pattern statistics from
258
+ multiple chunks are combined into final results.
259
+
260
+ Subclass this to create custom aggregation behavior.
261
+ """
262
+
263
+ name: str = "base"
264
+
265
+ @abstractmethod
266
+ def aggregate(
267
+ self,
268
+ state: PatternState,
269
+ min_match_ratio: float = 0.8,
270
+ ) -> PatternMatchResult | None:
271
+ """Aggregate pattern statistics into final result.
272
+
273
+ Args:
274
+ state: Pattern state with all chunk statistics
275
+ min_match_ratio: Minimum ratio to consider a match
276
+
277
+ Returns:
278
+ Aggregated PatternMatchResult or None if not matching
279
+ """
280
+ pass
281
+
282
+ @abstractmethod
283
+ def should_include_pattern(
284
+ self,
285
+ state: PatternState,
286
+ min_match_ratio: float,
287
+ ) -> bool:
288
+ """Determine if pattern should be included in results.
289
+
290
+ Args:
291
+ state: Pattern state
292
+ min_match_ratio: Minimum ratio threshold
293
+
294
+ Returns:
295
+ True if pattern should be included
296
+ """
297
+ pass
298
+
299
+
300
+ class IncrementalAggregation(AggregationStrategy):
301
+ """Simple incremental aggregation using running totals.
302
+
303
+ The most straightforward aggregation: sum all matches and
304
+ divide by total rows. Works well for uniform data.
305
+ """
306
+
307
+ name = "incremental"
308
+
309
+ def aggregate(
310
+ self,
311
+ state: PatternState,
312
+ min_match_ratio: float = 0.8,
313
+ ) -> PatternMatchResult | None:
314
+ """Aggregate using simple totals."""
315
+ if not self.should_include_pattern(state, min_match_ratio):
316
+ return None
317
+
318
+ return PatternMatchResult(
319
+ pattern=state.pattern,
320
+ match_count=state.total_matches,
321
+ total_count=state.total_rows,
322
+ match_ratio=state.overall_match_ratio,
323
+ sample_matches=tuple(state.sample_matches),
324
+ )
325
+
326
+ def should_include_pattern(
327
+ self,
328
+ state: PatternState,
329
+ min_match_ratio: float,
330
+ ) -> bool:
331
+ """Include if overall ratio meets threshold."""
332
+ return state.overall_match_ratio >= min_match_ratio
333
+
334
+
335
+ class WeightedAggregation(AggregationStrategy):
336
+ """Weighted aggregation based on chunk sizes.
337
+
338
+ Gives more weight to larger chunks. Useful when chunk
339
+ sizes vary significantly.
340
+ """
341
+
342
+ name = "weighted"
343
+
344
+ def aggregate(
345
+ self,
346
+ state: PatternState,
347
+ min_match_ratio: float = 0.8,
348
+ ) -> PatternMatchResult | None:
349
+ """Aggregate using size-weighted average."""
350
+ if not self.should_include_pattern(state, min_match_ratio):
351
+ return None
352
+
353
+ # Weighted average is the same as simple total ratio
354
+ # when weights are proportional to counts
355
+ return PatternMatchResult(
356
+ pattern=state.pattern,
357
+ match_count=state.total_matches,
358
+ total_count=state.total_rows,
359
+ match_ratio=state.overall_match_ratio,
360
+ sample_matches=tuple(state.sample_matches),
361
+ )
362
+
363
+ def should_include_pattern(
364
+ self,
365
+ state: PatternState,
366
+ min_match_ratio: float,
367
+ ) -> bool:
368
+ """Include based on weighted ratio."""
369
+ return state.overall_match_ratio >= min_match_ratio
370
+
371
+
372
+ class SlidingWindowAggregation(AggregationStrategy):
373
+ """Aggregation using only recent chunks.
374
+
375
+ Useful for detecting patterns in recent data when older
376
+ data may have different characteristics.
377
+ """
378
+
379
+ name = "sliding_window"
380
+
381
+ def __init__(self, window_size: int = 5):
382
+ """Initialize with window size.
383
+
384
+ Args:
385
+ window_size: Number of recent chunks to consider
386
+ """
387
+ self.window_size = window_size
388
+
389
+ def aggregate(
390
+ self,
391
+ state: PatternState,
392
+ min_match_ratio: float = 0.8,
393
+ ) -> PatternMatchResult | None:
394
+ """Aggregate using recent chunks only."""
395
+ if not self.should_include_pattern(state, min_match_ratio):
396
+ return None
397
+
398
+ # Get recent chunks
399
+ recent = state.chunk_stats[-self.window_size:]
400
+ if not recent:
401
+ return None
402
+
403
+ total_matches = sum(s.match_count for s in recent)
404
+ total_rows = sum(s.total_count for s in recent)
405
+ match_ratio = total_matches / total_rows if total_rows > 0 else 0.0
406
+
407
+ return PatternMatchResult(
408
+ pattern=state.pattern,
409
+ match_count=total_matches,
410
+ total_count=total_rows,
411
+ match_ratio=match_ratio,
412
+ sample_matches=tuple(state.sample_matches),
413
+ )
414
+
415
+ def should_include_pattern(
416
+ self,
417
+ state: PatternState,
418
+ min_match_ratio: float,
419
+ ) -> bool:
420
+ """Include based on recent chunks."""
421
+ recent = state.chunk_stats[-self.window_size:]
422
+ if not recent:
423
+ return False
424
+
425
+ total_matches = sum(s.match_count for s in recent)
426
+ total_rows = sum(s.total_count for s in recent)
427
+ ratio = total_matches / total_rows if total_rows > 0 else 0.0
428
+ return ratio >= min_match_ratio
429
+
430
+
431
+ class ExponentialAggregation(AggregationStrategy):
432
+ """Exponential moving average aggregation.
433
+
434
+ Gives exponentially more weight to recent chunks.
435
+ Alpha controls the decay rate (higher = more weight to recent).
436
+ """
437
+
438
+ name = "exponential"
439
+
440
+ def __init__(self, alpha: float = 0.3):
441
+ """Initialize with smoothing factor.
442
+
443
+ Args:
444
+ alpha: Smoothing factor (0-1). Higher = more weight to recent.
445
+ """
446
+ if not 0 < alpha <= 1:
447
+ raise ValueError(f"alpha must be between 0 and 1, got {alpha}")
448
+ self.alpha = alpha
449
+
450
+ def aggregate(
451
+ self,
452
+ state: PatternState,
453
+ min_match_ratio: float = 0.8,
454
+ ) -> PatternMatchResult | None:
455
+ """Aggregate using exponential moving average."""
456
+ if not self.should_include_pattern(state, min_match_ratio):
457
+ return None
458
+
459
+ ema_ratio = self._calculate_ema(state.chunk_ratios)
460
+
461
+ return PatternMatchResult(
462
+ pattern=state.pattern,
463
+ match_count=state.total_matches,
464
+ total_count=state.total_rows,
465
+ match_ratio=ema_ratio,
466
+ sample_matches=tuple(state.sample_matches),
467
+ )
468
+
469
+ def _calculate_ema(self, ratios: list[float]) -> float:
470
+ """Calculate exponential moving average of ratios."""
471
+ if not ratios:
472
+ return 0.0
473
+
474
+ ema = ratios[0]
475
+ for ratio in ratios[1:]:
476
+ ema = self.alpha * ratio + (1 - self.alpha) * ema
477
+ return ema
478
+
479
+ def should_include_pattern(
480
+ self,
481
+ state: PatternState,
482
+ min_match_ratio: float,
483
+ ) -> bool:
484
+ """Include based on EMA ratio."""
485
+ if not state.chunk_ratios:
486
+ return False
487
+ ema = self._calculate_ema(state.chunk_ratios)
488
+ return ema >= min_match_ratio
489
+
490
+
491
+ class ConsensusAggregation(AggregationStrategy):
492
+ """Consensus-based aggregation requiring agreement across chunks.
493
+
494
+ Pattern is included only if it matches in a minimum fraction
495
+ of chunks. Useful for detecting consistent patterns.
496
+ """
497
+
498
+ name = "consensus"
499
+
500
+ def __init__(self, consensus_threshold: float = 0.8):
501
+ """Initialize with consensus threshold.
502
+
503
+ Args:
504
+ consensus_threshold: Fraction of chunks that must match (0-1)
505
+ """
506
+ if not 0 < consensus_threshold <= 1:
507
+ raise ValueError(
508
+ f"consensus_threshold must be between 0 and 1, got {consensus_threshold}"
509
+ )
510
+ self.consensus_threshold = consensus_threshold
511
+
512
+ def aggregate(
513
+ self,
514
+ state: PatternState,
515
+ min_match_ratio: float = 0.8,
516
+ ) -> PatternMatchResult | None:
517
+ """Aggregate requiring consensus across chunks."""
518
+ if not self.should_include_pattern(state, min_match_ratio):
519
+ return None
520
+
521
+ return PatternMatchResult(
522
+ pattern=state.pattern,
523
+ match_count=state.total_matches,
524
+ total_count=state.total_rows,
525
+ match_ratio=state.overall_match_ratio,
526
+ sample_matches=tuple(state.sample_matches),
527
+ )
528
+
529
+ def should_include_pattern(
530
+ self,
531
+ state: PatternState,
532
+ min_match_ratio: float,
533
+ ) -> bool:
534
+ """Include if consensus threshold is met."""
535
+ if not state.chunk_stats:
536
+ return False
537
+
538
+ # Count chunks where pattern matches
539
+ matching_chunks = sum(
540
+ 1 for s in state.chunk_stats if s.match_ratio >= min_match_ratio
541
+ )
542
+
543
+ consensus_ratio = matching_chunks / len(state.chunk_stats)
544
+ return consensus_ratio >= self.consensus_threshold
545
+
546
+
547
+ class AdaptiveAggregation(AggregationStrategy):
548
+ """Adaptive aggregation that selects strategy based on data characteristics.
549
+
550
+ Automatically chooses the best aggregation method based on:
551
+ - Variance in chunk ratios
552
+ - Number of chunks processed
553
+ - Pattern consistency
554
+ """
555
+
556
+ name = "adaptive"
557
+
558
+ def __init__(self) -> None:
559
+ """Initialize with sub-strategies."""
560
+ self._strategies = {
561
+ "incremental": IncrementalAggregation(),
562
+ "exponential": ExponentialAggregation(alpha=0.3),
563
+ "consensus": ConsensusAggregation(consensus_threshold=0.7),
564
+ }
565
+
566
+ def aggregate(
567
+ self,
568
+ state: PatternState,
569
+ min_match_ratio: float = 0.8,
570
+ ) -> PatternMatchResult | None:
571
+ """Aggregate using adaptively selected strategy."""
572
+ strategy = self._select_strategy(state)
573
+ logger.debug(
574
+ f"Adaptive aggregation selected '{strategy.name}' for pattern '{state.pattern.name}'"
575
+ )
576
+ return strategy.aggregate(state, min_match_ratio)
577
+
578
+ def should_include_pattern(
579
+ self,
580
+ state: PatternState,
581
+ min_match_ratio: float,
582
+ ) -> bool:
583
+ """Check using adaptively selected strategy."""
584
+ strategy = self._select_strategy(state)
585
+ return strategy.should_include_pattern(state, min_match_ratio)
586
+
587
+ def _select_strategy(self, state: PatternState) -> AggregationStrategy:
588
+ """Select best strategy based on state characteristics."""
589
+ if len(state.chunk_stats) < 3:
590
+ # Too few chunks for sophisticated analysis
591
+ return self._strategies["incremental"]
592
+
593
+ if state.is_consistent:
594
+ # Consistent pattern: simple aggregation is fine
595
+ return self._strategies["incremental"]
596
+
597
+ if state.std_deviation > 0.2:
598
+ # High variance: use consensus to require agreement
599
+ return self._strategies["consensus"]
600
+
601
+ # Default: exponential for balanced handling
602
+ return self._strategies["exponential"]
603
+
604
+
605
+ # =============================================================================
606
+ # Aggregation Strategy Registry
607
+ # =============================================================================
608
+
609
+
610
+ class AggregationStrategyRegistry:
611
+ """Registry for aggregation strategies.
612
+
613
+ Allows registration of custom strategies and creation by name.
614
+ """
615
+
616
+ def __init__(self) -> None:
617
+ self._strategies: dict[str, AggregationStrategy] = {}
618
+ self._lock = threading.RLock()
619
+ self._register_defaults()
620
+
621
+ def _register_defaults(self) -> None:
622
+ """Register built-in strategies."""
623
+ self.register(IncrementalAggregation())
624
+ self.register(WeightedAggregation())
625
+ self.register(SlidingWindowAggregation())
626
+ self.register(ExponentialAggregation())
627
+ self.register(ConsensusAggregation())
628
+ self.register(AdaptiveAggregation())
629
+
630
+ def register(self, strategy: AggregationStrategy) -> None:
631
+ """Register an aggregation strategy."""
632
+ with self._lock:
633
+ self._strategies[strategy.name] = strategy
634
+ logger.debug(f"Registered aggregation strategy: {strategy.name}")
635
+
636
+ def get(self, name: str) -> AggregationStrategy:
637
+ """Get a strategy by name."""
638
+ with self._lock:
639
+ if name not in self._strategies:
640
+ available = list(self._strategies.keys())
641
+ raise KeyError(
642
+ f"Unknown aggregation strategy: '{name}'. Available: {available}"
643
+ )
644
+ return self._strategies[name]
645
+
646
+ def get_or_default(
647
+ self,
648
+ name: str,
649
+ default: AggregationStrategy | None = None,
650
+ ) -> AggregationStrategy:
651
+ """Get strategy by name with fallback."""
652
+ try:
653
+ return self.get(name)
654
+ except KeyError:
655
+ return default or AdaptiveAggregation()
656
+
657
+ def list_strategies(self) -> list[str]:
658
+ """List all registered strategy names."""
659
+ with self._lock:
660
+ return list(self._strategies.keys())
661
+
662
+ def create_from_method(self, method: AggregationMethod) -> AggregationStrategy:
663
+ """Create strategy from AggregationMethod enum."""
664
+ return self.get(method.value)
665
+
666
+
667
+ # Global registry instance
668
+ aggregation_strategy_registry = AggregationStrategyRegistry()
669
+
670
+
671
+ # =============================================================================
672
+ # Streaming Pattern Matcher Result
673
+ # =============================================================================
674
+
675
+
676
+ @dataclass
677
+ class StreamingPatternResult:
678
+ """Result of streaming pattern matching for a column.
679
+
680
+ Contains aggregated pattern matches and metadata about
681
+ the streaming process.
682
+ """
683
+
684
+ column: str
685
+ matches: list[PatternMatchResult]
686
+ chunks_processed: int
687
+ total_rows: int
688
+ processing_time_ms: float
689
+ aggregation_method: str
690
+ inferred_type: DataType | None = None
691
+
692
+ # Per-pattern statistics
693
+ pattern_stats: dict[str, dict[str, Any]] = field(default_factory=dict)
694
+
695
+ @property
696
+ def has_matches(self) -> bool:
697
+ """Check if any patterns matched."""
698
+ return len(self.matches) > 0
699
+
700
+ @property
701
+ def best_match(self) -> PatternMatchResult | None:
702
+ """Get the best (highest ratio) match."""
703
+ return self.matches[0] if self.matches else None
704
+
705
+ def to_dict(self) -> dict[str, Any]:
706
+ """Convert to dictionary."""
707
+ return {
708
+ "column": self.column,
709
+ "matches": [
710
+ {
711
+ "pattern_name": m.pattern.name,
712
+ "match_ratio": m.match_ratio,
713
+ "match_count": m.match_count,
714
+ "total_count": m.total_count,
715
+ }
716
+ for m in self.matches
717
+ ],
718
+ "chunks_processed": self.chunks_processed,
719
+ "total_rows": self.total_rows,
720
+ "processing_time_ms": self.processing_time_ms,
721
+ "aggregation_method": self.aggregation_method,
722
+ "inferred_type": self.inferred_type.value if self.inferred_type else None,
723
+ "pattern_stats": self.pattern_stats,
724
+ }
725
+
726
+
727
+ # =============================================================================
728
+ # Pattern Event Callbacks
729
+ # =============================================================================
730
+
731
+
732
+ @dataclass
733
+ class PatternEvent:
734
+ """Event emitted during pattern processing."""
735
+
736
+ event_type: str # "chunk_processed", "pattern_detected", "processing_complete"
737
+ column: str
738
+ chunk_index: int
739
+ data: dict[str, Any] = field(default_factory=dict)
740
+ timestamp: datetime = field(default_factory=datetime.now)
741
+
742
+
743
+ PatternEventCallback = Callable[[PatternEvent], None]
744
+
745
+
746
+ # =============================================================================
747
+ # Streaming Pattern Matcher Configuration
748
+ # =============================================================================
749
+
750
+
751
+ @dataclass
752
+ class StreamingPatternConfig:
753
+ """Configuration for streaming pattern matching.
754
+
755
+ Attributes:
756
+ aggregation_method: Method for aggregating chunk statistics
757
+ min_match_ratio: Minimum ratio to consider a pattern matched
758
+ sample_size_per_chunk: Max samples to collect per chunk
759
+ patterns: Pattern registry to use
760
+ enable_early_termination: Stop if pattern definitely matched/not matched
761
+ early_termination_chunks: Chunks after which early termination is checked
762
+ collect_statistics: Collect detailed per-chunk statistics
763
+ """
764
+
765
+ aggregation_method: AggregationMethod = AggregationMethod.ADAPTIVE
766
+ min_match_ratio: float = 0.8
767
+ sample_size_per_chunk: int = 3
768
+ patterns: PatternRegistry | None = None
769
+ enable_early_termination: bool = True
770
+ early_termination_chunks: int = 3
771
+ early_termination_confidence: float = 0.95
772
+ collect_statistics: bool = True
773
+
774
+ def __post_init__(self) -> None:
775
+ """Validate configuration."""
776
+ if not 0.0 <= self.min_match_ratio <= 1.0:
777
+ raise ValueError(
778
+ f"min_match_ratio must be between 0 and 1, got {self.min_match_ratio}"
779
+ )
780
+ if self.sample_size_per_chunk < 0:
781
+ raise ValueError(
782
+ f"sample_size_per_chunk must be non-negative, got {self.sample_size_per_chunk}"
783
+ )
784
+
785
+ @classmethod
786
+ def fast(cls) -> "StreamingPatternConfig":
787
+ """Create config optimized for speed."""
788
+ return cls(
789
+ aggregation_method=AggregationMethod.INCREMENTAL,
790
+ min_match_ratio=0.7,
791
+ sample_size_per_chunk=2,
792
+ enable_early_termination=True,
793
+ early_termination_chunks=2,
794
+ collect_statistics=False,
795
+ )
796
+
797
+ @classmethod
798
+ def accurate(cls) -> "StreamingPatternConfig":
799
+ """Create config optimized for accuracy."""
800
+ return cls(
801
+ aggregation_method=AggregationMethod.CONSENSUS,
802
+ min_match_ratio=0.85,
803
+ sample_size_per_chunk=5,
804
+ enable_early_termination=False,
805
+ collect_statistics=True,
806
+ )
807
+
808
+ @classmethod
809
+ def balanced(cls) -> "StreamingPatternConfig":
810
+ """Create balanced config (default)."""
811
+ return cls(
812
+ aggregation_method=AggregationMethod.ADAPTIVE,
813
+ min_match_ratio=0.8,
814
+ sample_size_per_chunk=3,
815
+ enable_early_termination=True,
816
+ early_termination_chunks=3,
817
+ collect_statistics=True,
818
+ )
819
+
820
+
821
+ # =============================================================================
822
+ # Streaming Pattern Matcher
823
+ # =============================================================================
824
+
825
+
826
+ class StreamingPatternMatcher:
827
+ """Chunk-aware pattern matcher for streaming data.
828
+
829
+ This is the main interface for streaming pattern matching.
830
+ It maintains state across chunks and provides aggregated
831
+ results using configurable strategies.
832
+
833
+ Example:
834
+ # Basic usage
835
+ matcher = StreamingPatternMatcher()
836
+
837
+ for chunk in data_chunks:
838
+ matcher.process_chunk(chunk, "column_name")
839
+
840
+ result = matcher.finalize("column_name")
841
+ for match in result.matches:
842
+ print(f"{match.pattern.name}: {match.match_ratio:.2%}")
843
+
844
+ # With configuration
845
+ config = StreamingPatternConfig(
846
+ aggregation_method=AggregationMethod.CONSENSUS,
847
+ min_match_ratio=0.9,
848
+ )
849
+ matcher = StreamingPatternMatcher(config=config)
850
+
851
+ # Process multiple columns
852
+ for chunk in data_chunks:
853
+ for col in ["email", "phone", "id"]:
854
+ matcher.process_chunk(chunk, col)
855
+
856
+ # Get all results
857
+ results = matcher.finalize_all()
858
+ """
859
+
860
+ def __init__(
861
+ self,
862
+ config: StreamingPatternConfig | None = None,
863
+ aggregation_strategy: AggregationStrategy | None = None,
864
+ patterns: PatternRegistry | None = None,
865
+ event_callback: PatternEventCallback | None = None,
866
+ ):
867
+ """Initialize the streaming pattern matcher.
868
+
869
+ Args:
870
+ config: Configuration for pattern matching
871
+ aggregation_strategy: Override aggregation strategy
872
+ patterns: Override pattern registry
873
+ event_callback: Callback for pattern events
874
+ """
875
+ self.config = config or StreamingPatternConfig.balanced()
876
+
877
+ # Allow overrides
878
+ if aggregation_strategy:
879
+ self._aggregation = aggregation_strategy
880
+ else:
881
+ self._aggregation = aggregation_strategy_registry.create_from_method(
882
+ self.config.aggregation_method
883
+ )
884
+
885
+ self._patterns = patterns or self.config.patterns or BUILTIN_PATTERNS
886
+ self._event_callback = event_callback
887
+
888
+ # Internal matcher for per-chunk pattern detection
889
+ self._chunk_matcher = NativePatternMatcher(
890
+ patterns=self._patterns,
891
+ min_match_ratio=0.0, # We'll filter ourselves after aggregation
892
+ sample_size=self.config.sample_size_per_chunk,
893
+ )
894
+
895
+ # State management
896
+ self._column_states: dict[str, ColumnPatternState] = {}
897
+ self._lock = threading.RLock()
898
+
899
+ @property
900
+ def patterns(self) -> PatternRegistry:
901
+ """Get the pattern registry."""
902
+ return self._patterns
903
+
904
+ @property
905
+ def aggregation_strategy(self) -> AggregationStrategy:
906
+ """Get the current aggregation strategy."""
907
+ return self._aggregation
908
+
909
+ def process_chunk(
910
+ self,
911
+ chunk: pl.LazyFrame | pl.DataFrame,
912
+ column: str,
913
+ chunk_index: int | None = None,
914
+ ) -> ChunkProcessingStatus:
915
+ """Process a single chunk for pattern matching.
916
+
917
+ This updates the internal state with pattern statistics
918
+ from the chunk.
919
+
920
+ Args:
921
+ chunk: DataFrame or LazyFrame chunk to process
922
+ column: Column name to analyze
923
+ chunk_index: Optional chunk index (auto-incremented if not provided)
924
+
925
+ Returns:
926
+ Status of chunk processing
927
+ """
928
+ start_time = time.perf_counter()
929
+
930
+ # Ensure LazyFrame
931
+ if isinstance(chunk, pl.DataFrame):
932
+ lf = chunk.lazy()
933
+ else:
934
+ lf = chunk
935
+
936
+ with self._lock:
937
+ # Get or create column state
938
+ if column not in self._column_states:
939
+ self._column_states[column] = ColumnPatternState(column_name=column)
940
+
941
+ col_state = self._column_states[column]
942
+ idx = chunk_index if chunk_index is not None else col_state.chunks_processed
943
+
944
+ # Check early termination
945
+ if self._should_terminate_early(col_state):
946
+ self._emit_event("chunk_skipped", column, idx, {"reason": "early_termination"})
947
+ return ChunkProcessingStatus.SKIPPED
948
+
949
+ try:
950
+ # Get chunk row count
951
+ chunk_rows = lf.select(pl.len()).collect().item()
952
+ if chunk_rows == 0:
953
+ with self._lock:
954
+ col_state.add_chunk(0)
955
+ return ChunkProcessingStatus.COMPLETED
956
+
957
+ # Run pattern matching on chunk
958
+ chunk_results = self._chunk_matcher.match_column(lf, column)
959
+
960
+ # Get total non-null count
961
+ total_count = (
962
+ lf.select(pl.col(column).is_not_null().sum())
963
+ .collect()
964
+ .item()
965
+ )
966
+
967
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
968
+
969
+ with self._lock:
970
+ # Update state for each pattern that was tested
971
+ for pattern in self._patterns:
972
+ pattern_state = col_state.get_or_create_pattern_state(pattern)
973
+
974
+ # Find matching result
975
+ result = next(
976
+ (r for r in chunk_results if r.pattern.name == pattern.name),
977
+ None,
978
+ )
979
+
980
+ if result:
981
+ # Pattern was found in this chunk
982
+ stats = PatternChunkStats(
983
+ pattern_name=pattern.name,
984
+ match_count=result.match_count,
985
+ total_count=result.total_count,
986
+ chunk_index=idx,
987
+ processing_time_ms=elapsed_ms / len(list(self._patterns)),
988
+ )
989
+ pattern_state.add_chunk_stats(stats)
990
+ pattern_state.add_samples(result.sample_matches)
991
+ else:
992
+ # Pattern not found - record zero matches
993
+ stats = PatternChunkStats(
994
+ pattern_name=pattern.name,
995
+ match_count=0,
996
+ total_count=total_count,
997
+ chunk_index=idx,
998
+ processing_time_ms=elapsed_ms / len(list(self._patterns)),
999
+ )
1000
+ pattern_state.add_chunk_stats(stats)
1001
+
1002
+ col_state.add_chunk(chunk_rows)
1003
+
1004
+ self._emit_event("chunk_processed", column, idx, {
1005
+ "rows": chunk_rows,
1006
+ "patterns_detected": len(chunk_results),
1007
+ "processing_time_ms": elapsed_ms,
1008
+ })
1009
+
1010
+ return ChunkProcessingStatus.COMPLETED
1011
+
1012
+ except Exception as e:
1013
+ logger.error(f"Failed to process chunk {idx} for column '{column}': {e}")
1014
+ self._emit_event("chunk_failed", column, idx, {"error": str(e)})
1015
+ return ChunkProcessingStatus.FAILED
1016
+
1017
+ def process_chunks(
1018
+ self,
1019
+ chunks: Iterator[pl.LazyFrame | pl.DataFrame],
1020
+ column: str,
1021
+ ) -> list[ChunkProcessingStatus]:
1022
+ """Process multiple chunks in sequence.
1023
+
1024
+ Args:
1025
+ chunks: Iterator of chunks to process
1026
+ column: Column name to analyze
1027
+
1028
+ Returns:
1029
+ List of processing statuses for each chunk
1030
+ """
1031
+ statuses = []
1032
+ for chunk in chunks:
1033
+ status = self.process_chunk(chunk, column)
1034
+ statuses.append(status)
1035
+ if status == ChunkProcessingStatus.SKIPPED:
1036
+ # Early termination triggered
1037
+ break
1038
+ return statuses
1039
+
1040
+ def finalize(self, column: str) -> StreamingPatternResult:
1041
+ """Finalize pattern matching for a column.
1042
+
1043
+ Aggregates all chunk statistics into final results.
1044
+
1045
+ Args:
1046
+ column: Column name to finalize
1047
+
1048
+ Returns:
1049
+ StreamingPatternResult with aggregated matches
1050
+ """
1051
+ with self._lock:
1052
+ if column not in self._column_states:
1053
+ return StreamingPatternResult(
1054
+ column=column,
1055
+ matches=[],
1056
+ chunks_processed=0,
1057
+ total_rows=0,
1058
+ processing_time_ms=0.0,
1059
+ aggregation_method=self._aggregation.name,
1060
+ )
1061
+
1062
+ col_state = self._column_states[column]
1063
+ col_state.finalize()
1064
+
1065
+ # Aggregate pattern statistics
1066
+ matches = []
1067
+ pattern_stats = {}
1068
+
1069
+ for pattern_name, pattern_state in col_state.pattern_states.items():
1070
+ result = self._aggregation.aggregate(
1071
+ pattern_state,
1072
+ self.config.min_match_ratio,
1073
+ )
1074
+
1075
+ if result is not None:
1076
+ matches.append(result)
1077
+
1078
+ if self.config.collect_statistics:
1079
+ pattern_stats[pattern_name] = {
1080
+ "total_matches": pattern_state.total_matches,
1081
+ "total_rows": pattern_state.total_rows,
1082
+ "overall_ratio": pattern_state.overall_match_ratio,
1083
+ "chunks_with_matches": sum(
1084
+ 1 for s in pattern_state.chunk_stats if s.match_count > 0
1085
+ ),
1086
+ "variance": pattern_state.variance,
1087
+ "is_consistent": pattern_state.is_consistent,
1088
+ }
1089
+
1090
+ # Sort by match ratio
1091
+ matches.sort(key=lambda r: (-r.match_ratio, -r.pattern.priority))
1092
+
1093
+ # Infer type from best match
1094
+ inferred_type = matches[0].pattern.data_type if matches else None
1095
+
1096
+ self._emit_event("processing_complete", column, col_state.chunks_processed, {
1097
+ "matches": len(matches),
1098
+ "total_rows": col_state.total_rows,
1099
+ })
1100
+
1101
+ return StreamingPatternResult(
1102
+ column=column,
1103
+ matches=matches,
1104
+ chunks_processed=col_state.chunks_processed,
1105
+ total_rows=col_state.total_rows,
1106
+ processing_time_ms=col_state.processing_duration_ms,
1107
+ aggregation_method=self._aggregation.name,
1108
+ inferred_type=inferred_type,
1109
+ pattern_stats=pattern_stats,
1110
+ )
1111
+
1112
+ def finalize_all(self) -> dict[str, StreamingPatternResult]:
1113
+ """Finalize pattern matching for all processed columns.
1114
+
1115
+ Returns:
1116
+ Dictionary mapping column names to their results
1117
+ """
1118
+ with self._lock:
1119
+ columns = list(self._column_states.keys())
1120
+
1121
+ return {column: self.finalize(column) for column in columns}
1122
+
1123
+ def reset(self, column: str | None = None) -> None:
1124
+ """Reset state for a column or all columns.
1125
+
1126
+ Args:
1127
+ column: Column to reset, or None to reset all
1128
+ """
1129
+ with self._lock:
1130
+ if column is None:
1131
+ self._column_states.clear()
1132
+ elif column in self._column_states:
1133
+ del self._column_states[column]
1134
+
1135
+ def get_current_state(self, column: str) -> ColumnPatternState | None:
1136
+ """Get current state for a column (for monitoring).
1137
+
1138
+ Args:
1139
+ column: Column name
1140
+
1141
+ Returns:
1142
+ Current column state or None
1143
+ """
1144
+ with self._lock:
1145
+ return self._column_states.get(column)
1146
+
1147
+ def _should_terminate_early(self, state: ColumnPatternState) -> bool:
1148
+ """Check if early termination should be triggered."""
1149
+ if not self.config.enable_early_termination:
1150
+ return False
1151
+
1152
+ if state.chunks_processed < self.config.early_termination_chunks:
1153
+ return False
1154
+
1155
+ # Check if all patterns are clearly above or below threshold
1156
+ for pattern_state in state.pattern_states.values():
1157
+ if pattern_state.chunks_processed < 2:
1158
+ continue
1159
+
1160
+ ratio = pattern_state.overall_match_ratio
1161
+ std = pattern_state.std_deviation
1162
+
1163
+ # Pattern is clearly matching
1164
+ if ratio - 2 * std > self.config.min_match_ratio:
1165
+ continue
1166
+
1167
+ # Pattern is clearly not matching
1168
+ if ratio + 2 * std < self.config.min_match_ratio:
1169
+ continue
1170
+
1171
+ # Pattern is uncertain - continue processing
1172
+ return False
1173
+
1174
+ # All patterns are determined
1175
+ return True
1176
+
1177
+ def _emit_event(
1178
+ self,
1179
+ event_type: str,
1180
+ column: str,
1181
+ chunk_index: int,
1182
+ data: dict[str, Any],
1183
+ ) -> None:
1184
+ """Emit a pattern event."""
1185
+ if self._event_callback:
1186
+ event = PatternEvent(
1187
+ event_type=event_type,
1188
+ column=column,
1189
+ chunk_index=chunk_index,
1190
+ data=data,
1191
+ )
1192
+ try:
1193
+ self._event_callback(event)
1194
+ except Exception as e:
1195
+ logger.warning(f"Event callback failed: {e}")
1196
+
1197
+
1198
+ # =============================================================================
1199
+ # Integration with StreamingProfiler
1200
+ # =============================================================================
1201
+
1202
+
1203
+ class StreamingPatternIntegration:
1204
+ """Integration layer for StreamingProfiler.
1205
+
1206
+ This class provides the interface for integrating streaming
1207
+ pattern matching with the existing StreamingProfiler.
1208
+ """
1209
+
1210
+ def __init__(
1211
+ self,
1212
+ config: StreamingPatternConfig | None = None,
1213
+ patterns: PatternRegistry | None = None,
1214
+ ):
1215
+ """Initialize integration.
1216
+
1217
+ Args:
1218
+ config: Pattern matching configuration
1219
+ patterns: Pattern registry to use
1220
+ """
1221
+ self.config = config or StreamingPatternConfig.balanced()
1222
+ self.matcher = StreamingPatternMatcher(
1223
+ config=self.config,
1224
+ patterns=patterns,
1225
+ )
1226
+
1227
+ def process_column_chunk(
1228
+ self,
1229
+ chunk: pl.LazyFrame | pl.DataFrame,
1230
+ column: str,
1231
+ chunk_index: int,
1232
+ ) -> None:
1233
+ """Process a column in a chunk.
1234
+
1235
+ Called by StreamingProfiler for each chunk.
1236
+
1237
+ Args:
1238
+ chunk: Data chunk
1239
+ column: Column name
1240
+ chunk_index: Index of this chunk
1241
+ """
1242
+ self.matcher.process_chunk(chunk, column, chunk_index)
1243
+
1244
+ def get_column_patterns(self, column: str) -> tuple[PatternMatch, ...]:
1245
+ """Get detected patterns for a column.
1246
+
1247
+ Called by StreamingProfiler when building ColumnProfile.
1248
+
1249
+ Args:
1250
+ column: Column name
1251
+
1252
+ Returns:
1253
+ Tuple of PatternMatch objects
1254
+ """
1255
+ result = self.matcher.finalize(column)
1256
+ return tuple(r.to_pattern_match() for r in result.matches)
1257
+
1258
+ def get_inferred_type(self, column: str) -> DataType | None:
1259
+ """Get inferred type for a column.
1260
+
1261
+ Args:
1262
+ column: Column name
1263
+
1264
+ Returns:
1265
+ Inferred DataType or None
1266
+ """
1267
+ result = self.matcher.finalize(column)
1268
+ return result.inferred_type
1269
+
1270
+ def reset(self) -> None:
1271
+ """Reset all state."""
1272
+ self.matcher.reset()
1273
+
1274
+
1275
+ # =============================================================================
1276
+ # Convenience Functions
1277
+ # =============================================================================
1278
+
1279
+
1280
+ def create_streaming_matcher(
1281
+ aggregation: str | AggregationMethod = "adaptive",
1282
+ min_match_ratio: float = 0.8,
1283
+ **kwargs: Any,
1284
+ ) -> StreamingPatternMatcher:
1285
+ """Create a streaming pattern matcher with common options.
1286
+
1287
+ Args:
1288
+ aggregation: Aggregation method name or enum
1289
+ min_match_ratio: Minimum match ratio threshold
1290
+ **kwargs: Additional config options
1291
+
1292
+ Returns:
1293
+ Configured StreamingPatternMatcher
1294
+
1295
+ Example:
1296
+ matcher = create_streaming_matcher(
1297
+ aggregation="consensus",
1298
+ min_match_ratio=0.9,
1299
+ )
1300
+ """
1301
+ if isinstance(aggregation, str):
1302
+ aggregation = AggregationMethod(aggregation)
1303
+
1304
+ config = StreamingPatternConfig(
1305
+ aggregation_method=aggregation,
1306
+ min_match_ratio=min_match_ratio,
1307
+ **kwargs,
1308
+ )
1309
+
1310
+ return StreamingPatternMatcher(config=config)
1311
+
1312
+
1313
+ def stream_match_patterns(
1314
+ chunks: Iterator[pl.LazyFrame | pl.DataFrame],
1315
+ column: str,
1316
+ *,
1317
+ aggregation: str = "adaptive",
1318
+ min_ratio: float = 0.8,
1319
+ ) -> StreamingPatternResult:
1320
+ """Convenience function for streaming pattern matching.
1321
+
1322
+ Args:
1323
+ chunks: Iterator of data chunks
1324
+ column: Column to analyze
1325
+ aggregation: Aggregation method
1326
+ min_ratio: Minimum match ratio
1327
+
1328
+ Returns:
1329
+ StreamingPatternResult
1330
+
1331
+ Example:
1332
+ from truthound.profiler.streaming_patterns import stream_match_patterns
1333
+
1334
+ # From file chunks
1335
+ result = stream_match_patterns(
1336
+ file_chunk_iterator("data.csv"),
1337
+ "email_column",
1338
+ )
1339
+
1340
+ print(f"Best match: {result.best_match.pattern.name}")
1341
+ print(f"Chunks processed: {result.chunks_processed}")
1342
+ """
1343
+ matcher = create_streaming_matcher(
1344
+ aggregation=aggregation,
1345
+ min_match_ratio=min_ratio,
1346
+ )
1347
+
1348
+ matcher.process_chunks(chunks, column)
1349
+ return matcher.finalize(column)
1350
+
1351
+
1352
+ def get_available_aggregation_methods() -> list[str]:
1353
+ """Get list of available aggregation methods."""
1354
+ return aggregation_strategy_registry.list_strategies()