truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1276 @@
1
+ """ML-based type inference beyond pattern matching.
2
+
3
+ This module provides machine learning based type inference that considers:
4
+ - Column context (name, position, neighboring columns)
5
+ - Value distribution patterns
6
+ - Semantic relationships
7
+ - Historical learning from user feedback
8
+
9
+ Key features:
10
+ - Pluggable model architecture
11
+ - Feature extraction pipeline
12
+ - Online learning support
13
+ - Confidence calibration
14
+
15
+ Example:
16
+ from truthound.profiler.ml_inference import (
17
+ MLTypeInferrer,
18
+ ContextFeatureExtractor,
19
+ create_inference_model,
20
+ )
21
+
22
+ # Create inferrer with default model
23
+ inferrer = MLTypeInferrer()
24
+
25
+ # Infer type with context
26
+ result = inferrer.infer(column, context={
27
+ "column_name": "email_address",
28
+ "table_name": "users",
29
+ "sample_values": ["a@b.com", "c@d.org"],
30
+ })
31
+
32
+ print(f"Type: {result.inferred_type}, Confidence: {result.confidence:.2%}")
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import hashlib
38
+ import json
39
+ import logging
40
+ import math
41
+ import os
42
+ import pickle
43
+ import re
44
+ import threading
45
+ from abc import ABC, abstractmethod
46
+ from collections import Counter, defaultdict
47
+ from dataclasses import dataclass, field
48
+ from datetime import datetime
49
+ from enum import Enum
50
+ from pathlib import Path
51
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
52
+
53
+ import polars as pl
54
+
55
+ from truthound.profiler.base import DataType, ColumnProfile
56
+
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ # =============================================================================
62
+ # Feature Types
63
+ # =============================================================================
64
+
65
+
66
+ class FeatureType(str, Enum):
67
+ """Types of features for ML inference."""
68
+
69
+ NAME_BASED = "name_based"
70
+ VALUE_BASED = "value_based"
71
+ STATISTICAL = "statistical"
72
+ CONTEXTUAL = "contextual"
73
+ PATTERN_BASED = "pattern_based"
74
+
75
+
76
+ @dataclass
77
+ class Feature:
78
+ """Single feature for ML model."""
79
+
80
+ name: str
81
+ value: float
82
+ feature_type: FeatureType
83
+ importance: float = 1.0
84
+ metadata: dict[str, Any] = field(default_factory=dict)
85
+
86
+
87
+ @dataclass
88
+ class FeatureVector:
89
+ """Vector of features for a column."""
90
+
91
+ column_name: str
92
+ features: list[Feature]
93
+ raw_values: dict[str, Any] = field(default_factory=dict)
94
+
95
+ def to_array(self) -> list[float]:
96
+ """Convert to numeric array for ML model."""
97
+ return [f.value for f in self.features]
98
+
99
+ def to_dict(self) -> dict[str, float]:
100
+ """Convert to named dictionary."""
101
+ return {f.name: f.value for f in self.features}
102
+
103
+ def get_feature(self, name: str) -> Feature | None:
104
+ """Get feature by name."""
105
+ for f in self.features:
106
+ if f.name == name:
107
+ return f
108
+ return None
109
+
110
+
111
+ # =============================================================================
112
+ # Inference Result
113
+ # =============================================================================
114
+
115
+
116
+ @dataclass
117
+ class InferenceResult:
118
+ """Result of ML type inference."""
119
+
120
+ column_name: str
121
+ inferred_type: DataType
122
+ confidence: float
123
+ alternatives: list[tuple[DataType, float]] = field(default_factory=list)
124
+ reasoning: list[str] = field(default_factory=list)
125
+ features_used: list[str] = field(default_factory=list)
126
+ model_version: str = "1.0"
127
+ inference_time_ms: float = 0.0
128
+
129
+ def to_dict(self) -> dict[str, Any]:
130
+ return {
131
+ "column_name": self.column_name,
132
+ "inferred_type": self.inferred_type.value,
133
+ "confidence": self.confidence,
134
+ "alternatives": [
135
+ {"type": t.value, "confidence": c}
136
+ for t, c in self.alternatives
137
+ ],
138
+ "reasoning": self.reasoning,
139
+ "features_used": self.features_used,
140
+ "model_version": self.model_version,
141
+ "inference_time_ms": self.inference_time_ms,
142
+ }
143
+
144
+
145
+ # =============================================================================
146
+ # Feature Extractor Protocol
147
+ # =============================================================================
148
+
149
+
150
+ class FeatureExtractor(ABC):
151
+ """Abstract base for feature extractors."""
152
+
153
+ name: str = "base"
154
+ feature_type: FeatureType = FeatureType.VALUE_BASED
155
+
156
+ @abstractmethod
157
+ def extract(
158
+ self,
159
+ column: pl.Series,
160
+ context: dict[str, Any],
161
+ ) -> list[Feature]:
162
+ """Extract features from column.
163
+
164
+ Args:
165
+ column: Column data
166
+ context: Additional context (column name, table info, etc.)
167
+
168
+ Returns:
169
+ List of extracted features
170
+ """
171
+ pass
172
+
173
+
174
+ class NameFeatureExtractor(FeatureExtractor):
175
+ """Extract features from column names.
176
+
177
+ Uses keyword matching and embedding similarity.
178
+ """
179
+
180
+ name = "name_features"
181
+ feature_type = FeatureType.NAME_BASED
182
+
183
+ # Keywords associated with each type
184
+ TYPE_KEYWORDS: dict[DataType, list[str]] = {
185
+ DataType.EMAIL: ["email", "mail", "e_mail", "correo"],
186
+ DataType.PHONE: ["phone", "tel", "mobile", "cell", "fax", "telephone"],
187
+ DataType.URL: ["url", "link", "href", "website", "uri", "endpoint"],
188
+ DataType.UUID: ["uuid", "guid", "id", "identifier", "uid"],
189
+ DataType.DATE: ["date", "day", "birth", "created", "updated", "modified"],
190
+ DataType.DATETIME: ["datetime", "timestamp", "time", "at", "when"],
191
+ DataType.INTEGER: ["count", "num", "qty", "quantity", "amount", "total", "id"],
192
+ DataType.FLOAT: ["price", "rate", "ratio", "percent", "score", "value"],
193
+ DataType.BOOLEAN: ["is_", "has_", "flag", "active", "enabled", "valid"],
194
+ DataType.CURRENCY: ["price", "cost", "amount", "fee", "payment", "salary"],
195
+ DataType.PERCENTAGE: ["percent", "pct", "ratio", "rate"],
196
+ DataType.KOREAN_PHONE: ["phone", "hp", "tel", "mobile", "연락처", "전화"],
197
+ DataType.KOREAN_RRN: ["rrn", "resident", "주민", "jumin"],
198
+ DataType.KOREAN_BUSINESS_NUMBER: ["business", "사업자", "brn"],
199
+ DataType.CATEGORICAL: ["type", "status", "category", "class", "kind", "level"],
200
+ DataType.IDENTIFIER: ["id", "key", "code", "no", "number"],
201
+ }
202
+
203
+ def extract(
204
+ self,
205
+ column: pl.Series,
206
+ context: dict[str, Any],
207
+ ) -> list[Feature]:
208
+ features = []
209
+ col_name = context.get("column_name", column.name or "").lower()
210
+
211
+ # Clean column name
212
+ clean_name = re.sub(r"[^a-z0-9_]", "_", col_name)
213
+ tokens = [t for t in clean_name.split("_") if t]
214
+
215
+ # Check each type's keywords
216
+ for dtype, keywords in self.TYPE_KEYWORDS.items():
217
+ score = 0.0
218
+ for keyword in keywords:
219
+ if keyword in col_name:
220
+ score += 1.0
221
+ elif any(keyword in token for token in tokens):
222
+ score += 0.5
223
+
224
+ if score > 0:
225
+ features.append(Feature(
226
+ name=f"name_match_{dtype.value}",
227
+ value=min(1.0, score / len(keywords)),
228
+ feature_type=self.feature_type,
229
+ ))
230
+
231
+ # Add general name features
232
+ features.append(Feature(
233
+ name="name_length",
234
+ value=min(1.0, len(col_name) / 50),
235
+ feature_type=self.feature_type,
236
+ ))
237
+
238
+ features.append(Feature(
239
+ name="name_has_underscore",
240
+ value=1.0 if "_" in col_name else 0.0,
241
+ feature_type=self.feature_type,
242
+ ))
243
+
244
+ features.append(Feature(
245
+ name="name_has_number",
246
+ value=1.0 if any(c.isdigit() for c in col_name) else 0.0,
247
+ feature_type=self.feature_type,
248
+ ))
249
+
250
+ return features
251
+
252
+
253
+ class ValueFeatureExtractor(FeatureExtractor):
254
+ """Extract features from actual values."""
255
+
256
+ name = "value_features"
257
+ feature_type = FeatureType.VALUE_BASED
258
+
259
+ def extract(
260
+ self,
261
+ column: pl.Series,
262
+ context: dict[str, Any],
263
+ ) -> list[Feature]:
264
+ features = []
265
+
266
+ # Sample values for analysis
267
+ sample_size = min(1000, len(column))
268
+ sample = column.drop_nulls().head(sample_size)
269
+
270
+ if len(sample) == 0:
271
+ return [Feature(
272
+ name="all_null",
273
+ value=1.0,
274
+ feature_type=self.feature_type,
275
+ )]
276
+
277
+ # String analysis
278
+ if column.dtype == pl.Utf8:
279
+ str_features = self._extract_string_features(sample)
280
+ features.extend(str_features)
281
+
282
+ # Numeric analysis
283
+ elif column.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
284
+ num_features = self._extract_numeric_features(sample)
285
+ features.extend(num_features)
286
+
287
+ # Boolean
288
+ elif column.dtype == pl.Boolean:
289
+ features.append(Feature(
290
+ name="is_boolean",
291
+ value=1.0,
292
+ feature_type=self.feature_type,
293
+ ))
294
+
295
+ # General features
296
+ features.append(Feature(
297
+ name="null_ratio",
298
+ value=column.null_count() / len(column) if len(column) > 0 else 0,
299
+ feature_type=self.feature_type,
300
+ ))
301
+
302
+ features.append(Feature(
303
+ name="unique_ratio",
304
+ value=column.n_unique() / len(column) if len(column) > 0 else 0,
305
+ feature_type=self.feature_type,
306
+ ))
307
+
308
+ return features
309
+
310
+ def _extract_string_features(self, sample: pl.Series) -> list[Feature]:
311
+ """Extract features from string values."""
312
+ features = []
313
+
314
+ # Length statistics
315
+ lengths = sample.str.len_chars()
316
+ avg_len = lengths.mean() or 0
317
+ std_len = lengths.std() or 0
318
+
319
+ features.append(Feature(
320
+ name="avg_string_length",
321
+ value=min(1.0, avg_len / 100),
322
+ feature_type=self.feature_type,
323
+ ))
324
+
325
+ features.append(Feature(
326
+ name="length_variance",
327
+ value=min(1.0, std_len / avg_len) if avg_len > 0 else 0,
328
+ feature_type=self.feature_type,
329
+ ))
330
+
331
+ # Character type ratios
332
+ sample_str = sample.to_list()[:100] # Limit for performance
333
+
334
+ has_at = sum(1 for s in sample_str if "@" in str(s)) / len(sample_str)
335
+ has_dot = sum(1 for s in sample_str if "." in str(s)) / len(sample_str)
336
+ has_slash = sum(1 for s in sample_str if "/" in str(s)) / len(sample_str)
337
+ has_dash = sum(1 for s in sample_str if "-" in str(s)) / len(sample_str)
338
+ has_colon = sum(1 for s in sample_str if ":" in str(s)) / len(sample_str)
339
+
340
+ features.extend([
341
+ Feature(name="has_at_sign", value=has_at, feature_type=self.feature_type),
342
+ Feature(name="has_dot", value=has_dot, feature_type=self.feature_type),
343
+ Feature(name="has_slash", value=has_slash, feature_type=self.feature_type),
344
+ Feature(name="has_dash", value=has_dash, feature_type=self.feature_type),
345
+ Feature(name="has_colon", value=has_colon, feature_type=self.feature_type),
346
+ ])
347
+
348
+ # Digit ratio
349
+ digit_ratios = []
350
+ for s in sample_str:
351
+ s = str(s)
352
+ if len(s) > 0:
353
+ digit_ratios.append(sum(c.isdigit() for c in s) / len(s))
354
+
355
+ avg_digit_ratio = sum(digit_ratios) / len(digit_ratios) if digit_ratios else 0
356
+ features.append(Feature(
357
+ name="digit_ratio",
358
+ value=avg_digit_ratio,
359
+ feature_type=self.feature_type,
360
+ ))
361
+
362
+ # Check for common patterns
363
+ email_pattern = sum(1 for s in sample_str if re.match(r"^[^@]+@[^@]+\.[^@]+$", str(s))) / len(sample_str)
364
+ uuid_pattern = sum(1 for s in sample_str if re.match(
365
+ r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
366
+ str(s)
367
+ )) / len(sample_str)
368
+
369
+ features.extend([
370
+ Feature(name="email_pattern_ratio", value=email_pattern, feature_type=self.feature_type),
371
+ Feature(name="uuid_pattern_ratio", value=uuid_pattern, feature_type=self.feature_type),
372
+ ])
373
+
374
+ return features
375
+
376
+ def _extract_numeric_features(self, sample: pl.Series) -> list[Feature]:
377
+ """Extract features from numeric values."""
378
+ features = []
379
+
380
+ # Basic stats
381
+ min_val = sample.min() or 0
382
+ max_val = sample.max() or 0
383
+ mean_val = sample.mean() or 0
384
+ std_val = sample.std() or 0
385
+
386
+ # Range features
387
+ range_val = max_val - min_val
388
+ features.append(Feature(
389
+ name="numeric_range_log",
390
+ value=math.log10(range_val + 1) / 10, # Normalize
391
+ feature_type=self.feature_type,
392
+ ))
393
+
394
+ # Check if values look like IDs (sequential integers)
395
+ if sample.dtype in [pl.Int32, pl.Int64]:
396
+ sorted_sample = sample.sort()
397
+ diffs = sorted_sample.diff().drop_nulls()
398
+ is_sequential = (diffs == 1).mean() if len(diffs) > 0 else 0
399
+ features.append(Feature(
400
+ name="is_sequential",
401
+ value=is_sequential or 0,
402
+ feature_type=self.feature_type,
403
+ ))
404
+
405
+ # Check for percentage-like values (0-100 or 0-1)
406
+ in_0_1 = ((sample >= 0) & (sample <= 1)).mean()
407
+ in_0_100 = ((sample >= 0) & (sample <= 100)).mean()
408
+
409
+ features.extend([
410
+ Feature(name="in_0_1_range", value=in_0_1 or 0, feature_type=self.feature_type),
411
+ Feature(name="in_0_100_range", value=in_0_100 or 0, feature_type=self.feature_type),
412
+ ])
413
+
414
+ # Check for currency-like (2 decimal places)
415
+ if sample.dtype in [pl.Float32, pl.Float64]:
416
+ decimal_places = []
417
+ for v in sample.head(100).to_list():
418
+ if v is not None:
419
+ s = f"{v:.10f}".rstrip("0")
420
+ if "." in s:
421
+ decimal_places.append(len(s.split(".")[1]))
422
+
423
+ if decimal_places:
424
+ avg_decimals = sum(decimal_places) / len(decimal_places)
425
+ is_currency_like = 1.0 if 1.5 <= avg_decimals <= 2.5 else 0.0
426
+ features.append(Feature(
427
+ name="is_currency_like",
428
+ value=is_currency_like,
429
+ feature_type=self.feature_type,
430
+ ))
431
+
432
+ return features
433
+
434
+
435
+ class StatisticalFeatureExtractor(FeatureExtractor):
436
+ """Extract statistical distribution features."""
437
+
438
+ name = "statistical_features"
439
+ feature_type = FeatureType.STATISTICAL
440
+
441
+ def extract(
442
+ self,
443
+ column: pl.Series,
444
+ context: dict[str, Any],
445
+ ) -> list[Feature]:
446
+ features = []
447
+
448
+ non_null = column.drop_nulls()
449
+ if len(non_null) == 0:
450
+ return features
451
+
452
+ # Cardinality
453
+ n_unique = non_null.n_unique()
454
+ n_total = len(non_null)
455
+ cardinality = n_unique / n_total if n_total > 0 else 0
456
+
457
+ features.append(Feature(
458
+ name="cardinality",
459
+ value=cardinality,
460
+ feature_type=self.feature_type,
461
+ ))
462
+
463
+ # Is it low cardinality (categorical)?
464
+ is_categorical = 1.0 if n_unique < 20 and cardinality < 0.05 else 0.0
465
+ features.append(Feature(
466
+ name="is_categorical",
467
+ value=is_categorical,
468
+ feature_type=self.feature_type,
469
+ ))
470
+
471
+ # Is it high cardinality (identifier)?
472
+ is_identifier = 1.0 if cardinality > 0.95 else 0.0
473
+ features.append(Feature(
474
+ name="is_identifier",
475
+ value=is_identifier,
476
+ feature_type=self.feature_type,
477
+ ))
478
+
479
+ # Value frequency distribution
480
+ value_counts = non_null.value_counts()
481
+ if len(value_counts) > 0:
482
+ counts = value_counts.get_column("count").to_list()
483
+ max_freq = max(counts) / n_total
484
+ features.append(Feature(
485
+ name="max_frequency",
486
+ value=max_freq,
487
+ feature_type=self.feature_type,
488
+ ))
489
+
490
+ # Entropy
491
+ probs = [c / n_total for c in counts]
492
+ entropy = -sum(p * math.log2(p) for p in probs if p > 0)
493
+ max_entropy = math.log2(n_unique) if n_unique > 1 else 1
494
+ normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
495
+
496
+ features.append(Feature(
497
+ name="normalized_entropy",
498
+ value=normalized_entropy,
499
+ feature_type=self.feature_type,
500
+ ))
501
+
502
+ return features
503
+
504
+
505
+ class ContextFeatureExtractor(FeatureExtractor):
506
+ """Extract features from column context."""
507
+
508
+ name = "context_features"
509
+ feature_type = FeatureType.CONTEXTUAL
510
+
511
+ def extract(
512
+ self,
513
+ column: pl.Series,
514
+ context: dict[str, Any],
515
+ ) -> list[Feature]:
516
+ features = []
517
+
518
+ # Table-level context
519
+ table_name = context.get("table_name", "").lower()
520
+ if table_name:
521
+ # Check if table name gives hints
522
+ if any(kw in table_name for kw in ["user", "customer", "member"]):
523
+ features.append(Feature(
524
+ name="table_is_user_related",
525
+ value=1.0,
526
+ feature_type=self.feature_type,
527
+ ))
528
+
529
+ if any(kw in table_name for kw in ["order", "transaction", "payment"]):
530
+ features.append(Feature(
531
+ name="table_is_transaction_related",
532
+ value=1.0,
533
+ feature_type=self.feature_type,
534
+ ))
535
+
536
+ # Column position
537
+ col_index = context.get("column_index", 0)
538
+ total_cols = context.get("total_columns", 1)
539
+ position_ratio = col_index / total_cols if total_cols > 0 else 0
540
+
541
+ features.append(Feature(
542
+ name="column_position",
543
+ value=position_ratio,
544
+ feature_type=self.feature_type,
545
+ ))
546
+
547
+ # First column is often ID
548
+ if col_index == 0:
549
+ features.append(Feature(
550
+ name="is_first_column",
551
+ value=1.0,
552
+ feature_type=self.feature_type,
553
+ ))
554
+
555
+ # Neighboring columns
556
+ neighbor_names = context.get("neighbor_columns", [])
557
+ for name in neighbor_names:
558
+ name = name.lower()
559
+ if "email" in name:
560
+ features.append(Feature(
561
+ name="neighbor_has_email",
562
+ value=1.0,
563
+ feature_type=self.feature_type,
564
+ ))
565
+ if "name" in name:
566
+ features.append(Feature(
567
+ name="neighbor_has_name",
568
+ value=1.0,
569
+ feature_type=self.feature_type,
570
+ ))
571
+
572
+ return features
573
+
574
+
575
+ # =============================================================================
576
+ # Feature Extractor Registry
577
+ # =============================================================================
578
+
579
+
580
+ class FeatureExtractorRegistry:
581
+ """Registry for feature extractors."""
582
+
583
+ def __init__(self) -> None:
584
+ self._extractors: dict[str, FeatureExtractor] = {}
585
+
586
+ def register(self, extractor: FeatureExtractor) -> None:
587
+ """Register an extractor."""
588
+ self._extractors[extractor.name] = extractor
589
+
590
+ def get(self, name: str) -> FeatureExtractor:
591
+ """Get extractor by name."""
592
+ if name not in self._extractors:
593
+ raise KeyError(f"Unknown extractor: {name}")
594
+ return self._extractors[name]
595
+
596
+ def list_extractors(self) -> list[str]:
597
+ """List registered extractors."""
598
+ return list(self._extractors.keys())
599
+
600
+ def extract_all(
601
+ self,
602
+ column: pl.Series,
603
+ context: dict[str, Any],
604
+ ) -> FeatureVector:
605
+ """Extract features using all registered extractors."""
606
+ all_features = []
607
+ for extractor in self._extractors.values():
608
+ try:
609
+ features = extractor.extract(column, context)
610
+ all_features.extend(features)
611
+ except Exception as e:
612
+ logger.warning(f"Extractor {extractor.name} failed: {e}")
613
+
614
+ return FeatureVector(
615
+ column_name=context.get("column_name", column.name or ""),
616
+ features=all_features,
617
+ )
618
+
619
+
620
+ # Global registry with default extractors
621
+ feature_extractor_registry = FeatureExtractorRegistry()
622
+ feature_extractor_registry.register(NameFeatureExtractor())
623
+ feature_extractor_registry.register(ValueFeatureExtractor())
624
+ feature_extractor_registry.register(StatisticalFeatureExtractor())
625
+ feature_extractor_registry.register(ContextFeatureExtractor())
626
+
627
+
628
+ # =============================================================================
629
+ # ML Model Protocol
630
+ # =============================================================================
631
+
632
+
633
+ class InferenceModel(ABC):
634
+ """Abstract base for inference models."""
635
+
636
+ name: str = "base"
637
+ version: str = "1.0"
638
+
639
+ @abstractmethod
640
+ def predict(
641
+ self,
642
+ features: FeatureVector,
643
+ ) -> list[tuple[DataType, float]]:
644
+ """Predict type probabilities.
645
+
646
+ Args:
647
+ features: Extracted features
648
+
649
+ Returns:
650
+ List of (DataType, probability) sorted by probability
651
+ """
652
+ pass
653
+
654
+ @abstractmethod
655
+ def train(
656
+ self,
657
+ training_data: list[tuple[FeatureVector, DataType]],
658
+ ) -> None:
659
+ """Train/update the model.
660
+
661
+ Args:
662
+ training_data: List of (features, true_type) pairs
663
+ """
664
+ pass
665
+
666
+ def save(self, path: str | Path) -> None:
667
+ """Save model to file."""
668
+ pass
669
+
670
+ def load(self, path: str | Path) -> None:
671
+ """Load model from file."""
672
+ pass
673
+
674
+
675
+ class RuleBasedModel(InferenceModel):
676
+ """Rule-based inference model.
677
+
678
+ Uses weighted rules derived from feature values to infer types.
679
+ Good baseline that doesn't require training data.
680
+ """
681
+
682
+ name = "rule_based"
683
+ version = "1.0"
684
+
685
+ def __init__(self) -> None:
686
+ # Define rules as (feature_name, operator, threshold, type, weight)
687
+ self.rules: list[tuple[str, str, float, DataType, float]] = [
688
+ # Email rules
689
+ ("email_pattern_ratio", ">=", 0.8, DataType.EMAIL, 0.9),
690
+ ("has_at_sign", ">=", 0.9, DataType.EMAIL, 0.7),
691
+ ("name_match_email", ">=", 0.5, DataType.EMAIL, 0.5),
692
+
693
+ # UUID rules
694
+ ("uuid_pattern_ratio", ">=", 0.8, DataType.UUID, 0.95),
695
+ ("name_match_uuid", ">=", 0.5, DataType.UUID, 0.6),
696
+
697
+ # Identifier rules
698
+ ("is_identifier", ">=", 0.9, DataType.IDENTIFIER, 0.7),
699
+ ("is_first_column", ">=", 0.5, DataType.IDENTIFIER, 0.3),
700
+ ("name_match_identifier", ">=", 0.5, DataType.IDENTIFIER, 0.4),
701
+
702
+ # Categorical rules
703
+ ("is_categorical", ">=", 0.8, DataType.CATEGORICAL, 0.8),
704
+ ("name_match_categorical", ">=", 0.5, DataType.CATEGORICAL, 0.4),
705
+
706
+ # Date/DateTime rules
707
+ ("name_match_date", ">=", 0.5, DataType.DATE, 0.5),
708
+ ("name_match_datetime", ">=", 0.5, DataType.DATETIME, 0.5),
709
+
710
+ # Numeric rules
711
+ ("is_currency_like", ">=", 0.8, DataType.CURRENCY, 0.7),
712
+ ("in_0_100_range", ">=", 0.9, DataType.PERCENTAGE, 0.5),
713
+ ("in_0_1_range", ">=", 0.95, DataType.PERCENTAGE, 0.6),
714
+
715
+ # Phone rules
716
+ ("name_match_phone", ">=", 0.5, DataType.PHONE, 0.5),
717
+ ("name_match_korean_phone", ">=", 0.5, DataType.KOREAN_PHONE, 0.6),
718
+
719
+ # Boolean
720
+ ("is_boolean", ">=", 0.9, DataType.BOOLEAN, 0.95),
721
+ ]
722
+
723
+ def predict(
724
+ self,
725
+ features: FeatureVector,
726
+ ) -> list[tuple[DataType, float]]:
727
+ """Apply rules to predict type."""
728
+ type_scores: dict[DataType, float] = defaultdict(float)
729
+ feature_dict = features.to_dict()
730
+
731
+ for feature_name, operator, threshold, dtype, weight in self.rules:
732
+ value = feature_dict.get(feature_name, 0.0)
733
+
734
+ match = False
735
+ if operator == ">=":
736
+ match = value >= threshold
737
+ elif operator == "<=":
738
+ match = value <= threshold
739
+ elif operator == "==":
740
+ match = abs(value - threshold) < 0.01
741
+
742
+ if match:
743
+ type_scores[dtype] += weight * value
744
+
745
+ # Normalize scores to probabilities
746
+ total = sum(type_scores.values())
747
+ if total > 0:
748
+ probabilities = [
749
+ (dtype, score / total)
750
+ for dtype, score in type_scores.items()
751
+ ]
752
+ else:
753
+ # Default to string if no rules match
754
+ probabilities = [(DataType.STRING, 0.5)]
755
+
756
+ # Sort by probability
757
+ probabilities.sort(key=lambda x: x[1], reverse=True)
758
+
759
+ return probabilities
760
+
761
+ def train(
762
+ self,
763
+ training_data: list[tuple[FeatureVector, DataType]],
764
+ ) -> None:
765
+ """Rule-based model doesn't need training, but could be tuned."""
766
+ pass
767
+
768
+
769
+ class NaiveBayesModel(InferenceModel):
770
+ """Naive Bayes classifier for type inference.
771
+
772
+ Simple probabilistic model that works well with limited training data.
773
+ """
774
+
775
+ name = "naive_bayes"
776
+ version = "1.0"
777
+
778
+ def __init__(self) -> None:
779
+ self.class_priors: dict[DataType, float] = {}
780
+ self.feature_likelihoods: dict[str, dict[DataType, tuple[float, float]]] = {}
781
+ self._trained = False
782
+
783
+ def predict(
784
+ self,
785
+ features: FeatureVector,
786
+ ) -> list[tuple[DataType, float]]:
787
+ """Predict using Naive Bayes."""
788
+ if not self._trained:
789
+ # Fall back to rule-based if not trained
790
+ return RuleBasedModel().predict(features)
791
+
792
+ log_posteriors: dict[DataType, float] = {}
793
+ feature_dict = features.to_dict()
794
+
795
+ for dtype, prior in self.class_priors.items():
796
+ log_posterior = math.log(prior + 1e-10)
797
+
798
+ for feature_name, value in feature_dict.items():
799
+ if feature_name in self.feature_likelihoods:
800
+ mean, std = self.feature_likelihoods[feature_name].get(
801
+ dtype, (0.5, 0.3)
802
+ )
803
+ # Gaussian likelihood
804
+ if std > 0:
805
+ z = (value - mean) / std
806
+ log_likelihood = -0.5 * z * z - math.log(std) - 0.5 * math.log(2 * math.pi)
807
+ log_posterior += log_likelihood
808
+
809
+ log_posteriors[dtype] = log_posterior
810
+
811
+ # Convert to probabilities
812
+ max_log = max(log_posteriors.values())
813
+ exp_posteriors = {
814
+ dtype: math.exp(lp - max_log)
815
+ for dtype, lp in log_posteriors.items()
816
+ }
817
+ total = sum(exp_posteriors.values())
818
+
819
+ probabilities = [
820
+ (dtype, prob / total)
821
+ for dtype, prob in exp_posteriors.items()
822
+ ]
823
+ probabilities.sort(key=lambda x: x[1], reverse=True)
824
+
825
+ return probabilities
826
+
827
+ def train(
828
+ self,
829
+ training_data: list[tuple[FeatureVector, DataType]],
830
+ ) -> None:
831
+ """Train Naive Bayes classifier."""
832
+ if not training_data:
833
+ return
834
+
835
+ # Count classes
836
+ class_counts: dict[DataType, int] = Counter()
837
+ feature_values: dict[str, dict[DataType, list[float]]] = defaultdict(
838
+ lambda: defaultdict(list)
839
+ )
840
+
841
+ for features, dtype in training_data:
842
+ class_counts[dtype] += 1
843
+ for f in features.features:
844
+ feature_values[f.name][dtype].append(f.value)
845
+
846
+ # Calculate priors
847
+ total = sum(class_counts.values())
848
+ self.class_priors = {
849
+ dtype: count / total
850
+ for dtype, count in class_counts.items()
851
+ }
852
+
853
+ # Calculate feature likelihoods (mean, std for each feature per class)
854
+ for feature_name, class_values in feature_values.items():
855
+ self.feature_likelihoods[feature_name] = {}
856
+ for dtype, values in class_values.items():
857
+ if values:
858
+ mean = sum(values) / len(values)
859
+ variance = sum((v - mean) ** 2 for v in values) / len(values)
860
+ std = math.sqrt(variance) if variance > 0 else 0.1
861
+ self.feature_likelihoods[feature_name][dtype] = (mean, std)
862
+
863
+ self._trained = True
864
+
865
+ def save(self, path: str | Path) -> None:
866
+ """Save model to file."""
867
+ data = {
868
+ "class_priors": {k.value: v for k, v in self.class_priors.items()},
869
+ "feature_likelihoods": {
870
+ fname: {dtype.value: stats for dtype, stats in class_stats.items()}
871
+ for fname, class_stats in self.feature_likelihoods.items()
872
+ },
873
+ "trained": self._trained,
874
+ }
875
+ with open(path, "w") as f:
876
+ json.dump(data, f)
877
+
878
+ def load(self, path: str | Path) -> None:
879
+ """Load model from file."""
880
+ with open(path) as f:
881
+ data = json.load(f)
882
+
883
+ self.class_priors = {
884
+ DataType(k): v for k, v in data["class_priors"].items()
885
+ }
886
+ self.feature_likelihoods = {
887
+ fname: {DataType(dtype): tuple(stats) for dtype, stats in class_stats.items()}
888
+ for fname, class_stats in data["feature_likelihoods"].items()
889
+ }
890
+ self._trained = data["trained"]
891
+
892
+
893
+ class EnsembleModel(InferenceModel):
894
+ """Ensemble of multiple models.
895
+
896
+ Combines predictions from multiple models using weighted voting.
897
+ """
898
+
899
+ name = "ensemble"
900
+ version = "1.0"
901
+
902
+ def __init__(
903
+ self,
904
+ models: list[tuple[InferenceModel, float]] | None = None,
905
+ ):
906
+ """Initialize ensemble.
907
+
908
+ Args:
909
+ models: List of (model, weight) tuples
910
+ """
911
+ self.models = models or [
912
+ (RuleBasedModel(), 0.6),
913
+ (NaiveBayesModel(), 0.4),
914
+ ]
915
+
916
+ def predict(
917
+ self,
918
+ features: FeatureVector,
919
+ ) -> list[tuple[DataType, float]]:
920
+ """Combine predictions from all models."""
921
+ combined_scores: dict[DataType, float] = defaultdict(float)
922
+
923
+ for model, weight in self.models:
924
+ predictions = model.predict(features)
925
+ for dtype, prob in predictions:
926
+ combined_scores[dtype] += weight * prob
927
+
928
+ # Normalize
929
+ total = sum(combined_scores.values())
930
+ if total > 0:
931
+ probabilities = [
932
+ (dtype, score / total)
933
+ for dtype, score in combined_scores.items()
934
+ ]
935
+ else:
936
+ probabilities = [(DataType.STRING, 1.0)]
937
+
938
+ probabilities.sort(key=lambda x: x[1], reverse=True)
939
+ return probabilities
940
+
941
+ def train(
942
+ self,
943
+ training_data: list[tuple[FeatureVector, DataType]],
944
+ ) -> None:
945
+ """Train all models in ensemble."""
946
+ for model, _ in self.models:
947
+ model.train(training_data)
948
+
949
+
950
+ # =============================================================================
951
+ # Model Registry
952
+ # =============================================================================
953
+
954
+
955
+ class ModelRegistry:
956
+ """Registry for inference models."""
957
+
958
+ def __init__(self) -> None:
959
+ self._models: dict[str, type[InferenceModel]] = {}
960
+
961
+ def register(
962
+ self,
963
+ name: str,
964
+ model_class: type[InferenceModel],
965
+ ) -> None:
966
+ """Register a model class."""
967
+ self._models[name] = model_class
968
+
969
+ def create(self, name: str, **kwargs: Any) -> InferenceModel:
970
+ """Create a model instance."""
971
+ if name not in self._models:
972
+ raise KeyError(f"Unknown model: {name}")
973
+ return self._models[name](**kwargs)
974
+
975
+ def list_models(self) -> list[str]:
976
+ """List available models."""
977
+ return list(self._models.keys())
978
+
979
+
980
+ model_registry = ModelRegistry()
981
+ model_registry.register("rule_based", RuleBasedModel)
982
+ model_registry.register("naive_bayes", NaiveBayesModel)
983
+ model_registry.register("ensemble", EnsembleModel)
984
+
985
+
986
+ # =============================================================================
987
+ # ML Type Inferrer
988
+ # =============================================================================
989
+
990
+
991
+ @dataclass
992
+ class InferrerConfig:
993
+ """Configuration for ML type inferrer."""
994
+
995
+ model: str = "ensemble"
996
+ confidence_threshold: float = 0.5
997
+ use_caching: bool = True
998
+ cache_size: int = 1000
999
+ enable_learning: bool = True
1000
+ model_path: str | None = None
1001
+
1002
+
1003
+ class MLTypeInferrer:
1004
+ """ML-based type inferrer.
1005
+
1006
+ Main interface for ML-powered type inference.
1007
+
1008
+ Example:
1009
+ inferrer = MLTypeInferrer()
1010
+
1011
+ result = inferrer.infer(column, context={
1012
+ "column_name": "email",
1013
+ "table_name": "users",
1014
+ })
1015
+
1016
+ print(f"Inferred: {result.inferred_type} ({result.confidence:.0%})")
1017
+ """
1018
+
1019
+ def __init__(
1020
+ self,
1021
+ model: str | InferenceModel = "ensemble",
1022
+ config: InferrerConfig | None = None,
1023
+ ):
1024
+ self.config = config or InferrerConfig()
1025
+
1026
+ if isinstance(model, InferenceModel):
1027
+ self._model = model
1028
+ else:
1029
+ self._model = model_registry.create(model)
1030
+
1031
+ self._feature_registry = feature_extractor_registry
1032
+ self._cache: dict[str, InferenceResult] = {}
1033
+ self._feedback_buffer: list[tuple[FeatureVector, DataType]] = []
1034
+ self._lock = threading.Lock()
1035
+
1036
+ # Load saved model if path provided
1037
+ if self.config.model_path and Path(self.config.model_path).exists():
1038
+ self._model.load(self.config.model_path)
1039
+
1040
+ def infer(
1041
+ self,
1042
+ column: pl.Series,
1043
+ context: dict[str, Any] | None = None,
1044
+ ) -> InferenceResult:
1045
+ """Infer column type using ML.
1046
+
1047
+ Args:
1048
+ column: Column data
1049
+ context: Additional context information
1050
+
1051
+ Returns:
1052
+ Inference result with type and confidence
1053
+ """
1054
+ import time
1055
+ start = time.time()
1056
+
1057
+ context = context or {}
1058
+ context["column_name"] = context.get("column_name", column.name or "")
1059
+
1060
+ # Check cache
1061
+ cache_key = self._make_cache_key(column, context)
1062
+ if self.config.use_caching and cache_key in self._cache:
1063
+ return self._cache[cache_key]
1064
+
1065
+ # Extract features
1066
+ features = self._feature_registry.extract_all(column, context)
1067
+
1068
+ # Get predictions
1069
+ predictions = self._model.predict(features)
1070
+
1071
+ if not predictions:
1072
+ predictions = [(DataType.STRING, 0.5)]
1073
+
1074
+ # Build result
1075
+ top_type, top_confidence = predictions[0]
1076
+ alternatives = predictions[1:5] # Top 5 alternatives
1077
+
1078
+ # Generate reasoning
1079
+ reasoning = self._generate_reasoning(features, predictions)
1080
+
1081
+ elapsed_ms = (time.time() - start) * 1000
1082
+
1083
+ result = InferenceResult(
1084
+ column_name=context["column_name"],
1085
+ inferred_type=top_type,
1086
+ confidence=top_confidence,
1087
+ alternatives=alternatives,
1088
+ reasoning=reasoning,
1089
+ features_used=[f.name for f in features.features[:10]],
1090
+ model_version=self._model.version,
1091
+ inference_time_ms=elapsed_ms,
1092
+ )
1093
+
1094
+ # Cache result
1095
+ if self.config.use_caching:
1096
+ with self._lock:
1097
+ self._cache[cache_key] = result
1098
+ # LRU eviction
1099
+ if len(self._cache) > self.config.cache_size:
1100
+ oldest_key = next(iter(self._cache))
1101
+ del self._cache[oldest_key]
1102
+
1103
+ return result
1104
+
1105
+ def infer_table(
1106
+ self,
1107
+ df: pl.DataFrame,
1108
+ table_name: str = "",
1109
+ ) -> dict[str, InferenceResult]:
1110
+ """Infer types for all columns in a table.
1111
+
1112
+ Args:
1113
+ df: DataFrame to analyze
1114
+ table_name: Table name for context
1115
+
1116
+ Returns:
1117
+ Dictionary mapping column names to results
1118
+ """
1119
+ results = {}
1120
+ columns = df.columns
1121
+
1122
+ for i, col_name in enumerate(columns):
1123
+ # Build context with neighboring columns
1124
+ neighbors = []
1125
+ if i > 0:
1126
+ neighbors.append(columns[i - 1])
1127
+ if i < len(columns) - 1:
1128
+ neighbors.append(columns[i + 1])
1129
+
1130
+ context = {
1131
+ "column_name": col_name,
1132
+ "table_name": table_name,
1133
+ "column_index": i,
1134
+ "total_columns": len(columns),
1135
+ "neighbor_columns": neighbors,
1136
+ }
1137
+
1138
+ result = self.infer(df.get_column(col_name), context)
1139
+ results[col_name] = result
1140
+
1141
+ return results
1142
+
1143
+ def provide_feedback(
1144
+ self,
1145
+ column: pl.Series,
1146
+ true_type: DataType,
1147
+ context: dict[str, Any] | None = None,
1148
+ ) -> None:
1149
+ """Provide feedback for online learning.
1150
+
1151
+ Args:
1152
+ column: Column that was classified
1153
+ true_type: The correct type
1154
+ context: Context used during inference
1155
+ """
1156
+ if not self.config.enable_learning:
1157
+ return
1158
+
1159
+ context = context or {}
1160
+ context["column_name"] = context.get("column_name", column.name or "")
1161
+
1162
+ features = self._feature_registry.extract_all(column, context)
1163
+
1164
+ with self._lock:
1165
+ self._feedback_buffer.append((features, true_type))
1166
+
1167
+ # Retrain when buffer is large enough
1168
+ if len(self._feedback_buffer) >= 100:
1169
+ self._model.train(self._feedback_buffer)
1170
+ self._feedback_buffer.clear()
1171
+
1172
+ # Save model if path configured
1173
+ if self.config.model_path:
1174
+ self._model.save(self.config.model_path)
1175
+
1176
+ def _make_cache_key(
1177
+ self,
1178
+ column: pl.Series,
1179
+ context: dict[str, Any],
1180
+ ) -> str:
1181
+ """Create cache key for column + context."""
1182
+ # Use column sample and context for key
1183
+ sample = column.head(10).to_list()
1184
+ key_data = f"{context.get('column_name', '')}:{sample}:{column.dtype}"
1185
+ return hashlib.md5(key_data.encode()).hexdigest()
1186
+
1187
+ def _generate_reasoning(
1188
+ self,
1189
+ features: FeatureVector,
1190
+ predictions: list[tuple[DataType, float]],
1191
+ ) -> list[str]:
1192
+ """Generate human-readable reasoning."""
1193
+ reasoning = []
1194
+
1195
+ # Get top features
1196
+ sorted_features = sorted(
1197
+ features.features,
1198
+ key=lambda f: abs(f.value - 0.5), # Deviation from neutral
1199
+ reverse=True,
1200
+ )
1201
+
1202
+ for f in sorted_features[:5]:
1203
+ if f.value > 0.7:
1204
+ reasoning.append(f"High {f.name}: {f.value:.2f}")
1205
+ elif f.value < 0.3:
1206
+ reasoning.append(f"Low {f.name}: {f.value:.2f}")
1207
+
1208
+ if predictions:
1209
+ top_type, top_conf = predictions[0]
1210
+ reasoning.append(f"Best match: {top_type.value} ({top_conf:.0%})")
1211
+
1212
+ return reasoning
1213
+
1214
+ def clear_cache(self) -> None:
1215
+ """Clear inference cache."""
1216
+ with self._lock:
1217
+ self._cache.clear()
1218
+
1219
+
1220
+ # =============================================================================
1221
+ # Convenience Functions
1222
+ # =============================================================================
1223
+
1224
+
1225
+ def create_inference_model(
1226
+ model_type: str = "ensemble",
1227
+ **kwargs: Any,
1228
+ ) -> InferenceModel:
1229
+ """Create an inference model.
1230
+
1231
+ Args:
1232
+ model_type: Model type name
1233
+ **kwargs: Model configuration
1234
+
1235
+ Returns:
1236
+ Configured model
1237
+ """
1238
+ return model_registry.create(model_type, **kwargs)
1239
+
1240
+
1241
+ def infer_column_type_ml(
1242
+ column: pl.Series,
1243
+ context: dict[str, Any] | None = None,
1244
+ model: str = "ensemble",
1245
+ ) -> InferenceResult:
1246
+ """Infer column type using ML.
1247
+
1248
+ Args:
1249
+ column: Column to analyze
1250
+ context: Additional context
1251
+ model: Model to use
1252
+
1253
+ Returns:
1254
+ Inference result
1255
+ """
1256
+ inferrer = MLTypeInferrer(model=model)
1257
+ return inferrer.infer(column, context)
1258
+
1259
+
1260
+ def infer_table_types_ml(
1261
+ df: pl.DataFrame,
1262
+ table_name: str = "",
1263
+ model: str = "ensemble",
1264
+ ) -> dict[str, InferenceResult]:
1265
+ """Infer types for all columns in a table.
1266
+
1267
+ Args:
1268
+ df: DataFrame to analyze
1269
+ table_name: Table name for context
1270
+ model: Model to use
1271
+
1272
+ Returns:
1273
+ Dictionary of column results
1274
+ """
1275
+ inferrer = MLTypeInferrer(model=model)
1276
+ return inferrer.infer_table(df, table_name)