truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (877) hide show
  1. truthound/__init__.py +162 -0
  2. truthound/adapters.py +100 -0
  3. truthound/api.py +365 -0
  4. truthound/audit/__init__.py +248 -0
  5. truthound/audit/core.py +967 -0
  6. truthound/audit/filters.py +620 -0
  7. truthound/audit/formatters.py +707 -0
  8. truthound/audit/logger.py +902 -0
  9. truthound/audit/middleware.py +571 -0
  10. truthound/audit/storage.py +1083 -0
  11. truthound/benchmark/__init__.py +123 -0
  12. truthound/benchmark/base.py +757 -0
  13. truthound/benchmark/comparison.py +635 -0
  14. truthound/benchmark/generators.py +706 -0
  15. truthound/benchmark/reporters.py +718 -0
  16. truthound/benchmark/runner.py +635 -0
  17. truthound/benchmark/scenarios.py +712 -0
  18. truthound/cache.py +252 -0
  19. truthound/checkpoint/__init__.py +136 -0
  20. truthound/checkpoint/actions/__init__.py +164 -0
  21. truthound/checkpoint/actions/base.py +324 -0
  22. truthound/checkpoint/actions/custom.py +234 -0
  23. truthound/checkpoint/actions/discord_notify.py +290 -0
  24. truthound/checkpoint/actions/email_notify.py +405 -0
  25. truthound/checkpoint/actions/github_action.py +406 -0
  26. truthound/checkpoint/actions/opsgenie.py +1499 -0
  27. truthound/checkpoint/actions/pagerduty.py +226 -0
  28. truthound/checkpoint/actions/slack_notify.py +233 -0
  29. truthound/checkpoint/actions/store_result.py +249 -0
  30. truthound/checkpoint/actions/teams_notify.py +1570 -0
  31. truthound/checkpoint/actions/telegram_notify.py +419 -0
  32. truthound/checkpoint/actions/update_docs.py +552 -0
  33. truthound/checkpoint/actions/webhook.py +293 -0
  34. truthound/checkpoint/analytics/__init__.py +147 -0
  35. truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
  36. truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
  37. truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
  38. truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
  39. truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
  40. truthound/checkpoint/analytics/analyzers/base.py +270 -0
  41. truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
  42. truthound/checkpoint/analytics/analyzers/trend.py +314 -0
  43. truthound/checkpoint/analytics/models.py +292 -0
  44. truthound/checkpoint/analytics/protocols.py +549 -0
  45. truthound/checkpoint/analytics/service.py +718 -0
  46. truthound/checkpoint/analytics/stores/__init__.py +16 -0
  47. truthound/checkpoint/analytics/stores/base.py +306 -0
  48. truthound/checkpoint/analytics/stores/memory_store.py +353 -0
  49. truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
  50. truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
  51. truthound/checkpoint/async_actions.py +794 -0
  52. truthound/checkpoint/async_base.py +708 -0
  53. truthound/checkpoint/async_checkpoint.py +617 -0
  54. truthound/checkpoint/async_runner.py +639 -0
  55. truthound/checkpoint/checkpoint.py +527 -0
  56. truthound/checkpoint/ci/__init__.py +61 -0
  57. truthound/checkpoint/ci/detector.py +355 -0
  58. truthound/checkpoint/ci/reporter.py +436 -0
  59. truthound/checkpoint/ci/templates.py +454 -0
  60. truthound/checkpoint/circuitbreaker/__init__.py +133 -0
  61. truthound/checkpoint/circuitbreaker/breaker.py +542 -0
  62. truthound/checkpoint/circuitbreaker/core.py +252 -0
  63. truthound/checkpoint/circuitbreaker/detection.py +459 -0
  64. truthound/checkpoint/circuitbreaker/middleware.py +389 -0
  65. truthound/checkpoint/circuitbreaker/registry.py +357 -0
  66. truthound/checkpoint/distributed/__init__.py +139 -0
  67. truthound/checkpoint/distributed/backends/__init__.py +35 -0
  68. truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
  69. truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
  70. truthound/checkpoint/distributed/backends/local_backend.py +397 -0
  71. truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
  72. truthound/checkpoint/distributed/base.py +774 -0
  73. truthound/checkpoint/distributed/orchestrator.py +765 -0
  74. truthound/checkpoint/distributed/protocols.py +842 -0
  75. truthound/checkpoint/distributed/registry.py +449 -0
  76. truthound/checkpoint/idempotency/__init__.py +120 -0
  77. truthound/checkpoint/idempotency/core.py +295 -0
  78. truthound/checkpoint/idempotency/fingerprint.py +454 -0
  79. truthound/checkpoint/idempotency/locking.py +604 -0
  80. truthound/checkpoint/idempotency/service.py +592 -0
  81. truthound/checkpoint/idempotency/stores.py +653 -0
  82. truthound/checkpoint/monitoring/__init__.py +134 -0
  83. truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
  84. truthound/checkpoint/monitoring/aggregators/base.py +372 -0
  85. truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
  86. truthound/checkpoint/monitoring/aggregators/window.py +493 -0
  87. truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
  88. truthound/checkpoint/monitoring/collectors/base.py +257 -0
  89. truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
  90. truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
  91. truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
  92. truthound/checkpoint/monitoring/events.py +410 -0
  93. truthound/checkpoint/monitoring/protocols.py +636 -0
  94. truthound/checkpoint/monitoring/service.py +578 -0
  95. truthound/checkpoint/monitoring/views/__init__.py +17 -0
  96. truthound/checkpoint/monitoring/views/base.py +172 -0
  97. truthound/checkpoint/monitoring/views/queue_view.py +220 -0
  98. truthound/checkpoint/monitoring/views/task_view.py +240 -0
  99. truthound/checkpoint/monitoring/views/worker_view.py +263 -0
  100. truthound/checkpoint/registry.py +337 -0
  101. truthound/checkpoint/runner.py +356 -0
  102. truthound/checkpoint/transaction/__init__.py +133 -0
  103. truthound/checkpoint/transaction/base.py +389 -0
  104. truthound/checkpoint/transaction/compensatable.py +537 -0
  105. truthound/checkpoint/transaction/coordinator.py +576 -0
  106. truthound/checkpoint/transaction/executor.py +622 -0
  107. truthound/checkpoint/transaction/idempotency.py +534 -0
  108. truthound/checkpoint/transaction/saga/__init__.py +143 -0
  109. truthound/checkpoint/transaction/saga/builder.py +584 -0
  110. truthound/checkpoint/transaction/saga/definition.py +515 -0
  111. truthound/checkpoint/transaction/saga/event_store.py +542 -0
  112. truthound/checkpoint/transaction/saga/patterns.py +833 -0
  113. truthound/checkpoint/transaction/saga/runner.py +718 -0
  114. truthound/checkpoint/transaction/saga/state_machine.py +793 -0
  115. truthound/checkpoint/transaction/saga/strategies.py +780 -0
  116. truthound/checkpoint/transaction/saga/testing.py +886 -0
  117. truthound/checkpoint/triggers/__init__.py +58 -0
  118. truthound/checkpoint/triggers/base.py +237 -0
  119. truthound/checkpoint/triggers/event.py +385 -0
  120. truthound/checkpoint/triggers/schedule.py +355 -0
  121. truthound/cli.py +2358 -0
  122. truthound/cli_modules/__init__.py +124 -0
  123. truthound/cli_modules/advanced/__init__.py +45 -0
  124. truthound/cli_modules/advanced/benchmark.py +343 -0
  125. truthound/cli_modules/advanced/docs.py +225 -0
  126. truthound/cli_modules/advanced/lineage.py +209 -0
  127. truthound/cli_modules/advanced/ml.py +320 -0
  128. truthound/cli_modules/advanced/realtime.py +196 -0
  129. truthound/cli_modules/checkpoint/__init__.py +46 -0
  130. truthound/cli_modules/checkpoint/init.py +114 -0
  131. truthound/cli_modules/checkpoint/list.py +71 -0
  132. truthound/cli_modules/checkpoint/run.py +159 -0
  133. truthound/cli_modules/checkpoint/validate.py +67 -0
  134. truthound/cli_modules/common/__init__.py +71 -0
  135. truthound/cli_modules/common/errors.py +414 -0
  136. truthound/cli_modules/common/options.py +419 -0
  137. truthound/cli_modules/common/output.py +507 -0
  138. truthound/cli_modules/common/protocol.py +552 -0
  139. truthound/cli_modules/core/__init__.py +48 -0
  140. truthound/cli_modules/core/check.py +123 -0
  141. truthound/cli_modules/core/compare.py +104 -0
  142. truthound/cli_modules/core/learn.py +57 -0
  143. truthound/cli_modules/core/mask.py +77 -0
  144. truthound/cli_modules/core/profile.py +65 -0
  145. truthound/cli_modules/core/scan.py +61 -0
  146. truthound/cli_modules/profiler/__init__.py +51 -0
  147. truthound/cli_modules/profiler/auto_profile.py +175 -0
  148. truthound/cli_modules/profiler/metadata.py +107 -0
  149. truthound/cli_modules/profiler/suite.py +283 -0
  150. truthound/cli_modules/registry.py +431 -0
  151. truthound/cli_modules/scaffolding/__init__.py +89 -0
  152. truthound/cli_modules/scaffolding/base.py +631 -0
  153. truthound/cli_modules/scaffolding/commands.py +545 -0
  154. truthound/cli_modules/scaffolding/plugins.py +1072 -0
  155. truthound/cli_modules/scaffolding/reporters.py +594 -0
  156. truthound/cli_modules/scaffolding/validators.py +1127 -0
  157. truthound/common/__init__.py +18 -0
  158. truthound/common/resilience/__init__.py +130 -0
  159. truthound/common/resilience/bulkhead.py +266 -0
  160. truthound/common/resilience/circuit_breaker.py +516 -0
  161. truthound/common/resilience/composite.py +332 -0
  162. truthound/common/resilience/config.py +292 -0
  163. truthound/common/resilience/protocols.py +217 -0
  164. truthound/common/resilience/rate_limiter.py +404 -0
  165. truthound/common/resilience/retry.py +341 -0
  166. truthound/datadocs/__init__.py +260 -0
  167. truthound/datadocs/base.py +571 -0
  168. truthound/datadocs/builder.py +761 -0
  169. truthound/datadocs/charts.py +764 -0
  170. truthound/datadocs/dashboard/__init__.py +63 -0
  171. truthound/datadocs/dashboard/app.py +576 -0
  172. truthound/datadocs/dashboard/components.py +584 -0
  173. truthound/datadocs/dashboard/state.py +240 -0
  174. truthound/datadocs/engine/__init__.py +46 -0
  175. truthound/datadocs/engine/context.py +376 -0
  176. truthound/datadocs/engine/pipeline.py +618 -0
  177. truthound/datadocs/engine/registry.py +469 -0
  178. truthound/datadocs/exporters/__init__.py +49 -0
  179. truthound/datadocs/exporters/base.py +198 -0
  180. truthound/datadocs/exporters/html.py +178 -0
  181. truthound/datadocs/exporters/json_exporter.py +253 -0
  182. truthound/datadocs/exporters/markdown.py +284 -0
  183. truthound/datadocs/exporters/pdf.py +392 -0
  184. truthound/datadocs/i18n/__init__.py +86 -0
  185. truthound/datadocs/i18n/catalog.py +960 -0
  186. truthound/datadocs/i18n/formatting.py +505 -0
  187. truthound/datadocs/i18n/loader.py +256 -0
  188. truthound/datadocs/i18n/plurals.py +378 -0
  189. truthound/datadocs/renderers/__init__.py +42 -0
  190. truthound/datadocs/renderers/base.py +401 -0
  191. truthound/datadocs/renderers/custom.py +342 -0
  192. truthound/datadocs/renderers/jinja.py +697 -0
  193. truthound/datadocs/sections.py +736 -0
  194. truthound/datadocs/styles.py +931 -0
  195. truthound/datadocs/themes/__init__.py +101 -0
  196. truthound/datadocs/themes/base.py +336 -0
  197. truthound/datadocs/themes/default.py +417 -0
  198. truthound/datadocs/themes/enterprise.py +419 -0
  199. truthound/datadocs/themes/loader.py +336 -0
  200. truthound/datadocs/themes.py +301 -0
  201. truthound/datadocs/transformers/__init__.py +57 -0
  202. truthound/datadocs/transformers/base.py +268 -0
  203. truthound/datadocs/transformers/enrichers.py +544 -0
  204. truthound/datadocs/transformers/filters.py +447 -0
  205. truthound/datadocs/transformers/i18n.py +468 -0
  206. truthound/datadocs/versioning/__init__.py +62 -0
  207. truthound/datadocs/versioning/diff.py +639 -0
  208. truthound/datadocs/versioning/storage.py +497 -0
  209. truthound/datadocs/versioning/version.py +358 -0
  210. truthound/datasources/__init__.py +223 -0
  211. truthound/datasources/_async_protocols.py +222 -0
  212. truthound/datasources/_protocols.py +159 -0
  213. truthound/datasources/adapters.py +428 -0
  214. truthound/datasources/async_base.py +599 -0
  215. truthound/datasources/async_factory.py +511 -0
  216. truthound/datasources/base.py +516 -0
  217. truthound/datasources/factory.py +433 -0
  218. truthound/datasources/nosql/__init__.py +47 -0
  219. truthound/datasources/nosql/base.py +487 -0
  220. truthound/datasources/nosql/elasticsearch.py +801 -0
  221. truthound/datasources/nosql/mongodb.py +636 -0
  222. truthound/datasources/pandas_optimized.py +582 -0
  223. truthound/datasources/pandas_source.py +216 -0
  224. truthound/datasources/polars_source.py +395 -0
  225. truthound/datasources/spark_source.py +479 -0
  226. truthound/datasources/sql/__init__.py +154 -0
  227. truthound/datasources/sql/base.py +710 -0
  228. truthound/datasources/sql/bigquery.py +410 -0
  229. truthound/datasources/sql/cloud_base.py +199 -0
  230. truthound/datasources/sql/databricks.py +471 -0
  231. truthound/datasources/sql/mysql.py +316 -0
  232. truthound/datasources/sql/oracle.py +427 -0
  233. truthound/datasources/sql/postgresql.py +321 -0
  234. truthound/datasources/sql/redshift.py +479 -0
  235. truthound/datasources/sql/snowflake.py +439 -0
  236. truthound/datasources/sql/sqlite.py +286 -0
  237. truthound/datasources/sql/sqlserver.py +437 -0
  238. truthound/datasources/streaming/__init__.py +47 -0
  239. truthound/datasources/streaming/base.py +350 -0
  240. truthound/datasources/streaming/kafka.py +670 -0
  241. truthound/decorators.py +98 -0
  242. truthound/docs/__init__.py +69 -0
  243. truthound/docs/extractor.py +971 -0
  244. truthound/docs/generator.py +601 -0
  245. truthound/docs/parser.py +1037 -0
  246. truthound/docs/renderer.py +999 -0
  247. truthound/drift/__init__.py +22 -0
  248. truthound/drift/compare.py +189 -0
  249. truthound/drift/detectors.py +464 -0
  250. truthound/drift/report.py +160 -0
  251. truthound/execution/__init__.py +65 -0
  252. truthound/execution/_protocols.py +324 -0
  253. truthound/execution/base.py +576 -0
  254. truthound/execution/distributed/__init__.py +179 -0
  255. truthound/execution/distributed/aggregations.py +731 -0
  256. truthound/execution/distributed/arrow_bridge.py +817 -0
  257. truthound/execution/distributed/base.py +550 -0
  258. truthound/execution/distributed/dask_engine.py +976 -0
  259. truthound/execution/distributed/mixins.py +766 -0
  260. truthound/execution/distributed/protocols.py +756 -0
  261. truthound/execution/distributed/ray_engine.py +1127 -0
  262. truthound/execution/distributed/registry.py +446 -0
  263. truthound/execution/distributed/spark_engine.py +1011 -0
  264. truthound/execution/distributed/validator_adapter.py +682 -0
  265. truthound/execution/pandas_engine.py +401 -0
  266. truthound/execution/polars_engine.py +497 -0
  267. truthound/execution/pushdown/__init__.py +230 -0
  268. truthound/execution/pushdown/ast.py +1550 -0
  269. truthound/execution/pushdown/builder.py +1550 -0
  270. truthound/execution/pushdown/dialects.py +1072 -0
  271. truthound/execution/pushdown/executor.py +829 -0
  272. truthound/execution/pushdown/optimizer.py +1041 -0
  273. truthound/execution/sql_engine.py +518 -0
  274. truthound/infrastructure/__init__.py +189 -0
  275. truthound/infrastructure/audit.py +1515 -0
  276. truthound/infrastructure/config.py +1133 -0
  277. truthound/infrastructure/encryption.py +1132 -0
  278. truthound/infrastructure/logging.py +1503 -0
  279. truthound/infrastructure/metrics.py +1220 -0
  280. truthound/lineage/__init__.py +89 -0
  281. truthound/lineage/base.py +746 -0
  282. truthound/lineage/impact_analysis.py +474 -0
  283. truthound/lineage/integrations/__init__.py +22 -0
  284. truthound/lineage/integrations/openlineage.py +548 -0
  285. truthound/lineage/tracker.py +512 -0
  286. truthound/lineage/visualization/__init__.py +33 -0
  287. truthound/lineage/visualization/protocols.py +145 -0
  288. truthound/lineage/visualization/renderers/__init__.py +20 -0
  289. truthound/lineage/visualization/renderers/cytoscape.py +329 -0
  290. truthound/lineage/visualization/renderers/d3.py +331 -0
  291. truthound/lineage/visualization/renderers/graphviz.py +276 -0
  292. truthound/lineage/visualization/renderers/mermaid.py +308 -0
  293. truthound/maskers.py +113 -0
  294. truthound/ml/__init__.py +124 -0
  295. truthound/ml/anomaly_models/__init__.py +31 -0
  296. truthound/ml/anomaly_models/ensemble.py +362 -0
  297. truthound/ml/anomaly_models/isolation_forest.py +444 -0
  298. truthound/ml/anomaly_models/statistical.py +392 -0
  299. truthound/ml/base.py +1178 -0
  300. truthound/ml/drift_detection/__init__.py +26 -0
  301. truthound/ml/drift_detection/concept.py +381 -0
  302. truthound/ml/drift_detection/distribution.py +361 -0
  303. truthound/ml/drift_detection/feature.py +442 -0
  304. truthound/ml/drift_detection/multivariate.py +495 -0
  305. truthound/ml/monitoring/__init__.py +88 -0
  306. truthound/ml/monitoring/alerting/__init__.py +33 -0
  307. truthound/ml/monitoring/alerting/handlers.py +427 -0
  308. truthound/ml/monitoring/alerting/rules.py +508 -0
  309. truthound/ml/monitoring/collectors/__init__.py +19 -0
  310. truthound/ml/monitoring/collectors/composite.py +105 -0
  311. truthound/ml/monitoring/collectors/drift.py +324 -0
  312. truthound/ml/monitoring/collectors/performance.py +179 -0
  313. truthound/ml/monitoring/collectors/quality.py +369 -0
  314. truthound/ml/monitoring/monitor.py +536 -0
  315. truthound/ml/monitoring/protocols.py +451 -0
  316. truthound/ml/monitoring/stores/__init__.py +15 -0
  317. truthound/ml/monitoring/stores/memory.py +201 -0
  318. truthound/ml/monitoring/stores/prometheus.py +296 -0
  319. truthound/ml/rule_learning/__init__.py +25 -0
  320. truthound/ml/rule_learning/constraint_miner.py +443 -0
  321. truthound/ml/rule_learning/pattern_learner.py +499 -0
  322. truthound/ml/rule_learning/profile_learner.py +462 -0
  323. truthound/multitenancy/__init__.py +326 -0
  324. truthound/multitenancy/core.py +852 -0
  325. truthound/multitenancy/integration.py +597 -0
  326. truthound/multitenancy/isolation.py +630 -0
  327. truthound/multitenancy/manager.py +770 -0
  328. truthound/multitenancy/middleware.py +765 -0
  329. truthound/multitenancy/quota.py +537 -0
  330. truthound/multitenancy/resolvers.py +603 -0
  331. truthound/multitenancy/storage.py +703 -0
  332. truthound/observability/__init__.py +307 -0
  333. truthound/observability/context.py +531 -0
  334. truthound/observability/instrumentation.py +611 -0
  335. truthound/observability/logging.py +887 -0
  336. truthound/observability/metrics.py +1157 -0
  337. truthound/observability/tracing/__init__.py +178 -0
  338. truthound/observability/tracing/baggage.py +310 -0
  339. truthound/observability/tracing/config.py +426 -0
  340. truthound/observability/tracing/exporter.py +787 -0
  341. truthound/observability/tracing/integration.py +1018 -0
  342. truthound/observability/tracing/otel/__init__.py +146 -0
  343. truthound/observability/tracing/otel/adapter.py +982 -0
  344. truthound/observability/tracing/otel/bridge.py +1177 -0
  345. truthound/observability/tracing/otel/compat.py +681 -0
  346. truthound/observability/tracing/otel/config.py +691 -0
  347. truthound/observability/tracing/otel/detection.py +327 -0
  348. truthound/observability/tracing/otel/protocols.py +426 -0
  349. truthound/observability/tracing/processor.py +561 -0
  350. truthound/observability/tracing/propagator.py +757 -0
  351. truthound/observability/tracing/provider.py +569 -0
  352. truthound/observability/tracing/resource.py +515 -0
  353. truthound/observability/tracing/sampler.py +487 -0
  354. truthound/observability/tracing/span.py +676 -0
  355. truthound/plugins/__init__.py +198 -0
  356. truthound/plugins/base.py +599 -0
  357. truthound/plugins/cli.py +680 -0
  358. truthound/plugins/dependencies/__init__.py +42 -0
  359. truthound/plugins/dependencies/graph.py +422 -0
  360. truthound/plugins/dependencies/resolver.py +417 -0
  361. truthound/plugins/discovery.py +379 -0
  362. truthound/plugins/docs/__init__.py +46 -0
  363. truthound/plugins/docs/extractor.py +444 -0
  364. truthound/plugins/docs/renderer.py +499 -0
  365. truthound/plugins/enterprise_manager.py +877 -0
  366. truthound/plugins/examples/__init__.py +19 -0
  367. truthound/plugins/examples/custom_validators.py +317 -0
  368. truthound/plugins/examples/slack_notifier.py +312 -0
  369. truthound/plugins/examples/xml_reporter.py +254 -0
  370. truthound/plugins/hooks.py +558 -0
  371. truthound/plugins/lifecycle/__init__.py +43 -0
  372. truthound/plugins/lifecycle/hot_reload.py +402 -0
  373. truthound/plugins/lifecycle/manager.py +371 -0
  374. truthound/plugins/manager.py +736 -0
  375. truthound/plugins/registry.py +338 -0
  376. truthound/plugins/security/__init__.py +93 -0
  377. truthound/plugins/security/exceptions.py +332 -0
  378. truthound/plugins/security/policies.py +348 -0
  379. truthound/plugins/security/protocols.py +643 -0
  380. truthound/plugins/security/sandbox/__init__.py +45 -0
  381. truthound/plugins/security/sandbox/context.py +158 -0
  382. truthound/plugins/security/sandbox/engines/__init__.py +19 -0
  383. truthound/plugins/security/sandbox/engines/container.py +379 -0
  384. truthound/plugins/security/sandbox/engines/noop.py +144 -0
  385. truthound/plugins/security/sandbox/engines/process.py +336 -0
  386. truthound/plugins/security/sandbox/factory.py +211 -0
  387. truthound/plugins/security/signing/__init__.py +57 -0
  388. truthound/plugins/security/signing/service.py +330 -0
  389. truthound/plugins/security/signing/trust_store.py +368 -0
  390. truthound/plugins/security/signing/verifier.py +459 -0
  391. truthound/plugins/versioning/__init__.py +41 -0
  392. truthound/plugins/versioning/constraints.py +297 -0
  393. truthound/plugins/versioning/resolver.py +329 -0
  394. truthound/profiler/__init__.py +1729 -0
  395. truthound/profiler/_lazy.py +452 -0
  396. truthound/profiler/ab_testing/__init__.py +80 -0
  397. truthound/profiler/ab_testing/analysis.py +449 -0
  398. truthound/profiler/ab_testing/base.py +257 -0
  399. truthound/profiler/ab_testing/experiment.py +395 -0
  400. truthound/profiler/ab_testing/tracking.py +368 -0
  401. truthound/profiler/auto_threshold.py +1170 -0
  402. truthound/profiler/base.py +579 -0
  403. truthound/profiler/cache_patterns.py +911 -0
  404. truthound/profiler/caching.py +1303 -0
  405. truthound/profiler/column_profiler.py +712 -0
  406. truthound/profiler/comparison.py +1007 -0
  407. truthound/profiler/custom_patterns.py +1170 -0
  408. truthound/profiler/dashboard/__init__.py +50 -0
  409. truthound/profiler/dashboard/app.py +476 -0
  410. truthound/profiler/dashboard/components.py +457 -0
  411. truthound/profiler/dashboard/config.py +72 -0
  412. truthound/profiler/distributed/__init__.py +83 -0
  413. truthound/profiler/distributed/base.py +281 -0
  414. truthound/profiler/distributed/dask_backend.py +498 -0
  415. truthound/profiler/distributed/local_backend.py +293 -0
  416. truthound/profiler/distributed/profiler.py +304 -0
  417. truthound/profiler/distributed/ray_backend.py +374 -0
  418. truthound/profiler/distributed/spark_backend.py +375 -0
  419. truthound/profiler/distributed.py +1366 -0
  420. truthound/profiler/enterprise_sampling.py +1065 -0
  421. truthound/profiler/errors.py +488 -0
  422. truthound/profiler/evolution/__init__.py +91 -0
  423. truthound/profiler/evolution/alerts.py +426 -0
  424. truthound/profiler/evolution/changes.py +206 -0
  425. truthound/profiler/evolution/compatibility.py +365 -0
  426. truthound/profiler/evolution/detector.py +372 -0
  427. truthound/profiler/evolution/protocols.py +121 -0
  428. truthound/profiler/generators/__init__.py +48 -0
  429. truthound/profiler/generators/base.py +384 -0
  430. truthound/profiler/generators/ml_rules.py +375 -0
  431. truthound/profiler/generators/pattern_rules.py +384 -0
  432. truthound/profiler/generators/schema_rules.py +267 -0
  433. truthound/profiler/generators/stats_rules.py +324 -0
  434. truthound/profiler/generators/suite_generator.py +857 -0
  435. truthound/profiler/i18n.py +1542 -0
  436. truthound/profiler/incremental.py +554 -0
  437. truthound/profiler/incremental_validation.py +1710 -0
  438. truthound/profiler/integration/__init__.py +73 -0
  439. truthound/profiler/integration/adapters.py +345 -0
  440. truthound/profiler/integration/context.py +371 -0
  441. truthound/profiler/integration/executor.py +527 -0
  442. truthound/profiler/integration/naming.py +75 -0
  443. truthound/profiler/integration/protocols.py +243 -0
  444. truthound/profiler/memory.py +1185 -0
  445. truthound/profiler/migration/__init__.py +60 -0
  446. truthound/profiler/migration/base.py +345 -0
  447. truthound/profiler/migration/manager.py +444 -0
  448. truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
  449. truthound/profiler/ml/__init__.py +73 -0
  450. truthound/profiler/ml/base.py +244 -0
  451. truthound/profiler/ml/classifier.py +507 -0
  452. truthound/profiler/ml/feature_extraction.py +604 -0
  453. truthound/profiler/ml/pretrained.py +448 -0
  454. truthound/profiler/ml_inference.py +1276 -0
  455. truthound/profiler/native_patterns.py +815 -0
  456. truthound/profiler/observability.py +1184 -0
  457. truthound/profiler/process_timeout.py +1566 -0
  458. truthound/profiler/progress.py +568 -0
  459. truthound/profiler/progress_callbacks.py +1734 -0
  460. truthound/profiler/quality.py +1345 -0
  461. truthound/profiler/resilience.py +1180 -0
  462. truthound/profiler/sampled_matcher.py +794 -0
  463. truthound/profiler/sampling.py +1288 -0
  464. truthound/profiler/scheduling/__init__.py +82 -0
  465. truthound/profiler/scheduling/protocols.py +214 -0
  466. truthound/profiler/scheduling/scheduler.py +474 -0
  467. truthound/profiler/scheduling/storage.py +457 -0
  468. truthound/profiler/scheduling/triggers.py +449 -0
  469. truthound/profiler/schema.py +603 -0
  470. truthound/profiler/streaming.py +685 -0
  471. truthound/profiler/streaming_patterns.py +1354 -0
  472. truthound/profiler/suite_cli.py +625 -0
  473. truthound/profiler/suite_config.py +789 -0
  474. truthound/profiler/suite_export.py +1268 -0
  475. truthound/profiler/table_profiler.py +547 -0
  476. truthound/profiler/timeout.py +565 -0
  477. truthound/profiler/validation.py +1532 -0
  478. truthound/profiler/visualization/__init__.py +118 -0
  479. truthound/profiler/visualization/base.py +346 -0
  480. truthound/profiler/visualization/generator.py +1259 -0
  481. truthound/profiler/visualization/plotly_renderer.py +811 -0
  482. truthound/profiler/visualization/renderers.py +669 -0
  483. truthound/profiler/visualization/sections.py +540 -0
  484. truthound/profiler/visualization.py +2122 -0
  485. truthound/profiler/yaml_validation.py +1151 -0
  486. truthound/py.typed +0 -0
  487. truthound/ratelimit/__init__.py +248 -0
  488. truthound/ratelimit/algorithms.py +1108 -0
  489. truthound/ratelimit/core.py +573 -0
  490. truthound/ratelimit/integration.py +532 -0
  491. truthound/ratelimit/limiter.py +663 -0
  492. truthound/ratelimit/middleware.py +700 -0
  493. truthound/ratelimit/policy.py +792 -0
  494. truthound/ratelimit/storage.py +763 -0
  495. truthound/rbac/__init__.py +340 -0
  496. truthound/rbac/core.py +976 -0
  497. truthound/rbac/integration.py +760 -0
  498. truthound/rbac/manager.py +1052 -0
  499. truthound/rbac/middleware.py +842 -0
  500. truthound/rbac/policy.py +954 -0
  501. truthound/rbac/storage.py +878 -0
  502. truthound/realtime/__init__.py +141 -0
  503. truthound/realtime/adapters/__init__.py +43 -0
  504. truthound/realtime/adapters/base.py +533 -0
  505. truthound/realtime/adapters/kafka.py +487 -0
  506. truthound/realtime/adapters/kinesis.py +479 -0
  507. truthound/realtime/adapters/mock.py +243 -0
  508. truthound/realtime/base.py +553 -0
  509. truthound/realtime/factory.py +382 -0
  510. truthound/realtime/incremental.py +660 -0
  511. truthound/realtime/processing/__init__.py +67 -0
  512. truthound/realtime/processing/exactly_once.py +575 -0
  513. truthound/realtime/processing/state.py +547 -0
  514. truthound/realtime/processing/windows.py +647 -0
  515. truthound/realtime/protocols.py +569 -0
  516. truthound/realtime/streaming.py +605 -0
  517. truthound/realtime/testing/__init__.py +32 -0
  518. truthound/realtime/testing/containers.py +615 -0
  519. truthound/realtime/testing/fixtures.py +484 -0
  520. truthound/report.py +280 -0
  521. truthound/reporters/__init__.py +46 -0
  522. truthound/reporters/_protocols.py +30 -0
  523. truthound/reporters/base.py +324 -0
  524. truthound/reporters/ci/__init__.py +66 -0
  525. truthound/reporters/ci/azure.py +436 -0
  526. truthound/reporters/ci/base.py +509 -0
  527. truthound/reporters/ci/bitbucket.py +567 -0
  528. truthound/reporters/ci/circleci.py +547 -0
  529. truthound/reporters/ci/detection.py +364 -0
  530. truthound/reporters/ci/factory.py +182 -0
  531. truthound/reporters/ci/github.py +388 -0
  532. truthound/reporters/ci/gitlab.py +471 -0
  533. truthound/reporters/ci/jenkins.py +525 -0
  534. truthound/reporters/console_reporter.py +299 -0
  535. truthound/reporters/factory.py +211 -0
  536. truthound/reporters/html_reporter.py +524 -0
  537. truthound/reporters/json_reporter.py +256 -0
  538. truthound/reporters/markdown_reporter.py +280 -0
  539. truthound/reporters/sdk/__init__.py +174 -0
  540. truthound/reporters/sdk/builder.py +558 -0
  541. truthound/reporters/sdk/mixins.py +1150 -0
  542. truthound/reporters/sdk/schema.py +1493 -0
  543. truthound/reporters/sdk/templates.py +666 -0
  544. truthound/reporters/sdk/testing.py +968 -0
  545. truthound/scanners.py +170 -0
  546. truthound/scheduling/__init__.py +122 -0
  547. truthound/scheduling/cron.py +1136 -0
  548. truthound/scheduling/presets.py +212 -0
  549. truthound/schema.py +275 -0
  550. truthound/secrets/__init__.py +173 -0
  551. truthound/secrets/base.py +618 -0
  552. truthound/secrets/cloud.py +682 -0
  553. truthound/secrets/integration.py +507 -0
  554. truthound/secrets/manager.py +633 -0
  555. truthound/secrets/oidc/__init__.py +172 -0
  556. truthound/secrets/oidc/base.py +902 -0
  557. truthound/secrets/oidc/credential_provider.py +623 -0
  558. truthound/secrets/oidc/exchangers.py +1001 -0
  559. truthound/secrets/oidc/github/__init__.py +110 -0
  560. truthound/secrets/oidc/github/claims.py +718 -0
  561. truthound/secrets/oidc/github/enhanced_provider.py +693 -0
  562. truthound/secrets/oidc/github/trust_policy.py +742 -0
  563. truthound/secrets/oidc/github/verification.py +723 -0
  564. truthound/secrets/oidc/github/workflow.py +691 -0
  565. truthound/secrets/oidc/providers.py +825 -0
  566. truthound/secrets/providers.py +506 -0
  567. truthound/secrets/resolver.py +495 -0
  568. truthound/stores/__init__.py +177 -0
  569. truthound/stores/backends/__init__.py +18 -0
  570. truthound/stores/backends/_protocols.py +340 -0
  571. truthound/stores/backends/azure_blob.py +530 -0
  572. truthound/stores/backends/concurrent_filesystem.py +915 -0
  573. truthound/stores/backends/connection_pool.py +1365 -0
  574. truthound/stores/backends/database.py +743 -0
  575. truthound/stores/backends/filesystem.py +538 -0
  576. truthound/stores/backends/gcs.py +399 -0
  577. truthound/stores/backends/memory.py +354 -0
  578. truthound/stores/backends/s3.py +434 -0
  579. truthound/stores/backpressure/__init__.py +84 -0
  580. truthound/stores/backpressure/base.py +375 -0
  581. truthound/stores/backpressure/circuit_breaker.py +434 -0
  582. truthound/stores/backpressure/monitor.py +376 -0
  583. truthound/stores/backpressure/strategies.py +677 -0
  584. truthound/stores/base.py +551 -0
  585. truthound/stores/batching/__init__.py +65 -0
  586. truthound/stores/batching/base.py +305 -0
  587. truthound/stores/batching/buffer.py +370 -0
  588. truthound/stores/batching/store.py +248 -0
  589. truthound/stores/batching/writer.py +521 -0
  590. truthound/stores/caching/__init__.py +60 -0
  591. truthound/stores/caching/backends.py +684 -0
  592. truthound/stores/caching/base.py +356 -0
  593. truthound/stores/caching/store.py +305 -0
  594. truthound/stores/compression/__init__.py +193 -0
  595. truthound/stores/compression/adaptive.py +694 -0
  596. truthound/stores/compression/base.py +514 -0
  597. truthound/stores/compression/pipeline.py +868 -0
  598. truthound/stores/compression/providers.py +672 -0
  599. truthound/stores/compression/streaming.py +832 -0
  600. truthound/stores/concurrency/__init__.py +81 -0
  601. truthound/stores/concurrency/atomic.py +556 -0
  602. truthound/stores/concurrency/index.py +775 -0
  603. truthound/stores/concurrency/locks.py +576 -0
  604. truthound/stores/concurrency/manager.py +482 -0
  605. truthound/stores/encryption/__init__.py +297 -0
  606. truthound/stores/encryption/base.py +952 -0
  607. truthound/stores/encryption/keys.py +1191 -0
  608. truthound/stores/encryption/pipeline.py +903 -0
  609. truthound/stores/encryption/providers.py +953 -0
  610. truthound/stores/encryption/streaming.py +950 -0
  611. truthound/stores/expectations.py +227 -0
  612. truthound/stores/factory.py +246 -0
  613. truthound/stores/migration/__init__.py +75 -0
  614. truthound/stores/migration/base.py +480 -0
  615. truthound/stores/migration/manager.py +347 -0
  616. truthound/stores/migration/registry.py +382 -0
  617. truthound/stores/migration/store.py +559 -0
  618. truthound/stores/observability/__init__.py +106 -0
  619. truthound/stores/observability/audit.py +718 -0
  620. truthound/stores/observability/config.py +270 -0
  621. truthound/stores/observability/factory.py +208 -0
  622. truthound/stores/observability/metrics.py +636 -0
  623. truthound/stores/observability/protocols.py +410 -0
  624. truthound/stores/observability/store.py +570 -0
  625. truthound/stores/observability/tracing.py +784 -0
  626. truthound/stores/replication/__init__.py +76 -0
  627. truthound/stores/replication/base.py +260 -0
  628. truthound/stores/replication/monitor.py +269 -0
  629. truthound/stores/replication/store.py +439 -0
  630. truthound/stores/replication/syncer.py +391 -0
  631. truthound/stores/results.py +359 -0
  632. truthound/stores/retention/__init__.py +77 -0
  633. truthound/stores/retention/base.py +378 -0
  634. truthound/stores/retention/policies.py +621 -0
  635. truthound/stores/retention/scheduler.py +279 -0
  636. truthound/stores/retention/store.py +526 -0
  637. truthound/stores/streaming/__init__.py +138 -0
  638. truthound/stores/streaming/base.py +801 -0
  639. truthound/stores/streaming/database.py +984 -0
  640. truthound/stores/streaming/filesystem.py +719 -0
  641. truthound/stores/streaming/reader.py +629 -0
  642. truthound/stores/streaming/s3.py +843 -0
  643. truthound/stores/streaming/writer.py +790 -0
  644. truthound/stores/tiering/__init__.py +108 -0
  645. truthound/stores/tiering/base.py +462 -0
  646. truthound/stores/tiering/manager.py +249 -0
  647. truthound/stores/tiering/policies.py +692 -0
  648. truthound/stores/tiering/store.py +526 -0
  649. truthound/stores/versioning/__init__.py +56 -0
  650. truthound/stores/versioning/base.py +376 -0
  651. truthound/stores/versioning/store.py +660 -0
  652. truthound/stores/versioning/strategies.py +353 -0
  653. truthound/types.py +56 -0
  654. truthound/validators/__init__.py +774 -0
  655. truthound/validators/aggregate/__init__.py +27 -0
  656. truthound/validators/aggregate/central.py +116 -0
  657. truthound/validators/aggregate/extremes.py +116 -0
  658. truthound/validators/aggregate/spread.py +118 -0
  659. truthound/validators/aggregate/sum.py +64 -0
  660. truthound/validators/aggregate/type.py +78 -0
  661. truthound/validators/anomaly/__init__.py +93 -0
  662. truthound/validators/anomaly/base.py +431 -0
  663. truthound/validators/anomaly/ml_based.py +1190 -0
  664. truthound/validators/anomaly/multivariate.py +647 -0
  665. truthound/validators/anomaly/statistical.py +599 -0
  666. truthound/validators/base.py +1089 -0
  667. truthound/validators/business_rule/__init__.py +46 -0
  668. truthound/validators/business_rule/base.py +147 -0
  669. truthound/validators/business_rule/checksum.py +509 -0
  670. truthound/validators/business_rule/financial.py +526 -0
  671. truthound/validators/cache.py +733 -0
  672. truthound/validators/completeness/__init__.py +39 -0
  673. truthound/validators/completeness/conditional.py +73 -0
  674. truthound/validators/completeness/default.py +98 -0
  675. truthound/validators/completeness/empty.py +103 -0
  676. truthound/validators/completeness/nan.py +337 -0
  677. truthound/validators/completeness/null.py +152 -0
  678. truthound/validators/cross_table/__init__.py +17 -0
  679. truthound/validators/cross_table/aggregate.py +333 -0
  680. truthound/validators/cross_table/row_count.py +122 -0
  681. truthound/validators/datetime/__init__.py +29 -0
  682. truthound/validators/datetime/format.py +78 -0
  683. truthound/validators/datetime/freshness.py +269 -0
  684. truthound/validators/datetime/order.py +73 -0
  685. truthound/validators/datetime/parseable.py +185 -0
  686. truthound/validators/datetime/range.py +202 -0
  687. truthound/validators/datetime/timezone.py +69 -0
  688. truthound/validators/distribution/__init__.py +49 -0
  689. truthound/validators/distribution/distribution.py +128 -0
  690. truthound/validators/distribution/monotonic.py +119 -0
  691. truthound/validators/distribution/outlier.py +178 -0
  692. truthound/validators/distribution/quantile.py +80 -0
  693. truthound/validators/distribution/range.py +254 -0
  694. truthound/validators/distribution/set.py +125 -0
  695. truthound/validators/distribution/statistical.py +459 -0
  696. truthound/validators/drift/__init__.py +79 -0
  697. truthound/validators/drift/base.py +427 -0
  698. truthound/validators/drift/multi_feature.py +401 -0
  699. truthound/validators/drift/numeric.py +395 -0
  700. truthound/validators/drift/psi.py +446 -0
  701. truthound/validators/drift/statistical.py +510 -0
  702. truthound/validators/enterprise.py +1658 -0
  703. truthound/validators/geospatial/__init__.py +80 -0
  704. truthound/validators/geospatial/base.py +97 -0
  705. truthound/validators/geospatial/boundary.py +238 -0
  706. truthound/validators/geospatial/coordinate.py +351 -0
  707. truthound/validators/geospatial/distance.py +399 -0
  708. truthound/validators/geospatial/polygon.py +665 -0
  709. truthound/validators/i18n/__init__.py +308 -0
  710. truthound/validators/i18n/bidi.py +571 -0
  711. truthound/validators/i18n/catalogs.py +570 -0
  712. truthound/validators/i18n/dialects.py +763 -0
  713. truthound/validators/i18n/extended_catalogs.py +549 -0
  714. truthound/validators/i18n/formatting.py +1434 -0
  715. truthound/validators/i18n/loader.py +1020 -0
  716. truthound/validators/i18n/messages.py +521 -0
  717. truthound/validators/i18n/plural.py +683 -0
  718. truthound/validators/i18n/protocols.py +855 -0
  719. truthound/validators/i18n/tms.py +1162 -0
  720. truthound/validators/localization/__init__.py +53 -0
  721. truthound/validators/localization/base.py +122 -0
  722. truthound/validators/localization/chinese.py +362 -0
  723. truthound/validators/localization/japanese.py +275 -0
  724. truthound/validators/localization/korean.py +524 -0
  725. truthound/validators/memory/__init__.py +94 -0
  726. truthound/validators/memory/approximate_knn.py +506 -0
  727. truthound/validators/memory/base.py +547 -0
  728. truthound/validators/memory/sgd_online.py +719 -0
  729. truthound/validators/memory/streaming_ecdf.py +753 -0
  730. truthound/validators/ml_feature/__init__.py +54 -0
  731. truthound/validators/ml_feature/base.py +249 -0
  732. truthound/validators/ml_feature/correlation.py +299 -0
  733. truthound/validators/ml_feature/leakage.py +344 -0
  734. truthound/validators/ml_feature/null_impact.py +270 -0
  735. truthound/validators/ml_feature/scale.py +264 -0
  736. truthound/validators/multi_column/__init__.py +89 -0
  737. truthound/validators/multi_column/arithmetic.py +284 -0
  738. truthound/validators/multi_column/base.py +231 -0
  739. truthound/validators/multi_column/comparison.py +273 -0
  740. truthound/validators/multi_column/consistency.py +312 -0
  741. truthound/validators/multi_column/statistical.py +299 -0
  742. truthound/validators/optimization/__init__.py +164 -0
  743. truthound/validators/optimization/aggregation.py +563 -0
  744. truthound/validators/optimization/covariance.py +556 -0
  745. truthound/validators/optimization/geo.py +626 -0
  746. truthound/validators/optimization/graph.py +587 -0
  747. truthound/validators/optimization/orchestrator.py +970 -0
  748. truthound/validators/optimization/profiling.py +1312 -0
  749. truthound/validators/privacy/__init__.py +223 -0
  750. truthound/validators/privacy/base.py +635 -0
  751. truthound/validators/privacy/ccpa.py +670 -0
  752. truthound/validators/privacy/gdpr.py +728 -0
  753. truthound/validators/privacy/global_patterns.py +604 -0
  754. truthound/validators/privacy/plugins.py +867 -0
  755. truthound/validators/profiling/__init__.py +52 -0
  756. truthound/validators/profiling/base.py +175 -0
  757. truthound/validators/profiling/cardinality.py +312 -0
  758. truthound/validators/profiling/entropy.py +391 -0
  759. truthound/validators/profiling/frequency.py +455 -0
  760. truthound/validators/pushdown_support.py +660 -0
  761. truthound/validators/query/__init__.py +91 -0
  762. truthound/validators/query/aggregate.py +346 -0
  763. truthound/validators/query/base.py +246 -0
  764. truthound/validators/query/column.py +249 -0
  765. truthound/validators/query/expression.py +274 -0
  766. truthound/validators/query/result.py +323 -0
  767. truthound/validators/query/row_count.py +264 -0
  768. truthound/validators/referential/__init__.py +80 -0
  769. truthound/validators/referential/base.py +395 -0
  770. truthound/validators/referential/cascade.py +391 -0
  771. truthound/validators/referential/circular.py +563 -0
  772. truthound/validators/referential/foreign_key.py +624 -0
  773. truthound/validators/referential/orphan.py +485 -0
  774. truthound/validators/registry.py +112 -0
  775. truthound/validators/schema/__init__.py +41 -0
  776. truthound/validators/schema/column_count.py +142 -0
  777. truthound/validators/schema/column_exists.py +80 -0
  778. truthound/validators/schema/column_order.py +82 -0
  779. truthound/validators/schema/column_pair.py +85 -0
  780. truthound/validators/schema/column_pair_set.py +195 -0
  781. truthound/validators/schema/column_type.py +94 -0
  782. truthound/validators/schema/multi_column.py +53 -0
  783. truthound/validators/schema/multi_column_aggregate.py +175 -0
  784. truthound/validators/schema/referential.py +274 -0
  785. truthound/validators/schema/table_schema.py +91 -0
  786. truthound/validators/schema_validator.py +219 -0
  787. truthound/validators/sdk/__init__.py +250 -0
  788. truthound/validators/sdk/builder.py +680 -0
  789. truthound/validators/sdk/decorators.py +474 -0
  790. truthound/validators/sdk/enterprise/__init__.py +211 -0
  791. truthound/validators/sdk/enterprise/docs.py +725 -0
  792. truthound/validators/sdk/enterprise/fuzzing.py +659 -0
  793. truthound/validators/sdk/enterprise/licensing.py +709 -0
  794. truthound/validators/sdk/enterprise/manager.py +543 -0
  795. truthound/validators/sdk/enterprise/resources.py +628 -0
  796. truthound/validators/sdk/enterprise/sandbox.py +766 -0
  797. truthound/validators/sdk/enterprise/signing.py +603 -0
  798. truthound/validators/sdk/enterprise/templates.py +865 -0
  799. truthound/validators/sdk/enterprise/versioning.py +659 -0
  800. truthound/validators/sdk/templates.py +757 -0
  801. truthound/validators/sdk/testing.py +807 -0
  802. truthound/validators/security/__init__.py +181 -0
  803. truthound/validators/security/redos/__init__.py +182 -0
  804. truthound/validators/security/redos/core.py +861 -0
  805. truthound/validators/security/redos/cpu_monitor.py +593 -0
  806. truthound/validators/security/redos/cve_database.py +791 -0
  807. truthound/validators/security/redos/ml/__init__.py +155 -0
  808. truthound/validators/security/redos/ml/base.py +785 -0
  809. truthound/validators/security/redos/ml/datasets.py +618 -0
  810. truthound/validators/security/redos/ml/features.py +359 -0
  811. truthound/validators/security/redos/ml/models.py +1000 -0
  812. truthound/validators/security/redos/ml/predictor.py +507 -0
  813. truthound/validators/security/redos/ml/storage.py +632 -0
  814. truthound/validators/security/redos/ml/training.py +571 -0
  815. truthound/validators/security/redos/ml_analyzer.py +937 -0
  816. truthound/validators/security/redos/optimizer.py +674 -0
  817. truthound/validators/security/redos/profiler.py +682 -0
  818. truthound/validators/security/redos/re2_engine.py +709 -0
  819. truthound/validators/security/redos.py +886 -0
  820. truthound/validators/security/sql_security.py +1247 -0
  821. truthound/validators/streaming/__init__.py +126 -0
  822. truthound/validators/streaming/base.py +292 -0
  823. truthound/validators/streaming/completeness.py +210 -0
  824. truthound/validators/streaming/mixin.py +575 -0
  825. truthound/validators/streaming/range.py +308 -0
  826. truthound/validators/streaming/sources.py +846 -0
  827. truthound/validators/string/__init__.py +57 -0
  828. truthound/validators/string/casing.py +158 -0
  829. truthound/validators/string/charset.py +96 -0
  830. truthound/validators/string/format.py +501 -0
  831. truthound/validators/string/json.py +77 -0
  832. truthound/validators/string/json_schema.py +184 -0
  833. truthound/validators/string/length.py +104 -0
  834. truthound/validators/string/like_pattern.py +237 -0
  835. truthound/validators/string/regex.py +202 -0
  836. truthound/validators/string/regex_extended.py +435 -0
  837. truthound/validators/table/__init__.py +88 -0
  838. truthound/validators/table/base.py +78 -0
  839. truthound/validators/table/column_count.py +198 -0
  840. truthound/validators/table/freshness.py +362 -0
  841. truthound/validators/table/row_count.py +251 -0
  842. truthound/validators/table/schema.py +333 -0
  843. truthound/validators/table/size.py +285 -0
  844. truthound/validators/timeout/__init__.py +102 -0
  845. truthound/validators/timeout/advanced/__init__.py +247 -0
  846. truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
  847. truthound/validators/timeout/advanced/prediction.py +773 -0
  848. truthound/validators/timeout/advanced/priority.py +618 -0
  849. truthound/validators/timeout/advanced/redis_backend.py +770 -0
  850. truthound/validators/timeout/advanced/retry.py +721 -0
  851. truthound/validators/timeout/advanced/sampling.py +788 -0
  852. truthound/validators/timeout/advanced/sla.py +661 -0
  853. truthound/validators/timeout/advanced/telemetry.py +804 -0
  854. truthound/validators/timeout/cascade.py +477 -0
  855. truthound/validators/timeout/deadline.py +657 -0
  856. truthound/validators/timeout/degradation.py +525 -0
  857. truthound/validators/timeout/distributed.py +597 -0
  858. truthound/validators/timeseries/__init__.py +89 -0
  859. truthound/validators/timeseries/base.py +326 -0
  860. truthound/validators/timeseries/completeness.py +617 -0
  861. truthound/validators/timeseries/gap.py +485 -0
  862. truthound/validators/timeseries/monotonic.py +310 -0
  863. truthound/validators/timeseries/seasonality.py +422 -0
  864. truthound/validators/timeseries/trend.py +510 -0
  865. truthound/validators/uniqueness/__init__.py +59 -0
  866. truthound/validators/uniqueness/approximate.py +475 -0
  867. truthound/validators/uniqueness/distinct_values.py +253 -0
  868. truthound/validators/uniqueness/duplicate.py +118 -0
  869. truthound/validators/uniqueness/primary_key.py +140 -0
  870. truthound/validators/uniqueness/unique.py +191 -0
  871. truthound/validators/uniqueness/within_record.py +599 -0
  872. truthound/validators/utils.py +756 -0
  873. truthound-1.0.8.dist-info/METADATA +474 -0
  874. truthound-1.0.8.dist-info/RECORD +877 -0
  875. truthound-1.0.8.dist-info/WHEEL +4 -0
  876. truthound-1.0.8.dist-info/entry_points.txt +2 -0
  877. truthound-1.0.8.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1190 @@
1
+ """Machine learning based anomaly detection validators.
2
+
3
+ These validators use scikit-learn for advanced anomaly detection.
4
+ Requires: pip install truthound[anomaly] (includes scikit-learn)
5
+
6
+ Memory Optimization:
7
+ These validators now support automatic sampling for large datasets:
8
+
9
+ # Memory-efficient usage for large datasets:
10
+ validator = IsolationForestValidator(
11
+ columns=["col1", "col2"],
12
+ sample_size=100000, # Sample if data exceeds this
13
+ batch_size=50000, # Process in batches for scoring
14
+ )
15
+
16
+ # Or use auto-sampling based on available memory:
17
+ validator = IsolationForestValidator(
18
+ columns=["col1", "col2"],
19
+ auto_sample=True, # Auto-detect optimal sample size
20
+ )
21
+ """
22
+
23
+ from typing import Any
24
+
25
+ import polars as pl
26
+ import numpy as np
27
+
28
+ from truthound.types import Severity
29
+ from truthound.validators.base import ValidationIssue
30
+ from truthound.validators.registry import register_validator
31
+ from truthound.validators.anomaly.base import (
32
+ AnomalyValidator,
33
+ MLAnomalyMixin,
34
+ )
35
+
36
+
37
+ # Default thresholds for memory-efficient processing
38
+ DEFAULT_SAMPLE_SIZE = 100000 # Default max samples for training
39
+ DEFAULT_BATCH_SIZE = 50000 # Default batch size for scoring
40
+ MEMORY_THRESHOLD_MB = 500 # Auto-sample when data exceeds this
41
+
42
+
43
+ def _check_sklearn_available() -> None:
44
+ """Check if scikit-learn is available."""
45
+ try:
46
+ import sklearn # noqa: F401
47
+ except ImportError:
48
+ raise ImportError(
49
+ "scikit-learn is required for ML-based anomaly detection. "
50
+ "Install with: pip install truthound[anomaly]"
51
+ )
52
+
53
+
54
+ def _estimate_data_memory_mb(n_rows: int, n_cols: int) -> float:
55
+ """Estimate memory usage for numpy array in MB."""
56
+ # Assuming float64 (8 bytes per element)
57
+ bytes_needed = n_rows * n_cols * 8
58
+ return bytes_needed / (1024 * 1024)
59
+
60
+
61
+ def _compute_optimal_sample_size(
62
+ n_rows: int,
63
+ n_cols: int,
64
+ max_memory_mb: float = MEMORY_THRESHOLD_MB,
65
+ ) -> int:
66
+ """Compute optimal sample size based on memory constraints.
67
+
68
+ Args:
69
+ n_rows: Total number of rows
70
+ n_cols: Number of columns
71
+ max_memory_mb: Maximum memory to use
72
+
73
+ Returns:
74
+ Optimal sample size
75
+ """
76
+ # Calculate max rows that fit in memory
77
+ bytes_per_row = n_cols * 8 # float64
78
+ max_rows = int((max_memory_mb * 1024 * 1024) / bytes_per_row)
79
+
80
+ # Apply a safety margin and cap
81
+ safe_rows = int(max_rows * 0.8)
82
+ return min(n_rows, max(safe_rows, 1000)) # At least 1000 samples
83
+
84
+
85
+ class LargeDatasetMixin:
86
+ """Mixin providing large dataset handling utilities for ML validators.
87
+
88
+ Provides:
89
+ - Automatic sampling for training
90
+ - Mini-batch scoring for prediction
91
+ - Memory-aware data loading
92
+ """
93
+
94
+ def _smart_sample_lazyframe(
95
+ self,
96
+ lf: pl.LazyFrame,
97
+ columns: list[str],
98
+ sample_size: int | None = None,
99
+ random_state: int = 42,
100
+ ) -> tuple[np.ndarray, int, bool]:
101
+ """Efficiently sample data from LazyFrame.
102
+
103
+ Uses Polars lazy evaluation to avoid loading full dataset.
104
+
105
+ Args:
106
+ lf: Input LazyFrame
107
+ columns: Columns to select
108
+ sample_size: Max samples (None = load all)
109
+ random_state: Random seed
110
+
111
+ Returns:
112
+ Tuple of (data_array, original_count, was_sampled)
113
+ """
114
+ # First, get count efficiently
115
+ count_result = lf.select(pl.len()).collect()
116
+ total_count = count_result.item()
117
+
118
+ if total_count == 0:
119
+ return np.array([]).reshape(0, len(columns)), 0, False
120
+
121
+ # Determine if sampling is needed
122
+ effective_sample_size = sample_size
123
+ should_sample = sample_size is not None and total_count > sample_size
124
+
125
+ if should_sample:
126
+ # Collect data first, then sample (more reliable approach)
127
+ # For very large data, we use slice-based sampling
128
+ df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
129
+
130
+ if len(df) > effective_sample_size:
131
+ # Random sampling from collected dataframe
132
+ df = df.sample(n=effective_sample_size, seed=random_state)
133
+ else:
134
+ df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
135
+
136
+ if len(df) == 0:
137
+ return np.array([]).reshape(0, len(columns)), total_count, should_sample
138
+
139
+ data = df.to_numpy()
140
+ return data, total_count, should_sample
141
+
142
+ def _batch_predict(
143
+ self,
144
+ model: Any,
145
+ data: np.ndarray,
146
+ batch_size: int = DEFAULT_BATCH_SIZE,
147
+ predict_method: str = "predict",
148
+ ) -> np.ndarray:
149
+ """Predict in batches to reduce memory usage.
150
+
151
+ Args:
152
+ model: Fitted sklearn model
153
+ data: Input data array
154
+ batch_size: Size of each batch
155
+ predict_method: Method to call on model ('predict' or 'decision_function')
156
+
157
+ Returns:
158
+ Concatenated predictions
159
+ """
160
+ n_samples = len(data)
161
+ if n_samples <= batch_size:
162
+ method = getattr(model, predict_method)
163
+ return method(data)
164
+
165
+ predictions = []
166
+ for start_idx in range(0, n_samples, batch_size):
167
+ end_idx = min(start_idx + batch_size, n_samples)
168
+ batch = data[start_idx:end_idx]
169
+ method = getattr(model, predict_method)
170
+ batch_pred = method(batch)
171
+ predictions.append(batch_pred)
172
+
173
+ return np.concatenate(predictions)
174
+
175
+ def _streaming_score(
176
+ self,
177
+ lf: pl.LazyFrame,
178
+ columns: list[str],
179
+ model: Any,
180
+ medians: np.ndarray,
181
+ iqrs: np.ndarray,
182
+ batch_size: int = DEFAULT_BATCH_SIZE,
183
+ ) -> tuple[np.ndarray, np.ndarray]:
184
+ """Stream data and score in batches for very large datasets.
185
+
186
+ This method processes data in chunks without loading the entire
187
+ dataset into memory at once.
188
+
189
+ Args:
190
+ lf: LazyFrame with data
191
+ columns: Columns to process
192
+ model: Fitted sklearn model (must have predict() and optionally decision_function())
193
+ medians: Normalization medians
194
+ iqrs: Normalization IQRs
195
+ batch_size: Size of each batch
196
+
197
+ Returns:
198
+ Tuple of (predictions_array, scores_array or empty array)
199
+ """
200
+ # Get total count
201
+ total_count = lf.select(pl.len()).collect().item()
202
+
203
+ if total_count == 0:
204
+ return np.array([]), np.array([])
205
+
206
+ all_predictions = []
207
+ all_scores = []
208
+ has_decision_function = hasattr(model, 'decision_function')
209
+
210
+ # Process in streaming batches
211
+ for offset in range(0, total_count, batch_size):
212
+ # Fetch batch using slice
213
+ batch_lf = (
214
+ lf.select([pl.col(c) for c in columns])
215
+ .slice(offset, batch_size)
216
+ .drop_nulls()
217
+ )
218
+ batch_df = batch_lf.collect()
219
+
220
+ if len(batch_df) == 0:
221
+ continue
222
+
223
+ batch_data = batch_df.to_numpy()
224
+
225
+ # Normalize using training stats
226
+ normalized_batch = (batch_data - medians) / np.where(iqrs == 0, 1, iqrs)
227
+
228
+ # Predict
229
+ batch_preds = model.predict(normalized_batch)
230
+ all_predictions.append(batch_preds)
231
+
232
+ if has_decision_function:
233
+ batch_scores = model.decision_function(normalized_batch)
234
+ all_scores.append(batch_scores)
235
+
236
+ if not all_predictions:
237
+ return np.array([]), np.array([])
238
+
239
+ predictions = np.concatenate(all_predictions)
240
+ scores = np.concatenate(all_scores) if all_scores else np.array([])
241
+
242
+ return predictions, scores
243
+
244
+
245
+ @register_validator
246
+ class IsolationForestValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
247
+ """Isolation Forest anomaly detection.
248
+
249
+ Isolation Forest isolates anomalies by randomly selecting a feature
250
+ and then randomly selecting a split value. Anomalies are easier to
251
+ isolate, so they have shorter path lengths in the tree.
252
+
253
+ This is efficient for high-dimensional data and doesn't assume
254
+ any particular distribution.
255
+
256
+ Memory Optimization:
257
+ For large datasets, use sample_size and batch_size parameters:
258
+
259
+ # Memory-efficient for 10M+ rows:
260
+ validator = IsolationForestValidator(
261
+ columns=["col1", "col2"],
262
+ sample_size=100000, # Train on 100k samples
263
+ batch_size=50000, # Score in 50k batches
264
+ auto_sample=True, # Or let it auto-detect
265
+ )
266
+
267
+ Example:
268
+ # Detect anomalies in multiple columns
269
+ validator = IsolationForestValidator(
270
+ columns=["feature1", "feature2", "feature3"],
271
+ contamination=0.05, # Expected 5% anomalies
272
+ )
273
+
274
+ # Auto-detect contamination
275
+ validator = IsolationForestValidator(
276
+ columns=["col1", "col2"],
277
+ contamination="auto",
278
+ )
279
+ """
280
+
281
+ name = "isolation_forest"
282
+
283
+ def __init__(
284
+ self,
285
+ columns: list[str] | None = None,
286
+ contamination: float | str = "auto",
287
+ n_estimators: int = 100,
288
+ max_samples: int | float | str = "auto",
289
+ random_state: int | None = 42,
290
+ max_anomaly_ratio: float = 0.1,
291
+ sample_size: int | None = None,
292
+ batch_size: int = DEFAULT_BATCH_SIZE,
293
+ auto_sample: bool = False,
294
+ max_memory_mb: float = MEMORY_THRESHOLD_MB,
295
+ **kwargs: Any,
296
+ ):
297
+ """Initialize Isolation Forest validator.
298
+
299
+ Args:
300
+ columns: Columns to use for detection. If None, uses all numeric.
301
+ contamination: Expected proportion of outliers ("auto" or 0.0-0.5)
302
+ n_estimators: Number of trees in the forest
303
+ max_samples: Number of samples for each tree
304
+ random_state: Random seed for reproducibility
305
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
306
+ sample_size: Max samples for training (None = use all data)
307
+ batch_size: Batch size for scoring large datasets
308
+ auto_sample: If True, automatically determine sample_size
309
+ max_memory_mb: Max memory (MB) for auto_sample mode
310
+ **kwargs: Additional config
311
+ """
312
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
313
+ self.contamination = contamination
314
+ self.n_estimators = n_estimators
315
+ self.max_samples = max_samples
316
+ self.random_state = random_state
317
+ self._sample_size = sample_size
318
+ self._batch_size = batch_size
319
+ self._auto_sample = auto_sample
320
+ self._max_memory_mb = max_memory_mb
321
+
322
+ def detect_anomalies(
323
+ self, data: np.ndarray, column_names: list[str]
324
+ ) -> tuple[np.ndarray, dict[str, Any]]:
325
+ """Detect anomalies using Isolation Forest."""
326
+ _check_sklearn_available()
327
+ from sklearn.ensemble import IsolationForest
328
+
329
+ # Normalize data for better performance
330
+ normalized_data, medians, iqrs = self.normalize_data(data)
331
+
332
+ # Create and fit model
333
+ model = IsolationForest(
334
+ contamination=self.contamination,
335
+ n_estimators=self.n_estimators,
336
+ max_samples=self.max_samples,
337
+ random_state=self.random_state,
338
+ n_jobs=-1, # Use all cores
339
+ )
340
+
341
+ # Predict: -1 for anomalies, 1 for normal
342
+ predictions = model.fit_predict(normalized_data)
343
+ anomaly_mask = predictions == -1
344
+
345
+ # Get anomaly scores (lower = more anomalous)
346
+ scores = model.decision_function(normalized_data)
347
+
348
+ return anomaly_mask, {
349
+ "n_features": data.shape[1],
350
+ "n_samples": data.shape[0],
351
+ "min_score": float(np.min(scores)),
352
+ "max_score": float(np.max(scores)),
353
+ "threshold": float(model.offset_),
354
+ }
355
+
356
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
357
+ issues: list[ValidationIssue] = []
358
+
359
+ columns = self._get_anomaly_columns(lf)
360
+ if not columns:
361
+ return issues
362
+
363
+ # Determine sample size
364
+ sample_size = self._sample_size
365
+ if self._auto_sample and sample_size is None:
366
+ # Get row count first
367
+ total_count = lf.select(pl.len()).collect().item()
368
+ sample_size = _compute_optimal_sample_size(
369
+ total_count, len(columns), self._max_memory_mb
370
+ )
371
+ self.logger.debug(f"Auto-sample: using {sample_size} samples from {total_count}")
372
+
373
+ # Smart sampling from LazyFrame
374
+ data, original_count, was_sampled = self._smart_sample_lazyframe(
375
+ lf, columns, sample_size, self.random_state or 42
376
+ )
377
+
378
+ if len(data) < 10:
379
+ return issues
380
+
381
+ # Detect anomalies on (possibly sampled) data
382
+ anomaly_mask, info = self.detect_anomalies(data, columns)
383
+
384
+ # If we sampled, we need to report based on sample
385
+ # For large datasets, we train on sample but can optionally score all data
386
+ if was_sampled and len(data) < original_count:
387
+ # For very large datasets, we estimate anomaly ratio from sample
388
+ sample_anomaly_count = int(anomaly_mask.sum())
389
+ sample_anomaly_ratio = sample_anomaly_count / len(data)
390
+ # Extrapolate to full dataset
391
+ estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
392
+ anomaly_count = estimated_total_anomalies
393
+ anomaly_ratio = sample_anomaly_ratio
394
+ info["sampled"] = True
395
+ info["sample_size"] = len(data)
396
+ info["original_count"] = original_count
397
+ else:
398
+ anomaly_count = int(anomaly_mask.sum())
399
+ anomaly_ratio = anomaly_count / len(data)
400
+ info["sampled"] = False
401
+
402
+ if anomaly_ratio > self.max_anomaly_ratio:
403
+ severity = self._calculate_severity(anomaly_ratio)
404
+
405
+ sample_note = ""
406
+ if info.get("sampled"):
407
+ sample_note = f" (estimated from {info['sample_size']:,} samples)"
408
+
409
+ issues.append(
410
+ ValidationIssue(
411
+ column=", ".join(columns),
412
+ issue_type="isolation_forest_anomaly",
413
+ count=anomaly_count,
414
+ severity=severity,
415
+ details=(
416
+ f"Isolation Forest detected {anomaly_count:,} anomalies "
417
+ f"({anomaly_ratio:.2%}) across {info['n_features']} features{sample_note}. "
418
+ f"Score range: [{info['min_score']:.4f}, {info['max_score']:.4f}]"
419
+ ),
420
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
421
+ )
422
+ )
423
+
424
+ return issues
425
+
426
+
427
+ @register_validator
428
+ class LOFValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
429
+ """Local Outlier Factor (LOF) anomaly detection.
430
+
431
+ LOF measures the local density deviation of a point with respect to
432
+ its neighbors. Points with substantially lower density than their
433
+ neighbors are considered outliers.
434
+
435
+ Best for detecting local anomalies in clustered data.
436
+
437
+ Memory Optimization:
438
+ LOF is memory-intensive due to distance computations.
439
+ For large datasets, use sampling:
440
+
441
+ validator = LOFValidator(
442
+ columns=["x", "y"],
443
+ n_neighbors=20,
444
+ sample_size=50000, # Sample for large datasets
445
+ )
446
+
447
+ Example:
448
+ validator = LOFValidator(
449
+ columns=["x", "y"],
450
+ n_neighbors=20,
451
+ contamination=0.05,
452
+ )
453
+ """
454
+
455
+ name = "lof"
456
+
457
+ def __init__(
458
+ self,
459
+ columns: list[str] | None = None,
460
+ n_neighbors: int = 20,
461
+ contamination: float | str = "auto",
462
+ metric: str = "minkowski",
463
+ max_anomaly_ratio: float = 0.1,
464
+ sample_size: int | None = None,
465
+ auto_sample: bool = False,
466
+ max_memory_mb: float = MEMORY_THRESHOLD_MB,
467
+ **kwargs: Any,
468
+ ):
469
+ """Initialize LOF validator.
470
+
471
+ Args:
472
+ columns: Columns to use for detection. If None, uses all numeric.
473
+ n_neighbors: Number of neighbors for LOF calculation
474
+ contamination: Expected proportion of outliers
475
+ metric: Distance metric to use
476
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
477
+ sample_size: Max samples for training (None = use all data)
478
+ auto_sample: If True, automatically determine sample_size
479
+ max_memory_mb: Max memory (MB) for auto_sample mode
480
+ **kwargs: Additional config
481
+ """
482
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
483
+ self.n_neighbors = n_neighbors
484
+ self.contamination = contamination
485
+ self.metric = metric
486
+ self._sample_size = sample_size
487
+ self._auto_sample = auto_sample
488
+ self._max_memory_mb = max_memory_mb
489
+
490
+ def detect_anomalies(
491
+ self, data: np.ndarray, column_names: list[str]
492
+ ) -> tuple[np.ndarray, dict[str, Any]]:
493
+ """Detect anomalies using LOF."""
494
+ _check_sklearn_available()
495
+ from sklearn.neighbors import LocalOutlierFactor
496
+
497
+ # Normalize data
498
+ normalized_data, _, _ = self.normalize_data(data)
499
+
500
+ # Adjust n_neighbors if needed
501
+ n_neighbors = min(self.n_neighbors, len(data) - 1)
502
+
503
+ model = LocalOutlierFactor(
504
+ n_neighbors=n_neighbors,
505
+ contamination=self.contamination,
506
+ metric=self.metric,
507
+ n_jobs=-1,
508
+ )
509
+
510
+ # Predict: -1 for anomalies, 1 for normal
511
+ predictions = model.fit_predict(normalized_data)
512
+ anomaly_mask = predictions == -1
513
+
514
+ # Get LOF scores (higher = more anomalous)
515
+ lof_scores = -model.negative_outlier_factor_
516
+
517
+ return anomaly_mask, {
518
+ "n_neighbors": n_neighbors,
519
+ "min_lof": float(np.min(lof_scores)),
520
+ "max_lof": float(np.max(lof_scores)),
521
+ "mean_lof": float(np.mean(lof_scores)),
522
+ }
523
+
524
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
525
+ issues: list[ValidationIssue] = []
526
+
527
+ columns = self._get_anomaly_columns(lf)
528
+ if not columns:
529
+ return issues
530
+
531
+ # Determine sample size (LOF is O(n^2) memory, so sampling is critical)
532
+ sample_size = self._sample_size
533
+ if self._auto_sample and sample_size is None:
534
+ total_count = lf.select(pl.len()).collect().item()
535
+ # LOF needs distance matrix, so use more aggressive sampling
536
+ # Memory: O(n^2) for distance matrix
537
+ sample_size = _compute_optimal_sample_size(
538
+ total_count, len(columns), self._max_memory_mb / 2 # More conservative
539
+ )
540
+ # Cap at reasonable limit for LOF
541
+ sample_size = min(sample_size, 50000)
542
+ self.logger.debug(f"Auto-sample (LOF): using {sample_size} samples from {total_count}")
543
+
544
+ # Smart sampling from LazyFrame
545
+ data, original_count, was_sampled = self._smart_sample_lazyframe(
546
+ lf, columns, sample_size, 42
547
+ )
548
+
549
+ if len(data) < self.n_neighbors + 1:
550
+ return issues
551
+
552
+ anomaly_mask, info = self.detect_anomalies(data, columns)
553
+
554
+ # Handle sampled results
555
+ if was_sampled and len(data) < original_count:
556
+ sample_anomaly_count = int(anomaly_mask.sum())
557
+ sample_anomaly_ratio = sample_anomaly_count / len(data)
558
+ estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
559
+ anomaly_count = estimated_total_anomalies
560
+ anomaly_ratio = sample_anomaly_ratio
561
+ info["sampled"] = True
562
+ info["sample_size"] = len(data)
563
+ info["original_count"] = original_count
564
+ else:
565
+ anomaly_count = int(anomaly_mask.sum())
566
+ anomaly_ratio = anomaly_count / len(data)
567
+ info["sampled"] = False
568
+
569
+ if anomaly_ratio > self.max_anomaly_ratio:
570
+ severity = self._calculate_severity(anomaly_ratio)
571
+
572
+ sample_note = ""
573
+ if info.get("sampled"):
574
+ sample_note = f" (estimated from {info['sample_size']:,} samples)"
575
+
576
+ issues.append(
577
+ ValidationIssue(
578
+ column=", ".join(columns),
579
+ issue_type="lof_anomaly",
580
+ count=anomaly_count,
581
+ severity=severity,
582
+ details=(
583
+ f"LOF (k={info['n_neighbors']}) detected {anomaly_count:,} anomalies "
584
+ f"({anomaly_ratio:.2%}){sample_note}. LOF scores: mean={info['mean_lof']:.2f}, "
585
+ f"max={info['max_lof']:.2f}"
586
+ ),
587
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
588
+ )
589
+ )
590
+
591
+ return issues
592
+
593
+
594
+ @register_validator
595
+ class OneClassSVMValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
596
+ """One-Class SVM for anomaly detection.
597
+
598
+ One-Class SVM learns a decision boundary around normal data.
599
+ Points outside this boundary are classified as anomalies.
600
+
601
+ Works well for high-dimensional data but can be slower than
602
+ tree-based methods.
603
+
604
+ Memory Optimization:
605
+ SVM training is O(n^2) to O(n^3), so sampling is essential:
606
+
607
+ validator = OneClassSVMValidator(
608
+ columns=["feature1", "feature2"],
609
+ nu=0.05,
610
+ sample_size=10000, # Train on smaller sample
611
+ batch_size=50000, # Score in batches
612
+ )
613
+
614
+ Example:
615
+ validator = OneClassSVMValidator(
616
+ columns=["feature1", "feature2"],
617
+ nu=0.05, # Upper bound on fraction of anomalies
618
+ kernel="rbf",
619
+ )
620
+ """
621
+
622
+ name = "one_class_svm"
623
+
624
+ def __init__(
625
+ self,
626
+ columns: list[str] | None = None,
627
+ kernel: str = "rbf",
628
+ nu: float = 0.05,
629
+ gamma: str | float = "scale",
630
+ max_anomaly_ratio: float = 0.1,
631
+ sample_size: int | None = None,
632
+ batch_size: int = DEFAULT_BATCH_SIZE,
633
+ auto_sample: bool = False,
634
+ max_memory_mb: float = MEMORY_THRESHOLD_MB,
635
+ **kwargs: Any,
636
+ ):
637
+ """Initialize One-Class SVM validator.
638
+
639
+ Args:
640
+ columns: Columns to use for detection
641
+ kernel: Kernel type ('rbf', 'linear', 'poly', 'sigmoid')
642
+ nu: Upper bound on fraction of training errors and support vectors
643
+ gamma: Kernel coefficient
644
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
645
+ sample_size: Max samples for training (None = use all data)
646
+ batch_size: Batch size for scoring large datasets
647
+ auto_sample: If True, automatically determine sample_size
648
+ max_memory_mb: Max memory (MB) for auto_sample mode
649
+ **kwargs: Additional config
650
+ """
651
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
652
+ self.kernel = kernel
653
+ self.nu = nu
654
+ self.gamma = gamma
655
+ self._sample_size = sample_size
656
+ self._batch_size = batch_size
657
+ self._auto_sample = auto_sample
658
+ self._max_memory_mb = max_memory_mb
659
+
660
+ def detect_anomalies(
661
+ self, data: np.ndarray, column_names: list[str]
662
+ ) -> tuple[np.ndarray, dict[str, Any]]:
663
+ """Detect anomalies using One-Class SVM."""
664
+ _check_sklearn_available()
665
+ from sklearn.svm import OneClassSVM
666
+
667
+ # Normalize data
668
+ normalized_data, _, _ = self.normalize_data(data)
669
+
670
+ model = OneClassSVM(
671
+ kernel=self.kernel,
672
+ nu=self.nu,
673
+ gamma=self.gamma,
674
+ )
675
+
676
+ predictions = model.fit_predict(normalized_data)
677
+ anomaly_mask = predictions == -1
678
+
679
+ # Get decision function scores
680
+ scores = model.decision_function(normalized_data)
681
+
682
+ return anomaly_mask, {
683
+ "kernel": self.kernel,
684
+ "nu": self.nu,
685
+ "n_support": len(model.support_),
686
+ "min_score": float(np.min(scores)),
687
+ "max_score": float(np.max(scores)),
688
+ }
689
+
690
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
691
+ issues: list[ValidationIssue] = []
692
+
693
+ columns = self._get_anomaly_columns(lf)
694
+ if not columns:
695
+ return issues
696
+
697
+ # Determine sample size (SVM is O(n^2)-O(n^3), very memory intensive)
698
+ sample_size = self._sample_size
699
+ if self._auto_sample and sample_size is None:
700
+ total_count = lf.select(pl.len()).collect().item()
701
+ # SVM is very expensive, use aggressive sampling
702
+ sample_size = _compute_optimal_sample_size(
703
+ total_count, len(columns), self._max_memory_mb / 4 # Very conservative
704
+ )
705
+ # Cap at reasonable limit for SVM
706
+ sample_size = min(sample_size, 20000)
707
+ self.logger.debug(f"Auto-sample (SVM): using {sample_size} samples from {total_count}")
708
+
709
+ # Smart sampling from LazyFrame
710
+ data, original_count, was_sampled = self._smart_sample_lazyframe(
711
+ lf, columns, sample_size, 42
712
+ )
713
+
714
+ if len(data) < 10:
715
+ return issues
716
+
717
+ anomaly_mask, info = self.detect_anomalies(data, columns)
718
+
719
+ # Handle sampled results
720
+ if was_sampled and len(data) < original_count:
721
+ sample_anomaly_count = int(anomaly_mask.sum())
722
+ sample_anomaly_ratio = sample_anomaly_count / len(data)
723
+ estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
724
+ anomaly_count = estimated_total_anomalies
725
+ anomaly_ratio = sample_anomaly_ratio
726
+ info["sampled"] = True
727
+ info["sample_size"] = len(data)
728
+ info["original_count"] = original_count
729
+ else:
730
+ anomaly_count = int(anomaly_mask.sum())
731
+ anomaly_ratio = anomaly_count / len(data)
732
+ info["sampled"] = False
733
+
734
+ if anomaly_ratio > self.max_anomaly_ratio:
735
+ severity = self._calculate_severity(anomaly_ratio)
736
+
737
+ sample_note = ""
738
+ if info.get("sampled"):
739
+ sample_note = f" (estimated from {info['sample_size']:,} samples)"
740
+
741
+ issues.append(
742
+ ValidationIssue(
743
+ column=", ".join(columns),
744
+ issue_type="svm_anomaly",
745
+ count=anomaly_count,
746
+ severity=severity,
747
+ details=(
748
+ f"One-Class SVM ({info['kernel']}, nu={info['nu']}) detected "
749
+ f"{anomaly_count:,} anomalies ({anomaly_ratio:.2%}){sample_note}. "
750
+ f"Support vectors: {info['n_support']}"
751
+ ),
752
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
753
+ )
754
+ )
755
+
756
+ return issues
757
+
758
+
759
+ @register_validator
760
+ class MemoryEfficientLOFValidator(AnomalyValidator, MLAnomalyMixin):
761
+ """Memory-efficient LOF using approximate k-NN.
762
+
763
+ This validator uses approximate nearest neighbor algorithms (BallTree, Annoy, HNSW)
764
+ to compute LOF scores without building a full O(n²) distance matrix.
765
+
766
+ Memory Complexity:
767
+ - Standard LOF: O(n²) for distance matrix
768
+ - This implementation: O(n) with O(log n) query time
769
+
770
+ Use this for datasets > 50,000 rows where standard LOF would run out of memory.
771
+
772
+ Example:
773
+ # For large datasets (100k+ rows)
774
+ validator = MemoryEfficientLOFValidator(
775
+ columns=["feature1", "feature2"],
776
+ n_neighbors=20,
777
+ knn_backend="balltree", # or "annoy", "hnsw" if installed
778
+ )
779
+ """
780
+
781
+ name = "memory_efficient_lof"
782
+
783
+ def __init__(
784
+ self,
785
+ columns: list[str] | None = None,
786
+ n_neighbors: int = 20,
787
+ contamination: float = 0.1,
788
+ max_anomaly_ratio: float = 0.1,
789
+ knn_backend: str = "auto",
790
+ sample_size: int | None = None,
791
+ **kwargs: Any,
792
+ ):
793
+ """Initialize memory-efficient LOF validator.
794
+
795
+ Args:
796
+ columns: Columns to use for detection
797
+ n_neighbors: Number of neighbors for LOF
798
+ contamination: Expected proportion of outliers
799
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
800
+ knn_backend: k-NN backend ('auto', 'balltree', 'kdtree', 'annoy', 'hnsw')
801
+ sample_size: Optional sample size for very large datasets
802
+ **kwargs: Additional config
803
+ """
804
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
805
+ self.n_neighbors = n_neighbors
806
+ self.contamination = contamination
807
+ self.knn_backend = knn_backend
808
+ self._sample_size = sample_size
809
+
810
+ # Import the mixin at runtime to avoid circular imports
811
+ from truthound.validators.memory import ApproximateKNNMixin
812
+ self._knn_mixin = ApproximateKNNMixin()
813
+
814
+ def detect_anomalies(
815
+ self, data: np.ndarray, column_names: list[str]
816
+ ) -> tuple[np.ndarray, dict[str, Any]]:
817
+ """Detect anomalies using approximate LOF."""
818
+ # Normalize data
819
+ normalized_data, _, _ = self.normalize_data(data)
820
+
821
+ n_neighbors = min(self.n_neighbors, len(data) - 1)
822
+
823
+ # Build approximate index
824
+ backend = self.knn_backend if self.knn_backend != "auto" else None
825
+ self._knn_mixin.build_approximate_index(
826
+ normalized_data,
827
+ backend=backend,
828
+ metric="euclidean",
829
+ )
830
+
831
+ # Compute LOF scores using approximate k-NN
832
+ lof_scores = self._knn_mixin.compute_local_outlier_factor(
833
+ normalized_data, k=n_neighbors
834
+ )
835
+
836
+ # Determine threshold based on contamination
837
+ if isinstance(self.contamination, float) and 0 < self.contamination < 0.5:
838
+ threshold = np.percentile(lof_scores, 100 * (1 - self.contamination))
839
+ else:
840
+ # Auto: use 1.5 as threshold (common LOF threshold)
841
+ threshold = 1.5
842
+
843
+ anomaly_mask = lof_scores > threshold
844
+
845
+ # Clear index to free memory
846
+ self._knn_mixin.clear_index()
847
+
848
+ return anomaly_mask, {
849
+ "n_neighbors": n_neighbors,
850
+ "min_lof": float(np.min(lof_scores)),
851
+ "max_lof": float(np.max(lof_scores)),
852
+ "mean_lof": float(np.mean(lof_scores)),
853
+ "threshold": float(threshold),
854
+ "backend": str(self._knn_mixin._knn_backend),
855
+ }
856
+
857
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
858
+ issues: list[ValidationIssue] = []
859
+
860
+ columns = self._get_anomaly_columns(lf)
861
+ if not columns:
862
+ return issues
863
+
864
+ # Sample if needed
865
+ if self._sample_size:
866
+ sample_lf = lf.head(self._sample_size)
867
+ df = sample_lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
868
+ else:
869
+ df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
870
+
871
+ if len(df) < self.n_neighbors + 1:
872
+ return issues
873
+
874
+ data = df.to_numpy()
875
+ anomaly_mask, info = self.detect_anomalies(data, columns)
876
+
877
+ anomaly_count = int(anomaly_mask.sum())
878
+ anomaly_ratio = anomaly_count / len(data)
879
+
880
+ if anomaly_ratio > self.max_anomaly_ratio:
881
+ severity = self._calculate_severity(anomaly_ratio)
882
+
883
+ issues.append(
884
+ ValidationIssue(
885
+ column=", ".join(columns),
886
+ issue_type="memory_efficient_lof_anomaly",
887
+ count=anomaly_count,
888
+ severity=severity,
889
+ details=(
890
+ f"Approximate LOF (k={info['n_neighbors']}, backend={info['backend']}) "
891
+ f"detected {anomaly_count:,} anomalies ({anomaly_ratio:.2%}). "
892
+ f"LOF scores: mean={info['mean_lof']:.2f}, max={info['max_lof']:.2f}"
893
+ ),
894
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
895
+ )
896
+ )
897
+
898
+ return issues
899
+
900
+
901
+ @register_validator
902
+ class OnlineSVMValidator(AnomalyValidator, MLAnomalyMixin):
903
+ """Online One-Class SVM using SGD for memory-efficient training.
904
+
905
+ This validator uses kernel approximation (Nystroem) and SGD optimization
906
+ to train One-Class SVM incrementally, avoiding the O(n²) kernel matrix.
907
+
908
+ Memory Complexity:
909
+ - Standard SVM: O(n²) for kernel matrix
910
+ - This implementation: O(n_components × n_features) constant
911
+
912
+ Use this for datasets > 20,000 rows where standard SVM would run out of memory.
913
+
914
+ Example:
915
+ # For large datasets
916
+ validator = OnlineSVMValidator(
917
+ columns=["feature1", "feature2"],
918
+ nu=0.05,
919
+ n_components=100, # Kernel approximation components
920
+ )
921
+ """
922
+
923
+ name = "online_svm"
924
+
925
+ def __init__(
926
+ self,
927
+ columns: list[str] | None = None,
928
+ nu: float = 0.05,
929
+ n_components: int = 100,
930
+ kernel_approx: str = "nystroem",
931
+ max_anomaly_ratio: float = 0.1,
932
+ n_iterations: int = 3,
933
+ batch_size: int = 1000,
934
+ **kwargs: Any,
935
+ ):
936
+ """Initialize online SVM validator.
937
+
938
+ Args:
939
+ columns: Columns to use for detection
940
+ nu: Upper bound on fraction of outliers
941
+ n_components: Number of kernel approximation components
942
+ kernel_approx: Kernel approximation method ('nystroem' or 'rbf_sampler')
943
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
944
+ n_iterations: Number of passes through data
945
+ batch_size: Mini-batch size for training
946
+ **kwargs: Additional config
947
+ """
948
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
949
+ self.nu = nu
950
+ self.n_components = n_components
951
+ self.kernel_approx = kernel_approx
952
+ self.n_iterations = n_iterations
953
+ self.batch_size = batch_size
954
+
955
+ def detect_anomalies(
956
+ self, data: np.ndarray, column_names: list[str]
957
+ ) -> tuple[np.ndarray, dict[str, Any]]:
958
+ """Detect anomalies using online SVM."""
959
+ from truthound.validators.memory import SGDOneClassSVM
960
+
961
+ # Normalize data
962
+ normalized_data, _, _ = self.normalize_data(data)
963
+
964
+ # Create online SVM
965
+ model = SGDOneClassSVM(
966
+ nu=self.nu,
967
+ n_components=min(self.n_components, len(data)),
968
+ kernel_approx=self.kernel_approx,
969
+ )
970
+
971
+ # Train incrementally
972
+ n_samples = len(normalized_data)
973
+ for _ in range(self.n_iterations):
974
+ for start in range(0, n_samples, self.batch_size):
975
+ end = min(start + self.batch_size, n_samples)
976
+ model.partial_fit(normalized_data[start:end])
977
+
978
+ # Predict
979
+ predictions = model.predict(normalized_data)
980
+ anomaly_mask = predictions == -1
981
+
982
+ # Get decision scores
983
+ scores = model.decision_function(normalized_data)
984
+
985
+ return anomaly_mask, {
986
+ "nu": self.nu,
987
+ "n_components": self.n_components,
988
+ "min_score": float(np.min(scores)),
989
+ "max_score": float(np.max(scores)),
990
+ }
991
+
992
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
993
+ issues: list[ValidationIssue] = []
994
+
995
+ columns = self._get_anomaly_columns(lf)
996
+ if not columns:
997
+ return issues
998
+
999
+ df = lf.select([pl.col(c) for c in columns]).drop_nulls().collect()
1000
+
1001
+ if len(df) < 10:
1002
+ return issues
1003
+
1004
+ data = df.to_numpy()
1005
+ anomaly_mask, info = self.detect_anomalies(data, columns)
1006
+
1007
+ anomaly_count = int(anomaly_mask.sum())
1008
+ anomaly_ratio = anomaly_count / len(data)
1009
+
1010
+ if anomaly_ratio > self.max_anomaly_ratio:
1011
+ severity = self._calculate_severity(anomaly_ratio)
1012
+
1013
+ issues.append(
1014
+ ValidationIssue(
1015
+ column=", ".join(columns),
1016
+ issue_type="online_svm_anomaly",
1017
+ count=anomaly_count,
1018
+ severity=severity,
1019
+ details=(
1020
+ f"Online SVM (nu={info['nu']}, components={info['n_components']}) "
1021
+ f"detected {anomaly_count:,} anomalies ({anomaly_ratio:.2%}). "
1022
+ f"Score range: [{info['min_score']:.4f}, {info['max_score']:.4f}]"
1023
+ ),
1024
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
1025
+ )
1026
+ )
1027
+
1028
+ return issues
1029
+
1030
+
1031
+ @register_validator
1032
+ class DBSCANAnomalyValidator(AnomalyValidator, MLAnomalyMixin, LargeDatasetMixin):
1033
+ """DBSCAN-based anomaly detection.
1034
+
1035
+ DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
1036
+ identifies outliers as noise points that don't belong to any cluster.
1037
+
1038
+ Best for discovering clusters of arbitrary shape while identifying
1039
+ noise points as anomalies.
1040
+
1041
+ Memory Optimization:
1042
+ DBSCAN requires pairwise distance computation. For large datasets:
1043
+
1044
+ validator = DBSCANAnomalyValidator(
1045
+ columns=["x", "y"],
1046
+ eps=0.5,
1047
+ sample_size=50000, # Sample for large datasets
1048
+ )
1049
+
1050
+ Example:
1051
+ validator = DBSCANAnomalyValidator(
1052
+ columns=["x", "y"],
1053
+ eps=0.5, # Maximum distance between points
1054
+ min_samples=5, # Minimum cluster size
1055
+ )
1056
+ """
1057
+
1058
+ name = "dbscan_anomaly"
1059
+
1060
+ def __init__(
1061
+ self,
1062
+ columns: list[str] | None = None,
1063
+ eps: float = 0.5,
1064
+ min_samples: int = 5,
1065
+ metric: str = "euclidean",
1066
+ max_anomaly_ratio: float = 0.1,
1067
+ sample_size: int | None = None,
1068
+ auto_sample: bool = False,
1069
+ max_memory_mb: float = MEMORY_THRESHOLD_MB,
1070
+ **kwargs: Any,
1071
+ ):
1072
+ """Initialize DBSCAN anomaly validator.
1073
+
1074
+ Args:
1075
+ columns: Columns to use for detection
1076
+ eps: Maximum distance between points in a cluster
1077
+ min_samples: Minimum number of points for a core point
1078
+ metric: Distance metric
1079
+ max_anomaly_ratio: Maximum acceptable ratio of anomalies
1080
+ sample_size: Max samples for processing (None = use all data)
1081
+ auto_sample: If True, automatically determine sample_size
1082
+ max_memory_mb: Max memory (MB) for auto_sample mode
1083
+ **kwargs: Additional config
1084
+ """
1085
+ super().__init__(columns=columns, max_anomaly_ratio=max_anomaly_ratio, **kwargs)
1086
+ self.eps = eps
1087
+ self.min_samples = min_samples
1088
+ self.metric = metric
1089
+ self._sample_size = sample_size
1090
+ self._auto_sample = auto_sample
1091
+ self._max_memory_mb = max_memory_mb
1092
+
1093
+ def detect_anomalies(
1094
+ self, data: np.ndarray, column_names: list[str]
1095
+ ) -> tuple[np.ndarray, dict[str, Any]]:
1096
+ """Detect anomalies using DBSCAN."""
1097
+ _check_sklearn_available()
1098
+ from sklearn.cluster import DBSCAN
1099
+
1100
+ # Normalize data
1101
+ normalized_data, _, _ = self.normalize_data(data)
1102
+
1103
+ model = DBSCAN(
1104
+ eps=self.eps,
1105
+ min_samples=self.min_samples,
1106
+ metric=self.metric,
1107
+ n_jobs=-1,
1108
+ )
1109
+
1110
+ labels = model.fit_predict(normalized_data)
1111
+
1112
+ # -1 label indicates noise (anomaly)
1113
+ anomaly_mask = labels == -1
1114
+
1115
+ # Count clusters
1116
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
1117
+
1118
+ return anomaly_mask, {
1119
+ "n_clusters": n_clusters,
1120
+ "eps": self.eps,
1121
+ "min_samples": self.min_samples,
1122
+ }
1123
+
1124
+ def validate(self, lf: pl.LazyFrame) -> list[ValidationIssue]:
1125
+ issues: list[ValidationIssue] = []
1126
+
1127
+ columns = self._get_anomaly_columns(lf)
1128
+ if not columns:
1129
+ return issues
1130
+
1131
+ # Determine sample size (DBSCAN needs O(n^2) distance computations)
1132
+ sample_size = self._sample_size
1133
+ if self._auto_sample and sample_size is None:
1134
+ total_count = lf.select(pl.len()).collect().item()
1135
+ # DBSCAN is memory intensive, use conservative sampling
1136
+ sample_size = _compute_optimal_sample_size(
1137
+ total_count, len(columns), self._max_memory_mb / 2
1138
+ )
1139
+ # Cap at reasonable limit
1140
+ sample_size = min(sample_size, 50000)
1141
+ self.logger.debug(f"Auto-sample (DBSCAN): using {sample_size} samples from {total_count}")
1142
+
1143
+ # Smart sampling from LazyFrame
1144
+ data, original_count, was_sampled = self._smart_sample_lazyframe(
1145
+ lf, columns, sample_size, 42
1146
+ )
1147
+
1148
+ if len(data) < self.min_samples:
1149
+ return issues
1150
+
1151
+ anomaly_mask, info = self.detect_anomalies(data, columns)
1152
+
1153
+ # Handle sampled results
1154
+ if was_sampled and len(data) < original_count:
1155
+ sample_anomaly_count = int(anomaly_mask.sum())
1156
+ sample_anomaly_ratio = sample_anomaly_count / len(data)
1157
+ estimated_total_anomalies = int(sample_anomaly_ratio * original_count)
1158
+ anomaly_count = estimated_total_anomalies
1159
+ anomaly_ratio = sample_anomaly_ratio
1160
+ info["sampled"] = True
1161
+ info["sample_size"] = len(data)
1162
+ info["original_count"] = original_count
1163
+ else:
1164
+ anomaly_count = int(anomaly_mask.sum())
1165
+ anomaly_ratio = anomaly_count / len(data)
1166
+ info["sampled"] = False
1167
+
1168
+ if anomaly_ratio > self.max_anomaly_ratio:
1169
+ severity = self._calculate_severity(anomaly_ratio)
1170
+
1171
+ sample_note = ""
1172
+ if info.get("sampled"):
1173
+ sample_note = f" (estimated from {info['sample_size']:,} samples)"
1174
+
1175
+ issues.append(
1176
+ ValidationIssue(
1177
+ column=", ".join(columns),
1178
+ issue_type="dbscan_anomaly",
1179
+ count=anomaly_count,
1180
+ severity=severity,
1181
+ details=(
1182
+ f"DBSCAN (eps={info['eps']}, min_samples={info['min_samples']}) "
1183
+ f"found {info['n_clusters']} clusters and {anomaly_count:,} noise points "
1184
+ f"({anomaly_ratio:.2%}){sample_note}"
1185
+ ),
1186
+ expected=f"Anomaly ratio <= {self.max_anomaly_ratio:.2%}",
1187
+ )
1188
+ )
1189
+
1190
+ return issues