truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +645 -23
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +15 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.1.dist-info/METADATA +312 -0
  146. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
  203. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,498 @@
1
+ """Enterprise Sampling API endpoints.
2
+
3
+ This module provides REST API endpoints for truthound 1.2.10's enterprise-scale
4
+ sampling capabilities.
5
+
6
+ Endpoints:
7
+ - POST /api/v1/sampling/enterprise: Run enterprise sampling
8
+ - POST /api/v1/sampling/estimate-size: Estimate optimal sample size
9
+ - POST /api/v1/sampling/sketch: Run sketch-based estimation
10
+ - GET /api/v1/sampling/jobs: List sampling jobs
11
+ - GET /api/v1/sampling/jobs/{job_id}: Get job status
12
+ - POST /api/v1/sampling/jobs/{job_id}/cancel: Cancel job
13
+ - GET /api/v1/sampling/strategies: List available strategies
14
+ - GET /api/v1/sampling/quality-presets: List quality presets
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from typing import Any
21
+
22
+ from fastapi import APIRouter, Depends, HTTPException, Query, status
23
+ from sqlalchemy.ext.asyncio import AsyncSession
24
+
25
+ from truthound_dashboard.api.deps import get_session
26
+ from truthound_dashboard.core.enterprise_sampling import (
27
+ QUALITY_PRESETS,
28
+ SCALE_STRATEGY_MAP,
29
+ classify_dataset_scale,
30
+ get_enterprise_sampler,
31
+ get_sample_size_estimator,
32
+ get_sketch_estimator,
33
+ )
34
+ from sqlalchemy import select
35
+ from truthound_dashboard.db import Source
36
+ from truthound_dashboard.schemas.enterprise_sampling import (
37
+ BlockSamplingConfig,
38
+ ColumnAwareSamplingConfig,
39
+ EnterpriseSamplingRequest,
40
+ EnterpriseSamplingResponse,
41
+ EnterpriseSamplingStrategy,
42
+ MemoryBudgetConfig,
43
+ MultiStageSamplingConfig,
44
+ ParallelSamplingConfig,
45
+ ProgressiveSamplingConfig,
46
+ SampleSizeEstimateRequest,
47
+ SampleSizeEstimateResponse,
48
+ SamplingJobListResponse,
49
+ SamplingJobStatus,
50
+ SamplingQuality,
51
+ ScaleCategory,
52
+ SchedulingPolicy,
53
+ SketchConfig,
54
+ SketchEstimateRequest,
55
+ SketchEstimateResponse,
56
+ SketchType,
57
+ )
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+ router = APIRouter(prefix="/sampling", tags=["Enterprise Sampling"])
62
+
63
+
64
+ # ============================================================================
65
+ # Response Models for API Documentation
66
+ # ============================================================================
67
+
68
+
69
+ class StrategyInfo:
70
+ """Strategy information for documentation."""
71
+
72
+ def __init__(
73
+ self,
74
+ name: str,
75
+ value: str,
76
+ description: str,
77
+ best_for: str,
78
+ supports_parallel: bool = False,
79
+ supports_streaming: bool = False,
80
+ ):
81
+ self.name = name
82
+ self.value = value
83
+ self.description = description
84
+ self.best_for = best_for
85
+ self.supports_parallel = supports_parallel
86
+ self.supports_streaming = supports_streaming
87
+
88
+
89
+ STRATEGY_DOCS = {
90
+ EnterpriseSamplingStrategy.NONE: StrategyInfo(
91
+ name="No Sampling",
92
+ value="none",
93
+ description="Use full dataset without sampling",
94
+ best_for="Datasets < 1M rows",
95
+ ),
96
+ EnterpriseSamplingStrategy.RANDOM: StrategyInfo(
97
+ name="Random Sampling",
98
+ value="random",
99
+ description="Simple random sampling without replacement",
100
+ best_for="General purpose, uniform distributions",
101
+ ),
102
+ EnterpriseSamplingStrategy.BLOCK: StrategyInfo(
103
+ name="Block Sampling",
104
+ value="block",
105
+ description="Divides data into blocks and samples proportionally from each",
106
+ best_for="10M-100M rows, when coverage across data is important",
107
+ supports_parallel=True,
108
+ ),
109
+ EnterpriseSamplingStrategy.MULTI_STAGE: StrategyInfo(
110
+ name="Multi-Stage Sampling",
111
+ value="multi_stage",
112
+ description="Hierarchical sampling in multiple progressive passes",
113
+ best_for="100M-1B rows, when quick estimates are acceptable",
114
+ ),
115
+ EnterpriseSamplingStrategy.COLUMN_AWARE: StrategyInfo(
116
+ name="Column-Aware Sampling",
117
+ value="column_aware",
118
+ description="Adjusts sample size based on column type complexity",
119
+ best_for="Datasets with mixed column types",
120
+ ),
121
+ EnterpriseSamplingStrategy.PROGRESSIVE: StrategyInfo(
122
+ name="Progressive Sampling",
123
+ value="progressive",
124
+ description="Iteratively increases sample size until convergence",
125
+ best_for="Exploratory analysis, early stopping when possible",
126
+ ),
127
+ EnterpriseSamplingStrategy.ADAPTIVE: StrategyInfo(
128
+ name="Adaptive (Auto-Select)",
129
+ value="adaptive",
130
+ description="Automatically selects best strategy based on data characteristics",
131
+ best_for="When unsure which strategy to use",
132
+ ),
133
+ }
134
+
135
+
136
+ # ============================================================================
137
+ # Endpoints
138
+ # ============================================================================
139
+
140
+
141
+ @router.post(
142
+ "/enterprise",
143
+ response_model=EnterpriseSamplingResponse,
144
+ summary="Run enterprise-scale sampling",
145
+ description="""
146
+ Execute enterprise-scale sampling on a data source.
147
+
148
+ Supports datasets from 100M to billions of rows with:
149
+ - Block sampling for parallel processing
150
+ - Multi-stage hierarchical sampling
151
+ - Column-aware adaptive sampling
152
+ - Progressive sampling with convergence detection
153
+
154
+ The response includes detailed metrics about the sampling operation.
155
+ """,
156
+ )
157
+ async def run_enterprise_sampling(
158
+ request: EnterpriseSamplingRequest,
159
+ db: AsyncSession = Depends(get_session),
160
+ ) -> EnterpriseSamplingResponse:
161
+ """Run enterprise-scale sampling on a data source."""
162
+ # Get source
163
+ result = await db.execute(select(Source).where(Source.id == request.source_id))
164
+ source = result.scalar_one_or_none()
165
+ if not source:
166
+ raise HTTPException(
167
+ status_code=status.HTTP_404_NOT_FOUND,
168
+ detail=f"Source not found: {request.source_id}",
169
+ )
170
+
171
+ try:
172
+ import polars as pl
173
+
174
+ # Load data
175
+ data_path = source.path
176
+ if data_path.endswith(".csv"):
177
+ lf = pl.scan_csv(data_path)
178
+ elif data_path.endswith(".parquet"):
179
+ lf = pl.scan_parquet(data_path)
180
+ elif data_path.endswith(".json"):
181
+ lf = pl.read_json(data_path).lazy()
182
+ elif data_path.endswith(".jsonl") or data_path.endswith(".ndjson"):
183
+ lf = pl.read_ndjson(data_path).lazy()
184
+ else:
185
+ raise HTTPException(
186
+ status_code=status.HTTP_400_BAD_REQUEST,
187
+ detail=f"Unsupported file format: {data_path}",
188
+ )
189
+
190
+ # Get row count (may require a collect for some formats)
191
+ schema = lf.collect_schema()
192
+ column_count = len(schema)
193
+
194
+ # Estimate row count
195
+ try:
196
+ row_count = lf.select(pl.len()).collect().item()
197
+ except Exception:
198
+ # Fall back to collecting and counting
199
+ row_count = len(lf.collect())
200
+
201
+ # Run sampling
202
+ sampler = get_enterprise_sampler()
203
+ response = await sampler.sample(
204
+ config=request,
205
+ data=lf,
206
+ row_count=row_count,
207
+ column_count=column_count,
208
+ )
209
+
210
+ return response
211
+
212
+ except HTTPException:
213
+ raise
214
+ except Exception as e:
215
+ logger.error(f"Enterprise sampling failed: {e}")
216
+ raise HTTPException(
217
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
218
+ detail=f"Sampling failed: {str(e)}",
219
+ )
220
+
221
+
222
+ @router.post(
223
+ "/estimate-size",
224
+ response_model=SampleSizeEstimateResponse,
225
+ summary="Estimate optimal sample size",
226
+ description="""
227
+ Calculate the optimal sample size for a given population using Cochran's formula.
228
+
229
+ Returns:
230
+ - Recommended sample size with statistical confidence
231
+ - Minimum and maximum useful sample sizes
232
+ - Estimated processing time and memory usage
233
+ - Recommended sampling strategy with rationale
234
+ """,
235
+ )
236
+ async def estimate_sample_size(
237
+ request: SampleSizeEstimateRequest,
238
+ ) -> SampleSizeEstimateResponse:
239
+ """Estimate optimal sample size for statistical confidence."""
240
+ estimator = get_sample_size_estimator()
241
+ return estimator.estimate(request)
242
+
243
+
244
+ @router.post(
245
+ "/sketch",
246
+ response_model=SketchEstimateResponse,
247
+ summary="Run sketch-based estimation",
248
+ description="""
249
+ Use probabilistic data structures for O(1) memory aggregations on massive datasets.
250
+
251
+ Supported sketch types:
252
+ - **HyperLogLog**: Cardinality estimation (distinct count)
253
+ - **Count-Min Sketch**: Frequency estimation (heavy hitters)
254
+ - **Bloom Filter**: Membership testing
255
+
256
+ Ideal for datasets exceeding 10B rows where exact computation is impractical.
257
+ """,
258
+ )
259
+ async def run_sketch_estimation(
260
+ request: SketchEstimateRequest,
261
+ db: AsyncSession = Depends(get_session),
262
+ ) -> SketchEstimateResponse:
263
+ """Run sketch-based estimation using probabilistic data structures."""
264
+ # Get source
265
+ result = await db.execute(select(Source).where(Source.id == request.source_id))
266
+ source = result.scalar_one_or_none()
267
+ if not source:
268
+ raise HTTPException(
269
+ status_code=status.HTTP_404_NOT_FOUND,
270
+ detail=f"Source not found: {request.source_id}",
271
+ )
272
+
273
+ try:
274
+ import polars as pl
275
+
276
+ # Load data
277
+ data_path = source.path
278
+ if data_path.endswith(".csv"):
279
+ lf = pl.scan_csv(data_path)
280
+ elif data_path.endswith(".parquet"):
281
+ lf = pl.scan_parquet(data_path)
282
+ else:
283
+ lf = pl.read_csv(data_path).lazy()
284
+
285
+ # Validate columns exist
286
+ schema = lf.collect_schema()
287
+ for col in request.columns:
288
+ if col not in schema:
289
+ raise HTTPException(
290
+ status_code=status.HTTP_400_BAD_REQUEST,
291
+ detail=f"Column not found: {col}",
292
+ )
293
+
294
+ # Run sketch estimation
295
+ estimator = get_sketch_estimator()
296
+ response = await estimator.estimate(request, lf)
297
+
298
+ return response
299
+
300
+ except HTTPException:
301
+ raise
302
+ except Exception as e:
303
+ logger.error(f"Sketch estimation failed: {e}")
304
+ raise HTTPException(
305
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
306
+ detail=f"Sketch estimation failed: {str(e)}",
307
+ )
308
+
309
+
310
+ @router.get(
311
+ "/jobs",
312
+ response_model=SamplingJobListResponse,
313
+ summary="List sampling jobs",
314
+ description="List all active and recent sampling jobs.",
315
+ )
316
+ async def list_sampling_jobs(
317
+ status_filter: str | None = Query(
318
+ None,
319
+ description="Filter by status: pending, running, completed, failed",
320
+ ),
321
+ limit: int = Query(50, ge=1, le=100, description="Maximum jobs to return"),
322
+ ) -> SamplingJobListResponse:
323
+ """List all sampling jobs."""
324
+ sampler = get_enterprise_sampler()
325
+ all_jobs = sampler.list_jobs()
326
+
327
+ # Filter by status if specified
328
+ if status_filter:
329
+ all_jobs = [j for j in all_jobs if j.status == status_filter]
330
+
331
+ # Apply limit
332
+ jobs = all_jobs[:limit]
333
+
334
+ return SamplingJobListResponse(
335
+ jobs=jobs,
336
+ total=len(all_jobs),
337
+ active_count=sum(1 for j in all_jobs if j.status in ("pending", "running")),
338
+ )
339
+
340
+
341
+ @router.get(
342
+ "/jobs/{job_id}",
343
+ response_model=SamplingJobStatus,
344
+ summary="Get job status",
345
+ description="Get the status of a specific sampling job.",
346
+ )
347
+ async def get_job_status(job_id: str) -> SamplingJobStatus:
348
+ """Get status of a specific sampling job."""
349
+ sampler = get_enterprise_sampler()
350
+ job = sampler.get_job_status(job_id)
351
+
352
+ if not job:
353
+ raise HTTPException(
354
+ status_code=status.HTTP_404_NOT_FOUND,
355
+ detail=f"Job not found: {job_id}",
356
+ )
357
+
358
+ return job
359
+
360
+
361
+ @router.post(
362
+ "/jobs/{job_id}/cancel",
363
+ summary="Cancel sampling job",
364
+ description="Cancel an active sampling job.",
365
+ )
366
+ async def cancel_sampling_job(job_id: str) -> dict[str, Any]:
367
+ """Cancel an active sampling job."""
368
+ sampler = get_enterprise_sampler()
369
+ job = sampler.get_job_status(job_id)
370
+
371
+ if not job:
372
+ raise HTTPException(
373
+ status_code=status.HTTP_404_NOT_FOUND,
374
+ detail=f"Job not found: {job_id}",
375
+ )
376
+
377
+ if job.status not in ("pending", "running"):
378
+ raise HTTPException(
379
+ status_code=status.HTTP_400_BAD_REQUEST,
380
+ detail=f"Job cannot be cancelled: status is {job.status}",
381
+ )
382
+
383
+ # In production, this would actually cancel the job
384
+ # For now, just mark it as cancelled
385
+ job.status = "cancelled"
386
+
387
+ return {"job_id": job_id, "status": "cancelled", "message": "Job cancellation requested"}
388
+
389
+
390
+ @router.get(
391
+ "/strategies",
392
+ summary="List available strategies",
393
+ description="List all available enterprise sampling strategies with descriptions.",
394
+ )
395
+ async def list_strategies() -> list[dict[str, Any]]:
396
+ """List available sampling strategies."""
397
+ strategies = []
398
+
399
+ for strategy_type, info in STRATEGY_DOCS.items():
400
+ strategies.append({
401
+ "name": info.name,
402
+ "value": info.value,
403
+ "description": info.description,
404
+ "best_for": info.best_for,
405
+ "supports_parallel": info.supports_parallel,
406
+ "supports_streaming": info.supports_streaming,
407
+ })
408
+
409
+ return strategies
410
+
411
+
412
+ @router.get(
413
+ "/quality-presets",
414
+ summary="List quality presets",
415
+ description="List available sampling quality presets with their configurations.",
416
+ )
417
+ async def list_quality_presets() -> list[dict[str, Any]]:
418
+ """List available quality presets."""
419
+ presets = []
420
+
421
+ preset_descriptions = {
422
+ SamplingQuality.SKETCH: "Fast approximation using probabilistic structures",
423
+ SamplingQuality.QUICK: "Quick estimates with 90% confidence",
424
+ SamplingQuality.STANDARD: "Balanced sampling with 95% confidence (recommended)",
425
+ SamplingQuality.HIGH: "High accuracy with 99% confidence",
426
+ SamplingQuality.EXACT: "Full scan without sampling",
427
+ }
428
+
429
+ for quality, config in QUALITY_PRESETS.items():
430
+ presets.append({
431
+ "name": quality.value,
432
+ "description": preset_descriptions.get(quality, ""),
433
+ "target_rows": config["target_rows"],
434
+ "confidence_level": config["confidence_level"],
435
+ "margin_of_error": config["margin_of_error"],
436
+ })
437
+
438
+ return presets
439
+
440
+
441
+ @router.get(
442
+ "/scale-categories",
443
+ summary="List scale categories",
444
+ description="List dataset scale categories with recommended strategies.",
445
+ )
446
+ async def list_scale_categories() -> list[dict[str, Any]]:
447
+ """List scale categories with recommended strategies."""
448
+ categories = [
449
+ {
450
+ "name": ScaleCategory.SMALL.value,
451
+ "row_count_range": "< 1M",
452
+ "recommended_strategy": SCALE_STRATEGY_MAP[ScaleCategory.SMALL].value,
453
+ "description": "Small datasets that don't require sampling",
454
+ },
455
+ {
456
+ "name": ScaleCategory.MEDIUM.value,
457
+ "row_count_range": "1M - 10M",
458
+ "recommended_strategy": SCALE_STRATEGY_MAP[ScaleCategory.MEDIUM].value,
459
+ "description": "Medium datasets suitable for column-aware sampling",
460
+ },
461
+ {
462
+ "name": ScaleCategory.LARGE.value,
463
+ "row_count_range": "10M - 100M",
464
+ "recommended_strategy": SCALE_STRATEGY_MAP[ScaleCategory.LARGE].value,
465
+ "description": "Large datasets requiring block-based parallel sampling",
466
+ },
467
+ {
468
+ "name": ScaleCategory.XLARGE.value,
469
+ "row_count_range": "100M - 1B",
470
+ "recommended_strategy": SCALE_STRATEGY_MAP[ScaleCategory.XLARGE].value,
471
+ "description": "Extra-large datasets requiring multi-stage sampling",
472
+ },
473
+ {
474
+ "name": ScaleCategory.XXLARGE.value,
475
+ "row_count_range": "> 1B",
476
+ "recommended_strategy": SCALE_STRATEGY_MAP[ScaleCategory.XXLARGE].value,
477
+ "description": "Massive datasets requiring sketches and multi-stage sampling",
478
+ },
479
+ ]
480
+
481
+ return categories
482
+
483
+
484
+ @router.post(
485
+ "/classify-scale",
486
+ summary="Classify dataset scale",
487
+ description="Classify a dataset by row count into a scale category.",
488
+ )
489
+ async def classify_scale(row_count: int = Query(..., ge=0)) -> dict[str, Any]:
490
+ """Classify dataset scale by row count."""
491
+ scale = classify_dataset_scale(row_count)
492
+ strategy = SCALE_STRATEGY_MAP.get(scale, EnterpriseSamplingStrategy.COLUMN_AWARE)
493
+
494
+ return {
495
+ "row_count": row_count,
496
+ "scale_category": scale.value,
497
+ "recommended_strategy": strategy.value,
498
+ }
@@ -1,13 +1,18 @@
1
1
  """Validation history API endpoints.
2
2
 
3
3
  Provides endpoints for validation history and trend analysis.
4
+
5
+ API Design: Direct Response Style
6
+ - Returns data directly without success wrapper
7
+ - Errors handled via HTTPException
4
8
  """
5
9
 
6
10
  from __future__ import annotations
7
11
 
8
- from typing import Annotated, Literal
12
+ from typing import Annotated, Any, Literal
9
13
 
10
14
  from fastapi import APIRouter, Depends, HTTPException, Query
15
+ from pydantic import BaseModel, Field
11
16
 
12
17
  from truthound_dashboard.core import HistoryService
13
18
 
@@ -24,9 +29,56 @@ async def get_history_service(session: SessionDep) -> HistoryService:
24
29
  HistoryServiceDep = Annotated[HistoryService, Depends(get_history_service)]
25
30
 
26
31
 
32
+ class HistorySummary(BaseModel):
33
+ """Validation history summary."""
34
+
35
+ total_runs: int
36
+ passed_runs: int
37
+ failed_runs: int
38
+ success_rate: float
39
+
40
+
41
+ class TrendDataPoint(BaseModel):
42
+ """Single data point in trend."""
43
+
44
+ date: str
45
+ success_rate: float
46
+ run_count: int
47
+ passed_count: int
48
+ failed_count: int
49
+
50
+
51
+ class FailureFrequency(BaseModel):
52
+ """Failure frequency item."""
53
+
54
+ issue: str
55
+ count: int
56
+
57
+
58
+ class RecentValidation(BaseModel):
59
+ """Recent validation item."""
60
+
61
+ id: str
62
+ status: str
63
+ passed: bool
64
+ has_critical: bool
65
+ has_high: bool
66
+ total_issues: int
67
+ created_at: str
68
+
69
+
70
+ class HistoryResponse(BaseModel):
71
+ """Validation history response."""
72
+
73
+ summary: HistorySummary
74
+ trend: list[TrendDataPoint]
75
+ failure_frequency: list[FailureFrequency]
76
+ recent_validations: list[RecentValidation]
77
+
78
+
27
79
  @router.get(
28
80
  "/sources/{source_id}/history",
29
- response_model=dict,
81
+ response_model=HistoryResponse,
30
82
  summary="Get validation history",
31
83
  description="Get validation history with trend analysis for a source.",
32
84
  )
@@ -37,7 +89,7 @@ async def get_validation_history(
37
89
  granularity: Literal["hourly", "daily", "weekly"] = Query(
38
90
  "daily", description="Aggregation granularity"
39
91
  ),
40
- ) -> dict:
92
+ ) -> HistoryResponse:
41
93
  """Get validation history with trend data.
42
94
 
43
95
  Args:
@@ -47,7 +99,7 @@ async def get_validation_history(
47
99
  granularity: Aggregation granularity (hourly, daily, weekly).
48
100
 
49
101
  Returns:
50
- Dictionary with summary, trend, failure_frequency, and recent_validations.
102
+ History data with summary, trend, failure_frequency, and recent_validations.
51
103
  """
52
104
  try:
53
105
  data = await service.get_history(
@@ -55,7 +107,7 @@ async def get_validation_history(
55
107
  period=period,
56
108
  granularity=granularity,
57
109
  )
58
- return {"success": True, "data": data}
110
+ return HistoryResponse(**data)
59
111
  except ValueError as e:
60
112
  raise HTTPException(status_code=404, detail=str(e))
61
113
  except Exception as e:
@@ -12,8 +12,6 @@ from fastapi import APIRouter, HTTPException, Path, Query, Body
12
12
  from truthound_dashboard.schemas.lineage import (
13
13
  AnomalyImpactResponse,
14
14
  AnomalyStatus,
15
- AutoDiscoverRequest,
16
- AutoDiscoverResponse,
17
15
  ImpactAnalysisRequest,
18
16
  ImpactAnalysisResponse,
19
17
  ImpactDirection,
@@ -368,7 +366,7 @@ async def create_edge(
368
366
  HTTPException: 400 if nodes not found or edge already exists.
369
367
  """
370
368
  try:
371
- created = await service.create_edge(
369
+ created, source_node, target_node = await service.create_edge(
372
370
  source_node_id=edge.source_node_id,
373
371
  target_node_id=edge.target_node_id,
374
372
  edge_type=edge.edge_type,
@@ -378,8 +376,8 @@ async def create_edge(
378
376
  id=created.id,
379
377
  source_node_id=created.source_node_id,
380
378
  target_node_id=created.target_node_id,
381
- source_node_name=created.source_node.name if created.source_node else None,
382
- target_node_name=created.target_node.name if created.target_node else None,
379
+ source_node_name=source_node.name if source_node else None,
380
+ target_node_name=target_node.name if target_node else None,
383
381
  edge_type=created.edge_type,
384
382
  metadata=created.metadata_json,
385
383
  created_at=created.created_at.isoformat() if created.created_at else "",
@@ -609,49 +607,6 @@ async def get_anomaly_impact(
609
607
  raise HTTPException(status_code=404, detail=str(e))
610
608
 
611
609
 
612
- # =============================================================================
613
- # Auto-Discovery Endpoints
614
- # =============================================================================
615
-
616
-
617
- @router.post(
618
- "/auto-discover",
619
- response_model=AutoDiscoverResponse,
620
- summary="Auto-discover lineage",
621
- description="Auto-discover lineage from a data source",
622
- )
623
- async def auto_discover(
624
- service: LineageServiceDep,
625
- request: AutoDiscoverRequest,
626
- ) -> AutoDiscoverResponse:
627
- """Auto-discover lineage from a source.
628
-
629
- Args:
630
- service: Injected lineage service.
631
- request: Auto-discovery request.
632
-
633
- Returns:
634
- Discovery results.
635
-
636
- Raises:
637
- HTTPException: 404 if source not found.
638
- """
639
- try:
640
- result = await service.auto_discover(
641
- source_id=request.source_id,
642
- include_fk_relations=request.include_fk_relations,
643
- max_depth=request.max_depth,
644
- )
645
- return AutoDiscoverResponse(
646
- source_id=result["source_id"],
647
- discovered_nodes=result["discovered_nodes"],
648
- discovered_edges=result["discovered_edges"],
649
- graph=LineageGraphResponse(**result["graph"]),
650
- )
651
- except ValueError as e:
652
- raise HTTPException(status_code=404, detail=str(e))
653
-
654
-
655
610
  # =============================================================================
656
611
  # Position Update Endpoints
657
612
  # =============================================================================