truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
  162. truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,54 +5,341 @@ This module defines schemas for data profiling API operations.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- from typing import Any
8
+ from enum import Enum
9
+ from typing import Any, Literal
9
10
 
10
11
  from pydantic import Field
11
12
 
12
13
  from .base import BaseSchema
13
14
 
14
15
 
16
+ # =============================================================================
17
+ # Sampling Strategy Enums and Types
18
+ # =============================================================================
19
+
20
+
21
+ class SamplingStrategy(str, Enum):
22
+ """Sampling strategies for data profiling.
23
+
24
+ Supports 8+ strategies from truthound profiler:
25
+ - NONE: Profile all data (for small datasets < 100K rows)
26
+ - HEAD: First N rows (for quick previews)
27
+ - RANDOM: Random sampling (general purpose)
28
+ - SYSTEMATIC: Every Nth row (for ordered data)
29
+ - STRATIFIED: Maintain distribution across categories
30
+ - RESERVOIR: Streaming-friendly sampling
31
+ - ADAPTIVE: Auto-select based on data characteristics (default)
32
+ - HASH: Deterministic sampling for reproducibility
33
+ """
34
+
35
+ NONE = "none"
36
+ HEAD = "head"
37
+ RANDOM = "random"
38
+ SYSTEMATIC = "systematic"
39
+ STRATIFIED = "stratified"
40
+ RESERVOIR = "reservoir"
41
+ ADAPTIVE = "adaptive"
42
+ HASH = "hash"
43
+
44
+
45
+ # Literal type for API validation
46
+ SamplingStrategyType = Literal[
47
+ "none", "head", "random", "systematic", "stratified", "reservoir", "adaptive", "hash"
48
+ ]
49
+
50
+
51
+ class SamplingConfig(BaseSchema):
52
+ """Advanced sampling configuration for profiling.
53
+
54
+ Provides fine-grained control over sampling behavior for large datasets.
55
+ """
56
+
57
+ strategy: SamplingStrategyType = Field(
58
+ default="adaptive",
59
+ description="Sampling strategy to use. 'adaptive' auto-selects based on data size.",
60
+ )
61
+ sample_size: int | None = Field(
62
+ default=None,
63
+ ge=100,
64
+ description="Target sample size. If None, auto-estimated based on confidence level.",
65
+ )
66
+ confidence_level: float = Field(
67
+ default=0.95,
68
+ ge=0.80,
69
+ le=0.99,
70
+ description="Statistical confidence level for sample size estimation (0.80-0.99).",
71
+ )
72
+ margin_of_error: float = Field(
73
+ default=0.03,
74
+ ge=0.01,
75
+ le=0.10,
76
+ description="Acceptable margin of error for statistical estimates (0.01-0.10).",
77
+ )
78
+ strata_column: str | None = Field(
79
+ default=None,
80
+ description="Column for stratified sampling to maintain distribution.",
81
+ )
82
+ seed: int | None = Field(
83
+ default=None,
84
+ description="Random seed for reproducible sampling results.",
85
+ )
86
+
87
+
88
+ # =============================================================================
89
+ # Pattern Detection Configuration
90
+ # =============================================================================
91
+
92
+
93
+ class PatternType(str, Enum):
94
+ """Supported data pattern types for detection."""
95
+
96
+ EMAIL = "email"
97
+ PHONE = "phone"
98
+ UUID = "uuid"
99
+ URL = "url"
100
+ IP_ADDRESS = "ip_address"
101
+ CREDIT_CARD = "credit_card"
102
+ DATE = "date"
103
+ DATETIME = "datetime"
104
+ KOREAN_RRN = "korean_rrn"
105
+ KOREAN_PHONE = "korean_phone"
106
+ SSN = "ssn"
107
+ POSTAL_CODE = "postal_code"
108
+ CURRENCY = "currency"
109
+ PERCENTAGE = "percentage"
110
+ CUSTOM = "custom"
111
+
112
+
113
+ class PatternDetectionConfig(BaseSchema):
114
+ """Configuration for pattern detection during profiling.
115
+
116
+ Enables automatic detection of common data patterns like
117
+ emails, phone numbers, UUIDs, etc.
118
+ """
119
+
120
+ enabled: bool = Field(
121
+ default=True,
122
+ description="Enable pattern detection during profiling.",
123
+ )
124
+ sample_size: int = Field(
125
+ default=1000,
126
+ ge=100,
127
+ le=100000,
128
+ description="Number of values to sample for pattern detection.",
129
+ )
130
+ min_confidence: float = Field(
131
+ default=0.8,
132
+ ge=0.5,
133
+ le=1.0,
134
+ description="Minimum confidence threshold for pattern matches (0.5-1.0).",
135
+ )
136
+ patterns_to_detect: list[str] | None = Field(
137
+ default=None,
138
+ description="Specific patterns to detect. If None, detects all supported patterns.",
139
+ )
140
+
141
+
142
+ # =============================================================================
143
+ # Profile Request Schema (Enhanced)
144
+ # =============================================================================
145
+
146
+
15
147
  class ProfileRequest(BaseSchema):
16
148
  """Request schema for data profiling.
17
149
 
18
- Provides optional configuration for profiling operations.
19
- All fields are optional with sensible defaults.
150
+ Provides comprehensive configuration for profiling operations including
151
+ sampling strategies, pattern detection, and statistical analysis options.
20
152
  """
21
153
 
154
+ # Basic sampling (backward compatible)
22
155
  sample_size: int | None = Field(
23
156
  default=None,
24
157
  ge=1,
25
158
  description="Maximum number of rows to sample for profiling. "
26
- "If None, profiles all data. Useful for large datasets.",
159
+ "If None, profiles all data. For advanced sampling, use 'sampling' config.",
27
160
  examples=[10000, 50000, 100000],
28
161
  )
29
162
 
163
+ # Advanced sampling configuration
164
+ sampling: SamplingConfig | None = Field(
165
+ default=None,
166
+ description="Advanced sampling configuration. If provided, overrides sample_size.",
167
+ )
168
+
169
+ # Pattern detection configuration
170
+ pattern_detection: PatternDetectionConfig | None = Field(
171
+ default=None,
172
+ description="Pattern detection configuration. If None, uses default settings.",
173
+ )
174
+
175
+ # Additional profiling options
176
+ include_histograms: bool = Field(
177
+ default=True,
178
+ description="Include value distribution histograms in the profile.",
179
+ )
180
+ include_correlations: bool = Field(
181
+ default=False,
182
+ description="Include column correlation analysis (increases processing time).",
183
+ )
184
+ include_cardinality: bool = Field(
185
+ default=True,
186
+ description="Include cardinality estimates for high-cardinality columns.",
187
+ )
188
+
189
+
190
+ # =============================================================================
191
+ # Pattern Detection Results
192
+ # =============================================================================
193
+
194
+
195
+ class DetectedPattern(BaseSchema):
196
+ """A detected data pattern in a column."""
197
+
198
+ pattern_type: str = Field(
199
+ ...,
200
+ description="Type of pattern detected (email, phone, uuid, etc.)",
201
+ )
202
+ confidence: float = Field(
203
+ ...,
204
+ ge=0.0,
205
+ le=1.0,
206
+ description="Confidence score of the pattern match (0-1).",
207
+ )
208
+ match_count: int = Field(
209
+ ...,
210
+ ge=0,
211
+ description="Number of values matching this pattern.",
212
+ )
213
+ match_percentage: float = Field(
214
+ ...,
215
+ ge=0.0,
216
+ le=100.0,
217
+ description="Percentage of non-null values matching this pattern.",
218
+ )
219
+ sample_matches: list[str] | None = Field(
220
+ default=None,
221
+ description="Sample values matching this pattern (masked for sensitive data).",
222
+ )
223
+
224
+
225
+ class HistogramBucket(BaseSchema):
226
+ """A bucket in a value distribution histogram."""
227
+
228
+ bucket: str = Field(..., description="Bucket label (range or category)")
229
+ count: int = Field(..., ge=0, description="Count of values in this bucket")
230
+ percentage: float = Field(..., ge=0.0, le=100.0, description="Percentage of total")
231
+
232
+
233
+ # =============================================================================
234
+ # Column Profile Schema (Enhanced)
235
+ # =============================================================================
236
+
30
237
 
31
238
  class ColumnProfile(BaseSchema):
32
- """Profile information for a single column."""
239
+ """Profile information for a single column.
240
+
241
+ Includes basic statistics, pattern detection results, and distribution data.
242
+ """
33
243
 
244
+ # Basic identification
34
245
  name: str = Field(..., description="Column name")
35
- dtype: str = Field(..., description="Data type")
246
+ dtype: str = Field(..., description="Physical data type (string, int64, float64, etc.)")
247
+
248
+ # Inferred semantic type (NEW)
249
+ inferred_type: str | None = Field(
250
+ default=None,
251
+ description="Inferred semantic type based on pattern detection "
252
+ "(email, phone, uuid, url, date, currency, etc.)",
253
+ )
254
+
255
+ # Completeness metrics
36
256
  null_pct: str = Field(default="0%", description="Percentage of null values")
257
+ null_count: int | None = Field(default=None, description="Count of null values")
258
+
259
+ # Uniqueness metrics
37
260
  unique_pct: str = Field(default="0%", description="Percentage of unique values")
261
+ distinct_count: int | None = Field(
262
+ default=None,
263
+ description="Count of distinct values",
264
+ )
265
+ is_unique: bool | None = Field(
266
+ default=None,
267
+ description="Whether all non-null values are unique",
268
+ )
269
+
270
+ # Value range (for numeric/date columns)
38
271
  min: Any | None = Field(default=None, description="Minimum value")
39
272
  max: Any | None = Field(default=None, description="Maximum value")
273
+
274
+ # Statistical measures (for numeric columns)
40
275
  mean: float | None = Field(default=None, description="Mean value (numeric columns)")
41
276
  std: float | None = Field(default=None, description="Standard deviation (numeric)")
277
+ median: float | None = Field(default=None, description="Median value (numeric)")
278
+ q1: float | None = Field(default=None, description="25th percentile (Q1)")
279
+ q3: float | None = Field(default=None, description="75th percentile (Q3)")
280
+ skewness: float | None = Field(default=None, description="Skewness of distribution")
281
+ kurtosis: float | None = Field(default=None, description="Kurtosis of distribution")
42
282
 
43
- # Additional statistics (optional)
44
- distinct_count: int | None = Field(
283
+ # String-specific metrics
284
+ min_length: int | None = Field(default=None, description="Minimum string length")
285
+ max_length: int | None = Field(default=None, description="Maximum string length")
286
+ avg_length: float | None = Field(default=None, description="Average string length")
287
+
288
+ # Pattern detection results (NEW)
289
+ patterns: list[DetectedPattern] | None = Field(
45
290
  default=None,
46
- description="Count of distinct values",
291
+ description="Detected data patterns (email, phone, uuid, etc.)",
292
+ )
293
+ primary_pattern: str | None = Field(
294
+ default=None,
295
+ description="The most prevalent detected pattern type",
47
296
  )
297
+
298
+ # Distribution data
48
299
  most_common: list[dict[str, Any]] | None = Field(
49
300
  default=None,
50
301
  description="Most common values with counts",
51
302
  )
303
+ histogram: list[HistogramBucket] | None = Field(
304
+ default=None,
305
+ description="Value distribution histogram",
306
+ )
307
+
308
+ # Cardinality estimate for high-cardinality columns
309
+ cardinality_estimate: int | None = Field(
310
+ default=None,
311
+ description="Estimated cardinality using HyperLogLog (for high-cardinality columns)",
312
+ )
313
+
314
+
315
+ # =============================================================================
316
+ # Sampling Metadata for Response
317
+ # =============================================================================
318
+
319
+
320
+ class SamplingMetadata(BaseSchema):
321
+ """Metadata about sampling used during profiling."""
322
+
323
+ strategy_used: str = Field(..., description="Sampling strategy that was applied")
324
+ sample_size: int = Field(..., description="Actual sample size used")
325
+ total_rows: int = Field(..., description="Total rows in the dataset")
326
+ sampling_ratio: float = Field(..., description="Ratio of sampled to total rows")
327
+ seed: int | None = Field(default=None, description="Random seed used (if applicable)")
328
+ confidence_level: float | None = Field(
329
+ default=None, description="Confidence level achieved"
330
+ )
331
+ margin_of_error: float | None = Field(
332
+ default=None, description="Estimated margin of error"
333
+ )
334
+
335
+
336
+ # =============================================================================
337
+ # Profile Response Schema (Enhanced)
338
+ # =============================================================================
52
339
 
53
340
 
54
341
  class ProfileResponse(BaseSchema):
55
- """Data profiling response."""
342
+ """Data profiling response with enhanced statistics and pattern detection."""
56
343
 
57
344
  source: str = Field(..., description="Source path/identifier")
58
345
  row_count: int = Field(..., ge=0, description="Total number of rows")
@@ -63,6 +350,28 @@ class ProfileResponse(BaseSchema):
63
350
  description="Profile for each column",
64
351
  )
65
352
 
353
+ # Sampling metadata (NEW)
354
+ sampling: SamplingMetadata | None = Field(
355
+ default=None,
356
+ description="Information about sampling applied during profiling",
357
+ )
358
+
359
+ # Pattern detection summary (NEW)
360
+ detected_patterns_summary: dict[str, int] | None = Field(
361
+ default=None,
362
+ description="Summary of detected patterns across all columns {pattern_type: count}",
363
+ )
364
+
365
+ # Profiling metadata (NEW)
366
+ profiled_at: str | None = Field(
367
+ default=None,
368
+ description="ISO timestamp when profiling was performed",
369
+ )
370
+ profiling_duration_ms: int | None = Field(
371
+ default=None,
372
+ description="Time taken to profile in milliseconds",
373
+ )
374
+
66
375
  # Computed properties
67
376
  @property
68
377
  def size_human(self) -> str:
@@ -74,6 +383,70 @@ class ProfileResponse(BaseSchema):
74
383
  size /= 1024
75
384
  return f"{size:.1f} PB"
76
385
 
386
+ @classmethod
387
+ def _build_column_profile(cls, col: dict[str, Any]) -> ColumnProfile:
388
+ """Build a ColumnProfile from column data dict.
389
+
390
+ Args:
391
+ col: Column data dictionary from adapter or database.
392
+
393
+ Returns:
394
+ ColumnProfile instance with all available fields.
395
+ """
396
+ # Build patterns list if present
397
+ patterns = None
398
+ if col.get("patterns"):
399
+ patterns = [
400
+ DetectedPattern(
401
+ pattern_type=p.get("pattern_type", p.get("type", "unknown")),
402
+ confidence=p.get("confidence", 0.0),
403
+ match_count=p.get("match_count", 0),
404
+ match_percentage=p.get("match_percentage", 0.0),
405
+ sample_matches=p.get("sample_matches"),
406
+ )
407
+ for p in col["patterns"]
408
+ ]
409
+
410
+ # Build histogram if present
411
+ histogram = None
412
+ if col.get("histogram"):
413
+ histogram = [
414
+ HistogramBucket(
415
+ bucket=h.get("bucket", ""),
416
+ count=h.get("count", 0),
417
+ percentage=h.get("percentage", 0.0),
418
+ )
419
+ for h in col["histogram"]
420
+ ]
421
+
422
+ return ColumnProfile(
423
+ name=col["name"],
424
+ dtype=col["dtype"],
425
+ inferred_type=col.get("inferred_type"),
426
+ null_pct=col.get("null_pct", "0%"),
427
+ null_count=col.get("null_count"),
428
+ unique_pct=col.get("unique_pct", "0%"),
429
+ distinct_count=col.get("distinct_count"),
430
+ is_unique=col.get("is_unique"),
431
+ min=col.get("min"),
432
+ max=col.get("max"),
433
+ mean=col.get("mean"),
434
+ std=col.get("std"),
435
+ median=col.get("median"),
436
+ q1=col.get("q1"),
437
+ q3=col.get("q3"),
438
+ skewness=col.get("skewness"),
439
+ kurtosis=col.get("kurtosis"),
440
+ min_length=col.get("min_length"),
441
+ max_length=col.get("max_length"),
442
+ avg_length=col.get("avg_length"),
443
+ patterns=patterns,
444
+ primary_pattern=col.get("primary_pattern"),
445
+ most_common=col.get("most_common"),
446
+ histogram=histogram,
447
+ cardinality_estimate=col.get("cardinality_estimate"),
448
+ )
449
+
77
450
  @classmethod
78
451
  def from_result(cls, result: Any) -> ProfileResponse:
79
452
  """Create response from adapter result or Profile model.
@@ -89,41 +462,50 @@ class ProfileResponse(BaseSchema):
89
462
  profile_json = result.profile_json
90
463
  source_name = profile_json.get("source", result.source_id)
91
464
  columns_data = profile_json.get("columns", [])
92
- columns = [
93
- ColumnProfile(
94
- name=col["name"],
95
- dtype=col["dtype"],
96
- null_pct=col.get("null_pct", "0%"),
97
- unique_pct=col.get("unique_pct", "0%"),
98
- min=col.get("min"),
99
- max=col.get("max"),
100
- mean=col.get("mean"),
101
- std=col.get("std"),
465
+ columns = [cls._build_column_profile(col) for col in columns_data]
466
+
467
+ # Build sampling metadata if present
468
+ sampling = None
469
+ if profile_json.get("sampling"):
470
+ s = profile_json["sampling"]
471
+ sampling = SamplingMetadata(
472
+ strategy_used=s.get("strategy_used", "none"),
473
+ sample_size=s.get("sample_size", result.row_count or 0),
474
+ total_rows=s.get("total_rows", result.row_count or 0),
475
+ sampling_ratio=s.get("sampling_ratio", 1.0),
476
+ seed=s.get("seed"),
477
+ confidence_level=s.get("confidence_level"),
478
+ margin_of_error=s.get("margin_of_error"),
102
479
  )
103
- for col in columns_data
104
- ]
480
+
105
481
  return cls(
106
482
  source=source_name,
107
483
  row_count=result.row_count or 0,
108
484
  column_count=result.column_count or 0,
109
485
  size_bytes=result.size_bytes or 0,
110
486
  columns=columns,
487
+ sampling=sampling,
488
+ detected_patterns_summary=profile_json.get("detected_patterns_summary"),
489
+ profiled_at=profile_json.get("profiled_at"),
490
+ profiling_duration_ms=profile_json.get("profiling_duration_ms"),
111
491
  )
112
492
 
113
493
  # Handle ProfileResult (from adapter)
114
- columns = [
115
- ColumnProfile(
116
- name=col["name"],
117
- dtype=col["dtype"],
118
- null_pct=col.get("null_pct", "0%"),
119
- unique_pct=col.get("unique_pct", "0%"),
120
- min=col.get("min"),
121
- max=col.get("max"),
122
- mean=col.get("mean"),
123
- std=col.get("std"),
494
+ columns = [cls._build_column_profile(col) for col in result.columns]
495
+
496
+ # Build sampling metadata if present
497
+ sampling = None
498
+ if hasattr(result, "sampling") and result.sampling:
499
+ s = result.sampling
500
+ sampling = SamplingMetadata(
501
+ strategy_used=getattr(s, "strategy_used", "none"),
502
+ sample_size=getattr(s, "sample_size", result.row_count),
503
+ total_rows=getattr(s, "total_rows", result.row_count),
504
+ sampling_ratio=getattr(s, "sampling_ratio", 1.0),
505
+ seed=getattr(s, "seed", None),
506
+ confidence_level=getattr(s, "confidence_level", None),
507
+ margin_of_error=getattr(s, "margin_of_error", None),
124
508
  )
125
- for col in result.columns
126
- ]
127
509
 
128
510
  return cls(
129
511
  source=result.source,
@@ -131,4 +513,8 @@ class ProfileResponse(BaseSchema):
131
513
  column_count=result.column_count,
132
514
  size_bytes=result.size_bytes,
133
515
  columns=columns,
516
+ sampling=sampling,
517
+ detected_patterns_summary=getattr(result, "detected_patterns_summary", None),
518
+ profiled_at=getattr(result, "profiled_at", None),
519
+ profiling_duration_ms=getattr(result, "profiling_duration_ms", None),
134
520
  )