truthound-dashboard 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.3.dist-info/METADATA +0 -505
  203. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.3.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,15 @@
1
1
  """Profile-related Pydantic schemas.
2
2
 
3
3
  This module defines schemas for data profiling API operations.
4
+
5
+ Note: truthound's th.profile() only supports (data, source) parameters.
6
+ Advanced options like sampling strategies, pattern detection configuration,
7
+ and correlation analysis are NOT supported by the underlying library.
4
8
  """
5
9
 
6
10
  from __future__ import annotations
7
11
 
8
- from enum import Enum
9
- from typing import Any, Literal
12
+ from typing import Any
10
13
 
11
14
  from pydantic import Field
12
15
 
@@ -14,176 +17,79 @@ from .base import BaseSchema
14
17
 
15
18
 
16
19
  # =============================================================================
17
- # Sampling Strategy Enums and Types
20
+ # Profile Request Schema (Simplified)
18
21
  # =============================================================================
19
22
 
20
23
 
21
- class SamplingStrategy(str, Enum):
22
- """Sampling strategies for data profiling.
24
+ class ProfileRequest(BaseSchema):
25
+ """Request schema for basic data profiling.
23
26
 
24
- Supports 8+ strategies from truthound profiler:
25
- - NONE: Profile all data (for small datasets < 100K rows)
26
- - HEAD: First N rows (for quick previews)
27
- - RANDOM: Random sampling (general purpose)
28
- - SYSTEMATIC: Every Nth row (for ordered data)
29
- - STRATIFIED: Maintain distribution across categories
30
- - RESERVOIR: Streaming-friendly sampling
31
- - ADAPTIVE: Auto-select based on data characteristics (default)
32
- - HASH: Deterministic sampling for reproducibility
27
+ Note: truthound's th.profile() does not support advanced configuration.
28
+ This schema exists for API compatibility but options are not used.
29
+ For advanced profiling with configuration, use ProfileAdvancedRequest.
33
30
  """
34
31
 
35
- NONE = "none"
36
- HEAD = "head"
37
- RANDOM = "random"
38
- SYSTEMATIC = "systematic"
39
- STRATIFIED = "stratified"
40
- RESERVOIR = "reservoir"
41
- ADAPTIVE = "adaptive"
42
- HASH = "hash"
43
-
32
+ pass
44
33
 
45
- # Literal type for API validation
46
- SamplingStrategyType = Literal[
47
- "none", "head", "random", "systematic", "stratified", "reservoir", "adaptive", "hash"
48
- ]
49
34
 
35
+ class ProfileAdvancedRequest(BaseSchema):
36
+ """Request schema for advanced data profiling with ProfilerConfig options.
50
37
 
51
- class SamplingConfig(BaseSchema):
52
- """Advanced sampling configuration for profiling.
53
-
54
- Provides fine-grained control over sampling behavior for large datasets.
38
+ This schema maps to truthound's ProfilerConfig for fine-grained control
39
+ over profiling behavior.
55
40
  """
56
41
 
57
- strategy: SamplingStrategyType = Field(
58
- default="adaptive",
59
- description="Sampling strategy to use. 'adaptive' auto-selects based on data size.",
60
- )
61
42
  sample_size: int | None = Field(
62
43
  default=None,
63
44
  ge=100,
64
- description="Target sample size. If None, auto-estimated based on confidence level.",
45
+ description="Maximum rows to sample (None for all rows)",
65
46
  )
66
- confidence_level: float = Field(
67
- default=0.95,
68
- ge=0.80,
69
- le=0.99,
70
- description="Statistical confidence level for sample size estimation (0.80-0.99).",
71
- )
72
- margin_of_error: float = Field(
73
- default=0.03,
74
- ge=0.01,
75
- le=0.10,
76
- description="Acceptable margin of error for statistical estimates (0.01-0.10).",
47
+ random_seed: int = Field(
48
+ default=42,
49
+ ge=0,
50
+ description="Random seed for reproducible sampling",
77
51
  )
78
- strata_column: str | None = Field(
79
- default=None,
80
- description="Column for stratified sampling to maintain distribution.",
52
+ include_patterns: bool = Field(
53
+ default=True,
54
+ description="Enable pattern detection (email, phone, uuid, etc.)",
81
55
  )
82
- seed: int | None = Field(
83
- default=None,
84
- description="Random seed for reproducible sampling results.",
56
+ include_correlations: bool = Field(
57
+ default=False,
58
+ description="Calculate column correlations (can be slow for many columns)",
85
59
  )
86
-
87
-
88
- # =============================================================================
89
- # Pattern Detection Configuration
90
- # =============================================================================
91
-
92
-
93
- class PatternType(str, Enum):
94
- """Supported data pattern types for detection."""
95
-
96
- EMAIL = "email"
97
- PHONE = "phone"
98
- UUID = "uuid"
99
- URL = "url"
100
- IP_ADDRESS = "ip_address"
101
- CREDIT_CARD = "credit_card"
102
- DATE = "date"
103
- DATETIME = "datetime"
104
- KOREAN_RRN = "korean_rrn"
105
- KOREAN_PHONE = "korean_phone"
106
- SSN = "ssn"
107
- POSTAL_CODE = "postal_code"
108
- CURRENCY = "currency"
109
- PERCENTAGE = "percentage"
110
- CUSTOM = "custom"
111
-
112
-
113
- class PatternDetectionConfig(BaseSchema):
114
- """Configuration for pattern detection during profiling.
115
-
116
- Enables automatic detection of common data patterns like
117
- emails, phone numbers, UUIDs, etc.
118
- """
119
-
120
- enabled: bool = Field(
60
+ include_distributions: bool = Field(
121
61
  default=True,
122
- description="Enable pattern detection during profiling.",
62
+ description="Include value distribution histograms",
123
63
  )
124
- sample_size: int = Field(
64
+ top_n_values: int = Field(
65
+ default=10,
66
+ ge=1,
67
+ le=100,
68
+ description="Number of top values to return per column",
69
+ )
70
+ pattern_sample_size: int = Field(
125
71
  default=1000,
126
72
  ge=100,
127
- le=100000,
128
- description="Number of values to sample for pattern detection.",
73
+ le=10000,
74
+ description="Sample size for pattern detection",
75
+ )
76
+ correlation_threshold: float = Field(
77
+ default=0.7,
78
+ ge=0.0,
79
+ le=1.0,
80
+ description="Minimum correlation to report",
129
81
  )
130
- min_confidence: float = Field(
82
+ min_pattern_match_ratio: float = Field(
131
83
  default=0.8,
132
84
  ge=0.5,
133
85
  le=1.0,
134
- description="Minimum confidence threshold for pattern matches (0.5-1.0).",
86
+ description="Minimum match ratio to consider a pattern detected",
135
87
  )
136
- patterns_to_detect: list[str] | None = Field(
137
- default=None,
138
- description="Specific patterns to detect. If None, detects all supported patterns.",
139
- )
140
-
141
-
142
- # =============================================================================
143
- # Profile Request Schema (Enhanced)
144
- # =============================================================================
145
-
146
-
147
- class ProfileRequest(BaseSchema):
148
- """Request schema for data profiling.
149
-
150
- Provides comprehensive configuration for profiling operations including
151
- sampling strategies, pattern detection, and statistical analysis options.
152
- """
153
-
154
- # Basic sampling (backward compatible)
155
- sample_size: int | None = Field(
156
- default=None,
88
+ n_jobs: int = Field(
89
+ default=1,
157
90
  ge=1,
158
- description="Maximum number of rows to sample for profiling. "
159
- "If None, profiles all data. For advanced sampling, use 'sampling' config.",
160
- examples=[10000, 50000, 100000],
161
- )
162
-
163
- # Advanced sampling configuration
164
- sampling: SamplingConfig | None = Field(
165
- default=None,
166
- description="Advanced sampling configuration. If provided, overrides sample_size.",
167
- )
168
-
169
- # Pattern detection configuration
170
- pattern_detection: PatternDetectionConfig | None = Field(
171
- default=None,
172
- description="Pattern detection configuration. If None, uses default settings.",
173
- )
174
-
175
- # Additional profiling options
176
- include_histograms: bool = Field(
177
- default=True,
178
- description="Include value distribution histograms in the profile.",
179
- )
180
- include_correlations: bool = Field(
181
- default=False,
182
- description="Include column correlation analysis (increases processing time).",
183
- )
184
- include_cardinality: bool = Field(
185
- default=True,
186
- description="Include cardinality estimates for high-cardinality columns.",
91
+ le=16,
92
+ description="Number of parallel jobs for profiling",
187
93
  )
188
94
 
189
95
 
@@ -231,21 +137,21 @@ class HistogramBucket(BaseSchema):
231
137
 
232
138
 
233
139
  # =============================================================================
234
- # Column Profile Schema (Enhanced)
140
+ # Column Profile Schema
235
141
  # =============================================================================
236
142
 
237
143
 
238
144
  class ColumnProfile(BaseSchema):
239
145
  """Profile information for a single column.
240
146
 
241
- Includes basic statistics, pattern detection results, and distribution data.
147
+ Includes basic statistics and distribution data.
242
148
  """
243
149
 
244
150
  # Basic identification
245
151
  name: str = Field(..., description="Column name")
246
152
  dtype: str = Field(..., description="Physical data type (string, int64, float64, etc.)")
247
153
 
248
- # Inferred semantic type (NEW)
154
+ # Inferred semantic type
249
155
  inferred_type: str | None = Field(
250
156
  default=None,
251
157
  description="Inferred semantic type based on pattern detection "
@@ -285,7 +191,7 @@ class ColumnProfile(BaseSchema):
285
191
  max_length: int | None = Field(default=None, description="Maximum string length")
286
192
  avg_length: float | None = Field(default=None, description="Average string length")
287
193
 
288
- # Pattern detection results (NEW)
194
+ # Pattern detection results
289
195
  patterns: list[DetectedPattern] | None = Field(
290
196
  default=None,
291
197
  description="Detected data patterns (email, phone, uuid, etc.)",
@@ -313,33 +219,12 @@ class ColumnProfile(BaseSchema):
313
219
 
314
220
 
315
221
  # =============================================================================
316
- # Sampling Metadata for Response
317
- # =============================================================================
318
-
319
-
320
- class SamplingMetadata(BaseSchema):
321
- """Metadata about sampling used during profiling."""
322
-
323
- strategy_used: str = Field(..., description="Sampling strategy that was applied")
324
- sample_size: int = Field(..., description="Actual sample size used")
325
- total_rows: int = Field(..., description="Total rows in the dataset")
326
- sampling_ratio: float = Field(..., description="Ratio of sampled to total rows")
327
- seed: int | None = Field(default=None, description="Random seed used (if applicable)")
328
- confidence_level: float | None = Field(
329
- default=None, description="Confidence level achieved"
330
- )
331
- margin_of_error: float | None = Field(
332
- default=None, description="Estimated margin of error"
333
- )
334
-
335
-
336
- # =============================================================================
337
- # Profile Response Schema (Enhanced)
222
+ # Profile Response Schema
338
223
  # =============================================================================
339
224
 
340
225
 
341
226
  class ProfileResponse(BaseSchema):
342
- """Data profiling response with enhanced statistics and pattern detection."""
227
+ """Data profiling response with statistics."""
343
228
 
344
229
  source: str = Field(..., description="Source path/identifier")
345
230
  row_count: int = Field(..., ge=0, description="Total number of rows")
@@ -350,19 +235,13 @@ class ProfileResponse(BaseSchema):
350
235
  description="Profile for each column",
351
236
  )
352
237
 
353
- # Sampling metadata (NEW)
354
- sampling: SamplingMetadata | None = Field(
355
- default=None,
356
- description="Information about sampling applied during profiling",
357
- )
358
-
359
- # Pattern detection summary (NEW)
238
+ # Pattern detection summary
360
239
  detected_patterns_summary: dict[str, int] | None = Field(
361
240
  default=None,
362
241
  description="Summary of detected patterns across all columns {pattern_type: count}",
363
242
  )
364
243
 
365
- # Profiling metadata (NEW)
244
+ # Profiling metadata
366
245
  profiled_at: str | None = Field(
367
246
  default=None,
368
247
  description="ISO timestamp when profiling was performed",
@@ -384,67 +263,125 @@ class ProfileResponse(BaseSchema):
384
263
  return f"{size:.1f} PB"
385
264
 
386
265
  @classmethod
387
- def _build_column_profile(cls, col: dict[str, Any]) -> ColumnProfile:
388
- """Build a ColumnProfile from column data dict.
266
+ def _build_column_profile(cls, col: dict[str, Any] | Any) -> ColumnProfile:
267
+ """Build a ColumnProfile from column data dict or ColumnProfileResult object.
389
268
 
390
269
  Args:
391
- col: Column data dictionary from adapter or database.
270
+ col: Column data dictionary from adapter or database, or ColumnProfileResult object.
392
271
 
393
272
  Returns:
394
273
  ColumnProfile instance with all available fields.
395
274
  """
275
+ # Helper function to get attribute from dict or object
276
+ def get_val(key: str, default: Any = None) -> Any:
277
+ if isinstance(col, dict):
278
+ return col.get(key, default)
279
+ return getattr(col, key, default)
280
+
396
281
  # Build patterns list if present
397
282
  patterns = None
398
- if col.get("patterns"):
283
+ patterns_data = get_val("patterns") or get_val("detected_patterns")
284
+ if patterns_data:
399
285
  patterns = [
400
286
  DetectedPattern(
401
- pattern_type=p.get("pattern_type", p.get("type", "unknown")),
402
- confidence=p.get("confidence", 0.0),
403
- match_count=p.get("match_count", 0),
404
- match_percentage=p.get("match_percentage", 0.0),
405
- sample_matches=p.get("sample_matches"),
287
+ pattern_type=p.get("pattern_type", p.get("type", p.get("pattern", "unknown")))
288
+ if isinstance(p, dict)
289
+ else getattr(p, "pattern_type", getattr(p, "pattern", "unknown")),
290
+ confidence=p.get("confidence", 0.0)
291
+ if isinstance(p, dict)
292
+ else getattr(p, "confidence", getattr(p, "match_ratio", 0.0)),
293
+ match_count=p.get("match_count", 0) if isinstance(p, dict) else getattr(p, "match_count", 0),
294
+ match_percentage=p.get("match_percentage", 0.0)
295
+ if isinstance(p, dict)
296
+ else getattr(p, "match_percentage", getattr(p, "match_ratio", 0.0) * 100),
297
+ sample_matches=p.get("sample_matches") if isinstance(p, dict) else getattr(p, "sample_matches", None),
406
298
  )
407
- for p in col["patterns"]
299
+ for p in patterns_data
408
300
  ]
409
301
 
410
302
  # Build histogram if present
411
303
  histogram = None
412
- if col.get("histogram"):
304
+ histogram_data = get_val("histogram")
305
+ if histogram_data:
413
306
  histogram = [
414
307
  HistogramBucket(
415
- bucket=h.get("bucket", ""),
416
- count=h.get("count", 0),
417
- percentage=h.get("percentage", 0.0),
308
+ bucket=h.get("bucket", "") if isinstance(h, dict) else getattr(h, "bucket", ""),
309
+ count=h.get("count", 0) if isinstance(h, dict) else getattr(h, "count", 0),
310
+ percentage=h.get("percentage", 0.0) if isinstance(h, dict) else getattr(h, "percentage", 0.0),
418
311
  )
419
- for h in col["histogram"]
312
+ for h in histogram_data
420
313
  ]
421
314
 
315
+ # Get dtype from dict or object (physical_type for ColumnProfileResult)
316
+ dtype = get_val("dtype") or get_val("physical_type") or "unknown"
317
+
318
+ # Get null_pct - format from ratio if needed
319
+ null_pct = get_val("null_pct", "0%")
320
+ if null_pct == "0%" and get_val("null_ratio") is not None:
321
+ null_ratio = get_val("null_ratio", 0.0)
322
+ null_pct = f"{null_ratio * 100:.1f}%"
323
+
324
+ # Get unique_pct - format from ratio if needed
325
+ unique_pct = get_val("unique_pct", "0%")
326
+ if unique_pct == "0%" and get_val("unique_ratio") is not None:
327
+ unique_ratio = get_val("unique_ratio", 0.0)
328
+ unique_pct = f"{unique_ratio * 100:.1f}%"
329
+
330
+ # Get distribution stats
331
+ distribution = get_val("distribution")
332
+ mean = get_val("mean")
333
+ std = get_val("std")
334
+ median = get_val("median")
335
+ q1 = get_val("q1")
336
+ q3 = get_val("q3")
337
+ skewness = get_val("skewness")
338
+ kurtosis = get_val("kurtosis")
339
+ min_val = get_val("min")
340
+ max_val = get_val("max")
341
+
342
+ # Extract from distribution dict if present
343
+ if distribution and isinstance(distribution, dict):
344
+ mean = mean or distribution.get("mean")
345
+ std = std or distribution.get("std")
346
+ median = median or distribution.get("median")
347
+ q1 = q1 or distribution.get("q1")
348
+ q3 = q3 or distribution.get("q3")
349
+ skewness = skewness or distribution.get("skewness")
350
+ kurtosis = kurtosis or distribution.get("kurtosis")
351
+ min_val = min_val or distribution.get("min")
352
+ max_val = max_val or distribution.get("max")
353
+
354
+ # Get most_common from top_values if needed
355
+ most_common = get_val("most_common")
356
+ if not most_common and get_val("top_values"):
357
+ most_common = get_val("top_values")
358
+
422
359
  return ColumnProfile(
423
- name=col["name"],
424
- dtype=col["dtype"],
425
- inferred_type=col.get("inferred_type"),
426
- null_pct=col.get("null_pct", "0%"),
427
- null_count=col.get("null_count"),
428
- unique_pct=col.get("unique_pct", "0%"),
429
- distinct_count=col.get("distinct_count"),
430
- is_unique=col.get("is_unique"),
431
- min=col.get("min"),
432
- max=col.get("max"),
433
- mean=col.get("mean"),
434
- std=col.get("std"),
435
- median=col.get("median"),
436
- q1=col.get("q1"),
437
- q3=col.get("q3"),
438
- skewness=col.get("skewness"),
439
- kurtosis=col.get("kurtosis"),
440
- min_length=col.get("min_length"),
441
- max_length=col.get("max_length"),
442
- avg_length=col.get("avg_length"),
360
+ name=get_val("name"),
361
+ dtype=dtype,
362
+ inferred_type=get_val("inferred_type"),
363
+ null_pct=null_pct,
364
+ null_count=get_val("null_count"),
365
+ unique_pct=unique_pct,
366
+ distinct_count=get_val("distinct_count"),
367
+ is_unique=get_val("is_unique"),
368
+ min=min_val,
369
+ max=max_val,
370
+ mean=mean,
371
+ std=std,
372
+ median=median,
373
+ q1=q1,
374
+ q3=q3,
375
+ skewness=skewness,
376
+ kurtosis=kurtosis,
377
+ min_length=get_val("min_length"),
378
+ max_length=get_val("max_length"),
379
+ avg_length=get_val("avg_length"),
443
380
  patterns=patterns,
444
- primary_pattern=col.get("primary_pattern"),
445
- most_common=col.get("most_common"),
381
+ primary_pattern=get_val("primary_pattern"),
382
+ most_common=most_common,
446
383
  histogram=histogram,
447
- cardinality_estimate=col.get("cardinality_estimate"),
384
+ cardinality_estimate=get_val("cardinality_estimate"),
448
385
  )
449
386
 
450
387
  @classmethod
@@ -464,27 +401,12 @@ class ProfileResponse(BaseSchema):
464
401
  columns_data = profile_json.get("columns", [])
465
402
  columns = [cls._build_column_profile(col) for col in columns_data]
466
403
 
467
- # Build sampling metadata if present
468
- sampling = None
469
- if profile_json.get("sampling"):
470
- s = profile_json["sampling"]
471
- sampling = SamplingMetadata(
472
- strategy_used=s.get("strategy_used", "none"),
473
- sample_size=s.get("sample_size", result.row_count or 0),
474
- total_rows=s.get("total_rows", result.row_count or 0),
475
- sampling_ratio=s.get("sampling_ratio", 1.0),
476
- seed=s.get("seed"),
477
- confidence_level=s.get("confidence_level"),
478
- margin_of_error=s.get("margin_of_error"),
479
- )
480
-
481
404
  return cls(
482
405
  source=source_name,
483
406
  row_count=result.row_count or 0,
484
407
  column_count=result.column_count or 0,
485
408
  size_bytes=result.size_bytes or 0,
486
409
  columns=columns,
487
- sampling=sampling,
488
410
  detected_patterns_summary=profile_json.get("detected_patterns_summary"),
489
411
  profiled_at=profile_json.get("profiled_at"),
490
412
  profiling_duration_ms=profile_json.get("profiling_duration_ms"),
@@ -493,27 +415,12 @@ class ProfileResponse(BaseSchema):
493
415
  # Handle ProfileResult (from adapter)
494
416
  columns = [cls._build_column_profile(col) for col in result.columns]
495
417
 
496
- # Build sampling metadata if present
497
- sampling = None
498
- if hasattr(result, "sampling") and result.sampling:
499
- s = result.sampling
500
- sampling = SamplingMetadata(
501
- strategy_used=getattr(s, "strategy_used", "none"),
502
- sample_size=getattr(s, "sample_size", result.row_count),
503
- total_rows=getattr(s, "total_rows", result.row_count),
504
- sampling_ratio=getattr(s, "sampling_ratio", 1.0),
505
- seed=getattr(s, "seed", None),
506
- confidence_level=getattr(s, "confidence_level", None),
507
- margin_of_error=getattr(s, "margin_of_error", None),
508
- )
509
-
510
418
  return cls(
511
419
  source=result.source,
512
420
  row_count=result.row_count,
513
421
  column_count=result.column_count,
514
422
  size_bytes=result.size_bytes,
515
423
  columns=columns,
516
- sampling=sampling,
517
424
  detected_patterns_summary=getattr(result, "detected_patterns_summary", None),
518
425
  profiled_at=getattr(result, "profiled_at", None),
519
426
  profiling_duration_ms=getattr(result, "profiling_duration_ms", None),