truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
- truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
- truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
- truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,54 +5,341 @@ This module defines schemas for data profiling API operations.
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any, Literal
|
|
9
10
|
|
|
10
11
|
from pydantic import Field
|
|
11
12
|
|
|
12
13
|
from .base import BaseSchema
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# Sampling Strategy Enums and Types
|
|
18
|
+
# =============================================================================
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SamplingStrategy(str, Enum):
|
|
22
|
+
"""Sampling strategies for data profiling.
|
|
23
|
+
|
|
24
|
+
Supports 8+ strategies from truthound profiler:
|
|
25
|
+
- NONE: Profile all data (for small datasets < 100K rows)
|
|
26
|
+
- HEAD: First N rows (for quick previews)
|
|
27
|
+
- RANDOM: Random sampling (general purpose)
|
|
28
|
+
- SYSTEMATIC: Every Nth row (for ordered data)
|
|
29
|
+
- STRATIFIED: Maintain distribution across categories
|
|
30
|
+
- RESERVOIR: Streaming-friendly sampling
|
|
31
|
+
- ADAPTIVE: Auto-select based on data characteristics (default)
|
|
32
|
+
- HASH: Deterministic sampling for reproducibility
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
NONE = "none"
|
|
36
|
+
HEAD = "head"
|
|
37
|
+
RANDOM = "random"
|
|
38
|
+
SYSTEMATIC = "systematic"
|
|
39
|
+
STRATIFIED = "stratified"
|
|
40
|
+
RESERVOIR = "reservoir"
|
|
41
|
+
ADAPTIVE = "adaptive"
|
|
42
|
+
HASH = "hash"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Literal type for API validation
|
|
46
|
+
SamplingStrategyType = Literal[
|
|
47
|
+
"none", "head", "random", "systematic", "stratified", "reservoir", "adaptive", "hash"
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SamplingConfig(BaseSchema):
|
|
52
|
+
"""Advanced sampling configuration for profiling.
|
|
53
|
+
|
|
54
|
+
Provides fine-grained control over sampling behavior for large datasets.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
strategy: SamplingStrategyType = Field(
|
|
58
|
+
default="adaptive",
|
|
59
|
+
description="Sampling strategy to use. 'adaptive' auto-selects based on data size.",
|
|
60
|
+
)
|
|
61
|
+
sample_size: int | None = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
ge=100,
|
|
64
|
+
description="Target sample size. If None, auto-estimated based on confidence level.",
|
|
65
|
+
)
|
|
66
|
+
confidence_level: float = Field(
|
|
67
|
+
default=0.95,
|
|
68
|
+
ge=0.80,
|
|
69
|
+
le=0.99,
|
|
70
|
+
description="Statistical confidence level for sample size estimation (0.80-0.99).",
|
|
71
|
+
)
|
|
72
|
+
margin_of_error: float = Field(
|
|
73
|
+
default=0.03,
|
|
74
|
+
ge=0.01,
|
|
75
|
+
le=0.10,
|
|
76
|
+
description="Acceptable margin of error for statistical estimates (0.01-0.10).",
|
|
77
|
+
)
|
|
78
|
+
strata_column: str | None = Field(
|
|
79
|
+
default=None,
|
|
80
|
+
description="Column for stratified sampling to maintain distribution.",
|
|
81
|
+
)
|
|
82
|
+
seed: int | None = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Random seed for reproducible sampling results.",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# =============================================================================
|
|
89
|
+
# Pattern Detection Configuration
|
|
90
|
+
# =============================================================================
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class PatternType(str, Enum):
|
|
94
|
+
"""Supported data pattern types for detection."""
|
|
95
|
+
|
|
96
|
+
EMAIL = "email"
|
|
97
|
+
PHONE = "phone"
|
|
98
|
+
UUID = "uuid"
|
|
99
|
+
URL = "url"
|
|
100
|
+
IP_ADDRESS = "ip_address"
|
|
101
|
+
CREDIT_CARD = "credit_card"
|
|
102
|
+
DATE = "date"
|
|
103
|
+
DATETIME = "datetime"
|
|
104
|
+
KOREAN_RRN = "korean_rrn"
|
|
105
|
+
KOREAN_PHONE = "korean_phone"
|
|
106
|
+
SSN = "ssn"
|
|
107
|
+
POSTAL_CODE = "postal_code"
|
|
108
|
+
CURRENCY = "currency"
|
|
109
|
+
PERCENTAGE = "percentage"
|
|
110
|
+
CUSTOM = "custom"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class PatternDetectionConfig(BaseSchema):
|
|
114
|
+
"""Configuration for pattern detection during profiling.
|
|
115
|
+
|
|
116
|
+
Enables automatic detection of common data patterns like
|
|
117
|
+
emails, phone numbers, UUIDs, etc.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
enabled: bool = Field(
|
|
121
|
+
default=True,
|
|
122
|
+
description="Enable pattern detection during profiling.",
|
|
123
|
+
)
|
|
124
|
+
sample_size: int = Field(
|
|
125
|
+
default=1000,
|
|
126
|
+
ge=100,
|
|
127
|
+
le=100000,
|
|
128
|
+
description="Number of values to sample for pattern detection.",
|
|
129
|
+
)
|
|
130
|
+
min_confidence: float = Field(
|
|
131
|
+
default=0.8,
|
|
132
|
+
ge=0.5,
|
|
133
|
+
le=1.0,
|
|
134
|
+
description="Minimum confidence threshold for pattern matches (0.5-1.0).",
|
|
135
|
+
)
|
|
136
|
+
patterns_to_detect: list[str] | None = Field(
|
|
137
|
+
default=None,
|
|
138
|
+
description="Specific patterns to detect. If None, detects all supported patterns.",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# =============================================================================
|
|
143
|
+
# Profile Request Schema (Enhanced)
|
|
144
|
+
# =============================================================================
|
|
145
|
+
|
|
146
|
+
|
|
15
147
|
class ProfileRequest(BaseSchema):
|
|
16
148
|
"""Request schema for data profiling.
|
|
17
149
|
|
|
18
|
-
Provides
|
|
19
|
-
|
|
150
|
+
Provides comprehensive configuration for profiling operations including
|
|
151
|
+
sampling strategies, pattern detection, and statistical analysis options.
|
|
20
152
|
"""
|
|
21
153
|
|
|
154
|
+
# Basic sampling (backward compatible)
|
|
22
155
|
sample_size: int | None = Field(
|
|
23
156
|
default=None,
|
|
24
157
|
ge=1,
|
|
25
158
|
description="Maximum number of rows to sample for profiling. "
|
|
26
|
-
"If None, profiles all data.
|
|
159
|
+
"If None, profiles all data. For advanced sampling, use 'sampling' config.",
|
|
27
160
|
examples=[10000, 50000, 100000],
|
|
28
161
|
)
|
|
29
162
|
|
|
163
|
+
# Advanced sampling configuration
|
|
164
|
+
sampling: SamplingConfig | None = Field(
|
|
165
|
+
default=None,
|
|
166
|
+
description="Advanced sampling configuration. If provided, overrides sample_size.",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Pattern detection configuration
|
|
170
|
+
pattern_detection: PatternDetectionConfig | None = Field(
|
|
171
|
+
default=None,
|
|
172
|
+
description="Pattern detection configuration. If None, uses default settings.",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Additional profiling options
|
|
176
|
+
include_histograms: bool = Field(
|
|
177
|
+
default=True,
|
|
178
|
+
description="Include value distribution histograms in the profile.",
|
|
179
|
+
)
|
|
180
|
+
include_correlations: bool = Field(
|
|
181
|
+
default=False,
|
|
182
|
+
description="Include column correlation analysis (increases processing time).",
|
|
183
|
+
)
|
|
184
|
+
include_cardinality: bool = Field(
|
|
185
|
+
default=True,
|
|
186
|
+
description="Include cardinality estimates for high-cardinality columns.",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# =============================================================================
|
|
191
|
+
# Pattern Detection Results
|
|
192
|
+
# =============================================================================
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class DetectedPattern(BaseSchema):
|
|
196
|
+
"""A detected data pattern in a column."""
|
|
197
|
+
|
|
198
|
+
pattern_type: str = Field(
|
|
199
|
+
...,
|
|
200
|
+
description="Type of pattern detected (email, phone, uuid, etc.)",
|
|
201
|
+
)
|
|
202
|
+
confidence: float = Field(
|
|
203
|
+
...,
|
|
204
|
+
ge=0.0,
|
|
205
|
+
le=1.0,
|
|
206
|
+
description="Confidence score of the pattern match (0-1).",
|
|
207
|
+
)
|
|
208
|
+
match_count: int = Field(
|
|
209
|
+
...,
|
|
210
|
+
ge=0,
|
|
211
|
+
description="Number of values matching this pattern.",
|
|
212
|
+
)
|
|
213
|
+
match_percentage: float = Field(
|
|
214
|
+
...,
|
|
215
|
+
ge=0.0,
|
|
216
|
+
le=100.0,
|
|
217
|
+
description="Percentage of non-null values matching this pattern.",
|
|
218
|
+
)
|
|
219
|
+
sample_matches: list[str] | None = Field(
|
|
220
|
+
default=None,
|
|
221
|
+
description="Sample values matching this pattern (masked for sensitive data).",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class HistogramBucket(BaseSchema):
|
|
226
|
+
"""A bucket in a value distribution histogram."""
|
|
227
|
+
|
|
228
|
+
bucket: str = Field(..., description="Bucket label (range or category)")
|
|
229
|
+
count: int = Field(..., ge=0, description="Count of values in this bucket")
|
|
230
|
+
percentage: float = Field(..., ge=0.0, le=100.0, description="Percentage of total")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# =============================================================================
|
|
234
|
+
# Column Profile Schema (Enhanced)
|
|
235
|
+
# =============================================================================
|
|
236
|
+
|
|
30
237
|
|
|
31
238
|
class ColumnProfile(BaseSchema):
|
|
32
|
-
"""Profile information for a single column.
|
|
239
|
+
"""Profile information for a single column.
|
|
240
|
+
|
|
241
|
+
Includes basic statistics, pattern detection results, and distribution data.
|
|
242
|
+
"""
|
|
33
243
|
|
|
244
|
+
# Basic identification
|
|
34
245
|
name: str = Field(..., description="Column name")
|
|
35
|
-
dtype: str = Field(..., description="
|
|
246
|
+
dtype: str = Field(..., description="Physical data type (string, int64, float64, etc.)")
|
|
247
|
+
|
|
248
|
+
# Inferred semantic type (NEW)
|
|
249
|
+
inferred_type: str | None = Field(
|
|
250
|
+
default=None,
|
|
251
|
+
description="Inferred semantic type based on pattern detection "
|
|
252
|
+
"(email, phone, uuid, url, date, currency, etc.)",
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Completeness metrics
|
|
36
256
|
null_pct: str = Field(default="0%", description="Percentage of null values")
|
|
257
|
+
null_count: int | None = Field(default=None, description="Count of null values")
|
|
258
|
+
|
|
259
|
+
# Uniqueness metrics
|
|
37
260
|
unique_pct: str = Field(default="0%", description="Percentage of unique values")
|
|
261
|
+
distinct_count: int | None = Field(
|
|
262
|
+
default=None,
|
|
263
|
+
description="Count of distinct values",
|
|
264
|
+
)
|
|
265
|
+
is_unique: bool | None = Field(
|
|
266
|
+
default=None,
|
|
267
|
+
description="Whether all non-null values are unique",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Value range (for numeric/date columns)
|
|
38
271
|
min: Any | None = Field(default=None, description="Minimum value")
|
|
39
272
|
max: Any | None = Field(default=None, description="Maximum value")
|
|
273
|
+
|
|
274
|
+
# Statistical measures (for numeric columns)
|
|
40
275
|
mean: float | None = Field(default=None, description="Mean value (numeric columns)")
|
|
41
276
|
std: float | None = Field(default=None, description="Standard deviation (numeric)")
|
|
277
|
+
median: float | None = Field(default=None, description="Median value (numeric)")
|
|
278
|
+
q1: float | None = Field(default=None, description="25th percentile (Q1)")
|
|
279
|
+
q3: float | None = Field(default=None, description="75th percentile (Q3)")
|
|
280
|
+
skewness: float | None = Field(default=None, description="Skewness of distribution")
|
|
281
|
+
kurtosis: float | None = Field(default=None, description="Kurtosis of distribution")
|
|
42
282
|
|
|
43
|
-
#
|
|
44
|
-
|
|
283
|
+
# String-specific metrics
|
|
284
|
+
min_length: int | None = Field(default=None, description="Minimum string length")
|
|
285
|
+
max_length: int | None = Field(default=None, description="Maximum string length")
|
|
286
|
+
avg_length: float | None = Field(default=None, description="Average string length")
|
|
287
|
+
|
|
288
|
+
# Pattern detection results (NEW)
|
|
289
|
+
patterns: list[DetectedPattern] | None = Field(
|
|
45
290
|
default=None,
|
|
46
|
-
description="
|
|
291
|
+
description="Detected data patterns (email, phone, uuid, etc.)",
|
|
292
|
+
)
|
|
293
|
+
primary_pattern: str | None = Field(
|
|
294
|
+
default=None,
|
|
295
|
+
description="The most prevalent detected pattern type",
|
|
47
296
|
)
|
|
297
|
+
|
|
298
|
+
# Distribution data
|
|
48
299
|
most_common: list[dict[str, Any]] | None = Field(
|
|
49
300
|
default=None,
|
|
50
301
|
description="Most common values with counts",
|
|
51
302
|
)
|
|
303
|
+
histogram: list[HistogramBucket] | None = Field(
|
|
304
|
+
default=None,
|
|
305
|
+
description="Value distribution histogram",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Cardinality estimate for high-cardinality columns
|
|
309
|
+
cardinality_estimate: int | None = Field(
|
|
310
|
+
default=None,
|
|
311
|
+
description="Estimated cardinality using HyperLogLog (for high-cardinality columns)",
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# =============================================================================
|
|
316
|
+
# Sampling Metadata for Response
|
|
317
|
+
# =============================================================================
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class SamplingMetadata(BaseSchema):
|
|
321
|
+
"""Metadata about sampling used during profiling."""
|
|
322
|
+
|
|
323
|
+
strategy_used: str = Field(..., description="Sampling strategy that was applied")
|
|
324
|
+
sample_size: int = Field(..., description="Actual sample size used")
|
|
325
|
+
total_rows: int = Field(..., description="Total rows in the dataset")
|
|
326
|
+
sampling_ratio: float = Field(..., description="Ratio of sampled to total rows")
|
|
327
|
+
seed: int | None = Field(default=None, description="Random seed used (if applicable)")
|
|
328
|
+
confidence_level: float | None = Field(
|
|
329
|
+
default=None, description="Confidence level achieved"
|
|
330
|
+
)
|
|
331
|
+
margin_of_error: float | None = Field(
|
|
332
|
+
default=None, description="Estimated margin of error"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# =============================================================================
|
|
337
|
+
# Profile Response Schema (Enhanced)
|
|
338
|
+
# =============================================================================
|
|
52
339
|
|
|
53
340
|
|
|
54
341
|
class ProfileResponse(BaseSchema):
|
|
55
|
-
"""Data profiling response."""
|
|
342
|
+
"""Data profiling response with enhanced statistics and pattern detection."""
|
|
56
343
|
|
|
57
344
|
source: str = Field(..., description="Source path/identifier")
|
|
58
345
|
row_count: int = Field(..., ge=0, description="Total number of rows")
|
|
@@ -63,6 +350,28 @@ class ProfileResponse(BaseSchema):
|
|
|
63
350
|
description="Profile for each column",
|
|
64
351
|
)
|
|
65
352
|
|
|
353
|
+
# Sampling metadata (NEW)
|
|
354
|
+
sampling: SamplingMetadata | None = Field(
|
|
355
|
+
default=None,
|
|
356
|
+
description="Information about sampling applied during profiling",
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# Pattern detection summary (NEW)
|
|
360
|
+
detected_patterns_summary: dict[str, int] | None = Field(
|
|
361
|
+
default=None,
|
|
362
|
+
description="Summary of detected patterns across all columns {pattern_type: count}",
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Profiling metadata (NEW)
|
|
366
|
+
profiled_at: str | None = Field(
|
|
367
|
+
default=None,
|
|
368
|
+
description="ISO timestamp when profiling was performed",
|
|
369
|
+
)
|
|
370
|
+
profiling_duration_ms: int | None = Field(
|
|
371
|
+
default=None,
|
|
372
|
+
description="Time taken to profile in milliseconds",
|
|
373
|
+
)
|
|
374
|
+
|
|
66
375
|
# Computed properties
|
|
67
376
|
@property
|
|
68
377
|
def size_human(self) -> str:
|
|
@@ -74,6 +383,70 @@ class ProfileResponse(BaseSchema):
|
|
|
74
383
|
size /= 1024
|
|
75
384
|
return f"{size:.1f} PB"
|
|
76
385
|
|
|
386
|
+
@classmethod
|
|
387
|
+
def _build_column_profile(cls, col: dict[str, Any]) -> ColumnProfile:
|
|
388
|
+
"""Build a ColumnProfile from column data dict.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
col: Column data dictionary from adapter or database.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
ColumnProfile instance with all available fields.
|
|
395
|
+
"""
|
|
396
|
+
# Build patterns list if present
|
|
397
|
+
patterns = None
|
|
398
|
+
if col.get("patterns"):
|
|
399
|
+
patterns = [
|
|
400
|
+
DetectedPattern(
|
|
401
|
+
pattern_type=p.get("pattern_type", p.get("type", "unknown")),
|
|
402
|
+
confidence=p.get("confidence", 0.0),
|
|
403
|
+
match_count=p.get("match_count", 0),
|
|
404
|
+
match_percentage=p.get("match_percentage", 0.0),
|
|
405
|
+
sample_matches=p.get("sample_matches"),
|
|
406
|
+
)
|
|
407
|
+
for p in col["patterns"]
|
|
408
|
+
]
|
|
409
|
+
|
|
410
|
+
# Build histogram if present
|
|
411
|
+
histogram = None
|
|
412
|
+
if col.get("histogram"):
|
|
413
|
+
histogram = [
|
|
414
|
+
HistogramBucket(
|
|
415
|
+
bucket=h.get("bucket", ""),
|
|
416
|
+
count=h.get("count", 0),
|
|
417
|
+
percentage=h.get("percentage", 0.0),
|
|
418
|
+
)
|
|
419
|
+
for h in col["histogram"]
|
|
420
|
+
]
|
|
421
|
+
|
|
422
|
+
return ColumnProfile(
|
|
423
|
+
name=col["name"],
|
|
424
|
+
dtype=col["dtype"],
|
|
425
|
+
inferred_type=col.get("inferred_type"),
|
|
426
|
+
null_pct=col.get("null_pct", "0%"),
|
|
427
|
+
null_count=col.get("null_count"),
|
|
428
|
+
unique_pct=col.get("unique_pct", "0%"),
|
|
429
|
+
distinct_count=col.get("distinct_count"),
|
|
430
|
+
is_unique=col.get("is_unique"),
|
|
431
|
+
min=col.get("min"),
|
|
432
|
+
max=col.get("max"),
|
|
433
|
+
mean=col.get("mean"),
|
|
434
|
+
std=col.get("std"),
|
|
435
|
+
median=col.get("median"),
|
|
436
|
+
q1=col.get("q1"),
|
|
437
|
+
q3=col.get("q3"),
|
|
438
|
+
skewness=col.get("skewness"),
|
|
439
|
+
kurtosis=col.get("kurtosis"),
|
|
440
|
+
min_length=col.get("min_length"),
|
|
441
|
+
max_length=col.get("max_length"),
|
|
442
|
+
avg_length=col.get("avg_length"),
|
|
443
|
+
patterns=patterns,
|
|
444
|
+
primary_pattern=col.get("primary_pattern"),
|
|
445
|
+
most_common=col.get("most_common"),
|
|
446
|
+
histogram=histogram,
|
|
447
|
+
cardinality_estimate=col.get("cardinality_estimate"),
|
|
448
|
+
)
|
|
449
|
+
|
|
77
450
|
@classmethod
|
|
78
451
|
def from_result(cls, result: Any) -> ProfileResponse:
|
|
79
452
|
"""Create response from adapter result or Profile model.
|
|
@@ -89,41 +462,50 @@ class ProfileResponse(BaseSchema):
|
|
|
89
462
|
profile_json = result.profile_json
|
|
90
463
|
source_name = profile_json.get("source", result.source_id)
|
|
91
464
|
columns_data = profile_json.get("columns", [])
|
|
92
|
-
columns = [
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
465
|
+
columns = [cls._build_column_profile(col) for col in columns_data]
|
|
466
|
+
|
|
467
|
+
# Build sampling metadata if present
|
|
468
|
+
sampling = None
|
|
469
|
+
if profile_json.get("sampling"):
|
|
470
|
+
s = profile_json["sampling"]
|
|
471
|
+
sampling = SamplingMetadata(
|
|
472
|
+
strategy_used=s.get("strategy_used", "none"),
|
|
473
|
+
sample_size=s.get("sample_size", result.row_count or 0),
|
|
474
|
+
total_rows=s.get("total_rows", result.row_count or 0),
|
|
475
|
+
sampling_ratio=s.get("sampling_ratio", 1.0),
|
|
476
|
+
seed=s.get("seed"),
|
|
477
|
+
confidence_level=s.get("confidence_level"),
|
|
478
|
+
margin_of_error=s.get("margin_of_error"),
|
|
102
479
|
)
|
|
103
|
-
|
|
104
|
-
]
|
|
480
|
+
|
|
105
481
|
return cls(
|
|
106
482
|
source=source_name,
|
|
107
483
|
row_count=result.row_count or 0,
|
|
108
484
|
column_count=result.column_count or 0,
|
|
109
485
|
size_bytes=result.size_bytes or 0,
|
|
110
486
|
columns=columns,
|
|
487
|
+
sampling=sampling,
|
|
488
|
+
detected_patterns_summary=profile_json.get("detected_patterns_summary"),
|
|
489
|
+
profiled_at=profile_json.get("profiled_at"),
|
|
490
|
+
profiling_duration_ms=profile_json.get("profiling_duration_ms"),
|
|
111
491
|
)
|
|
112
492
|
|
|
113
493
|
# Handle ProfileResult (from adapter)
|
|
114
|
-
columns = [
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
494
|
+
columns = [cls._build_column_profile(col) for col in result.columns]
|
|
495
|
+
|
|
496
|
+
# Build sampling metadata if present
|
|
497
|
+
sampling = None
|
|
498
|
+
if hasattr(result, "sampling") and result.sampling:
|
|
499
|
+
s = result.sampling
|
|
500
|
+
sampling = SamplingMetadata(
|
|
501
|
+
strategy_used=getattr(s, "strategy_used", "none"),
|
|
502
|
+
sample_size=getattr(s, "sample_size", result.row_count),
|
|
503
|
+
total_rows=getattr(s, "total_rows", result.row_count),
|
|
504
|
+
sampling_ratio=getattr(s, "sampling_ratio", 1.0),
|
|
505
|
+
seed=getattr(s, "seed", None),
|
|
506
|
+
confidence_level=getattr(s, "confidence_level", None),
|
|
507
|
+
margin_of_error=getattr(s, "margin_of_error", None),
|
|
124
508
|
)
|
|
125
|
-
for col in result.columns
|
|
126
|
-
]
|
|
127
509
|
|
|
128
510
|
return cls(
|
|
129
511
|
source=result.source,
|
|
@@ -131,4 +513,8 @@ class ProfileResponse(BaseSchema):
|
|
|
131
513
|
column_count=result.column_count,
|
|
132
514
|
size_bytes=result.size_bytes,
|
|
133
515
|
columns=columns,
|
|
516
|
+
sampling=sampling,
|
|
517
|
+
detected_patterns_summary=getattr(result, "detected_patterns_summary", None),
|
|
518
|
+
profiled_at=getattr(result, "profiled_at", None),
|
|
519
|
+
profiling_duration_ms=getattr(result, "profiling_duration_ms", None),
|
|
134
520
|
)
|