truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +75 -86
- truthound_dashboard/api/anomaly.py +7 -13
- truthound_dashboard/api/cross_alerts.py +38 -52
- truthound_dashboard/api/drift.py +49 -59
- truthound_dashboard/api/drift_monitor.py +234 -79
- truthound_dashboard/api/enterprise_sampling.py +498 -0
- truthound_dashboard/api/history.py +57 -5
- truthound_dashboard/api/lineage.py +3 -48
- truthound_dashboard/api/maintenance.py +104 -49
- truthound_dashboard/api/mask.py +1 -2
- truthound_dashboard/api/middleware.py +2 -1
- truthound_dashboard/api/model_monitoring.py +435 -311
- truthound_dashboard/api/notifications.py +227 -191
- truthound_dashboard/api/notifications_advanced.py +21 -20
- truthound_dashboard/api/observability.py +586 -0
- truthound_dashboard/api/plugins.py +2 -433
- truthound_dashboard/api/profile.py +199 -37
- truthound_dashboard/api/quality_reporter.py +701 -0
- truthound_dashboard/api/reports.py +7 -16
- truthound_dashboard/api/router.py +66 -0
- truthound_dashboard/api/rule_suggestions.py +5 -5
- truthound_dashboard/api/scan.py +17 -19
- truthound_dashboard/api/schedules.py +85 -50
- truthound_dashboard/api/schema_evolution.py +6 -6
- truthound_dashboard/api/schema_watcher.py +667 -0
- truthound_dashboard/api/sources.py +98 -27
- truthound_dashboard/api/tiering.py +1323 -0
- truthound_dashboard/api/triggers.py +14 -11
- truthound_dashboard/api/validations.py +12 -11
- truthound_dashboard/api/versioning.py +1 -6
- truthound_dashboard/core/__init__.py +129 -3
- truthound_dashboard/core/actions/__init__.py +62 -0
- truthound_dashboard/core/actions/custom.py +426 -0
- truthound_dashboard/core/actions/notifications.py +910 -0
- truthound_dashboard/core/actions/storage.py +472 -0
- truthound_dashboard/core/actions/webhook.py +281 -0
- truthound_dashboard/core/anomaly.py +262 -67
- truthound_dashboard/core/anomaly_explainer.py +4 -3
- truthound_dashboard/core/backends/__init__.py +67 -0
- truthound_dashboard/core/backends/base.py +299 -0
- truthound_dashboard/core/backends/errors.py +191 -0
- truthound_dashboard/core/backends/factory.py +423 -0
- truthound_dashboard/core/backends/mock_backend.py +451 -0
- truthound_dashboard/core/backends/truthound_backend.py +718 -0
- truthound_dashboard/core/checkpoint/__init__.py +87 -0
- truthound_dashboard/core/checkpoint/adapters.py +814 -0
- truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
- truthound_dashboard/core/checkpoint/runner.py +270 -0
- truthound_dashboard/core/connections.py +437 -10
- truthound_dashboard/core/converters/__init__.py +14 -0
- truthound_dashboard/core/converters/truthound.py +620 -0
- truthound_dashboard/core/cross_alerts.py +540 -320
- truthound_dashboard/core/datasource_factory.py +1672 -0
- truthound_dashboard/core/drift_monitor.py +216 -20
- truthound_dashboard/core/enterprise_sampling.py +1291 -0
- truthound_dashboard/core/interfaces/__init__.py +225 -0
- truthound_dashboard/core/interfaces/actions.py +652 -0
- truthound_dashboard/core/interfaces/base.py +247 -0
- truthound_dashboard/core/interfaces/checkpoint.py +676 -0
- truthound_dashboard/core/interfaces/protocols.py +664 -0
- truthound_dashboard/core/interfaces/reporters.py +650 -0
- truthound_dashboard/core/interfaces/routing.py +646 -0
- truthound_dashboard/core/interfaces/triggers.py +619 -0
- truthound_dashboard/core/lineage.py +407 -71
- truthound_dashboard/core/model_monitoring.py +431 -3
- truthound_dashboard/core/notifications/base.py +4 -0
- truthound_dashboard/core/notifications/channels.py +501 -1203
- truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
- truthound_dashboard/core/notifications/deduplication/service.py +131 -348
- truthound_dashboard/core/notifications/dispatcher.py +202 -11
- truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
- truthound_dashboard/core/notifications/escalation/engine.py +168 -358
- truthound_dashboard/core/notifications/routing/__init__.py +88 -128
- truthound_dashboard/core/notifications/routing/engine.py +90 -317
- truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
- truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
- truthound_dashboard/core/notifications/throttling/builder.py +117 -255
- truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
- truthound_dashboard/core/phase5/collaboration.py +1 -1
- truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
- truthound_dashboard/core/quality_reporter.py +1359 -0
- truthound_dashboard/core/report_history.py +0 -6
- truthound_dashboard/core/reporters/__init__.py +175 -14
- truthound_dashboard/core/reporters/adapters.py +943 -0
- truthound_dashboard/core/reporters/base.py +0 -3
- truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
- truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
- truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
- truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
- truthound_dashboard/core/reporters/compat.py +266 -0
- truthound_dashboard/core/reporters/csv_reporter.py +2 -35
- truthound_dashboard/core/reporters/factory.py +526 -0
- truthound_dashboard/core/reporters/interfaces.py +745 -0
- truthound_dashboard/core/reporters/registry.py +1 -10
- truthound_dashboard/core/scheduler.py +165 -0
- truthound_dashboard/core/schema_evolution.py +3 -3
- truthound_dashboard/core/schema_watcher.py +1528 -0
- truthound_dashboard/core/services.py +595 -76
- truthound_dashboard/core/store_manager.py +810 -0
- truthound_dashboard/core/streaming_anomaly.py +169 -4
- truthound_dashboard/core/tiering.py +1309 -0
- truthound_dashboard/core/triggers/evaluators.py +178 -8
- truthound_dashboard/core/truthound_adapter.py +2620 -197
- truthound_dashboard/core/unified_alerts.py +23 -20
- truthound_dashboard/db/__init__.py +8 -0
- truthound_dashboard/db/database.py +8 -2
- truthound_dashboard/db/models.py +944 -25
- truthound_dashboard/db/repository.py +2 -0
- truthound_dashboard/main.py +11 -0
- truthound_dashboard/schemas/__init__.py +177 -16
- truthound_dashboard/schemas/base.py +44 -23
- truthound_dashboard/schemas/collaboration.py +19 -6
- truthound_dashboard/schemas/cross_alerts.py +19 -3
- truthound_dashboard/schemas/drift.py +61 -55
- truthound_dashboard/schemas/drift_monitor.py +67 -23
- truthound_dashboard/schemas/enterprise_sampling.py +653 -0
- truthound_dashboard/schemas/lineage.py +0 -33
- truthound_dashboard/schemas/mask.py +10 -8
- truthound_dashboard/schemas/model_monitoring.py +89 -10
- truthound_dashboard/schemas/notifications_advanced.py +13 -0
- truthound_dashboard/schemas/observability.py +453 -0
- truthound_dashboard/schemas/plugins.py +0 -280
- truthound_dashboard/schemas/profile.py +154 -247
- truthound_dashboard/schemas/quality_reporter.py +403 -0
- truthound_dashboard/schemas/reports.py +2 -2
- truthound_dashboard/schemas/rule_suggestion.py +8 -1
- truthound_dashboard/schemas/scan.py +4 -24
- truthound_dashboard/schemas/schedule.py +11 -3
- truthound_dashboard/schemas/schema_watcher.py +727 -0
- truthound_dashboard/schemas/source.py +17 -2
- truthound_dashboard/schemas/tiering.py +822 -0
- truthound_dashboard/schemas/triggers.py +16 -0
- truthound_dashboard/schemas/unified_alerts.py +7 -0
- truthound_dashboard/schemas/validation.py +0 -13
- truthound_dashboard/schemas/validators/base.py +41 -21
- truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
- truthound_dashboard/schemas/validators/localization_validators.py +273 -0
- truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
- truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
- truthound_dashboard/schemas/validators/referential_validators.py +312 -0
- truthound_dashboard/schemas/validators/registry.py +93 -8
- truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
- truthound_dashboard/schemas/versioning.py +1 -6
- truthound_dashboard/static/index.html +2 -2
- truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
- truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
- truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
- truthound_dashboard/core/plugins/hooks/manager.py +0 -403
- truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
- truthound_dashboard/core/reporters/junit_reporter.py +0 -233
- truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
- truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
- truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
- truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
- truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
- truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
- truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
- truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
- truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
- truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
- truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
- truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
- truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
- truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
- truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
- truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
- truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
- truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
- truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
- truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
- truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
- truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
- truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
- truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
- truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
- truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
- truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
- truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
- truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
- truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
- truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
- truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
- truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
- truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
- truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
- truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
- truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
- truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
- truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
- truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
- truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
- truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
- truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
- truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
- truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
- truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
- truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
- truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
"""Profile-related Pydantic schemas.
|
|
2
2
|
|
|
3
3
|
This module defines schemas for data profiling API operations.
|
|
4
|
+
|
|
5
|
+
Note: truthound's th.profile() only supports (data, source) parameters.
|
|
6
|
+
Advanced options like sampling strategies, pattern detection configuration,
|
|
7
|
+
and correlation analysis are NOT supported by the underlying library.
|
|
4
8
|
"""
|
|
5
9
|
|
|
6
10
|
from __future__ import annotations
|
|
7
11
|
|
|
8
|
-
from
|
|
9
|
-
from typing import Any, Literal
|
|
12
|
+
from typing import Any
|
|
10
13
|
|
|
11
14
|
from pydantic import Field
|
|
12
15
|
|
|
@@ -14,176 +17,79 @@ from .base import BaseSchema
|
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
# =============================================================================
|
|
17
|
-
#
|
|
20
|
+
# Profile Request Schema (Simplified)
|
|
18
21
|
# =============================================================================
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
class
|
|
22
|
-
"""
|
|
24
|
+
class ProfileRequest(BaseSchema):
|
|
25
|
+
"""Request schema for basic data profiling.
|
|
23
26
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
- RANDOM: Random sampling (general purpose)
|
|
28
|
-
- SYSTEMATIC: Every Nth row (for ordered data)
|
|
29
|
-
- STRATIFIED: Maintain distribution across categories
|
|
30
|
-
- RESERVOIR: Streaming-friendly sampling
|
|
31
|
-
- ADAPTIVE: Auto-select based on data characteristics (default)
|
|
32
|
-
- HASH: Deterministic sampling for reproducibility
|
|
27
|
+
Note: truthound's th.profile() does not support advanced configuration.
|
|
28
|
+
This schema exists for API compatibility but options are not used.
|
|
29
|
+
For advanced profiling with configuration, use ProfileAdvancedRequest.
|
|
33
30
|
"""
|
|
34
31
|
|
|
35
|
-
|
|
36
|
-
HEAD = "head"
|
|
37
|
-
RANDOM = "random"
|
|
38
|
-
SYSTEMATIC = "systematic"
|
|
39
|
-
STRATIFIED = "stratified"
|
|
40
|
-
RESERVOIR = "reservoir"
|
|
41
|
-
ADAPTIVE = "adaptive"
|
|
42
|
-
HASH = "hash"
|
|
43
|
-
|
|
32
|
+
pass
|
|
44
33
|
|
|
45
|
-
# Literal type for API validation
|
|
46
|
-
SamplingStrategyType = Literal[
|
|
47
|
-
"none", "head", "random", "systematic", "stratified", "reservoir", "adaptive", "hash"
|
|
48
|
-
]
|
|
49
34
|
|
|
35
|
+
class ProfileAdvancedRequest(BaseSchema):
|
|
36
|
+
"""Request schema for advanced data profiling with ProfilerConfig options.
|
|
50
37
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
Provides fine-grained control over sampling behavior for large datasets.
|
|
38
|
+
This schema maps to truthound's ProfilerConfig for fine-grained control
|
|
39
|
+
over profiling behavior.
|
|
55
40
|
"""
|
|
56
41
|
|
|
57
|
-
strategy: SamplingStrategyType = Field(
|
|
58
|
-
default="adaptive",
|
|
59
|
-
description="Sampling strategy to use. 'adaptive' auto-selects based on data size.",
|
|
60
|
-
)
|
|
61
42
|
sample_size: int | None = Field(
|
|
62
43
|
default=None,
|
|
63
44
|
ge=100,
|
|
64
|
-
description="
|
|
45
|
+
description="Maximum rows to sample (None for all rows)",
|
|
65
46
|
)
|
|
66
|
-
|
|
67
|
-
default=
|
|
68
|
-
ge=0
|
|
69
|
-
|
|
70
|
-
description="Statistical confidence level for sample size estimation (0.80-0.99).",
|
|
71
|
-
)
|
|
72
|
-
margin_of_error: float = Field(
|
|
73
|
-
default=0.03,
|
|
74
|
-
ge=0.01,
|
|
75
|
-
le=0.10,
|
|
76
|
-
description="Acceptable margin of error for statistical estimates (0.01-0.10).",
|
|
47
|
+
random_seed: int = Field(
|
|
48
|
+
default=42,
|
|
49
|
+
ge=0,
|
|
50
|
+
description="Random seed for reproducible sampling",
|
|
77
51
|
)
|
|
78
|
-
|
|
79
|
-
default=
|
|
80
|
-
description="
|
|
52
|
+
include_patterns: bool = Field(
|
|
53
|
+
default=True,
|
|
54
|
+
description="Enable pattern detection (email, phone, uuid, etc.)",
|
|
81
55
|
)
|
|
82
|
-
|
|
83
|
-
default=
|
|
84
|
-
description="
|
|
56
|
+
include_correlations: bool = Field(
|
|
57
|
+
default=False,
|
|
58
|
+
description="Calculate column correlations (can be slow for many columns)",
|
|
85
59
|
)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
# =============================================================================
|
|
89
|
-
# Pattern Detection Configuration
|
|
90
|
-
# =============================================================================
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class PatternType(str, Enum):
|
|
94
|
-
"""Supported data pattern types for detection."""
|
|
95
|
-
|
|
96
|
-
EMAIL = "email"
|
|
97
|
-
PHONE = "phone"
|
|
98
|
-
UUID = "uuid"
|
|
99
|
-
URL = "url"
|
|
100
|
-
IP_ADDRESS = "ip_address"
|
|
101
|
-
CREDIT_CARD = "credit_card"
|
|
102
|
-
DATE = "date"
|
|
103
|
-
DATETIME = "datetime"
|
|
104
|
-
KOREAN_RRN = "korean_rrn"
|
|
105
|
-
KOREAN_PHONE = "korean_phone"
|
|
106
|
-
SSN = "ssn"
|
|
107
|
-
POSTAL_CODE = "postal_code"
|
|
108
|
-
CURRENCY = "currency"
|
|
109
|
-
PERCENTAGE = "percentage"
|
|
110
|
-
CUSTOM = "custom"
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class PatternDetectionConfig(BaseSchema):
|
|
114
|
-
"""Configuration for pattern detection during profiling.
|
|
115
|
-
|
|
116
|
-
Enables automatic detection of common data patterns like
|
|
117
|
-
emails, phone numbers, UUIDs, etc.
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
enabled: bool = Field(
|
|
60
|
+
include_distributions: bool = Field(
|
|
121
61
|
default=True,
|
|
122
|
-
description="
|
|
62
|
+
description="Include value distribution histograms",
|
|
123
63
|
)
|
|
124
|
-
|
|
64
|
+
top_n_values: int = Field(
|
|
65
|
+
default=10,
|
|
66
|
+
ge=1,
|
|
67
|
+
le=100,
|
|
68
|
+
description="Number of top values to return per column",
|
|
69
|
+
)
|
|
70
|
+
pattern_sample_size: int = Field(
|
|
125
71
|
default=1000,
|
|
126
72
|
ge=100,
|
|
127
|
-
le=
|
|
128
|
-
description="
|
|
73
|
+
le=10000,
|
|
74
|
+
description="Sample size for pattern detection",
|
|
75
|
+
)
|
|
76
|
+
correlation_threshold: float = Field(
|
|
77
|
+
default=0.7,
|
|
78
|
+
ge=0.0,
|
|
79
|
+
le=1.0,
|
|
80
|
+
description="Minimum correlation to report",
|
|
129
81
|
)
|
|
130
|
-
|
|
82
|
+
min_pattern_match_ratio: float = Field(
|
|
131
83
|
default=0.8,
|
|
132
84
|
ge=0.5,
|
|
133
85
|
le=1.0,
|
|
134
|
-
description="Minimum
|
|
86
|
+
description="Minimum match ratio to consider a pattern detected",
|
|
135
87
|
)
|
|
136
|
-
|
|
137
|
-
default=
|
|
138
|
-
description="Specific patterns to detect. If None, detects all supported patterns.",
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
# =============================================================================
|
|
143
|
-
# Profile Request Schema (Enhanced)
|
|
144
|
-
# =============================================================================
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
class ProfileRequest(BaseSchema):
|
|
148
|
-
"""Request schema for data profiling.
|
|
149
|
-
|
|
150
|
-
Provides comprehensive configuration for profiling operations including
|
|
151
|
-
sampling strategies, pattern detection, and statistical analysis options.
|
|
152
|
-
"""
|
|
153
|
-
|
|
154
|
-
# Basic sampling (backward compatible)
|
|
155
|
-
sample_size: int | None = Field(
|
|
156
|
-
default=None,
|
|
88
|
+
n_jobs: int = Field(
|
|
89
|
+
default=1,
|
|
157
90
|
ge=1,
|
|
158
|
-
|
|
159
|
-
"
|
|
160
|
-
examples=[10000, 50000, 100000],
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# Advanced sampling configuration
|
|
164
|
-
sampling: SamplingConfig | None = Field(
|
|
165
|
-
default=None,
|
|
166
|
-
description="Advanced sampling configuration. If provided, overrides sample_size.",
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
# Pattern detection configuration
|
|
170
|
-
pattern_detection: PatternDetectionConfig | None = Field(
|
|
171
|
-
default=None,
|
|
172
|
-
description="Pattern detection configuration. If None, uses default settings.",
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
# Additional profiling options
|
|
176
|
-
include_histograms: bool = Field(
|
|
177
|
-
default=True,
|
|
178
|
-
description="Include value distribution histograms in the profile.",
|
|
179
|
-
)
|
|
180
|
-
include_correlations: bool = Field(
|
|
181
|
-
default=False,
|
|
182
|
-
description="Include column correlation analysis (increases processing time).",
|
|
183
|
-
)
|
|
184
|
-
include_cardinality: bool = Field(
|
|
185
|
-
default=True,
|
|
186
|
-
description="Include cardinality estimates for high-cardinality columns.",
|
|
91
|
+
le=16,
|
|
92
|
+
description="Number of parallel jobs for profiling",
|
|
187
93
|
)
|
|
188
94
|
|
|
189
95
|
|
|
@@ -231,21 +137,21 @@ class HistogramBucket(BaseSchema):
|
|
|
231
137
|
|
|
232
138
|
|
|
233
139
|
# =============================================================================
|
|
234
|
-
# Column Profile Schema
|
|
140
|
+
# Column Profile Schema
|
|
235
141
|
# =============================================================================
|
|
236
142
|
|
|
237
143
|
|
|
238
144
|
class ColumnProfile(BaseSchema):
|
|
239
145
|
"""Profile information for a single column.
|
|
240
146
|
|
|
241
|
-
Includes basic statistics
|
|
147
|
+
Includes basic statistics and distribution data.
|
|
242
148
|
"""
|
|
243
149
|
|
|
244
150
|
# Basic identification
|
|
245
151
|
name: str = Field(..., description="Column name")
|
|
246
152
|
dtype: str = Field(..., description="Physical data type (string, int64, float64, etc.)")
|
|
247
153
|
|
|
248
|
-
# Inferred semantic type
|
|
154
|
+
# Inferred semantic type
|
|
249
155
|
inferred_type: str | None = Field(
|
|
250
156
|
default=None,
|
|
251
157
|
description="Inferred semantic type based on pattern detection "
|
|
@@ -285,7 +191,7 @@ class ColumnProfile(BaseSchema):
|
|
|
285
191
|
max_length: int | None = Field(default=None, description="Maximum string length")
|
|
286
192
|
avg_length: float | None = Field(default=None, description="Average string length")
|
|
287
193
|
|
|
288
|
-
# Pattern detection results
|
|
194
|
+
# Pattern detection results
|
|
289
195
|
patterns: list[DetectedPattern] | None = Field(
|
|
290
196
|
default=None,
|
|
291
197
|
description="Detected data patterns (email, phone, uuid, etc.)",
|
|
@@ -313,33 +219,12 @@ class ColumnProfile(BaseSchema):
|
|
|
313
219
|
|
|
314
220
|
|
|
315
221
|
# =============================================================================
|
|
316
|
-
#
|
|
317
|
-
# =============================================================================
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
class SamplingMetadata(BaseSchema):
|
|
321
|
-
"""Metadata about sampling used during profiling."""
|
|
322
|
-
|
|
323
|
-
strategy_used: str = Field(..., description="Sampling strategy that was applied")
|
|
324
|
-
sample_size: int = Field(..., description="Actual sample size used")
|
|
325
|
-
total_rows: int = Field(..., description="Total rows in the dataset")
|
|
326
|
-
sampling_ratio: float = Field(..., description="Ratio of sampled to total rows")
|
|
327
|
-
seed: int | None = Field(default=None, description="Random seed used (if applicable)")
|
|
328
|
-
confidence_level: float | None = Field(
|
|
329
|
-
default=None, description="Confidence level achieved"
|
|
330
|
-
)
|
|
331
|
-
margin_of_error: float | None = Field(
|
|
332
|
-
default=None, description="Estimated margin of error"
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
# =============================================================================
|
|
337
|
-
# Profile Response Schema (Enhanced)
|
|
222
|
+
# Profile Response Schema
|
|
338
223
|
# =============================================================================
|
|
339
224
|
|
|
340
225
|
|
|
341
226
|
class ProfileResponse(BaseSchema):
|
|
342
|
-
"""Data profiling response with
|
|
227
|
+
"""Data profiling response with statistics."""
|
|
343
228
|
|
|
344
229
|
source: str = Field(..., description="Source path/identifier")
|
|
345
230
|
row_count: int = Field(..., ge=0, description="Total number of rows")
|
|
@@ -350,19 +235,13 @@ class ProfileResponse(BaseSchema):
|
|
|
350
235
|
description="Profile for each column",
|
|
351
236
|
)
|
|
352
237
|
|
|
353
|
-
#
|
|
354
|
-
sampling: SamplingMetadata | None = Field(
|
|
355
|
-
default=None,
|
|
356
|
-
description="Information about sampling applied during profiling",
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
# Pattern detection summary (NEW)
|
|
238
|
+
# Pattern detection summary
|
|
360
239
|
detected_patterns_summary: dict[str, int] | None = Field(
|
|
361
240
|
default=None,
|
|
362
241
|
description="Summary of detected patterns across all columns {pattern_type: count}",
|
|
363
242
|
)
|
|
364
243
|
|
|
365
|
-
# Profiling metadata
|
|
244
|
+
# Profiling metadata
|
|
366
245
|
profiled_at: str | None = Field(
|
|
367
246
|
default=None,
|
|
368
247
|
description="ISO timestamp when profiling was performed",
|
|
@@ -384,67 +263,125 @@ class ProfileResponse(BaseSchema):
|
|
|
384
263
|
return f"{size:.1f} PB"
|
|
385
264
|
|
|
386
265
|
@classmethod
|
|
387
|
-
def _build_column_profile(cls, col: dict[str, Any]) -> ColumnProfile:
|
|
388
|
-
"""Build a ColumnProfile from column data dict.
|
|
266
|
+
def _build_column_profile(cls, col: dict[str, Any] | Any) -> ColumnProfile:
|
|
267
|
+
"""Build a ColumnProfile from column data dict or ColumnProfileResult object.
|
|
389
268
|
|
|
390
269
|
Args:
|
|
391
|
-
col: Column data dictionary from adapter or database.
|
|
270
|
+
col: Column data dictionary from adapter or database, or ColumnProfileResult object.
|
|
392
271
|
|
|
393
272
|
Returns:
|
|
394
273
|
ColumnProfile instance with all available fields.
|
|
395
274
|
"""
|
|
275
|
+
# Helper function to get attribute from dict or object
|
|
276
|
+
def get_val(key: str, default: Any = None) -> Any:
|
|
277
|
+
if isinstance(col, dict):
|
|
278
|
+
return col.get(key, default)
|
|
279
|
+
return getattr(col, key, default)
|
|
280
|
+
|
|
396
281
|
# Build patterns list if present
|
|
397
282
|
patterns = None
|
|
398
|
-
|
|
283
|
+
patterns_data = get_val("patterns") or get_val("detected_patterns")
|
|
284
|
+
if patterns_data:
|
|
399
285
|
patterns = [
|
|
400
286
|
DetectedPattern(
|
|
401
|
-
pattern_type=p.get("pattern_type", p.get("type", "unknown"))
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
287
|
+
pattern_type=p.get("pattern_type", p.get("type", p.get("pattern", "unknown")))
|
|
288
|
+
if isinstance(p, dict)
|
|
289
|
+
else getattr(p, "pattern_type", getattr(p, "pattern", "unknown")),
|
|
290
|
+
confidence=p.get("confidence", 0.0)
|
|
291
|
+
if isinstance(p, dict)
|
|
292
|
+
else getattr(p, "confidence", getattr(p, "match_ratio", 0.0)),
|
|
293
|
+
match_count=p.get("match_count", 0) if isinstance(p, dict) else getattr(p, "match_count", 0),
|
|
294
|
+
match_percentage=p.get("match_percentage", 0.0)
|
|
295
|
+
if isinstance(p, dict)
|
|
296
|
+
else getattr(p, "match_percentage", getattr(p, "match_ratio", 0.0) * 100),
|
|
297
|
+
sample_matches=p.get("sample_matches") if isinstance(p, dict) else getattr(p, "sample_matches", None),
|
|
406
298
|
)
|
|
407
|
-
for p in
|
|
299
|
+
for p in patterns_data
|
|
408
300
|
]
|
|
409
301
|
|
|
410
302
|
# Build histogram if present
|
|
411
303
|
histogram = None
|
|
412
|
-
|
|
304
|
+
histogram_data = get_val("histogram")
|
|
305
|
+
if histogram_data:
|
|
413
306
|
histogram = [
|
|
414
307
|
HistogramBucket(
|
|
415
|
-
bucket=h.get("bucket", ""),
|
|
416
|
-
count=h.get("count", 0),
|
|
417
|
-
percentage=h.get("percentage", 0.0),
|
|
308
|
+
bucket=h.get("bucket", "") if isinstance(h, dict) else getattr(h, "bucket", ""),
|
|
309
|
+
count=h.get("count", 0) if isinstance(h, dict) else getattr(h, "count", 0),
|
|
310
|
+
percentage=h.get("percentage", 0.0) if isinstance(h, dict) else getattr(h, "percentage", 0.0),
|
|
418
311
|
)
|
|
419
|
-
for h in
|
|
312
|
+
for h in histogram_data
|
|
420
313
|
]
|
|
421
314
|
|
|
315
|
+
# Get dtype from dict or object (physical_type for ColumnProfileResult)
|
|
316
|
+
dtype = get_val("dtype") or get_val("physical_type") or "unknown"
|
|
317
|
+
|
|
318
|
+
# Get null_pct - format from ratio if needed
|
|
319
|
+
null_pct = get_val("null_pct", "0%")
|
|
320
|
+
if null_pct == "0%" and get_val("null_ratio") is not None:
|
|
321
|
+
null_ratio = get_val("null_ratio", 0.0)
|
|
322
|
+
null_pct = f"{null_ratio * 100:.1f}%"
|
|
323
|
+
|
|
324
|
+
# Get unique_pct - format from ratio if needed
|
|
325
|
+
unique_pct = get_val("unique_pct", "0%")
|
|
326
|
+
if unique_pct == "0%" and get_val("unique_ratio") is not None:
|
|
327
|
+
unique_ratio = get_val("unique_ratio", 0.0)
|
|
328
|
+
unique_pct = f"{unique_ratio * 100:.1f}%"
|
|
329
|
+
|
|
330
|
+
# Get distribution stats
|
|
331
|
+
distribution = get_val("distribution")
|
|
332
|
+
mean = get_val("mean")
|
|
333
|
+
std = get_val("std")
|
|
334
|
+
median = get_val("median")
|
|
335
|
+
q1 = get_val("q1")
|
|
336
|
+
q3 = get_val("q3")
|
|
337
|
+
skewness = get_val("skewness")
|
|
338
|
+
kurtosis = get_val("kurtosis")
|
|
339
|
+
min_val = get_val("min")
|
|
340
|
+
max_val = get_val("max")
|
|
341
|
+
|
|
342
|
+
# Extract from distribution dict if present
|
|
343
|
+
if distribution and isinstance(distribution, dict):
|
|
344
|
+
mean = mean or distribution.get("mean")
|
|
345
|
+
std = std or distribution.get("std")
|
|
346
|
+
median = median or distribution.get("median")
|
|
347
|
+
q1 = q1 or distribution.get("q1")
|
|
348
|
+
q3 = q3 or distribution.get("q3")
|
|
349
|
+
skewness = skewness or distribution.get("skewness")
|
|
350
|
+
kurtosis = kurtosis or distribution.get("kurtosis")
|
|
351
|
+
min_val = min_val or distribution.get("min")
|
|
352
|
+
max_val = max_val or distribution.get("max")
|
|
353
|
+
|
|
354
|
+
# Get most_common from top_values if needed
|
|
355
|
+
most_common = get_val("most_common")
|
|
356
|
+
if not most_common and get_val("top_values"):
|
|
357
|
+
most_common = get_val("top_values")
|
|
358
|
+
|
|
422
359
|
return ColumnProfile(
|
|
423
|
-
name=
|
|
424
|
-
dtype=
|
|
425
|
-
inferred_type=
|
|
426
|
-
null_pct=
|
|
427
|
-
null_count=
|
|
428
|
-
unique_pct=
|
|
429
|
-
distinct_count=
|
|
430
|
-
is_unique=
|
|
431
|
-
min=
|
|
432
|
-
max=
|
|
433
|
-
mean=
|
|
434
|
-
std=
|
|
435
|
-
median=
|
|
436
|
-
q1=
|
|
437
|
-
q3=
|
|
438
|
-
skewness=
|
|
439
|
-
kurtosis=
|
|
440
|
-
min_length=
|
|
441
|
-
max_length=
|
|
442
|
-
avg_length=
|
|
360
|
+
name=get_val("name"),
|
|
361
|
+
dtype=dtype,
|
|
362
|
+
inferred_type=get_val("inferred_type"),
|
|
363
|
+
null_pct=null_pct,
|
|
364
|
+
null_count=get_val("null_count"),
|
|
365
|
+
unique_pct=unique_pct,
|
|
366
|
+
distinct_count=get_val("distinct_count"),
|
|
367
|
+
is_unique=get_val("is_unique"),
|
|
368
|
+
min=min_val,
|
|
369
|
+
max=max_val,
|
|
370
|
+
mean=mean,
|
|
371
|
+
std=std,
|
|
372
|
+
median=median,
|
|
373
|
+
q1=q1,
|
|
374
|
+
q3=q3,
|
|
375
|
+
skewness=skewness,
|
|
376
|
+
kurtosis=kurtosis,
|
|
377
|
+
min_length=get_val("min_length"),
|
|
378
|
+
max_length=get_val("max_length"),
|
|
379
|
+
avg_length=get_val("avg_length"),
|
|
443
380
|
patterns=patterns,
|
|
444
|
-
primary_pattern=
|
|
445
|
-
most_common=
|
|
381
|
+
primary_pattern=get_val("primary_pattern"),
|
|
382
|
+
most_common=most_common,
|
|
446
383
|
histogram=histogram,
|
|
447
|
-
cardinality_estimate=
|
|
384
|
+
cardinality_estimate=get_val("cardinality_estimate"),
|
|
448
385
|
)
|
|
449
386
|
|
|
450
387
|
@classmethod
|
|
@@ -464,27 +401,12 @@ class ProfileResponse(BaseSchema):
|
|
|
464
401
|
columns_data = profile_json.get("columns", [])
|
|
465
402
|
columns = [cls._build_column_profile(col) for col in columns_data]
|
|
466
403
|
|
|
467
|
-
# Build sampling metadata if present
|
|
468
|
-
sampling = None
|
|
469
|
-
if profile_json.get("sampling"):
|
|
470
|
-
s = profile_json["sampling"]
|
|
471
|
-
sampling = SamplingMetadata(
|
|
472
|
-
strategy_used=s.get("strategy_used", "none"),
|
|
473
|
-
sample_size=s.get("sample_size", result.row_count or 0),
|
|
474
|
-
total_rows=s.get("total_rows", result.row_count or 0),
|
|
475
|
-
sampling_ratio=s.get("sampling_ratio", 1.0),
|
|
476
|
-
seed=s.get("seed"),
|
|
477
|
-
confidence_level=s.get("confidence_level"),
|
|
478
|
-
margin_of_error=s.get("margin_of_error"),
|
|
479
|
-
)
|
|
480
|
-
|
|
481
404
|
return cls(
|
|
482
405
|
source=source_name,
|
|
483
406
|
row_count=result.row_count or 0,
|
|
484
407
|
column_count=result.column_count or 0,
|
|
485
408
|
size_bytes=result.size_bytes or 0,
|
|
486
409
|
columns=columns,
|
|
487
|
-
sampling=sampling,
|
|
488
410
|
detected_patterns_summary=profile_json.get("detected_patterns_summary"),
|
|
489
411
|
profiled_at=profile_json.get("profiled_at"),
|
|
490
412
|
profiling_duration_ms=profile_json.get("profiling_duration_ms"),
|
|
@@ -493,27 +415,12 @@ class ProfileResponse(BaseSchema):
|
|
|
493
415
|
# Handle ProfileResult (from adapter)
|
|
494
416
|
columns = [cls._build_column_profile(col) for col in result.columns]
|
|
495
417
|
|
|
496
|
-
# Build sampling metadata if present
|
|
497
|
-
sampling = None
|
|
498
|
-
if hasattr(result, "sampling") and result.sampling:
|
|
499
|
-
s = result.sampling
|
|
500
|
-
sampling = SamplingMetadata(
|
|
501
|
-
strategy_used=getattr(s, "strategy_used", "none"),
|
|
502
|
-
sample_size=getattr(s, "sample_size", result.row_count),
|
|
503
|
-
total_rows=getattr(s, "total_rows", result.row_count),
|
|
504
|
-
sampling_ratio=getattr(s, "sampling_ratio", 1.0),
|
|
505
|
-
seed=getattr(s, "seed", None),
|
|
506
|
-
confidence_level=getattr(s, "confidence_level", None),
|
|
507
|
-
margin_of_error=getattr(s, "margin_of_error", None),
|
|
508
|
-
)
|
|
509
|
-
|
|
510
418
|
return cls(
|
|
511
419
|
source=result.source,
|
|
512
420
|
row_count=result.row_count,
|
|
513
421
|
column_count=result.column_count,
|
|
514
422
|
size_bytes=result.size_bytes,
|
|
515
423
|
columns=columns,
|
|
516
|
-
sampling=sampling,
|
|
517
424
|
detected_patterns_summary=getattr(result, "detected_patterns_summary", None),
|
|
518
425
|
profiled_at=getattr(result, "profiled_at", None),
|
|
519
426
|
profiling_duration_ms=getattr(result, "profiling_duration_ms", None),
|