pvw-cli 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pvw-cli might be problematic. Click here for more details.
- purviewcli/__init__.py +27 -0
- purviewcli/__main__.py +15 -0
- purviewcli/cli/__init__.py +5 -0
- purviewcli/cli/account.py +199 -0
- purviewcli/cli/cli.py +170 -0
- purviewcli/cli/collections.py +502 -0
- purviewcli/cli/domain.py +361 -0
- purviewcli/cli/entity.py +2436 -0
- purviewcli/cli/glossary.py +533 -0
- purviewcli/cli/health.py +250 -0
- purviewcli/cli/insight.py +113 -0
- purviewcli/cli/lineage.py +1103 -0
- purviewcli/cli/management.py +141 -0
- purviewcli/cli/policystore.py +103 -0
- purviewcli/cli/relationship.py +75 -0
- purviewcli/cli/scan.py +357 -0
- purviewcli/cli/search.py +527 -0
- purviewcli/cli/share.py +478 -0
- purviewcli/cli/types.py +831 -0
- purviewcli/cli/unified_catalog.py +3540 -0
- purviewcli/cli/workflow.py +402 -0
- purviewcli/client/__init__.py +21 -0
- purviewcli/client/_account.py +1877 -0
- purviewcli/client/_collections.py +1761 -0
- purviewcli/client/_domain.py +414 -0
- purviewcli/client/_entity.py +3545 -0
- purviewcli/client/_glossary.py +3233 -0
- purviewcli/client/_health.py +501 -0
- purviewcli/client/_insight.py +2873 -0
- purviewcli/client/_lineage.py +2138 -0
- purviewcli/client/_management.py +2202 -0
- purviewcli/client/_policystore.py +2915 -0
- purviewcli/client/_relationship.py +1351 -0
- purviewcli/client/_scan.py +2607 -0
- purviewcli/client/_search.py +1472 -0
- purviewcli/client/_share.py +272 -0
- purviewcli/client/_types.py +2708 -0
- purviewcli/client/_unified_catalog.py +5112 -0
- purviewcli/client/_workflow.py +2734 -0
- purviewcli/client/api_client.py +1295 -0
- purviewcli/client/business_rules.py +675 -0
- purviewcli/client/config.py +231 -0
- purviewcli/client/data_quality.py +433 -0
- purviewcli/client/endpoint.py +123 -0
- purviewcli/client/endpoints.py +554 -0
- purviewcli/client/exceptions.py +38 -0
- purviewcli/client/lineage_visualization.py +797 -0
- purviewcli/client/monitoring_dashboard.py +712 -0
- purviewcli/client/rate_limiter.py +30 -0
- purviewcli/client/retry_handler.py +125 -0
- purviewcli/client/scanning_operations.py +523 -0
- purviewcli/client/settings.py +1 -0
- purviewcli/client/sync_client.py +250 -0
- purviewcli/plugins/__init__.py +1 -0
- purviewcli/plugins/plugin_system.py +709 -0
- pvw_cli-1.2.8.dist-info/METADATA +1618 -0
- pvw_cli-1.2.8.dist-info/RECORD +60 -0
- pvw_cli-1.2.8.dist-info/WHEEL +5 -0
- pvw_cli-1.2.8.dist-info/entry_points.txt +3 -0
- pvw_cli-1.2.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration Management for Purview CLI
|
|
3
|
+
Handles environment configuration, profiles, and settings
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import yaml
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Any, Optional
|
|
11
|
+
from dataclasses import dataclass, asdict
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class PurviewProfile:
|
|
19
|
+
"""Purview connection profile"""
|
|
20
|
+
name: str
|
|
21
|
+
account_name: str
|
|
22
|
+
tenant_id: Optional[str] = None
|
|
23
|
+
client_id: Optional[str] = None
|
|
24
|
+
azure_region: Optional[str] = None
|
|
25
|
+
batch_size: int = 100
|
|
26
|
+
max_retries: int = 3
|
|
27
|
+
timeout: int = 30
|
|
28
|
+
default_collection: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
31
|
+
return asdict(self)
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'PurviewProfile':
|
|
35
|
+
return cls(**data)
|
|
36
|
+
|
|
37
|
+
class ConfigManager:
|
|
38
|
+
"""Manages CLI configuration and profiles"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, config_dir: Optional[str] = None):
|
|
41
|
+
self.config_dir = Path(config_dir) if config_dir else self._get_default_config_dir()
|
|
42
|
+
self.config_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
self.config_file = self.config_dir / 'config.yaml'
|
|
45
|
+
self.profiles_file = self.config_dir / 'profiles.yaml'
|
|
46
|
+
|
|
47
|
+
self._config = self._load_config()
|
|
48
|
+
self._profiles = self._load_profiles()
|
|
49
|
+
|
|
50
|
+
def _get_default_config_dir(self) -> Path:
|
|
51
|
+
"""Get default configuration directory"""
|
|
52
|
+
if os.name == 'nt': # Windows
|
|
53
|
+
config_dir = Path.home() / 'AppData' / 'Local' / 'purviewcli'
|
|
54
|
+
else: # Unix-like
|
|
55
|
+
config_dir = Path.home() / '.config' / 'purviewcli'
|
|
56
|
+
return config_dir
|
|
57
|
+
|
|
58
|
+
def _load_config(self) -> Dict[str, Any]:
|
|
59
|
+
"""Load main configuration"""
|
|
60
|
+
if self.config_file.exists():
|
|
61
|
+
try:
|
|
62
|
+
with open(self.config_file, 'r') as f:
|
|
63
|
+
return yaml.safe_load(f) or {}
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.warning(f"Failed to load config: {e}")
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
'default_profile': None,
|
|
69
|
+
'debug': False,
|
|
70
|
+
'output_format': 'json',
|
|
71
|
+
'auto_update_check': True
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def _load_profiles(self) -> Dict[str, PurviewProfile]:
|
|
75
|
+
"""Load connection profiles"""
|
|
76
|
+
profiles = {}
|
|
77
|
+
|
|
78
|
+
if self.profiles_file.exists():
|
|
79
|
+
try:
|
|
80
|
+
with open(self.profiles_file, 'r') as f:
|
|
81
|
+
data = yaml.safe_load(f) or {}
|
|
82
|
+
for name, profile_data in data.items():
|
|
83
|
+
profiles[name] = PurviewProfile.from_dict(profile_data)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.warning(f"Failed to load profiles: {e}")
|
|
86
|
+
|
|
87
|
+
return profiles
|
|
88
|
+
|
|
89
|
+
def save_config(self):
|
|
90
|
+
"""Save configuration to file"""
|
|
91
|
+
try:
|
|
92
|
+
with open(self.config_file, 'w') as f:
|
|
93
|
+
yaml.dump(self._config, f, default_flow_style=False)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Failed to save config: {e}")
|
|
96
|
+
|
|
97
|
+
def save_profiles(self):
|
|
98
|
+
"""Save profiles to file"""
|
|
99
|
+
try:
|
|
100
|
+
profiles_data = {name: profile.to_dict() for name, profile in self._profiles.items()}
|
|
101
|
+
with open(self.profiles_file, 'w') as f:
|
|
102
|
+
yaml.dump(profiles_data, f, default_flow_style=False)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.error(f"Failed to save profiles: {e}")
|
|
105
|
+
|
|
106
|
+
def add_profile(self, profile: PurviewProfile) -> bool:
|
|
107
|
+
"""Add or update a profile"""
|
|
108
|
+
try:
|
|
109
|
+
self._profiles[profile.name] = profile
|
|
110
|
+
self.save_profiles()
|
|
111
|
+
|
|
112
|
+
# Set as default if it's the first profile
|
|
113
|
+
if len(self._profiles) == 1:
|
|
114
|
+
self.set_default_profile(profile.name)
|
|
115
|
+
|
|
116
|
+
return True
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Failed to add profile: {e}")
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
def remove_profile(self, name: str) -> bool:
|
|
122
|
+
"""Remove a profile"""
|
|
123
|
+
if name in self._profiles:
|
|
124
|
+
del self._profiles[name]
|
|
125
|
+
self.save_profiles()
|
|
126
|
+
|
|
127
|
+
# Clear default if this was the default profile
|
|
128
|
+
if self._config.get('default_profile') == name:
|
|
129
|
+
self._config['default_profile'] = None
|
|
130
|
+
self.save_config()
|
|
131
|
+
|
|
132
|
+
return True
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
def get_profile(self, name: Optional[str] = None) -> Optional[PurviewProfile]:
|
|
136
|
+
"""Get a profile by name or default profile"""
|
|
137
|
+
if name is None:
|
|
138
|
+
name = self._config.get('default_profile')
|
|
139
|
+
|
|
140
|
+
if name and name in self._profiles:
|
|
141
|
+
return self._profiles[name]
|
|
142
|
+
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
def list_profiles(self) -> Dict[str, PurviewProfile]:
|
|
146
|
+
"""List all profiles"""
|
|
147
|
+
return self._profiles.copy()
|
|
148
|
+
|
|
149
|
+
def set_default_profile(self, name: str) -> bool:
|
|
150
|
+
"""Set default profile"""
|
|
151
|
+
if name in self._profiles:
|
|
152
|
+
self._config['default_profile'] = name
|
|
153
|
+
self.save_config()
|
|
154
|
+
return True
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
def get_config(self, key: str, default=None):
|
|
158
|
+
"""Get configuration value"""
|
|
159
|
+
return self._config.get(key, default)
|
|
160
|
+
|
|
161
|
+
def set_config(self, key: str, value: Any):
|
|
162
|
+
"""Set configuration value"""
|
|
163
|
+
self._config[key] = value
|
|
164
|
+
self.save_config()
|
|
165
|
+
|
|
166
|
+
def resolve_account_name(self, account_name: Optional[str] = None, profile_name: Optional[str] = None) -> Optional[str]:
|
|
167
|
+
"""Resolve account name from various sources"""
|
|
168
|
+
# 1. Explicit parameter
|
|
169
|
+
if account_name:
|
|
170
|
+
return account_name
|
|
171
|
+
|
|
172
|
+
# 2. Profile
|
|
173
|
+
profile = self.get_profile(profile_name)
|
|
174
|
+
if profile:
|
|
175
|
+
return profile.account_name
|
|
176
|
+
|
|
177
|
+
# 3. Environment variable
|
|
178
|
+
env_account = os.environ.get('PURVIEW_ACCOUNT_NAME')
|
|
179
|
+
if env_account:
|
|
180
|
+
return env_account
|
|
181
|
+
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
def create_profile_from_env(self, name: str = 'default') -> Optional[PurviewProfile]:
|
|
185
|
+
"""Create profile from environment variables"""
|
|
186
|
+
account_name = os.environ.get('PURVIEW_ACCOUNT_NAME')
|
|
187
|
+
if not account_name:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
profile = PurviewProfile(
|
|
191
|
+
name=name,
|
|
192
|
+
account_name=account_name,
|
|
193
|
+
tenant_id=os.environ.get('AZURE_TENANT_ID'),
|
|
194
|
+
client_id=os.environ.get('AZURE_CLIENT_ID'),
|
|
195
|
+
azure_region=os.environ.get('AZURE_REGION'),
|
|
196
|
+
batch_size=int(os.environ.get('PURVIEW_BATCH_SIZE', '100')),
|
|
197
|
+
max_retries=int(os.environ.get('PURVIEW_MAX_RETRIES', '3')),
|
|
198
|
+
timeout=int(os.environ.get('PURVIEW_TIMEOUT', '30'))
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return profile
|
|
202
|
+
|
|
203
|
+
class EnvironmentHelper:
|
|
204
|
+
"""Helper for environment variable management"""
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def setup_environment(profile: PurviewProfile):
|
|
208
|
+
"""Setup environment variables from profile"""
|
|
209
|
+
os.environ['PURVIEW_ACCOUNT_NAME'] = profile.account_name
|
|
210
|
+
|
|
211
|
+
if profile.tenant_id:
|
|
212
|
+
os.environ['AZURE_TENANT_ID'] = profile.tenant_id
|
|
213
|
+
|
|
214
|
+
if profile.client_id:
|
|
215
|
+
os.environ['AZURE_CLIENT_ID'] = profile.client_id
|
|
216
|
+
|
|
217
|
+
if profile.azure_region:
|
|
218
|
+
os.environ['AZURE_REGION'] = profile.azure_region
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def get_auth_info() -> Dict[str, str]:
|
|
222
|
+
"""Get authentication information"""
|
|
223
|
+
return {
|
|
224
|
+
'tenant_id': os.environ.get('AZURE_TENANT_ID', 'Not set'),
|
|
225
|
+
'client_id': os.environ.get('AZURE_CLIENT_ID', 'Not set'),
|
|
226
|
+
'region': os.environ.get('AZURE_REGION', 'public'),
|
|
227
|
+
'purview_account': os.environ.get('PURVIEW_ACCOUNT_NAME', 'Not set')
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
# Global config manager instance
|
|
231
|
+
config_manager = ConfigManager()
|
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Quality and Validation Module
|
|
3
|
+
Provides data quality checks and validation for Purview operations
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import re
|
|
8
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
class ValidationSeverity(Enum):
|
|
16
|
+
ERROR = "error"
|
|
17
|
+
WARNING = "warning"
|
|
18
|
+
INFO = "info"
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class ValidationRule:
|
|
22
|
+
"""Data validation rule definition"""
|
|
23
|
+
name: str
|
|
24
|
+
description: str
|
|
25
|
+
severity: ValidationSeverity
|
|
26
|
+
column: Optional[str] = None
|
|
27
|
+
pattern: Optional[str] = None
|
|
28
|
+
min_length: Optional[int] = None
|
|
29
|
+
max_length: Optional[int] = None
|
|
30
|
+
required: bool = False
|
|
31
|
+
allowed_values: Optional[List[str]] = None
|
|
32
|
+
custom_validator: Optional[callable] = None
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ValidationResult:
|
|
36
|
+
"""Result of a validation check"""
|
|
37
|
+
rule_name: str
|
|
38
|
+
severity: ValidationSeverity
|
|
39
|
+
message: str
|
|
40
|
+
row_index: Optional[int] = None
|
|
41
|
+
column: Optional[str] = None
|
|
42
|
+
value: Any = None
|
|
43
|
+
|
|
44
|
+
class DataQualityValidator:
|
|
45
|
+
"""Validates data quality for Purview operations"""
|
|
46
|
+
|
|
47
|
+
def __init__(self):
|
|
48
|
+
self.rules = []
|
|
49
|
+
self._setup_default_rules()
|
|
50
|
+
|
|
51
|
+
def _setup_default_rules(self):
|
|
52
|
+
"""Setup default validation rules"""
|
|
53
|
+
# GUID validation
|
|
54
|
+
guid_pattern = r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'
|
|
55
|
+
self.add_rule(ValidationRule(
|
|
56
|
+
name="valid_guid",
|
|
57
|
+
description="GUID format validation",
|
|
58
|
+
severity=ValidationSeverity.ERROR,
|
|
59
|
+
pattern=guid_pattern
|
|
60
|
+
))
|
|
61
|
+
|
|
62
|
+
# Email validation
|
|
63
|
+
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
64
|
+
self.add_rule(ValidationRule(
|
|
65
|
+
name="valid_email",
|
|
66
|
+
description="Email format validation",
|
|
67
|
+
severity=ValidationSeverity.WARNING,
|
|
68
|
+
pattern=email_pattern
|
|
69
|
+
))
|
|
70
|
+
|
|
71
|
+
# Qualified name validation
|
|
72
|
+
self.add_rule(ValidationRule(
|
|
73
|
+
name="qualified_name_format",
|
|
74
|
+
description="Qualified name should contain @ symbol",
|
|
75
|
+
severity=ValidationSeverity.ERROR,
|
|
76
|
+
custom_validator=lambda x: '@' in str(x) if x else False
|
|
77
|
+
))
|
|
78
|
+
|
|
79
|
+
# Name length validation
|
|
80
|
+
self.add_rule(ValidationRule(
|
|
81
|
+
name="name_length",
|
|
82
|
+
description="Name should be between 1 and 100 characters",
|
|
83
|
+
severity=ValidationSeverity.ERROR,
|
|
84
|
+
min_length=1,
|
|
85
|
+
max_length=100
|
|
86
|
+
))
|
|
87
|
+
|
|
88
|
+
def add_rule(self, rule: ValidationRule):
|
|
89
|
+
"""Add a validation rule"""
|
|
90
|
+
self.rules.append(rule)
|
|
91
|
+
|
|
92
|
+
def remove_rule(self, rule_name: str):
|
|
93
|
+
"""Remove a validation rule"""
|
|
94
|
+
self.rules = [rule for rule in self.rules if rule.name != rule_name]
|
|
95
|
+
|
|
96
|
+
def validate_dataframe(self, df: pd.DataFrame, column_rules: Dict[str, List[str]] = None) -> List[ValidationResult]:
|
|
97
|
+
"""Validate entire DataFrame"""
|
|
98
|
+
results = []
|
|
99
|
+
|
|
100
|
+
# Global validations
|
|
101
|
+
results.extend(self._validate_structure(df))
|
|
102
|
+
|
|
103
|
+
# Column-specific validations
|
|
104
|
+
if column_rules:
|
|
105
|
+
for column, rule_names in column_rules.items():
|
|
106
|
+
if column in df.columns:
|
|
107
|
+
for rule_name in rule_names:
|
|
108
|
+
rule = self._get_rule(rule_name)
|
|
109
|
+
if rule:
|
|
110
|
+
results.extend(self._validate_column(df, column, rule))
|
|
111
|
+
|
|
112
|
+
return results
|
|
113
|
+
|
|
114
|
+
def validate_entity_data(self, entity_data: Dict[str, Any]) -> List[ValidationResult]:
|
|
115
|
+
"""Validate entity data structure"""
|
|
116
|
+
results = []
|
|
117
|
+
|
|
118
|
+
# Check required fields
|
|
119
|
+
required_fields = ['typeName']
|
|
120
|
+
for field in required_fields:
|
|
121
|
+
if field not in entity_data:
|
|
122
|
+
results.append(ValidationResult(
|
|
123
|
+
rule_name="required_field",
|
|
124
|
+
severity=ValidationSeverity.ERROR,
|
|
125
|
+
message=f"Required field '{field}' is missing",
|
|
126
|
+
column=field
|
|
127
|
+
))
|
|
128
|
+
|
|
129
|
+
# Validate attributes
|
|
130
|
+
attributes = entity_data.get('attributes', {})
|
|
131
|
+
if attributes:
|
|
132
|
+
results.extend(self._validate_entity_attributes(attributes))
|
|
133
|
+
|
|
134
|
+
return results
|
|
135
|
+
|
|
136
|
+
def _validate_structure(self, df: pd.DataFrame) -> List[ValidationResult]:
|
|
137
|
+
"""Validate DataFrame structure"""
|
|
138
|
+
results = []
|
|
139
|
+
|
|
140
|
+
# Check for empty DataFrame
|
|
141
|
+
if df.empty:
|
|
142
|
+
results.append(ValidationResult(
|
|
143
|
+
rule_name="empty_dataframe",
|
|
144
|
+
severity=ValidationSeverity.ERROR,
|
|
145
|
+
message="DataFrame is empty"
|
|
146
|
+
))
|
|
147
|
+
|
|
148
|
+
# Check for duplicate rows
|
|
149
|
+
duplicates = df.duplicated()
|
|
150
|
+
if duplicates.any():
|
|
151
|
+
duplicate_indices = df[duplicates].index.tolist()
|
|
152
|
+
results.append(ValidationResult(
|
|
153
|
+
rule_name="duplicate_rows",
|
|
154
|
+
severity=ValidationSeverity.WARNING,
|
|
155
|
+
message=f"Found {len(duplicate_indices)} duplicate rows at indices: {duplicate_indices}"
|
|
156
|
+
))
|
|
157
|
+
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
def _validate_column(self, df: pd.DataFrame, column: str, rule: ValidationRule) -> List[ValidationResult]:
|
|
161
|
+
"""Validate specific column against rule"""
|
|
162
|
+
results = []
|
|
163
|
+
|
|
164
|
+
for index, value in df[column].items():
|
|
165
|
+
result = self._validate_value(value, rule, index, column)
|
|
166
|
+
if result:
|
|
167
|
+
results.append(result)
|
|
168
|
+
|
|
169
|
+
return results
|
|
170
|
+
|
|
171
|
+
def _validate_value(self, value: Any, rule: ValidationRule, row_index: int = None, column: str = None) -> Optional[ValidationResult]:
|
|
172
|
+
"""Validate single value against rule"""
|
|
173
|
+
# Skip validation for null values unless required
|
|
174
|
+
if pd.isna(value):
|
|
175
|
+
if rule.required:
|
|
176
|
+
return ValidationResult(
|
|
177
|
+
rule_name=rule.name,
|
|
178
|
+
severity=rule.severity,
|
|
179
|
+
message=f"Required value is missing",
|
|
180
|
+
row_index=row_index,
|
|
181
|
+
column=column,
|
|
182
|
+
value=value
|
|
183
|
+
)
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
str_value = str(value)
|
|
187
|
+
|
|
188
|
+
# Pattern validation
|
|
189
|
+
if rule.pattern and not re.match(rule.pattern, str_value, re.IGNORECASE):
|
|
190
|
+
return ValidationResult(
|
|
191
|
+
rule_name=rule.name,
|
|
192
|
+
severity=rule.severity,
|
|
193
|
+
message=f"Value '{value}' does not match pattern {rule.pattern}",
|
|
194
|
+
row_index=row_index,
|
|
195
|
+
column=column,
|
|
196
|
+
value=value
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Length validation
|
|
200
|
+
if rule.min_length and len(str_value) < rule.min_length:
|
|
201
|
+
return ValidationResult(
|
|
202
|
+
rule_name=rule.name,
|
|
203
|
+
severity=rule.severity,
|
|
204
|
+
message=f"Value '{value}' is too short (minimum {rule.min_length} characters)",
|
|
205
|
+
row_index=row_index,
|
|
206
|
+
column=column,
|
|
207
|
+
value=value
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if rule.max_length and len(str_value) > rule.max_length:
|
|
211
|
+
return ValidationResult(
|
|
212
|
+
rule_name=rule.name,
|
|
213
|
+
severity=rule.severity,
|
|
214
|
+
message=f"Value '{value}' is too long (maximum {rule.max_length} characters)",
|
|
215
|
+
row_index=row_index,
|
|
216
|
+
column=column,
|
|
217
|
+
value=value
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Allowed values validation
|
|
221
|
+
if rule.allowed_values and str_value not in rule.allowed_values:
|
|
222
|
+
return ValidationResult(
|
|
223
|
+
rule_name=rule.name,
|
|
224
|
+
severity=rule.severity,
|
|
225
|
+
message=f"Value '{value}' is not in allowed values: {rule.allowed_values}",
|
|
226
|
+
row_index=row_index,
|
|
227
|
+
column=column,
|
|
228
|
+
value=value
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Custom validator
|
|
232
|
+
if rule.custom_validator:
|
|
233
|
+
try:
|
|
234
|
+
if not rule.custom_validator(value):
|
|
235
|
+
return ValidationResult(
|
|
236
|
+
rule_name=rule.name,
|
|
237
|
+
severity=rule.severity,
|
|
238
|
+
message=f"Value '{value}' failed custom validation",
|
|
239
|
+
row_index=row_index,
|
|
240
|
+
column=column,
|
|
241
|
+
value=value
|
|
242
|
+
)
|
|
243
|
+
except Exception as e:
|
|
244
|
+
return ValidationResult(
|
|
245
|
+
rule_name=rule.name,
|
|
246
|
+
severity=ValidationSeverity.ERROR,
|
|
247
|
+
message=f"Custom validator error: {e}",
|
|
248
|
+
row_index=row_index,
|
|
249
|
+
column=column,
|
|
250
|
+
value=value
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
def _validate_entity_attributes(self, attributes: Dict[str, Any]) -> List[ValidationResult]:
|
|
256
|
+
"""Validate entity attributes"""
|
|
257
|
+
results = []
|
|
258
|
+
|
|
259
|
+
# Validate qualifiedName format
|
|
260
|
+
qualified_name = attributes.get('qualifiedName')
|
|
261
|
+
if qualified_name:
|
|
262
|
+
rule = self._get_rule('qualified_name_format')
|
|
263
|
+
if rule:
|
|
264
|
+
result = self._validate_value(qualified_name, rule, column='qualifiedName')
|
|
265
|
+
if result:
|
|
266
|
+
results.append(result)
|
|
267
|
+
|
|
268
|
+
# Validate name length
|
|
269
|
+
name = attributes.get('name')
|
|
270
|
+
if name:
|
|
271
|
+
rule = self._get_rule('name_length')
|
|
272
|
+
if rule:
|
|
273
|
+
result = self._validate_value(name, rule, column='name')
|
|
274
|
+
if result:
|
|
275
|
+
results.append(result)
|
|
276
|
+
|
|
277
|
+
return results
|
|
278
|
+
|
|
279
|
+
def _get_rule(self, rule_name: str) -> Optional[ValidationRule]:
|
|
280
|
+
"""Get validation rule by name"""
|
|
281
|
+
for rule in self.rules:
|
|
282
|
+
if rule.name == rule_name:
|
|
283
|
+
return rule
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
class DataQualityReport:
|
|
287
|
+
"""Generate data quality reports"""
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def generate_report(validation_results: List[ValidationResult]) -> Dict[str, Any]:
|
|
291
|
+
"""Generate comprehensive data quality report"""
|
|
292
|
+
|
|
293
|
+
# Categorize results by severity
|
|
294
|
+
errors = [r for r in validation_results if r.severity == ValidationSeverity.ERROR]
|
|
295
|
+
warnings = [r for r in validation_results if r.severity == ValidationSeverity.WARNING]
|
|
296
|
+
info = [r for r in validation_results if r.severity == ValidationSeverity.INFO]
|
|
297
|
+
|
|
298
|
+
# Count issues by rule
|
|
299
|
+
rule_counts = {}
|
|
300
|
+
for result in validation_results:
|
|
301
|
+
rule_counts[result.rule_name] = rule_counts.get(result.rule_name, 0) + 1
|
|
302
|
+
|
|
303
|
+
# Count issues by column
|
|
304
|
+
column_counts = {}
|
|
305
|
+
for result in validation_results:
|
|
306
|
+
if result.column:
|
|
307
|
+
column_counts[result.column] = column_counts.get(result.column, 0) + 1
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
'summary': {
|
|
311
|
+
'total_issues': len(validation_results),
|
|
312
|
+
'errors': len(errors),
|
|
313
|
+
'warnings': len(warnings),
|
|
314
|
+
'info': len(info),
|
|
315
|
+
'data_quality_score': DataQualityReport._calculate_quality_score(validation_results)
|
|
316
|
+
},
|
|
317
|
+
'issues_by_rule': rule_counts,
|
|
318
|
+
'issues_by_column': column_counts,
|
|
319
|
+
'error_details': [
|
|
320
|
+
{
|
|
321
|
+
'rule': r.rule_name,
|
|
322
|
+
'message': r.message,
|
|
323
|
+
'row': r.row_index,
|
|
324
|
+
'column': r.column,
|
|
325
|
+
'value': r.value
|
|
326
|
+
} for r in errors
|
|
327
|
+
],
|
|
328
|
+
'warning_details': [
|
|
329
|
+
{
|
|
330
|
+
'rule': r.rule_name,
|
|
331
|
+
'message': r.message,
|
|
332
|
+
'row': r.row_index,
|
|
333
|
+
'column': r.column,
|
|
334
|
+
'value': r.value
|
|
335
|
+
} for r in warnings
|
|
336
|
+
]
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
@staticmethod
|
|
340
|
+
def _calculate_quality_score(validation_results: List[ValidationResult]) -> float:
|
|
341
|
+
"""Calculate data quality score (0-100)"""
|
|
342
|
+
if not validation_results:
|
|
343
|
+
return 100.0
|
|
344
|
+
|
|
345
|
+
# Weight errors more heavily than warnings
|
|
346
|
+
error_weight = 3
|
|
347
|
+
warning_weight = 1
|
|
348
|
+
|
|
349
|
+
total_score = sum(
|
|
350
|
+
error_weight if r.severity == ValidationSeverity.ERROR else warning_weight
|
|
351
|
+
for r in validation_results
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Assume base score and deduct for issues
|
|
355
|
+
base_score = 100.0
|
|
356
|
+
deduction_per_issue = 2.0
|
|
357
|
+
|
|
358
|
+
final_score = max(0.0, base_score - (total_score * deduction_per_issue))
|
|
359
|
+
return round(final_score, 1)
|
|
360
|
+
|
|
361
|
+
@staticmethod
|
|
362
|
+
def export_report_to_csv(report: Dict[str, Any], output_file: str):
|
|
363
|
+
"""Export validation report to CSV"""
|
|
364
|
+
|
|
365
|
+
# Create detailed issues DataFrame
|
|
366
|
+
issues_data = []
|
|
367
|
+
|
|
368
|
+
for error in report.get('error_details', []):
|
|
369
|
+
issues_data.append({
|
|
370
|
+
'severity': 'ERROR',
|
|
371
|
+
'rule': error['rule'],
|
|
372
|
+
'message': error['message'],
|
|
373
|
+
'row': error['row'],
|
|
374
|
+
'column': error['column'],
|
|
375
|
+
'value': error['value']
|
|
376
|
+
})
|
|
377
|
+
|
|
378
|
+
for warning in report.get('warning_details', []):
|
|
379
|
+
issues_data.append({
|
|
380
|
+
'severity': 'WARNING',
|
|
381
|
+
'rule': warning['rule'],
|
|
382
|
+
'message': warning['message'],
|
|
383
|
+
'row': warning['row'],
|
|
384
|
+
'column': warning['column'],
|
|
385
|
+
'value': warning['value']
|
|
386
|
+
})
|
|
387
|
+
|
|
388
|
+
if issues_data:
|
|
389
|
+
df = pd.DataFrame(issues_data)
|
|
390
|
+
df.to_csv(output_file, index=False)
|
|
391
|
+
else:
|
|
392
|
+
# Create empty file with headers
|
|
393
|
+
pd.DataFrame(columns=['severity', 'rule', 'message', 'row', 'column', 'value']).to_csv(output_file, index=False)
|
|
394
|
+
|
|
395
|
+
# Predefined validation rule sets for common scenarios
|
|
396
|
+
# Entity validation rules mapping - maps entity types to validation rule names
|
|
397
|
+
ENTITY_VALIDATION_RULES = {
|
|
398
|
+
'dataset': [
|
|
399
|
+
'name_length',
|
|
400
|
+
'qualified_name_format',
|
|
401
|
+
'valid_email',
|
|
402
|
+
'valid_guid'
|
|
403
|
+
],
|
|
404
|
+
'table': [
|
|
405
|
+
'name_length',
|
|
406
|
+
'qualified_name_format',
|
|
407
|
+
'valid_email'
|
|
408
|
+
],
|
|
409
|
+
'glossary_term': [
|
|
410
|
+
'name_length',
|
|
411
|
+
'valid_guid'
|
|
412
|
+
]
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
# Legacy field-based validation rules (for backward compatibility)
|
|
416
|
+
LEGACY_VALIDATION_RULES = {
|
|
417
|
+
'name': ['name_length'],
|
|
418
|
+
'qualifiedName': ['qualified_name_format'],
|
|
419
|
+
'owner': ['valid_email'],
|
|
420
|
+
'guid': ['valid_guid']
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
GLOSSARY_TERM_VALIDATION_RULES = {
|
|
424
|
+
'name': ['name_length'],
|
|
425
|
+
'glossaryGuid': ['valid_guid']
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
TABLE_VALIDATION_RULES = {
|
|
429
|
+
'name': ['name_length'],
|
|
430
|
+
'qualifiedName': ['qualified_name_format'],
|
|
431
|
+
'db': ['name_length'],
|
|
432
|
+
'owner': ['valid_email']
|
|
433
|
+
}
|