pvw-cli 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pvw-cli might be problematic. Click here for more details.
- purviewcli/__init__.py +27 -0
- purviewcli/__main__.py +15 -0
- purviewcli/cli/__init__.py +5 -0
- purviewcli/cli/account.py +199 -0
- purviewcli/cli/cli.py +170 -0
- purviewcli/cli/collections.py +502 -0
- purviewcli/cli/domain.py +361 -0
- purviewcli/cli/entity.py +2436 -0
- purviewcli/cli/glossary.py +533 -0
- purviewcli/cli/health.py +250 -0
- purviewcli/cli/insight.py +113 -0
- purviewcli/cli/lineage.py +1103 -0
- purviewcli/cli/management.py +141 -0
- purviewcli/cli/policystore.py +103 -0
- purviewcli/cli/relationship.py +75 -0
- purviewcli/cli/scan.py +357 -0
- purviewcli/cli/search.py +527 -0
- purviewcli/cli/share.py +478 -0
- purviewcli/cli/types.py +831 -0
- purviewcli/cli/unified_catalog.py +3540 -0
- purviewcli/cli/workflow.py +402 -0
- purviewcli/client/__init__.py +21 -0
- purviewcli/client/_account.py +1877 -0
- purviewcli/client/_collections.py +1761 -0
- purviewcli/client/_domain.py +414 -0
- purviewcli/client/_entity.py +3545 -0
- purviewcli/client/_glossary.py +3233 -0
- purviewcli/client/_health.py +501 -0
- purviewcli/client/_insight.py +2873 -0
- purviewcli/client/_lineage.py +2138 -0
- purviewcli/client/_management.py +2202 -0
- purviewcli/client/_policystore.py +2915 -0
- purviewcli/client/_relationship.py +1351 -0
- purviewcli/client/_scan.py +2607 -0
- purviewcli/client/_search.py +1472 -0
- purviewcli/client/_share.py +272 -0
- purviewcli/client/_types.py +2708 -0
- purviewcli/client/_unified_catalog.py +5112 -0
- purviewcli/client/_workflow.py +2734 -0
- purviewcli/client/api_client.py +1295 -0
- purviewcli/client/business_rules.py +675 -0
- purviewcli/client/config.py +231 -0
- purviewcli/client/data_quality.py +433 -0
- purviewcli/client/endpoint.py +123 -0
- purviewcli/client/endpoints.py +554 -0
- purviewcli/client/exceptions.py +38 -0
- purviewcli/client/lineage_visualization.py +797 -0
- purviewcli/client/monitoring_dashboard.py +712 -0
- purviewcli/client/rate_limiter.py +30 -0
- purviewcli/client/retry_handler.py +125 -0
- purviewcli/client/scanning_operations.py +523 -0
- purviewcli/client/settings.py +1 -0
- purviewcli/client/sync_client.py +250 -0
- purviewcli/plugins/__init__.py +1 -0
- purviewcli/plugins/plugin_system.py +709 -0
- pvw_cli-1.2.8.dist-info/METADATA +1618 -0
- pvw_cli-1.2.8.dist-info/RECORD +60 -0
- pvw_cli-1.2.8.dist-info/WHEEL +5 -0
- pvw_cli-1.2.8.dist-info/entry_points.txt +3 -0
- pvw_cli-1.2.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
class RateLimiter:
|
|
5
|
+
"""
|
|
6
|
+
Simple thread-safe rate limiter using the token bucket algorithm.
|
|
7
|
+
rate_limit_config example: { 'rate': 5, 'per': 1 } # 5 requests per 1 second
|
|
8
|
+
"""
|
|
9
|
+
def __init__(self, config=None):
|
|
10
|
+
config = config or {}
|
|
11
|
+
self.rate = config.get('rate', 10) # default: 10 requests
|
|
12
|
+
self.per = config.get('per', 1) # default: per 1 second
|
|
13
|
+
self.allowance = self.rate
|
|
14
|
+
self.last_check = time.monotonic()
|
|
15
|
+
self.lock = threading.Lock()
|
|
16
|
+
|
|
17
|
+
def wait(self):
|
|
18
|
+
with self.lock:
|
|
19
|
+
current = time.monotonic()
|
|
20
|
+
time_passed = current - self.last_check
|
|
21
|
+
self.last_check = current
|
|
22
|
+
self.allowance += time_passed * (self.rate / self.per)
|
|
23
|
+
if self.allowance > self.rate:
|
|
24
|
+
self.allowance = self.rate
|
|
25
|
+
if self.allowance < 1.0:
|
|
26
|
+
sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
|
|
27
|
+
time.sleep(sleep_time)
|
|
28
|
+
self.allowance = 0
|
|
29
|
+
else:
|
|
30
|
+
self.allowance -= 1.0
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Retry handler for Purview API operations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
import random
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Callable, Any, Dict, Optional
|
|
9
|
+
from .exceptions import PurviewAPIError, PurviewRateLimitError
|
|
10
|
+
|
|
11
|
+
class RetryHandler:
|
|
12
|
+
"""Handles retry logic for API operations with exponential backoff"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, config: Optional[Dict] = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize retry handler
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
config: Retry configuration dictionary
|
|
20
|
+
"""
|
|
21
|
+
default_config = {
|
|
22
|
+
'max_retries': 3,
|
|
23
|
+
'base_delay': 1.0,
|
|
24
|
+
'max_delay': 60.0,
|
|
25
|
+
'exponential_base': 2,
|
|
26
|
+
'jitter': True,
|
|
27
|
+
'retry_on_status_codes': [429, 500, 502, 503, 504],
|
|
28
|
+
'retry_on_exceptions': [ConnectionError, TimeoutError]
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
self.config = {**default_config, **(config or {})}
|
|
32
|
+
self.logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
def execute(self, operation: Callable, *args, **kwargs) -> Any:
|
|
35
|
+
"""
|
|
36
|
+
Execute operation with retry logic
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
operation: Function to execute
|
|
40
|
+
*args: Positional arguments for operation
|
|
41
|
+
**kwargs: Keyword arguments for operation
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Result of operation
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
Exception: If all retries exhausted
|
|
48
|
+
"""
|
|
49
|
+
last_exception = None
|
|
50
|
+
|
|
51
|
+
for attempt in range(self.config['max_retries'] + 1):
|
|
52
|
+
try:
|
|
53
|
+
return operation(*args, **kwargs)
|
|
54
|
+
|
|
55
|
+
except Exception as e:
|
|
56
|
+
last_exception = e
|
|
57
|
+
|
|
58
|
+
if not self._should_retry(e, attempt):
|
|
59
|
+
raise e
|
|
60
|
+
|
|
61
|
+
if attempt < self.config['max_retries']:
|
|
62
|
+
delay = self._calculate_delay(attempt)
|
|
63
|
+
self.logger.warning(
|
|
64
|
+
f"Operation failed (attempt {attempt + 1}), retrying in {delay:.2f}s: {e}"
|
|
65
|
+
)
|
|
66
|
+
time.sleep(delay)
|
|
67
|
+
else:
|
|
68
|
+
self.logger.error(f"Operation failed after {attempt + 1} attempts: {e}")
|
|
69
|
+
raise e
|
|
70
|
+
|
|
71
|
+
# This should never be reached, but just in case
|
|
72
|
+
raise last_exception
|
|
73
|
+
|
|
74
|
+
def _should_retry(self, exception: Exception, attempt: int) -> bool:
|
|
75
|
+
"""
|
|
76
|
+
Determine if operation should be retried
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
exception: Exception that occurred
|
|
80
|
+
attempt: Current attempt number
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if should retry, False otherwise
|
|
84
|
+
"""
|
|
85
|
+
if attempt >= self.config['max_retries']:
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
# Check for specific exception types
|
|
89
|
+
if type(exception) in self.config['retry_on_exceptions']:
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
# Check for API errors with specific status codes
|
|
93
|
+
if isinstance(exception, PurviewAPIError):
|
|
94
|
+
if hasattr(exception, 'status_code'):
|
|
95
|
+
return exception.status_code in self.config['retry_on_status_codes']
|
|
96
|
+
|
|
97
|
+
# Check for rate limit errors
|
|
98
|
+
if isinstance(exception, PurviewRateLimitError):
|
|
99
|
+
return True
|
|
100
|
+
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def _calculate_delay(self, attempt: int) -> float:
|
|
104
|
+
"""
|
|
105
|
+
Calculate delay for retry attempt using exponential backoff
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
attempt: Current attempt number
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Delay in seconds
|
|
112
|
+
"""
|
|
113
|
+
delay = self.config['base_delay'] * (
|
|
114
|
+
self.config['exponential_base'] ** attempt
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Apply maximum delay limit
|
|
118
|
+
delay = min(delay, self.config['max_delay'])
|
|
119
|
+
|
|
120
|
+
# Add jitter to prevent thundering herd
|
|
121
|
+
if self.config['jitter']:
|
|
122
|
+
jitter = random.uniform(0, 0.1) * delay
|
|
123
|
+
delay += jitter
|
|
124
|
+
|
|
125
|
+
return delay
|
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scanning Operations Module for Microsoft Purview
|
|
3
|
+
Provides comprehensive scanning automation and management capabilities
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Any, Callable
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
from rich.progress import Progress, TaskID
|
|
14
|
+
|
|
15
|
+
# Optional pandas dependency for report generation
|
|
16
|
+
try:
|
|
17
|
+
import pandas as pd
|
|
18
|
+
PANDAS_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
pd = None
|
|
21
|
+
PANDAS_AVAILABLE = False
|
|
22
|
+
print("Warning: pandas not available. Report generation features will be limited.")
|
|
23
|
+
|
|
24
|
+
from .api_client import PurviewClient
|
|
25
|
+
|
|
26
|
+
console = Console()
|
|
27
|
+
|
|
28
|
+
class ScanningManager:
|
|
29
|
+
"""Advanced scanning operations and automation"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, client: PurviewClient):
|
|
32
|
+
self.client = client
|
|
33
|
+
self.console = Console()
|
|
34
|
+
|
|
35
|
+
async def create_data_source(self, data_source_config: Dict) -> Dict:
|
|
36
|
+
"""Create a new data source"""
|
|
37
|
+
endpoint = "/scan/datasources"
|
|
38
|
+
return await self.client._make_request('PUT', endpoint, json=data_source_config)
|
|
39
|
+
|
|
40
|
+
async def get_data_sources(self) -> List[Dict]:
|
|
41
|
+
"""Get all data sources"""
|
|
42
|
+
endpoint = "/scan/datasources"
|
|
43
|
+
response = await self.client._make_request('GET', endpoint)
|
|
44
|
+
return response.get('value', [])
|
|
45
|
+
|
|
46
|
+
async def create_scan(self, data_source_name: str, scan_config: Dict) -> Dict:
|
|
47
|
+
"""Create a new scan for a data source"""
|
|
48
|
+
endpoint = f"/scan/datasources/{data_source_name}/scans/{scan_config['name']}"
|
|
49
|
+
return await self.client._make_request('PUT', endpoint, json=scan_config)
|
|
50
|
+
|
|
51
|
+
async def run_scan(self, data_source_name: str, scan_name: str) -> Dict:
|
|
52
|
+
"""Start a scan"""
|
|
53
|
+
endpoint = f"/scan/datasources/{data_source_name}/scans/{scan_name}/run"
|
|
54
|
+
return await self.client._make_request('POST', endpoint)
|
|
55
|
+
|
|
56
|
+
async def get_scan_status(self, data_source_name: str, scan_name: str, run_id: str) -> Dict:
|
|
57
|
+
"""Get scan status"""
|
|
58
|
+
endpoint = f"/scan/datasources/{data_source_name}/scans/{scan_name}/runs/{run_id}"
|
|
59
|
+
return await self.client._make_request('GET', endpoint)
|
|
60
|
+
|
|
61
|
+
async def get_scan_history(self, data_source_name: str, scan_name: str) -> List[Dict]:
|
|
62
|
+
"""Get scan run history"""
|
|
63
|
+
endpoint = f"/scan/datasources/{data_source_name}/scans/{scan_name}/runs"
|
|
64
|
+
response = await self.client._make_request('GET', endpoint)
|
|
65
|
+
return response.get('value', [])
|
|
66
|
+
|
|
67
|
+
async def bulk_create_data_sources(self, sources_config: List[Dict],
|
|
68
|
+
progress_callback: Optional[Callable] = None) -> Dict:
|
|
69
|
+
"""Create multiple data sources from configuration"""
|
|
70
|
+
results = {'created': [], 'failed': [], 'errors': []}
|
|
71
|
+
|
|
72
|
+
with Progress() as progress:
|
|
73
|
+
task = progress.add_task("Creating data sources...", total=len(sources_config))
|
|
74
|
+
|
|
75
|
+
for i, source_config in enumerate(sources_config):
|
|
76
|
+
try:
|
|
77
|
+
result = await self.create_data_source(source_config)
|
|
78
|
+
results['created'].append({
|
|
79
|
+
'name': source_config.get('name'),
|
|
80
|
+
'type': source_config.get('kind'),
|
|
81
|
+
'result': result
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
error_msg = f"Failed to create {source_config.get('name', 'unknown')}: {str(e)}"
|
|
86
|
+
results['failed'].append(source_config.get('name', 'unknown'))
|
|
87
|
+
results['errors'].append(error_msg)
|
|
88
|
+
|
|
89
|
+
progress.update(task, advance=1)
|
|
90
|
+
if progress_callback:
|
|
91
|
+
progress_callback(i + 1, len(sources_config))
|
|
92
|
+
|
|
93
|
+
return results
|
|
94
|
+
|
|
95
|
+
async def bulk_run_scans(self, scan_configs: List[Dict],
|
|
96
|
+
monitor_progress: bool = True) -> Dict:
|
|
97
|
+
"""Run multiple scans and optionally monitor their progress"""
|
|
98
|
+
results = {'started': [], 'failed': [], 'completed': [], 'errors': []}
|
|
99
|
+
|
|
100
|
+
# Start all scans
|
|
101
|
+
scan_runs = []
|
|
102
|
+
for scan_config in scan_configs:
|
|
103
|
+
try:
|
|
104
|
+
data_source = scan_config['data_source']
|
|
105
|
+
scan_name = scan_config['scan_name']
|
|
106
|
+
|
|
107
|
+
result = await self.run_scan(data_source, scan_name)
|
|
108
|
+
run_id = result.get('runId')
|
|
109
|
+
|
|
110
|
+
if run_id:
|
|
111
|
+
scan_runs.append({
|
|
112
|
+
'data_source': data_source,
|
|
113
|
+
'scan_name': scan_name,
|
|
114
|
+
'run_id': run_id,
|
|
115
|
+
'started_at': datetime.now()
|
|
116
|
+
})
|
|
117
|
+
results['started'].append(f"{data_source}/{scan_name}")
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
error_msg = f"Failed to start scan {scan_config}: {str(e)}"
|
|
121
|
+
results['failed'].append(str(scan_config))
|
|
122
|
+
results['errors'].append(error_msg)
|
|
123
|
+
|
|
124
|
+
# Monitor progress if requested
|
|
125
|
+
if monitor_progress and scan_runs:
|
|
126
|
+
await self._monitor_scan_progress(scan_runs, results)
|
|
127
|
+
|
|
128
|
+
return results
|
|
129
|
+
|
|
130
|
+
async def _monitor_scan_progress(self, scan_runs: List[Dict], results: Dict):
|
|
131
|
+
"""Monitor the progress of running scans"""
|
|
132
|
+
pending_scans = scan_runs.copy()
|
|
133
|
+
|
|
134
|
+
with Progress() as progress:
|
|
135
|
+
# Create progress bars for each scan
|
|
136
|
+
scan_tasks = {}
|
|
137
|
+
for scan in pending_scans:
|
|
138
|
+
scan_id = f"{scan['data_source']}/{scan['scan_name']}"
|
|
139
|
+
task_id = progress.add_task(f"Scanning {scan_id}", total=100)
|
|
140
|
+
scan_tasks[scan_id] = task_id
|
|
141
|
+
|
|
142
|
+
while pending_scans:
|
|
143
|
+
completed_scans = []
|
|
144
|
+
|
|
145
|
+
for scan in pending_scans:
|
|
146
|
+
try:
|
|
147
|
+
status = await self.get_scan_status(
|
|
148
|
+
scan['data_source'],
|
|
149
|
+
scan['scan_name'],
|
|
150
|
+
scan['run_id']
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
scan_state = status.get('status', 'Unknown')
|
|
154
|
+
scan_id = f"{scan['data_source']}/{scan['scan_name']}"
|
|
155
|
+
|
|
156
|
+
if scan_state in ['Succeeded', 'Failed', 'Canceled']:
|
|
157
|
+
completed_scans.append(scan)
|
|
158
|
+
progress.update(scan_tasks[scan_id], completed=100)
|
|
159
|
+
|
|
160
|
+
if scan_state == 'Succeeded':
|
|
161
|
+
results['completed'].append(scan_id)
|
|
162
|
+
else:
|
|
163
|
+
results['failed'].append(scan_id)
|
|
164
|
+
results['errors'].append(f"Scan {scan_id} {scan_state}")
|
|
165
|
+
|
|
166
|
+
elif scan_state == 'Running':
|
|
167
|
+
# Update progress based on scan metrics if available
|
|
168
|
+
scan_result = status.get('scanResultMetrics', {})
|
|
169
|
+
if scan_result:
|
|
170
|
+
processed = scan_result.get('processedCount', 0)
|
|
171
|
+
total = scan_result.get('totalCount', 1)
|
|
172
|
+
percentage = min((processed / total) * 100, 99) if total > 0 else 50
|
|
173
|
+
progress.update(scan_tasks[scan_id], completed=percentage)
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
console.print(f"[red]Error monitoring scan {scan}: {str(e)}[/red]")
|
|
177
|
+
|
|
178
|
+
# Remove completed scans
|
|
179
|
+
for completed in completed_scans:
|
|
180
|
+
pending_scans.remove(completed)
|
|
181
|
+
|
|
182
|
+
if pending_scans:
|
|
183
|
+
await asyncio.sleep(30) # Check every 30 seconds
|
|
184
|
+
|
|
185
|
+
async def generate_scan_report(self, output_file: str, days_back: int = 30) -> Dict:
|
|
186
|
+
"""Generate comprehensive scanning report"""
|
|
187
|
+
end_date = datetime.now()
|
|
188
|
+
start_date = end_date - timedelta(days=days_back)
|
|
189
|
+
|
|
190
|
+
console.print(f"[blue]Generating scan report for last {days_back} days...[/blue]")
|
|
191
|
+
|
|
192
|
+
# Get all data sources
|
|
193
|
+
data_sources = await self.get_data_sources()
|
|
194
|
+
|
|
195
|
+
report_data = []
|
|
196
|
+
summary_stats = {
|
|
197
|
+
'total_sources': len(data_sources),
|
|
198
|
+
'scanned_sources': 0,
|
|
199
|
+
'successful_scans': 0,
|
|
200
|
+
'failed_scans': 0,
|
|
201
|
+
'total_assets_discovered': 0
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
for source in data_sources:
|
|
205
|
+
source_name = source.get('name')
|
|
206
|
+
source_type = source.get('kind')
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
# Get scans for this data source
|
|
210
|
+
scans_endpoint = f"/scan/datasources/{source_name}/scans"
|
|
211
|
+
scans_response = await self.client._make_request('GET', scans_endpoint)
|
|
212
|
+
scans = scans_response.get('value', [])
|
|
213
|
+
|
|
214
|
+
for scan in scans:
|
|
215
|
+
scan_name = scan.get('name')
|
|
216
|
+
|
|
217
|
+
# Get scan history
|
|
218
|
+
history = await self.get_scan_history(source_name, scan_name)
|
|
219
|
+
|
|
220
|
+
for run in history:
|
|
221
|
+
run_date = datetime.fromisoformat(run.get('startTime', '').replace('Z', '+00:00'))
|
|
222
|
+
|
|
223
|
+
if start_date <= run_date <= end_date:
|
|
224
|
+
summary_stats['scanned_sources'] += 1
|
|
225
|
+
|
|
226
|
+
status = run.get('status', 'Unknown')
|
|
227
|
+
if status == 'Succeeded':
|
|
228
|
+
summary_stats['successful_scans'] += 1
|
|
229
|
+
elif status == 'Failed':
|
|
230
|
+
summary_stats['failed_scans'] += 1
|
|
231
|
+
|
|
232
|
+
# Extract metrics
|
|
233
|
+
metrics = run.get('scanResultMetrics', {})
|
|
234
|
+
assets_discovered = metrics.get('processedCount', 0)
|
|
235
|
+
summary_stats['total_assets_discovered'] += assets_discovered
|
|
236
|
+
|
|
237
|
+
report_data.append({
|
|
238
|
+
'data_source': source_name,
|
|
239
|
+
'source_type': source_type,
|
|
240
|
+
'scan_name': scan_name,
|
|
241
|
+
'run_id': run.get('runId'),
|
|
242
|
+
'status': status,
|
|
243
|
+
'start_time': run.get('startTime'),
|
|
244
|
+
'end_time': run.get('endTime'),
|
|
245
|
+
'duration_minutes': self._calculate_duration(
|
|
246
|
+
run.get('startTime'), run.get('endTime')
|
|
247
|
+
),
|
|
248
|
+
'assets_discovered': assets_discovered,
|
|
249
|
+
'assets_classified': metrics.get('classifiedCount', 0),
|
|
250
|
+
'error_message': run.get('error', {}).get('message', '')
|
|
251
|
+
})
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
console.print(f"[yellow]Warning: Could not get scan data for {source_name}: {e}[/yellow]")
|
|
255
|
+
|
|
256
|
+
# Save report to CSV
|
|
257
|
+
df = pd.DataFrame(report_data)
|
|
258
|
+
df.to_csv(output_file, index=False)
|
|
259
|
+
|
|
260
|
+
# Generate summary
|
|
261
|
+
summary = {
|
|
262
|
+
'report_file': output_file,
|
|
263
|
+
'report_period': f"{start_date.date()} to {end_date.date()}",
|
|
264
|
+
'statistics': summary_stats,
|
|
265
|
+
'total_scan_runs': len(report_data)
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
console.print(f"[green]✓ Scan report saved to {output_file}[/green]")
|
|
269
|
+
console.print(f"[green]✓ Found {len(report_data)} scan runs across {summary_stats['total_sources']} data sources[/green]")
|
|
270
|
+
|
|
271
|
+
return summary
|
|
272
|
+
|
|
273
|
+
def _calculate_duration(self, start_time: str, end_time: str) -> float:
|
|
274
|
+
"""Calculate scan duration in minutes"""
|
|
275
|
+
try:
|
|
276
|
+
if not start_time or not end_time:
|
|
277
|
+
return 0.0
|
|
278
|
+
|
|
279
|
+
start = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
|
|
280
|
+
end = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
|
|
281
|
+
|
|
282
|
+
duration = end - start
|
|
283
|
+
return duration.total_seconds() / 60
|
|
284
|
+
|
|
285
|
+
except Exception:
|
|
286
|
+
return 0.0
|
|
287
|
+
|
|
288
|
+
async def optimize_scan_schedules(self) -> Dict:
|
|
289
|
+
"""Analyze scan patterns and suggest optimizations"""
|
|
290
|
+
console.print("[blue]Analyzing scan patterns for optimization recommendations...[/blue]")
|
|
291
|
+
|
|
292
|
+
# Get all data sources and their scan history
|
|
293
|
+
data_sources = await self.get_data_sources()
|
|
294
|
+
optimization_report = {
|
|
295
|
+
'recommendations': [],
|
|
296
|
+
'statistics': {},
|
|
297
|
+
'potential_savings': {}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
for source in data_sources:
|
|
301
|
+
source_name = source.get('name')
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
# Analyze scan frequency and success rates
|
|
305
|
+
scans_endpoint = f"/scan/datasources/{source_name}/scans"
|
|
306
|
+
scans_response = await self.client._make_request('GET', scans_endpoint)
|
|
307
|
+
scans = scans_response.get('value', [])
|
|
308
|
+
|
|
309
|
+
for scan in scans:
|
|
310
|
+
scan_name = scan.get('name')
|
|
311
|
+
history = await self.get_scan_history(source_name, scan_name)
|
|
312
|
+
|
|
313
|
+
if len(history) >= 5: # Need some history for analysis
|
|
314
|
+
analysis = self._analyze_scan_pattern(history)
|
|
315
|
+
|
|
316
|
+
if analysis['recommendations']:
|
|
317
|
+
optimization_report['recommendations'].extend([
|
|
318
|
+
{
|
|
319
|
+
'data_source': source_name,
|
|
320
|
+
'scan_name': scan_name,
|
|
321
|
+
'recommendation': rec
|
|
322
|
+
}
|
|
323
|
+
for rec in analysis['recommendations']
|
|
324
|
+
])
|
|
325
|
+
|
|
326
|
+
except Exception as e:
|
|
327
|
+
console.print(f"[yellow]Warning: Could not analyze {source_name}: {e}[/yellow]")
|
|
328
|
+
|
|
329
|
+
return optimization_report
|
|
330
|
+
|
|
331
|
+
def _analyze_scan_pattern(self, scan_history: List[Dict]) -> Dict:
|
|
332
|
+
"""Analyze scan history to identify optimization opportunities"""
|
|
333
|
+
recommendations = []
|
|
334
|
+
|
|
335
|
+
# Calculate success rate
|
|
336
|
+
total_scans = len(scan_history)
|
|
337
|
+
successful_scans = sum(1 for run in scan_history if run.get('status') == 'Succeeded')
|
|
338
|
+
success_rate = successful_scans / total_scans if total_scans > 0 else 0
|
|
339
|
+
|
|
340
|
+
# Analyze scan frequency
|
|
341
|
+
scan_times = [
|
|
342
|
+
datetime.fromisoformat(run.get('startTime', '').replace('Z', '+00:00'))
|
|
343
|
+
for run in scan_history
|
|
344
|
+
if run.get('startTime')
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
if len(scan_times) >= 2:
|
|
348
|
+
scan_times.sort()
|
|
349
|
+
intervals = [
|
|
350
|
+
(scan_times[i] - scan_times[i-1]).total_seconds() / 3600 # Hours
|
|
351
|
+
for i in range(1, len(scan_times))
|
|
352
|
+
]
|
|
353
|
+
avg_interval = sum(intervals) / len(intervals) if intervals else 0
|
|
354
|
+
|
|
355
|
+
# Generate recommendations
|
|
356
|
+
if success_rate < 0.8:
|
|
357
|
+
recommendations.append(f"Low success rate ({success_rate:.1%}). Review scan configuration and data source connectivity.")
|
|
358
|
+
|
|
359
|
+
if avg_interval < 6: # Less than 6 hours between scans
|
|
360
|
+
recommendations.append(f"Very frequent scanning (avg {avg_interval:.1f}h intervals). Consider reducing frequency if data doesn't change often.")
|
|
361
|
+
|
|
362
|
+
if avg_interval > 168: # More than a week between scans
|
|
363
|
+
recommendations.append(f"Infrequent scanning (avg {avg_interval:.1f}h intervals). Consider more frequent scans for better data freshness.")
|
|
364
|
+
|
|
365
|
+
return {'recommendations': recommendations}
|
|
366
|
+
|
|
367
|
+
class ScanTemplateManager:
|
|
368
|
+
"""Manage scanning templates and configurations"""
|
|
369
|
+
|
|
370
|
+
def __init__(self):
|
|
371
|
+
self.templates = self._load_default_templates()
|
|
372
|
+
|
|
373
|
+
def _load_default_templates(self) -> Dict:
|
|
374
|
+
"""Load default scanning templates"""
|
|
375
|
+
return {
|
|
376
|
+
'azure_storage': {
|
|
377
|
+
'kind': 'AdlsGen2',
|
|
378
|
+
'properties': {
|
|
379
|
+
'subscriptionId': '',
|
|
380
|
+
'resourceGroup': '',
|
|
381
|
+
'location': '',
|
|
382
|
+
'endpoint': '',
|
|
383
|
+
'collection': {
|
|
384
|
+
'referenceName': 'default'
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
},
|
|
388
|
+
'sql_database': {
|
|
389
|
+
'kind': 'AzureSqlDatabase',
|
|
390
|
+
'properties': {
|
|
391
|
+
'serverEndpoint': '',
|
|
392
|
+
'databaseName': '',
|
|
393
|
+
'collection': {
|
|
394
|
+
'referenceName': 'default'
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
},
|
|
398
|
+
'synapse_workspace': {
|
|
399
|
+
'kind': 'AzureSynapseWorkspace',
|
|
400
|
+
'properties': {
|
|
401
|
+
'dedicatedSqlEndpoint': '',
|
|
402
|
+
'serverlessSqlEndpoint': '',
|
|
403
|
+
'collection': {
|
|
404
|
+
'referenceName': 'default'
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
def create_data_source_config(self, template_name: str, **kwargs) -> Dict:
|
|
411
|
+
"""Create data source configuration from template"""
|
|
412
|
+
if template_name not in self.templates:
|
|
413
|
+
raise ValueError(f"Unknown template: {template_name}")
|
|
414
|
+
|
|
415
|
+
config = self.templates[template_name].copy()
|
|
416
|
+
|
|
417
|
+
# Update properties with provided values
|
|
418
|
+
for key, value in kwargs.items():
|
|
419
|
+
if '.' in key:
|
|
420
|
+
# Handle nested properties like 'properties.endpoint'
|
|
421
|
+
parts = key.split('.')
|
|
422
|
+
current = config
|
|
423
|
+
for part in parts[:-1]:
|
|
424
|
+
if part not in current:
|
|
425
|
+
current[part] = {}
|
|
426
|
+
current = current[part]
|
|
427
|
+
current[parts[-1]] = value
|
|
428
|
+
else:
|
|
429
|
+
config[key] = value
|
|
430
|
+
|
|
431
|
+
return config
|
|
432
|
+
|
|
433
|
+
def create_scan_config(self, scan_name: str, scan_ruleset: str = None) -> Dict:
|
|
434
|
+
"""Create scan configuration"""
|
|
435
|
+
config = {
|
|
436
|
+
'name': scan_name,
|
|
437
|
+
'kind': 'AzureSqlDatabaseCredential',
|
|
438
|
+
'properties': {
|
|
439
|
+
'scanRulesetName': scan_ruleset or 'AzureSqlDatabase',
|
|
440
|
+
'scanRulesetType': 'System',
|
|
441
|
+
'collection': {
|
|
442
|
+
'referenceName': 'default'
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return config
|
|
448
|
+
|
|
449
|
+
def save_template(self, name: str, template: Dict, file_path: str = None):
|
|
450
|
+
"""Save custom template"""
|
|
451
|
+
self.templates[name] = template
|
|
452
|
+
|
|
453
|
+
if file_path:
|
|
454
|
+
with open(file_path, 'w') as f:
|
|
455
|
+
json.dump({name: template}, f, indent=2)
|
|
456
|
+
|
|
457
|
+
def load_template_from_file(self, file_path: str) -> Dict:
|
|
458
|
+
"""Load template from file"""
|
|
459
|
+
with open(file_path, 'r') as f:
|
|
460
|
+
return json.load(f)
|
|
461
|
+
|
|
462
|
+
# CLI Integration Functions
|
|
463
|
+
async def create_scanning_cli_commands():
|
|
464
|
+
"""Create CLI commands for scanning operations"""
|
|
465
|
+
# This would integrate with the enhanced_cli.py
|
|
466
|
+
# Example implementation for demonstration
|
|
467
|
+
|
|
468
|
+
@click.group()
|
|
469
|
+
def scanning():
|
|
470
|
+
"""Advanced scanning operations and automation"""
|
|
471
|
+
pass
|
|
472
|
+
|
|
473
|
+
@scanning.command()
|
|
474
|
+
@click.option('--config-file', required=True, help='Data source configuration file')
|
|
475
|
+
@click.option('--profile', default='default', help='Configuration profile')
|
|
476
|
+
async def create_sources(config_file, profile):
|
|
477
|
+
"""Create multiple data sources from configuration file"""
|
|
478
|
+
config = PurviewConfig.load_profile(profile)
|
|
479
|
+
|
|
480
|
+
with open(config_file, 'r') as f:
|
|
481
|
+
sources_config = json.load(f)
|
|
482
|
+
|
|
483
|
+
async with PurviewClient(config) as client:
|
|
484
|
+
manager = ScanningManager(client)
|
|
485
|
+
results = await manager.bulk_create_data_sources(sources_config)
|
|
486
|
+
|
|
487
|
+
console.print(f"[green]✓ Created {len(results['created'])} data sources[/green]")
|
|
488
|
+
if results['failed']:
|
|
489
|
+
console.print(f"[red]✗ Failed to create {len(results['failed'])} data sources[/red]")
|
|
490
|
+
|
|
491
|
+
@scanning.command()
|
|
492
|
+
@click.option('--output-file', required=True, help='Output file for scan report')
|
|
493
|
+
@click.option('--days', default=30, help='Number of days to include in report')
|
|
494
|
+
@click.option('--profile', default='default', help='Configuration profile')
|
|
495
|
+
async def report(output_file, days, profile):
|
|
496
|
+
"""Generate comprehensive scanning report"""
|
|
497
|
+
config = PurviewConfig.load_profile(profile)
|
|
498
|
+
|
|
499
|
+
async with PurviewClient(config) as client:
|
|
500
|
+
manager = ScanningManager(client)
|
|
501
|
+
report = await manager.generate_scan_report(output_file, days)
|
|
502
|
+
|
|
503
|
+
# Display summary
|
|
504
|
+
stats = report['statistics']
|
|
505
|
+
table = Table(title="Scan Report Summary")
|
|
506
|
+
table.add_column("Metric", style="cyan")
|
|
507
|
+
table.add_column("Value", style="green")
|
|
508
|
+
|
|
509
|
+
table.add_row("Total Data Sources", str(stats['total_sources']))
|
|
510
|
+
table.add_row("Successful Scans", str(stats['successful_scans']))
|
|
511
|
+
table.add_row("Failed Scans", str(stats['failed_scans']))
|
|
512
|
+
table.add_row("Assets Discovered", str(stats['total_assets_discovered']))
|
|
513
|
+
|
|
514
|
+
console.print(table)
|
|
515
|
+
|
|
516
|
+
return scanning
|
|
517
|
+
|
|
518
|
+
# Export the main classes and functions
|
|
519
|
+
__all__ = [
|
|
520
|
+
'ScanningManager',
|
|
521
|
+
'ScanTemplateManager',
|
|
522
|
+
'create_scanning_cli_commands'
|
|
523
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PURVIEW_ACCOUNT_NAME = None
|