pvw-cli 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pvw-cli might be problematic. Click here for more details.
- purviewcli/__init__.py +27 -0
- purviewcli/__main__.py +15 -0
- purviewcli/cli/__init__.py +5 -0
- purviewcli/cli/account.py +199 -0
- purviewcli/cli/cli.py +170 -0
- purviewcli/cli/collections.py +502 -0
- purviewcli/cli/domain.py +361 -0
- purviewcli/cli/entity.py +2436 -0
- purviewcli/cli/glossary.py +533 -0
- purviewcli/cli/health.py +250 -0
- purviewcli/cli/insight.py +113 -0
- purviewcli/cli/lineage.py +1103 -0
- purviewcli/cli/management.py +141 -0
- purviewcli/cli/policystore.py +103 -0
- purviewcli/cli/relationship.py +75 -0
- purviewcli/cli/scan.py +357 -0
- purviewcli/cli/search.py +527 -0
- purviewcli/cli/share.py +478 -0
- purviewcli/cli/types.py +831 -0
- purviewcli/cli/unified_catalog.py +3540 -0
- purviewcli/cli/workflow.py +402 -0
- purviewcli/client/__init__.py +21 -0
- purviewcli/client/_account.py +1877 -0
- purviewcli/client/_collections.py +1761 -0
- purviewcli/client/_domain.py +414 -0
- purviewcli/client/_entity.py +3545 -0
- purviewcli/client/_glossary.py +3233 -0
- purviewcli/client/_health.py +501 -0
- purviewcli/client/_insight.py +2873 -0
- purviewcli/client/_lineage.py +2138 -0
- purviewcli/client/_management.py +2202 -0
- purviewcli/client/_policystore.py +2915 -0
- purviewcli/client/_relationship.py +1351 -0
- purviewcli/client/_scan.py +2607 -0
- purviewcli/client/_search.py +1472 -0
- purviewcli/client/_share.py +272 -0
- purviewcli/client/_types.py +2708 -0
- purviewcli/client/_unified_catalog.py +5112 -0
- purviewcli/client/_workflow.py +2734 -0
- purviewcli/client/api_client.py +1295 -0
- purviewcli/client/business_rules.py +675 -0
- purviewcli/client/config.py +231 -0
- purviewcli/client/data_quality.py +433 -0
- purviewcli/client/endpoint.py +123 -0
- purviewcli/client/endpoints.py +554 -0
- purviewcli/client/exceptions.py +38 -0
- purviewcli/client/lineage_visualization.py +797 -0
- purviewcli/client/monitoring_dashboard.py +712 -0
- purviewcli/client/rate_limiter.py +30 -0
- purviewcli/client/retry_handler.py +125 -0
- purviewcli/client/scanning_operations.py +523 -0
- purviewcli/client/settings.py +1 -0
- purviewcli/client/sync_client.py +250 -0
- purviewcli/plugins/__init__.py +1 -0
- purviewcli/plugins/plugin_system.py +709 -0
- pvw_cli-1.2.8.dist-info/METADATA +1618 -0
- pvw_cli-1.2.8.dist-info/RECORD +60 -0
- pvw_cli-1.2.8.dist-info/WHEEL +5 -0
- pvw_cli-1.2.8.dist-info/entry_points.txt +3 -0
- pvw_cli-1.2.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1295 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Microsoft Purview API Client
|
|
3
|
+
Supports the latest Microsoft Purview REST API specifications with comprehensive automation capabilities
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import asyncio
|
|
8
|
+
try:
|
|
9
|
+
import aiohttp
|
|
10
|
+
except Exception:
|
|
11
|
+
aiohttp = None
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from typing import Dict, List, Optional, Union, Any
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from azure.identity.aio import DefaultAzureCredential
|
|
16
|
+
from azure.core.exceptions import ClientAuthenticationError
|
|
17
|
+
import logging
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
from .endpoints import ENDPOINTS, DATAMAP_API_VERSION, format_endpoint, get_api_version_params
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class PurviewConfig:
|
|
28
|
+
"""Configuration for Purview API Client"""
|
|
29
|
+
|
|
30
|
+
account_name: str
|
|
31
|
+
tenant_id: Optional[str] = None
|
|
32
|
+
client_id: Optional[str] = None
|
|
33
|
+
client_secret: Optional[str] = None
|
|
34
|
+
azure_region: Optional[str] = None
|
|
35
|
+
max_retries: int = 3
|
|
36
|
+
timeout: int = 30
|
|
37
|
+
batch_size: int = 100
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PurviewClient:
|
|
41
|
+
"""Purview API Client with comprehensive automation support"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, config: PurviewConfig):
|
|
44
|
+
self.config = config
|
|
45
|
+
self._token = None
|
|
46
|
+
self._credential = None
|
|
47
|
+
self._session = None
|
|
48
|
+
self._setup_endpoints()
|
|
49
|
+
|
|
50
|
+
def _setup_endpoints(self):
|
|
51
|
+
"""Setup API endpoints based on Azure region"""
|
|
52
|
+
if self.config.azure_region and self.config.azure_region.lower() == "china":
|
|
53
|
+
self.purview_endpoint = f"https://{self.config.account_name}.purview.azure.cn"
|
|
54
|
+
self.management_endpoint = "https://management.chinacloudapi.cn"
|
|
55
|
+
self.auth_scope = "https://purview.azure.cn/.default"
|
|
56
|
+
elif self.config.azure_region and self.config.azure_region.lower() == "usgov":
|
|
57
|
+
self.purview_endpoint = f"https://{self.config.account_name}.purview.azure.us"
|
|
58
|
+
self.management_endpoint = "https://management.usgovcloudapi.net"
|
|
59
|
+
self.auth_scope = "https://purview.azure.us/.default"
|
|
60
|
+
else:
|
|
61
|
+
self.purview_endpoint = f"https://{self.config.account_name}.purview.azure.com"
|
|
62
|
+
self.management_endpoint = "https://management.azure.com"
|
|
63
|
+
self.auth_scope = "https://purview.azure.net/.default"
|
|
64
|
+
|
|
65
|
+
async def __aenter__(self):
|
|
66
|
+
"""Async context manager entry"""
|
|
67
|
+
await self._initialize_session()
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
71
|
+
"""Async context manager exit"""
|
|
72
|
+
if self._session:
|
|
73
|
+
await self._session.close()
|
|
74
|
+
if self._credential:
|
|
75
|
+
await self._credential.close()
|
|
76
|
+
|
|
77
|
+
async def _initialize_session(self):
|
|
78
|
+
"""Initialize HTTP session and authentication"""
|
|
79
|
+
if aiohttp is None:
|
|
80
|
+
raise RuntimeError(
|
|
81
|
+
"The 'aiohttp' package is required for Purview async operations. "
|
|
82
|
+
"Install it in your environment (e.g. '.venv\\Scripts\\pip.exe install aiohttp' or 'pip install aiohttp')."
|
|
83
|
+
)
|
|
84
|
+
self._credential = DefaultAzureCredential()
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
token = await self._credential.get_token(self.auth_scope)
|
|
88
|
+
self._token = token.token
|
|
89
|
+
except ClientAuthenticationError as e:
|
|
90
|
+
logger.error(f"Authentication failed: {e}")
|
|
91
|
+
raise
|
|
92
|
+
|
|
93
|
+
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
|
|
94
|
+
timeout = aiohttp.ClientTimeout(total=self.config.timeout)
|
|
95
|
+
|
|
96
|
+
self._session = aiohttp.ClientSession(
|
|
97
|
+
connector=connector,
|
|
98
|
+
timeout=timeout,
|
|
99
|
+
headers={
|
|
100
|
+
"Authorization": f"Bearer {self._token}",
|
|
101
|
+
"Content-Type": "application/json",
|
|
102
|
+
"User-Agent": f"pvw-cli/2.0",
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
async def _make_request(self, method: str, endpoint: str, **kwargs) -> Dict:
|
|
107
|
+
"""Make HTTP request with retry logic"""
|
|
108
|
+
url = f"{self.purview_endpoint}{endpoint}"
|
|
109
|
+
params = kwargs.get("params", {})
|
|
110
|
+
params["api-version"] = DATAMAP_API_VERSION
|
|
111
|
+
kwargs["params"] = params
|
|
112
|
+
|
|
113
|
+
for attempt in range(self.config.max_retries):
|
|
114
|
+
try:
|
|
115
|
+
async with self._session.request(method, url, **kwargs) as response:
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
return await response.json()
|
|
118
|
+
except aiohttp.ClientError as e:
|
|
119
|
+
logger.error(f"Request failed on attempt {attempt + 1}: {e}")
|
|
120
|
+
if attempt == self.config.max_retries - 1:
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
async def _refresh_token(self):
|
|
124
|
+
"""Refresh authentication token"""
|
|
125
|
+
token = await self._credential.get_token(self.auth_scope)
|
|
126
|
+
self._token = token.token
|
|
127
|
+
self._session.headers.update({"Authorization": f"Bearer {self._token}"})
|
|
128
|
+
|
|
129
|
+
# Data Map API Methods
|
|
130
|
+
async def get_entity(self, guid: str, **kwargs) -> Dict:
|
|
131
|
+
"""
|
|
132
|
+
Get a Purview entity by its unique GUID.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
guid: The unique GUID identifier of the entity
|
|
136
|
+
**kwargs: Additional query parameters (e.g., minExtInfo, ignoreRelationships)
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict containing entity details including:
|
|
140
|
+
- guid: Entity unique identifier
|
|
141
|
+
- typeName: Entity type (e.g., "azure_sql_table")
|
|
142
|
+
- attributes: Entity attributes (name, qualifiedName, etc.)
|
|
143
|
+
- classifications: Applied classifications/tags
|
|
144
|
+
- relationshipAttributes: Related entities
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ClientAuthenticationError: If authentication fails
|
|
148
|
+
ValueError: If guid is invalid or entity not found
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
entity = await client.get_entity("a1b2c3d4-e5f6-7890-abcd-ef1234567890")
|
|
152
|
+
print(entity["attributes"]["name"])
|
|
153
|
+
"""
|
|
154
|
+
endpoint = format_endpoint(ENDPOINTS["entity"]["get"], guid=guid)
|
|
155
|
+
return await self._make_request("GET", endpoint, params=kwargs)
|
|
156
|
+
|
|
157
|
+
async def create_entity(self, entity_data: Dict) -> Dict:
|
|
158
|
+
"""
|
|
159
|
+
Create a new entity in the Purview catalog.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
entity_data: Dictionary containing entity information with required fields:
|
|
163
|
+
- typeName (str): Entity type (e.g., "azure_sql_table", "DataSet")
|
|
164
|
+
- attributes (dict): Entity attributes including:
|
|
165
|
+
- name (str): Display name
|
|
166
|
+
- qualifiedName (str): Unique qualified name
|
|
167
|
+
- Additional type-specific attributes
|
|
168
|
+
- Optional: classifications, relationshipAttributes
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dict containing created entity details with assigned GUID
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
ValueError: If required fields are missing or invalid
|
|
175
|
+
|
|
176
|
+
Example:
|
|
177
|
+
entity = await client.create_entity({
|
|
178
|
+
"typeName": "DataSet",
|
|
179
|
+
"attributes": {
|
|
180
|
+
"name": "Sales Data",
|
|
181
|
+
"qualifiedName": "sales_data@tenant",
|
|
182
|
+
"description": "Monthly sales records"
|
|
183
|
+
}
|
|
184
|
+
})
|
|
185
|
+
"""
|
|
186
|
+
return await self._make_request(
|
|
187
|
+
"POST", ENDPOINTS["entity"]["create_or_update"], json=entity_data
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
async def update_entity(self, entity_data: Dict) -> Dict:
|
|
191
|
+
"""
|
|
192
|
+
Update an existing entity in the Purview catalog.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
entity_data: Dictionary containing entity update with:
|
|
196
|
+
- guid (str): Entity GUID to update (required)
|
|
197
|
+
- typeName (str): Entity type
|
|
198
|
+
- attributes (dict): Updated attributes
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dict containing updated entity details
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
ValueError: If entity not found or update fails
|
|
205
|
+
|
|
206
|
+
Example:
|
|
207
|
+
updated = await client.update_entity({
|
|
208
|
+
"guid": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
|
|
209
|
+
"typeName": "DataSet",
|
|
210
|
+
"attributes": {"description": "Updated description"}
|
|
211
|
+
})
|
|
212
|
+
"""
|
|
213
|
+
return await self._make_request(
|
|
214
|
+
"PUT", ENDPOINTS["entity"]["create_or_update"], json=entity_data
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
async def delete_entity(self, guid: str) -> Dict:
|
|
218
|
+
"""
|
|
219
|
+
Delete an entity from the Purview catalog.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
guid: The unique GUID of the entity to delete
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Dict containing deletion status
|
|
226
|
+
|
|
227
|
+
Raises:
|
|
228
|
+
ValueError: If entity not found
|
|
229
|
+
|
|
230
|
+
Warning:
|
|
231
|
+
This operation is irreversible. All relationships and lineage will be affected.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
result = await client.delete_entity("a1b2c3d4-e5f6-7890-abcd-ef1234567890")
|
|
235
|
+
"""
|
|
236
|
+
endpoint = format_endpoint(ENDPOINTS["entity"]["delete"], guid=guid)
|
|
237
|
+
return await self._make_request("DELETE", endpoint)
|
|
238
|
+
|
|
239
|
+
async def search_entities(self, query: str, **kwargs) -> Dict:
|
|
240
|
+
"""
|
|
241
|
+
Search for entities in the Purview catalog with advanced filtering.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
query: Search keywords or query string
|
|
245
|
+
**kwargs: Optional search parameters:
|
|
246
|
+
- filter (dict): Filter criteria (e.g., {"typeName": "DataSet"})
|
|
247
|
+
- facets (list): Facets for aggregation
|
|
248
|
+
- limit (int): Maximum results to return (default: 50, max: 1000)
|
|
249
|
+
- offset (int): Pagination offset (default: 0)
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Dict containing:
|
|
253
|
+
- value: List of matching entities
|
|
254
|
+
- @search.count: Total number of matches
|
|
255
|
+
- @search.facets: Facet aggregations if requested
|
|
256
|
+
|
|
257
|
+
Example:
|
|
258
|
+
results = await client.search_entities(
|
|
259
|
+
"sales",
|
|
260
|
+
filter={"typeName": "azure_sql_table"},
|
|
261
|
+
limit=100
|
|
262
|
+
)
|
|
263
|
+
for entity in results["value"]:
|
|
264
|
+
print(entity["name"])
|
|
265
|
+
"""
|
|
266
|
+
search_request = {
|
|
267
|
+
"keywords": query,
|
|
268
|
+
"filter": kwargs.get("filter"),
|
|
269
|
+
"facets": kwargs.get("facets"),
|
|
270
|
+
"limit": kwargs.get("limit", 50),
|
|
271
|
+
"offset": kwargs.get("offset", 0),
|
|
272
|
+
}
|
|
273
|
+
return await self._make_request(
|
|
274
|
+
"POST", ENDPOINTS["discovery"]["query"], json=search_request
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Batch Operations
|
|
278
|
+
async def batch_create_entities(
|
|
279
|
+
self, entities: List[Dict], progress_callback=None
|
|
280
|
+
) -> List[Dict]:
|
|
281
|
+
"""
|
|
282
|
+
Create multiple entities in batches to avoid API rate limiting and timeouts.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
entities: List of entity dictionaries to create, each containing:
|
|
286
|
+
- typeName (str): Entity type (e.g., "DataSet", "azure_sql_table")
|
|
287
|
+
- attributes (dict): Entity attributes including name, qualifiedName, etc.
|
|
288
|
+
progress_callback: Optional callback function(processed: int, total: int) for progress tracking
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of dictionaries containing created entities with assigned GUIDs and system attributes
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
PurviewException: If batch creation fails due to API errors
|
|
295
|
+
ValueError: If entities contain invalid data or missing required fields
|
|
296
|
+
|
|
297
|
+
Example:
|
|
298
|
+
```python
|
|
299
|
+
entities = [
|
|
300
|
+
{"typeName": "DataSet", "attributes": {"name": "dataset1", "qualifiedName": "dataset1@purview"}},
|
|
301
|
+
{"typeName": "DataSet", "attributes": {"name": "dataset2", "qualifiedName": "dataset2@purview"}}
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
def progress(processed, total):
|
|
305
|
+
print(f"Progress: {processed}/{total}")
|
|
306
|
+
|
|
307
|
+
created = await client.batch_create_entities(entities, progress_callback=progress)
|
|
308
|
+
print(f"Created {len(created)} entities")
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Use Cases:
|
|
312
|
+
- Import large datasets from external systems into Purview
|
|
313
|
+
- Bulk provisioning of data assets during migration
|
|
314
|
+
- Automated asset registration from data discovery tools
|
|
315
|
+
- Periodic synchronization of assets from source systems
|
|
316
|
+
"""
|
|
317
|
+
results = []
|
|
318
|
+
total = len(entities)
|
|
319
|
+
|
|
320
|
+
for i in range(0, total, self.config.batch_size):
|
|
321
|
+
batch = entities[i : i + self.config.batch_size]
|
|
322
|
+
batch_data = {"entities": batch}
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
result = await self._make_request(
|
|
326
|
+
"POST", ENDPOINTS["entity"]["bulk_create_or_update"], json=batch_data
|
|
327
|
+
)
|
|
328
|
+
results.extend(result.get("mutatedEntities", {}).get("CREATE", []))
|
|
329
|
+
|
|
330
|
+
if progress_callback:
|
|
331
|
+
progress_callback(min(i + self.config.batch_size, total), total)
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.error(f"Batch {i//self.config.batch_size + 1} failed: {e}")
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
return results
|
|
338
|
+
|
|
339
|
+
async def batch_update_entities(
|
|
340
|
+
self, entities: List[Dict], progress_callback=None
|
|
341
|
+
) -> List[Dict]:
|
|
342
|
+
"""
|
|
343
|
+
Update multiple entities in batches to avoid API rate limiting and timeouts.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
entities: List of entity dictionaries to update, each must include:
|
|
347
|
+
- guid (str): Entity GUID to update
|
|
348
|
+
- attributes (dict): Updated attributes (only changed fields needed)
|
|
349
|
+
progress_callback: Optional callback function(processed: int, total: int) for progress tracking
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
List of dictionaries containing updated entities with modified attributes and timestamps
|
|
353
|
+
|
|
354
|
+
Raises:
|
|
355
|
+
PurviewException: If batch update fails due to API errors
|
|
356
|
+
ValueError: If entities missing GUID or contain invalid data
|
|
357
|
+
|
|
358
|
+
Example:
|
|
359
|
+
```python
|
|
360
|
+
entities = [
|
|
361
|
+
{"guid": "guid-1", "attributes": {"description": "Updated description"}},
|
|
362
|
+
{"guid": "guid-2", "attributes": {"owner": "newowner@company.com"}}
|
|
363
|
+
]
|
|
364
|
+
|
|
365
|
+
updated = await client.batch_update_entities(entities)
|
|
366
|
+
print(f"Updated {len(updated)} entities")
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
Use Cases:
|
|
370
|
+
- Bulk update entity metadata from external systems
|
|
371
|
+
- Apply classification or glossary terms to multiple assets
|
|
372
|
+
- Synchronize ownership or stewardship information
|
|
373
|
+
- Update descriptions and documentation across many entities
|
|
374
|
+
"""
|
|
375
|
+
results = []
|
|
376
|
+
total = len(entities)
|
|
377
|
+
|
|
378
|
+
for i in range(0, total, self.config.batch_size):
|
|
379
|
+
batch = entities[i : i + self.config.batch_size]
|
|
380
|
+
batch_data = {"entities": batch}
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
result = await self._make_request(
|
|
384
|
+
"PUT", ENDPOINTS["entity"]["bulk_create_or_update"], json=batch_data
|
|
385
|
+
)
|
|
386
|
+
results.extend(result.get("mutatedEntities", {}).get("UPDATE", []))
|
|
387
|
+
|
|
388
|
+
if progress_callback:
|
|
389
|
+
progress_callback(min(i + self.config.batch_size, total), total)
|
|
390
|
+
|
|
391
|
+
except Exception as e:
|
|
392
|
+
logger.error(f"Batch {i//self.config.batch_size + 1} failed: {e}")
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
return results
|
|
396
|
+
|
|
397
|
+
# CSV Import/Export Methods
|
|
398
|
+
async def import_entities_from_csv(self, csv_file_path: str, mapping_config: Dict) -> Dict:
|
|
399
|
+
"""
|
|
400
|
+
Import entities from CSV file using column-to-attribute mapping configuration.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
csv_file_path: Path to CSV file containing entity data
|
|
404
|
+
mapping_config: Dictionary specifying how to map CSV columns to entity attributes:
|
|
405
|
+
- typeName (str): Entity type for all imported entities
|
|
406
|
+
- attributes (dict): Mapping of CSV column names to entity attribute names
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Dict containing import results with created entity GUIDs
|
|
410
|
+
|
|
411
|
+
Raises:
|
|
412
|
+
FileNotFoundError: If CSV file doesn't exist
|
|
413
|
+
ValueError: If mapping_config is invalid or CSV has missing required columns
|
|
414
|
+
|
|
415
|
+
Example:
|
|
416
|
+
```python
|
|
417
|
+
mapping = {
|
|
418
|
+
"typeName": "azure_sql_table",
|
|
419
|
+
"attributes": {
|
|
420
|
+
"table_name": "name",
|
|
421
|
+
"schema_name": "schema",
|
|
422
|
+
"table_description": "description"
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
results = await client.import_entities_from_csv("tables.csv", mapping)
|
|
426
|
+
print(f"Imported {len(results)} entities")
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
Use Cases:
|
|
430
|
+
- Bulk import assets from external catalogs or CMDBs
|
|
431
|
+
- Migrate metadata from legacy systems to Purview
|
|
432
|
+
- Load entity data from Excel/CSV exports
|
|
433
|
+
- Automate asset registration from data discovery tools
|
|
434
|
+
"""
|
|
435
|
+
df = pd.read_csv(csv_file_path)
|
|
436
|
+
entities = []
|
|
437
|
+
|
|
438
|
+
for _, row in df.iterrows():
|
|
439
|
+
entity = self._map_csv_row_to_entity(row, mapping_config)
|
|
440
|
+
if entity:
|
|
441
|
+
entities.append(entity)
|
|
442
|
+
|
|
443
|
+
return await self.batch_create_entities(entities)
|
|
444
|
+
|
|
445
|
+
async def export_entities_to_csv(
|
|
446
|
+
self, query: str, csv_file_path: str, columns: List[str] = None
|
|
447
|
+
) -> str:
|
|
448
|
+
"""
|
|
449
|
+
Export entities matching search query to CSV file.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
query: Search query to find entities (e.g., "*" for all, "type:DataSet" for specific type)
|
|
453
|
+
csv_file_path: Output CSV file path
|
|
454
|
+
columns: Optional list of column names to include (default: all available columns)
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
String message confirming export with count of exported entities
|
|
458
|
+
|
|
459
|
+
Raises:
|
|
460
|
+
PurviewException: If search fails
|
|
461
|
+
IOError: If unable to write CSV file
|
|
462
|
+
|
|
463
|
+
Example:
|
|
464
|
+
```python
|
|
465
|
+
# Export all DataSet entities
|
|
466
|
+
message = await client.export_entities_to_csv(
|
|
467
|
+
"type:DataSet",
|
|
468
|
+
"datasets.csv",
|
|
469
|
+
columns=["guid", "name", "typeName", "attr_owner"]
|
|
470
|
+
)
|
|
471
|
+
print(message) # "Exported 150 entities to datasets.csv"
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
Use Cases:
|
|
475
|
+
- Extract metadata for reporting and analysis
|
|
476
|
+
- Create backups of entity metadata
|
|
477
|
+
- Share asset information with stakeholders via CSV
|
|
478
|
+
- Generate data catalogs for external consumption
|
|
479
|
+
"""
|
|
480
|
+
search_results = await self.search_entities(query, limit=1000)
|
|
481
|
+
entities = search_results.get("value", [])
|
|
482
|
+
|
|
483
|
+
if not entities:
|
|
484
|
+
return "No entities found"
|
|
485
|
+
|
|
486
|
+
# Convert entities to DataFrame
|
|
487
|
+
flattened_data = []
|
|
488
|
+
for entity in entities:
|
|
489
|
+
flat_entity = self._flatten_entity(entity)
|
|
490
|
+
flattened_data.append(flat_entity)
|
|
491
|
+
|
|
492
|
+
df = pd.DataFrame(flattened_data)
|
|
493
|
+
|
|
494
|
+
if columns:
|
|
495
|
+
df = df[columns] if all(col in df.columns for col in columns) else df
|
|
496
|
+
|
|
497
|
+
df.to_csv(csv_file_path, index=False)
|
|
498
|
+
return f"Exported {len(entities)} entities to {csv_file_path}"
|
|
499
|
+
|
|
500
|
+
def _map_csv_row_to_entity(self, row: pd.Series, mapping_config: Dict) -> Dict:
|
|
501
|
+
"""Map CSV row to Purview entity format"""
|
|
502
|
+
try:
|
|
503
|
+
entity = {"typeName": mapping_config.get("typeName", "DataSet"), "attributes": {}}
|
|
504
|
+
|
|
505
|
+
# Map CSV columns to entity attributes
|
|
506
|
+
for csv_col, attr_name in mapping_config.get("attributes", {}).items():
|
|
507
|
+
if csv_col in row and pd.notna(row[csv_col]):
|
|
508
|
+
entity["attributes"][attr_name] = row[csv_col]
|
|
509
|
+
|
|
510
|
+
# Add required attributes if not present
|
|
511
|
+
if "name" not in entity["attributes"] and "name" in row:
|
|
512
|
+
entity["attributes"]["name"] = row["name"]
|
|
513
|
+
|
|
514
|
+
if "qualifiedName" not in entity["attributes"]:
|
|
515
|
+
entity["attributes"][
|
|
516
|
+
"qualifiedName"
|
|
517
|
+
] = f"{row.get('name', 'unnamed')}@{self.config.account_name}"
|
|
518
|
+
|
|
519
|
+
return entity
|
|
520
|
+
except Exception as e:
|
|
521
|
+
logger.error(f"Failed to map row to entity: {e}")
|
|
522
|
+
return None
|
|
523
|
+
|
|
524
|
+
def _flatten_entity(self, entity: Dict) -> Dict:
|
|
525
|
+
"""Flatten entity structure for CSV export"""
|
|
526
|
+
flat = {
|
|
527
|
+
"guid": entity.get("guid"),
|
|
528
|
+
"typeName": entity.get("typeName"),
|
|
529
|
+
"status": entity.get("status"),
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Flatten attributes
|
|
533
|
+
attributes = entity.get("attributes", {})
|
|
534
|
+
for key, value in attributes.items():
|
|
535
|
+
if isinstance(value, (str, int, float, bool)):
|
|
536
|
+
flat[f"attr_{key}"] = value
|
|
537
|
+
elif isinstance(value, list) and value:
|
|
538
|
+
flat[f"attr_{key}"] = ", ".join(str(v) for v in value)
|
|
539
|
+
|
|
540
|
+
return flat # Glossary Operations
|
|
541
|
+
|
|
542
|
+
async def get_glossary_terms(self, glossary_guid: str = None) -> List[Dict]:
|
|
543
|
+
"""
|
|
544
|
+
Get all glossary terms or terms from a specific glossary.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
glossary_guid: Optional GUID of a specific glossary to filter terms.
|
|
548
|
+
If None, returns all terms from all glossaries.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
List of dictionaries, each containing term information:
|
|
552
|
+
- guid: Term unique identifier
|
|
553
|
+
- name: Term display name
|
|
554
|
+
- qualifiedName: Fully qualified term name
|
|
555
|
+
- glossaryGuid: Parent glossary GUID
|
|
556
|
+
- status: Term status (Draft, Approved, etc.)
|
|
557
|
+
- definition: Term definition/description
|
|
558
|
+
- abbreviation: Optional abbreviation
|
|
559
|
+
- examples: Optional usage examples
|
|
560
|
+
- attributes: Custom attributes
|
|
561
|
+
- assignedEntities: Entities tagged with this term
|
|
562
|
+
|
|
563
|
+
Example:
|
|
564
|
+
# Get all terms
|
|
565
|
+
all_terms = await client.get_glossary_terms()
|
|
566
|
+
|
|
567
|
+
# Get terms from specific glossary
|
|
568
|
+
glossary_terms = await client.get_glossary_terms("glossary-guid-123")
|
|
569
|
+
|
|
570
|
+
for term in all_terms:
|
|
571
|
+
print(f"{term['name']}: {term.get('definition', 'No definition')}")
|
|
572
|
+
"""
|
|
573
|
+
if glossary_guid:
|
|
574
|
+
endpoint = f"{ENDPOINTS['glossary']['terms']}/{glossary_guid}"
|
|
575
|
+
else:
|
|
576
|
+
endpoint = ENDPOINTS["glossary"]["base"]
|
|
577
|
+
return await self._make_request("GET", endpoint)
|
|
578
|
+
|
|
579
|
+
async def create_glossary_term(self, term_data: Dict) -> Dict:
|
|
580
|
+
"""
|
|
581
|
+
Create a new glossary term in Purview.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
term_data: Dictionary containing term information with required fields:
|
|
585
|
+
- name (str): Term display name (required)
|
|
586
|
+
- glossaryGuid (str): Parent glossary GUID (required)
|
|
587
|
+
- Optional fields:
|
|
588
|
+
- qualifiedName (str): Auto-generated if not provided
|
|
589
|
+
- definition (str): Term definition/description
|
|
590
|
+
- abbreviation (str): Short form
|
|
591
|
+
- status (str): "Draft", "Approved", "Alert", "Expired"
|
|
592
|
+
- nickName (str): Alternative name
|
|
593
|
+
- examples (list): Usage examples
|
|
594
|
+
- resources (list): Related resources/links
|
|
595
|
+
- contacts (dict): Experts, owners, stewards
|
|
596
|
+
- attributes (dict): Custom attributes
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
Dict containing created term with assigned GUID
|
|
600
|
+
|
|
601
|
+
Raises:
|
|
602
|
+
ValueError: If required fields are missing or glossary not found
|
|
603
|
+
|
|
604
|
+
Example:
|
|
605
|
+
term = await client.create_glossary_term({
|
|
606
|
+
"name": "Customer",
|
|
607
|
+
"glossaryGuid": "glossary-guid-123",
|
|
608
|
+
"definition": "An individual or organization that purchases goods or services",
|
|
609
|
+
"status": "Approved",
|
|
610
|
+
"abbreviation": "CUST",
|
|
611
|
+
"examples": ["Enterprise customer", "Retail customer"]
|
|
612
|
+
})
|
|
613
|
+
print(f"Created term: {term['guid']}")
|
|
614
|
+
"""
|
|
615
|
+
return await self._make_request("POST", ENDPOINTS["glossary"]["term"], json=term_data)
|
|
616
|
+
|
|
617
|
+
async def assign_term_to_entities(self, term_guid: str, entity_guids: List[str]) -> Dict:
|
|
618
|
+
"""
|
|
619
|
+
Assign a glossary term to multiple entities for business context tagging.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
term_guid: The unique GUID of the glossary term to assign
|
|
623
|
+
entity_guids: List of entity GUIDs to tag with this term
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Dict containing assignment results with success/failure details
|
|
627
|
+
|
|
628
|
+
Raises:
|
|
629
|
+
ValueError: If term or entities not found
|
|
630
|
+
|
|
631
|
+
Use Case:
|
|
632
|
+
Tag data assets with business glossary terms to provide business context
|
|
633
|
+
and enable business users to discover data using familiar terminology.
|
|
634
|
+
|
|
635
|
+
Example:
|
|
636
|
+
# Tag multiple tables with "Customer" term
|
|
637
|
+
result = await client.assign_term_to_entities(
|
|
638
|
+
term_guid="term-guid-abc",
|
|
639
|
+
entity_guids=[
|
|
640
|
+
"table-guid-1",
|
|
641
|
+
"table-guid-2",
|
|
642
|
+
"table-guid-3"
|
|
643
|
+
]
|
|
644
|
+
)
|
|
645
|
+
print(f"Tagged {len(entity_guids)} entities")
|
|
646
|
+
"""
|
|
647
|
+
assignment_data = {"termGuid": term_guid, "entityGuids": entity_guids}
|
|
648
|
+
endpoint = f"{ENDPOINTS['glossary']['term_assigned_entities']}/{term_guid}"
|
|
649
|
+
return await self._make_request("POST", endpoint, json=assignment_data)
|
|
650
|
+
|
|
651
|
+
# Data Estate Insights
|
|
652
|
+
async def get_asset_distribution(self) -> Dict:
|
|
653
|
+
"""
|
|
654
|
+
Get asset distribution insights across the Purview data estate.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
Dict containing asset distribution statistics including:
|
|
658
|
+
- asset counts by type (DataSet, Table, Column, etc.)
|
|
659
|
+
- asset counts by classification
|
|
660
|
+
- asset counts by collection
|
|
661
|
+
- asset counts by source type
|
|
662
|
+
|
|
663
|
+
Raises:
|
|
664
|
+
PurviewException: If the request fails or API endpoint is unavailable
|
|
665
|
+
|
|
666
|
+
Example:
|
|
667
|
+
```python
|
|
668
|
+
distribution = await client.get_asset_distribution()
|
|
669
|
+
print(f"Total assets: {distribution.get('totalAssets', 0)}")
|
|
670
|
+
for asset_type, count in distribution.get('assetsByType', {}).items():
|
|
671
|
+
print(f"{asset_type}: {count}")
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
Use Cases:
|
|
675
|
+
- Generate data estate overview dashboards
|
|
676
|
+
- Monitor asset growth and distribution trends
|
|
677
|
+
- Identify collections with the most assets
|
|
678
|
+
- Create reports on data source coverage
|
|
679
|
+
"""
|
|
680
|
+
return await self._make_request("GET", "/mapanddiscover/api/browse")
|
|
681
|
+
|
|
682
|
+
# === ACCOUNT MANAGEMENT (Official API Operations) === async def get_account_properties(self) -> Dict:
|
|
683
|
+
"""Get Account Properties - Official API Operation"""
|
|
684
|
+
params = get_api_version_params("account")
|
|
685
|
+
return await self._make_request("GET", ENDPOINTS["account"]["account"], params=params)
|
|
686
|
+
|
|
687
|
+
async def update_account_properties(self, account_data: Dict) -> Dict:
|
|
688
|
+
"""
|
|
689
|
+
Update Microsoft Purview account properties and settings.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
account_data: Dictionary containing account properties to update:
|
|
693
|
+
- friendlyName (str): Display name for the account
|
|
694
|
+
- publicNetworkAccess (str): "Enabled" or "Disabled"
|
|
695
|
+
- managedResourceGroupName (str): Resource group name
|
|
696
|
+
- tags (dict): Azure resource tags
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
Dict containing updated account information including:
|
|
700
|
+
- name, id, location, sku
|
|
701
|
+
- properties (friendlyName, publicNetworkAccess, etc.)
|
|
702
|
+
- systemData (created/modified timestamps)
|
|
703
|
+
|
|
704
|
+
Raises:
|
|
705
|
+
PurviewException: If update fails or account not found
|
|
706
|
+
ValueError: If account_data contains invalid properties
|
|
707
|
+
|
|
708
|
+
Example:
|
|
709
|
+
```python
|
|
710
|
+
updated = await client.update_account_properties({
|
|
711
|
+
"friendlyName": "Production Data Catalog",
|
|
712
|
+
"publicNetworkAccess": "Enabled",
|
|
713
|
+
"tags": {"environment": "production", "department": "data"}
|
|
714
|
+
})
|
|
715
|
+
print(f"Account updated: {updated['properties']['friendlyName']}")
|
|
716
|
+
```
|
|
717
|
+
|
|
718
|
+
Use Cases:
|
|
719
|
+
- Update account display name for better organization
|
|
720
|
+
- Configure network access policies
|
|
721
|
+
- Add or modify resource tags for cost tracking
|
|
722
|
+
- Update managed resource group settings
|
|
723
|
+
"""
|
|
724
|
+
params = get_api_version_params("account")
|
|
725
|
+
return await self._make_request(
|
|
726
|
+
"PATCH", ENDPOINTS["account"]["account_update"], json=account_data, params=params
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
async def get_access_keys(self) -> Dict:
|
|
730
|
+
"""
|
|
731
|
+
Retrieve the primary and secondary access keys for the Purview account.
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
Dict containing access key information:
|
|
735
|
+
- atlasKafkaPrimaryEndpoint (str): Primary Kafka endpoint
|
|
736
|
+
- atlasKafkaSecondaryEndpoint (str): Secondary Kafka endpoint
|
|
737
|
+
|
|
738
|
+
Raises:
|
|
739
|
+
PurviewException: If unable to retrieve keys or insufficient permissions
|
|
740
|
+
PermissionError: If caller lacks Key Vault access
|
|
741
|
+
|
|
742
|
+
Example:
|
|
743
|
+
```python
|
|
744
|
+
keys = await client.get_access_keys()
|
|
745
|
+
primary_key = keys.get('atlasKafkaPrimaryEndpoint')
|
|
746
|
+
print(f"Primary endpoint: {primary_key}")
|
|
747
|
+
```
|
|
748
|
+
|
|
749
|
+
Use Cases:
|
|
750
|
+
- Configure external applications to connect to Purview event streams
|
|
751
|
+
- Rotate access keys periodically for security
|
|
752
|
+
- Integrate Purview events with Azure Event Hub or Kafka consumers
|
|
753
|
+
- Validate access key availability before deployment
|
|
754
|
+
"""
|
|
755
|
+
params = get_api_version_params("account")
|
|
756
|
+
return await self._make_request("POST", ENDPOINTS["account"]["access_keys"], params=params)
|
|
757
|
+
|
|
758
|
+
async def regenerate_access_key(self, key_data: Dict) -> Dict:
|
|
759
|
+
"""
|
|
760
|
+
Regenerate the primary or secondary access key for the Purview account.
|
|
761
|
+
|
|
762
|
+
Args:
|
|
763
|
+
key_data: Dictionary specifying which key to regenerate:
|
|
764
|
+
- keyType (str): "PrimaryAtlasKafkaKey" or "SecondaryAtlasKafkaKey"
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
Dict containing the new access key information after regeneration
|
|
768
|
+
|
|
769
|
+
Raises:
|
|
770
|
+
PurviewException: If key regeneration fails
|
|
771
|
+
ValueError: If keyType is invalid
|
|
772
|
+
PermissionError: If caller lacks Key Vault access
|
|
773
|
+
|
|
774
|
+
Example:
|
|
775
|
+
```python
|
|
776
|
+
# Regenerate primary key
|
|
777
|
+
new_key = await client.regenerate_access_key({
|
|
778
|
+
"keyType": "PrimaryAtlasKafkaKey"
|
|
779
|
+
})
|
|
780
|
+
print(f"Primary key regenerated: {new_key['atlasKafkaPrimaryEndpoint']}")
|
|
781
|
+
```
|
|
782
|
+
|
|
783
|
+
Use Cases:
|
|
784
|
+
- Rotate keys periodically as part of security best practices
|
|
785
|
+
- Revoke compromised keys and generate new ones
|
|
786
|
+
- Update application configurations with new credentials
|
|
787
|
+
- Implement key rotation automation in CI/CD pipelines
|
|
788
|
+
"""
|
|
789
|
+
params = get_api_version_params("account")
|
|
790
|
+
return await self._make_request(
|
|
791
|
+
"POST", ENDPOINTS["account"]["regenerate_access_key"], json=key_data, params=params
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
# === COLLECTIONS MANAGEMENT (Official API Operations) ===
|
|
795
|
+
|
|
796
|
+
async def list_collections(self) -> List[Dict]:
|
|
797
|
+
"""
|
|
798
|
+
List all collections in the Purview account.
|
|
799
|
+
|
|
800
|
+
Collections organize data assets into logical hierarchies for access control
|
|
801
|
+
and governance. They form a tree structure with parent-child relationships.
|
|
802
|
+
|
|
803
|
+
Returns:
|
|
804
|
+
List of dictionaries, each containing collection information:
|
|
805
|
+
- name: Collection unique name/identifier
|
|
806
|
+
- friendlyName: Human-readable display name
|
|
807
|
+
- description: Collection description
|
|
808
|
+
- collectionProvisioningState: State (e.g., "Succeeded")
|
|
809
|
+
- parentCollection: Parent collection reference
|
|
810
|
+
- systemData: Creation/modification metadata
|
|
811
|
+
|
|
812
|
+
Example:
|
|
813
|
+
collections = await client.list_collections()
|
|
814
|
+
for col in collections:
|
|
815
|
+
print(f"{col['friendlyName']} ({col['name']})")
|
|
816
|
+
print(f" Parent: {col.get('parentCollection', {}).get('referenceName', 'Root')}")
|
|
817
|
+
"""
|
|
818
|
+
params = get_api_version_params("collections")
|
|
819
|
+
return await self._make_request("GET", ENDPOINTS["collections"]["list"], params=params)
|
|
820
|
+
|
|
821
|
+
async def get_collection(self, collection_name: str) -> Dict:
|
|
822
|
+
"""
|
|
823
|
+
Get detailed information about a specific collection.
|
|
824
|
+
|
|
825
|
+
Args:
|
|
826
|
+
collection_name: The unique name (not friendlyName) of the collection
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
Dict containing collection details including name, friendlyName, description,
|
|
830
|
+
parent relationships, and provisioning state
|
|
831
|
+
|
|
832
|
+
Raises:
|
|
833
|
+
ValueError: If collection not found
|
|
834
|
+
|
|
835
|
+
Example:
|
|
836
|
+
collection = await client.get_collection("myorg-finance")
|
|
837
|
+
print(f"Collection: {collection['friendlyName']}")
|
|
838
|
+
print(f"Description: {collection.get('description', 'N/A')}")
|
|
839
|
+
"""
|
|
840
|
+
endpoint = format_endpoint(ENDPOINTS["collections"]["get"], collectionName=collection_name)
|
|
841
|
+
params = get_api_version_params("collections")
|
|
842
|
+
return await self._make_request("GET", endpoint, params=params)
|
|
843
|
+
|
|
844
|
+
async def create_collection(self, collection_name: str, collection_data: Dict) -> Dict:
|
|
845
|
+
"""
|
|
846
|
+
Create a new collection in the Purview account hierarchy.
|
|
847
|
+
|
|
848
|
+
Args:
|
|
849
|
+
collection_name: Unique collection name (used in URLs, no spaces)
|
|
850
|
+
collection_data: Dictionary containing collection properties:
|
|
851
|
+
- friendlyName (str): Display name for the collection
|
|
852
|
+
- description (str): Optional description
|
|
853
|
+
- parentCollection (dict): Reference to parent collection {"referenceName": "parent-name"}
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
Dict containing the created collection with assigned system properties
|
|
857
|
+
|
|
858
|
+
Raises:
|
|
859
|
+
PurviewException: If collection creation fails
|
|
860
|
+
ValueError: If collection_name already exists or parentCollection not found
|
|
861
|
+
|
|
862
|
+
Example:
|
|
863
|
+
```python
|
|
864
|
+
collection = await client.create_collection("finance-data", {
|
|
865
|
+
"friendlyName": "Finance Data Collection",
|
|
866
|
+
"description": "All financial datasets and reports",
|
|
867
|
+
"parentCollection": {"referenceName": "myorg"}
|
|
868
|
+
})
|
|
869
|
+
print(f"Created: {collection['name']}")
|
|
870
|
+
```
|
|
871
|
+
|
|
872
|
+
Use Cases:
|
|
873
|
+
- Organize data assets by department or business unit
|
|
874
|
+
- Implement multi-tenant data governance with collection hierarchies
|
|
875
|
+
- Apply role-based access control at the collection level
|
|
876
|
+
- Isolate data assets for compliance or security requirements
|
|
877
|
+
"""
|
|
878
|
+
endpoint = format_endpoint(
|
|
879
|
+
ENDPOINTS["collections"]["create_or_update"], collectionName=collection_name
|
|
880
|
+
)
|
|
881
|
+
params = get_api_version_params("collections")
|
|
882
|
+
return await self._make_request("PUT", endpoint, json=collection_data, params=params)
|
|
883
|
+
|
|
884
|
+
async def update_collection(self, collection_name: str, collection_data: Dict) -> Dict:
|
|
885
|
+
"""
|
|
886
|
+
Update an existing collection's properties.
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
collection_name: The unique name of the collection to update
|
|
890
|
+
collection_data: Dictionary with fields to update:
|
|
891
|
+
- friendlyName (str): New display name
|
|
892
|
+
- description (str): Updated description
|
|
893
|
+
- parentCollection (dict): New parent if moving in hierarchy
|
|
894
|
+
|
|
895
|
+
Returns:
|
|
896
|
+
Dict containing the updated collection information
|
|
897
|
+
|
|
898
|
+
Raises:
|
|
899
|
+
PurviewException: If update fails
|
|
900
|
+
ValueError: If collection_name not found
|
|
901
|
+
|
|
902
|
+
Example:
|
|
903
|
+
```python
|
|
904
|
+
updated = await client.update_collection("finance-data", {
|
|
905
|
+
"friendlyName": "Finance & Accounting Data",
|
|
906
|
+
"description": "Updated: All financial and accounting datasets"
|
|
907
|
+
})
|
|
908
|
+
print(f"Updated: {updated['friendlyName']}")
|
|
909
|
+
```
|
|
910
|
+
|
|
911
|
+
Use Cases:
|
|
912
|
+
- Update collection display names and descriptions
|
|
913
|
+
- Reorganize collection hierarchy by changing parent
|
|
914
|
+
- Maintain collection metadata as business needs evolve
|
|
915
|
+
- Correct naming or organizational structure
|
|
916
|
+
"""
|
|
917
|
+
endpoint = format_endpoint(
|
|
918
|
+
ENDPOINTS["collections"]["create_or_update"], collectionName=collection_name
|
|
919
|
+
)
|
|
920
|
+
params = get_api_version_params("collections")
|
|
921
|
+
return await self._make_request("PUT", endpoint, json=collection_data, params=params)
|
|
922
|
+
|
|
923
|
+
async def create_or_update_collection(
|
|
924
|
+
self, collection_name: str, collection_data: Dict
|
|
925
|
+
) -> Dict:
|
|
926
|
+
"""
|
|
927
|
+
Create a new collection or update an existing one (upsert operation).
|
|
928
|
+
|
|
929
|
+
Args:
|
|
930
|
+
collection_name: The unique name of the collection
|
|
931
|
+
collection_data: Dictionary containing collection properties (see create_collection)
|
|
932
|
+
|
|
933
|
+
Returns:
|
|
934
|
+
Dict containing the created or updated collection information
|
|
935
|
+
|
|
936
|
+
Raises:
|
|
937
|
+
PurviewException: If operation fails
|
|
938
|
+
|
|
939
|
+
Example:
|
|
940
|
+
```python
|
|
941
|
+
# Will create if doesn't exist, update if exists
|
|
942
|
+
collection = await client.create_or_update_collection("finance-data", {
|
|
943
|
+
"friendlyName": "Finance Data",
|
|
944
|
+
"description": "Financial datasets"
|
|
945
|
+
})
|
|
946
|
+
```
|
|
947
|
+
|
|
948
|
+
Use Cases:
|
|
949
|
+
- Idempotent collection management in automation scripts
|
|
950
|
+
- Simplify collection provisioning without checking existence
|
|
951
|
+
- Update collection metadata without separate create/update logic
|
|
952
|
+
- Implement declarative collection configuration
|
|
953
|
+
"""
|
|
954
|
+
endpoint = format_endpoint(
|
|
955
|
+
ENDPOINTS["collections"]["create_or_update"], collectionName=collection_name
|
|
956
|
+
)
|
|
957
|
+
params = get_api_version_params("collections")
|
|
958
|
+
return await self._make_request("PUT", endpoint, json=collection_data, params=params)
|
|
959
|
+
|
|
960
|
+
async def delete_collection(self, collection_name: str) -> Dict:
|
|
961
|
+
"""
|
|
962
|
+
Delete a collection from the Purview account.
|
|
963
|
+
|
|
964
|
+
Args:
|
|
965
|
+
collection_name: The unique name of the collection to delete
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
Dict containing deletion confirmation (typically empty on success)
|
|
969
|
+
|
|
970
|
+
Raises:
|
|
971
|
+
PurviewException: If deletion fails
|
|
972
|
+
ValueError: If collection not found or still contains assets
|
|
973
|
+
|
|
974
|
+
Example:
|
|
975
|
+
```python
|
|
976
|
+
await client.delete_collection("finance-data")
|
|
977
|
+
print("Collection deleted successfully")
|
|
978
|
+
```
|
|
979
|
+
|
|
980
|
+
Use Cases:
|
|
981
|
+
- Remove unused or obsolete collections
|
|
982
|
+
- Clean up test collections after development
|
|
983
|
+
- Reorganize collection hierarchy by removing intermediate levels
|
|
984
|
+
- Implement collection lifecycle management
|
|
985
|
+
"""
|
|
986
|
+
endpoint = format_endpoint(
|
|
987
|
+
ENDPOINTS["collections"]["delete"], collectionName=collection_name
|
|
988
|
+
)
|
|
989
|
+
params = get_api_version_params("collections")
|
|
990
|
+
return await self._make_request("DELETE", endpoint, params=params)
|
|
991
|
+
|
|
992
|
+
async def get_collection_path(self, collection_name: str) -> Dict:
|
|
993
|
+
"""
|
|
994
|
+
Get the full hierarchical path from root to the specified collection.
|
|
995
|
+
|
|
996
|
+
Args:
|
|
997
|
+
collection_name: The unique name of the collection
|
|
998
|
+
|
|
999
|
+
Returns:
|
|
1000
|
+
Dict containing the collection path information:
|
|
1001
|
+
- parentFriendlyNameChain (list): Ordered list of friendly names from root to parent
|
|
1002
|
+
- parentNameChain (list): Ordered list of collection names from root to parent
|
|
1003
|
+
|
|
1004
|
+
Raises:
|
|
1005
|
+
PurviewException: If request fails
|
|
1006
|
+
ValueError: If collection not found
|
|
1007
|
+
|
|
1008
|
+
Example:
|
|
1009
|
+
```python
|
|
1010
|
+
path = await client.get_collection_path("finance-reports")
|
|
1011
|
+
print(" > ".join(path['parentFriendlyNameChain']))
|
|
1012
|
+
# Output: "Root > Finance > Reports"
|
|
1013
|
+
```
|
|
1014
|
+
|
|
1015
|
+
Use Cases:
|
|
1016
|
+
- Display collection breadcrumb navigation in UI
|
|
1017
|
+
- Understand collection hierarchy and relationships
|
|
1018
|
+
- Validate collection positioning in organizational structure
|
|
1019
|
+
- Generate collection path reports for governance
|
|
1020
|
+
"""
|
|
1021
|
+
endpoint = format_endpoint(
|
|
1022
|
+
ENDPOINTS["collections"]["get_collection_path"], collectionName=collection_name
|
|
1023
|
+
)
|
|
1024
|
+
params = get_api_version_params("collections")
|
|
1025
|
+
return await self._make_request("GET", endpoint, params=params)
|
|
1026
|
+
|
|
1027
|
+
async def get_child_collection_names(self, collection_name: str) -> List[str]:
|
|
1028
|
+
"""
|
|
1029
|
+
Get the names of all immediate child collections under the specified collection.
|
|
1030
|
+
|
|
1031
|
+
Args:
|
|
1032
|
+
collection_name: The unique name of the parent collection
|
|
1033
|
+
|
|
1034
|
+
Returns:
|
|
1035
|
+
List of strings containing child collection names (not friendly names)
|
|
1036
|
+
|
|
1037
|
+
Raises:
|
|
1038
|
+
PurviewException: If request fails
|
|
1039
|
+
ValueError: If parent collection not found
|
|
1040
|
+
|
|
1041
|
+
Example:
|
|
1042
|
+
```python
|
|
1043
|
+
children = await client.get_child_collection_names("finance")
|
|
1044
|
+
for child in children:
|
|
1045
|
+
print(f"Child collection: {child}")
|
|
1046
|
+
# Output: finance-reports, finance-analytics, finance-archive
|
|
1047
|
+
```
|
|
1048
|
+
|
|
1049
|
+
Use Cases:
|
|
1050
|
+
- Navigate collection hierarchy programmatically
|
|
1051
|
+
- Build collection tree visualizations
|
|
1052
|
+
- Audit collection structure and organization
|
|
1053
|
+
- Implement recursive collection operations
|
|
1054
|
+
"""
|
|
1055
|
+
endpoint = format_endpoint(
|
|
1056
|
+
ENDPOINTS["collections"]["get_child_collection_names"], collectionName=collection_name
|
|
1057
|
+
)
|
|
1058
|
+
params = get_api_version_params("collections")
|
|
1059
|
+
return await self._make_request("GET", endpoint, params=params)
|
|
1060
|
+
|
|
1061
|
+
# Lineage Operations
|
|
1062
|
+
async def get_lineage(self, guid: str, direction: str = "BOTH", depth: int = 3) -> Dict:
|
|
1063
|
+
"""
|
|
1064
|
+
Get data lineage for an entity showing upstream sources and downstream consumers.
|
|
1065
|
+
|
|
1066
|
+
Data lineage tracks how data flows between systems, showing transformation paths
|
|
1067
|
+
and dependencies critical for impact analysis and compliance.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
guid: The unique GUID of the entity to get lineage for
|
|
1071
|
+
direction: Lineage direction to retrieve:
|
|
1072
|
+
- "INPUT": Upstream sources (where data comes from)
|
|
1073
|
+
- "OUTPUT": Downstream consumers (where data goes to)
|
|
1074
|
+
- "BOTH": Both upstream and downstream (default)
|
|
1075
|
+
depth: How many levels deep to traverse (default: 3, max: 10)
|
|
1076
|
+
Higher depths may return large result sets
|
|
1077
|
+
|
|
1078
|
+
Returns:
|
|
1079
|
+
Dict containing:
|
|
1080
|
+
- baseEntityGuid: Starting entity GUID
|
|
1081
|
+
- guidEntityMap: Map of all entities in the lineage graph
|
|
1082
|
+
- relations: List of lineage relationships showing data flow
|
|
1083
|
+
- widthCounts: Entity counts at each lineage level
|
|
1084
|
+
- lineageDirection: Requested direction
|
|
1085
|
+
- lineageDepth: Requested depth
|
|
1086
|
+
|
|
1087
|
+
Use Cases:
|
|
1088
|
+
- Impact analysis: "What will break if I change this table?"
|
|
1089
|
+
- Data tracing: "Where does this report's data come from?"
|
|
1090
|
+
- Compliance: "Show the complete data flow for audit"
|
|
1091
|
+
|
|
1092
|
+
Example:
|
|
1093
|
+
# Get full lineage for a table
|
|
1094
|
+
lineage = await client.get_lineage(
|
|
1095
|
+
guid="table-guid-abc",
|
|
1096
|
+
direction="BOTH",
|
|
1097
|
+
depth=5
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
# Analyze upstream sources
|
|
1101
|
+
for rel in lineage["relations"]:
|
|
1102
|
+
if rel["relationshipType"] == "UPSTREAM":
|
|
1103
|
+
source = lineage["guidEntityMap"][rel["fromEntityId"]]
|
|
1104
|
+
print(f"Source: {source['displayName']}")
|
|
1105
|
+
"""
|
|
1106
|
+
params = {"direction": direction, "depth": depth}
|
|
1107
|
+
endpoint = f"{ENDPOINTS['lineage']['lineage']}/{guid}"
|
|
1108
|
+
return await self._make_request("GET", endpoint, params=params)
|
|
1109
|
+
|
|
1110
|
+
async def create_lineage(self, lineage_data: Dict) -> Dict:
|
|
1111
|
+
"""
|
|
1112
|
+
Create a data lineage relationship between entities.
|
|
1113
|
+
|
|
1114
|
+
Use this to document custom data flows, ETL processes, or transformations
|
|
1115
|
+
not automatically discovered by Purview scanners.
|
|
1116
|
+
|
|
1117
|
+
Args:
|
|
1118
|
+
lineage_data: Dictionary containing lineage relationship with:
|
|
1119
|
+
- typeName (str): Process type (e.g., "Process", "spark_process")
|
|
1120
|
+
- attributes (dict):
|
|
1121
|
+
- name (str): Process name
|
|
1122
|
+
- qualifiedName (str): Unique identifier
|
|
1123
|
+
- inputs (list): List of input entity references
|
|
1124
|
+
- outputs (list): List of output entity references
|
|
1125
|
+
|
|
1126
|
+
Returns:
|
|
1127
|
+
Dict containing created lineage process entity
|
|
1128
|
+
|
|
1129
|
+
Example:
|
|
1130
|
+
# Document an ETL process
|
|
1131
|
+
lineage = await client.create_lineage({
|
|
1132
|
+
"typeName": "Process",
|
|
1133
|
+
"attributes": {
|
|
1134
|
+
"name": "Daily Sales ETL",
|
|
1135
|
+
"qualifiedName": "etl_sales_daily@tenant",
|
|
1136
|
+
"inputs": [
|
|
1137
|
+
{"guid": "source-table-guid"}
|
|
1138
|
+
],
|
|
1139
|
+
"outputs": [
|
|
1140
|
+
{"guid": "target-table-guid"}
|
|
1141
|
+
]
|
|
1142
|
+
}
|
|
1143
|
+
})
|
|
1144
|
+
"""
|
|
1145
|
+
return await self._make_request("POST", ENDPOINTS["lineage"]["lineage"], json=lineage_data)
|
|
1146
|
+
|
|
1147
|
+
# === CSV IMPORT/EXPORT OPERATIONS ===
|
|
1148
|
+
|
|
1149
|
+
async def import_collections_from_csv(self, csv_file_path: str, progress_callback=None) -> Dict:
|
|
1150
|
+
"""Import Collections from CSV file"""
|
|
1151
|
+
import pandas as pd
|
|
1152
|
+
|
|
1153
|
+
if not os.path.exists(csv_file_path):
|
|
1154
|
+
raise ValueError(f"CSV file not found: {csv_file_path}")
|
|
1155
|
+
|
|
1156
|
+
try:
|
|
1157
|
+
df = pd.read_csv(csv_file_path)
|
|
1158
|
+
except Exception as e:
|
|
1159
|
+
raise ValueError(f"Failed to read CSV file: {str(e)}")
|
|
1160
|
+
|
|
1161
|
+
# Validate required columns
|
|
1162
|
+
required_columns = ["collectionName", "friendlyName"]
|
|
1163
|
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
|
1164
|
+
if missing_columns:
|
|
1165
|
+
raise ValueError(
|
|
1166
|
+
f"Missing required columns: {missing_columns}. Required: {required_columns}"
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
results = []
|
|
1170
|
+
total_rows = len(df)
|
|
1171
|
+
|
|
1172
|
+
for index, row in df.iterrows():
|
|
1173
|
+
try:
|
|
1174
|
+
collection_name = row["collectionName"]
|
|
1175
|
+
collection_data = {
|
|
1176
|
+
"friendlyName": row.get("friendlyName", collection_name),
|
|
1177
|
+
"description": row.get("description", ""),
|
|
1178
|
+
"parentCollection": {"referenceName": row.get("parentCollection", "root")},
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
# Create the collection
|
|
1182
|
+
result = await self.create_collection(collection_name, collection_data)
|
|
1183
|
+
results.append(
|
|
1184
|
+
{
|
|
1185
|
+
"row": index + 1,
|
|
1186
|
+
"collectionName": collection_name,
|
|
1187
|
+
"status": "success",
|
|
1188
|
+
"result": result,
|
|
1189
|
+
}
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
if progress_callback:
|
|
1193
|
+
progress_callback(index + 1, total_rows)
|
|
1194
|
+
|
|
1195
|
+
except Exception as e:
|
|
1196
|
+
results.append(
|
|
1197
|
+
{
|
|
1198
|
+
"row": index + 1,
|
|
1199
|
+
"collectionName": row.get("collectionName", "unknown"),
|
|
1200
|
+
"status": "error",
|
|
1201
|
+
"error": str(e),
|
|
1202
|
+
}
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
return {
|
|
1206
|
+
"total_processed": len(results),
|
|
1207
|
+
"successful": len([r for r in results if r["status"] == "success"]),
|
|
1208
|
+
"failed": len([r for r in results if r["status"] == "error"]),
|
|
1209
|
+
"details": results,
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
async def export_collections_to_csv(
|
|
1213
|
+
self, csv_file_path: str, include_hierarchy: bool = True, include_metadata: bool = True
|
|
1214
|
+
) -> str:
|
|
1215
|
+
"""Export Collections to CSV file"""
|
|
1216
|
+
import pandas as pd
|
|
1217
|
+
|
|
1218
|
+
try:
|
|
1219
|
+
# Get all collections
|
|
1220
|
+
collections_data = await self.list_collections()
|
|
1221
|
+
|
|
1222
|
+
if not collections_data or "value" not in collections_data:
|
|
1223
|
+
return "No collections found to export"
|
|
1224
|
+
|
|
1225
|
+
collections = collections_data["value"]
|
|
1226
|
+
export_data = []
|
|
1227
|
+
|
|
1228
|
+
for collection in collections:
|
|
1229
|
+
row_data = {
|
|
1230
|
+
"collectionName": collection.get("name", ""),
|
|
1231
|
+
"friendlyName": collection.get("friendlyName", ""),
|
|
1232
|
+
"description": collection.get("description", ""),
|
|
1233
|
+
"parentCollection": collection.get("parentCollection", {}).get(
|
|
1234
|
+
"referenceName", "root"
|
|
1235
|
+
),
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
if include_hierarchy:
|
|
1239
|
+
# Try to get collection path for hierarchy info
|
|
1240
|
+
try:
|
|
1241
|
+
if collection.get("name"):
|
|
1242
|
+
path_data = await self.get_collection_path(collection["name"])
|
|
1243
|
+
row_data["collectionPath"] = " > ".join(path_data.get("path", []))
|
|
1244
|
+
row_data["level"] = len(path_data.get("path", [])) - 1
|
|
1245
|
+
except:
|
|
1246
|
+
row_data["collectionPath"] = ""
|
|
1247
|
+
row_data["level"] = 0
|
|
1248
|
+
|
|
1249
|
+
if include_metadata:
|
|
1250
|
+
row_data["systemData_createdAt"] = collection.get("systemData", {}).get(
|
|
1251
|
+
"createdAt", ""
|
|
1252
|
+
)
|
|
1253
|
+
row_data["systemData_lastModifiedAt"] = collection.get("systemData", {}).get(
|
|
1254
|
+
"lastModifiedAt", ""
|
|
1255
|
+
)
|
|
1256
|
+
row_data["systemData_createdBy"] = collection.get("systemData", {}).get(
|
|
1257
|
+
"createdBy", ""
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
export_data.append(row_data)
|
|
1261
|
+
|
|
1262
|
+
# Create DataFrame and export to CSV
|
|
1263
|
+
df = pd.DataFrame(export_data)
|
|
1264
|
+
df.to_csv(csv_file_path, index=False)
|
|
1265
|
+
|
|
1266
|
+
return f"Successfully exported {len(export_data)} collections to {csv_file_path}"
|
|
1267
|
+
|
|
1268
|
+
except Exception as e:
|
|
1269
|
+
raise Exception(f"Failed to export collections to CSV: {str(e)}")
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
class BatchOperationProgress:
|
|
1273
|
+
"""Progress tracker for batch operations"""
|
|
1274
|
+
|
|
1275
|
+
def __init__(self, total: int, description: str = "Processing"):
|
|
1276
|
+
self.total = total
|
|
1277
|
+
self.processed = 0
|
|
1278
|
+
self.description = description
|
|
1279
|
+
self.start_time = datetime.now()
|
|
1280
|
+
|
|
1281
|
+
def update(self, processed: int, total: int):
|
|
1282
|
+
"""Update progress"""
|
|
1283
|
+
self.processed = processed
|
|
1284
|
+
self.total = total
|
|
1285
|
+
percentage = (processed / total) * 100 if total > 0 else 0
|
|
1286
|
+
elapsed = datetime.now() - self.start_time
|
|
1287
|
+
|
|
1288
|
+
print(
|
|
1289
|
+
f"\r{self.description}: {processed}/{total} ({percentage:.1f}%) - Elapsed: {elapsed}",
|
|
1290
|
+
end="",
|
|
1291
|
+
flush=True,
|
|
1292
|
+
)
|
|
1293
|
+
|
|
1294
|
+
if processed >= total:
|
|
1295
|
+
print() # New line when complete
|