acryl-datahub 1.1.0.1rc6__py3-none-any.whl → 1.1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/METADATA +2522 -2522
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/RECORD +25 -23
- datahub/_version.py +1 -1
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +10 -10
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -2
- datahub/emitter/rest_emitter.py +29 -4
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/sink/datahub_rest.py +1 -0
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +488 -243
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/sql/mssql/source.py +207 -18
- datahub/ingestion/source/unity/source.py +2 -3
- datahub/metadata/_internal_schema_classes.py +499 -499
- datahub/metadata/_urns/urn_defs.py +1766 -1766
- datahub/metadata/schema.avsc +17480 -17093
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/main_client.py +3 -3
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from datahub.api.entities.external.external_entities import (
|
|
7
|
+
ExternalEntity,
|
|
8
|
+
ExternalEntityId,
|
|
9
|
+
LinkedResourceSet,
|
|
10
|
+
PlatformResourceRepository,
|
|
11
|
+
)
|
|
12
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
13
|
+
LakeFormationTag,
|
|
14
|
+
)
|
|
15
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
16
|
+
PlatformResource,
|
|
17
|
+
PlatformResourceKey,
|
|
18
|
+
PlatformResourceSearchFields,
|
|
19
|
+
)
|
|
20
|
+
from datahub.metadata.urns import TagUrn
|
|
21
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
22
|
+
from datahub.utilities.urns.urn import Urn
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LakeFormationTagSyncContext(BaseModel):
|
|
28
|
+
# it is intentionally empty
|
|
29
|
+
platform_instance: Optional[str] = None
|
|
30
|
+
catalog: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
34
|
+
"""
|
|
35
|
+
A LakeFormationTag is a unique identifier for a Lakeformation tag.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
tag_key: str
|
|
39
|
+
tag_value: Optional[str] = None
|
|
40
|
+
platform_instance: Optional[str]
|
|
41
|
+
catalog: Optional[str] = None
|
|
42
|
+
exists_in_lake_formation: bool = False
|
|
43
|
+
persisted: bool = False
|
|
44
|
+
|
|
45
|
+
def __hash__(self) -> int:
|
|
46
|
+
return hash(self.to_platform_resource_key().id)
|
|
47
|
+
|
|
48
|
+
# this is a hack to make sure the property is a string and not private pydantic field
|
|
49
|
+
@staticmethod
|
|
50
|
+
def _RESOURCE_TYPE() -> str:
|
|
51
|
+
return "LakeFormationTagPlatformResource"
|
|
52
|
+
|
|
53
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
54
|
+
return PlatformResourceKey(
|
|
55
|
+
platform="glue",
|
|
56
|
+
resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
57
|
+
primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
|
|
58
|
+
if self.catalog
|
|
59
|
+
else f"{self.tag_key}:{self.tag_value}",
|
|
60
|
+
platform_instance=self.platform_instance,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_tag(
|
|
65
|
+
cls,
|
|
66
|
+
tag: LakeFormationTag,
|
|
67
|
+
platform_instance: Optional[str],
|
|
68
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
69
|
+
catalog: Optional[str] = None,
|
|
70
|
+
exists_in_lake_formation: bool = False,
|
|
71
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
72
|
+
"""
|
|
73
|
+
Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
existing_platform_resource = cls.search_by_urn(
|
|
77
|
+
tag.to_datahub_tag_urn().urn(),
|
|
78
|
+
platform_resource_repository=platform_resource_repository,
|
|
79
|
+
tag_sync_context=LakeFormationTagSyncContext(
|
|
80
|
+
platform_instance=platform_instance,
|
|
81
|
+
catalog=catalog,
|
|
82
|
+
),
|
|
83
|
+
)
|
|
84
|
+
if existing_platform_resource:
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
|
|
87
|
+
)
|
|
88
|
+
return existing_platform_resource
|
|
89
|
+
|
|
90
|
+
return LakeFormationTagPlatformResourceId(
|
|
91
|
+
tag_key=tag.key,
|
|
92
|
+
tag_value=tag.value if tag.value is not None else None,
|
|
93
|
+
platform_instance=platform_instance,
|
|
94
|
+
exists_in_lake_formation=exists_in_lake_formation,
|
|
95
|
+
catalog=catalog,
|
|
96
|
+
persisted=False,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def search_by_urn(
|
|
101
|
+
cls,
|
|
102
|
+
urn: str,
|
|
103
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
104
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
105
|
+
) -> Optional["LakeFormationTagPlatformResourceId"]:
|
|
106
|
+
mapped_tags = [
|
|
107
|
+
t
|
|
108
|
+
for t in platform_resource_repository.search_by_filter(
|
|
109
|
+
ElasticDocumentQuery.create_from(
|
|
110
|
+
(
|
|
111
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
112
|
+
str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
113
|
+
),
|
|
114
|
+
(PlatformResourceSearchFields.SECONDARY_KEYS, urn),
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
]
|
|
118
|
+
logger.info(
|
|
119
|
+
f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
|
|
120
|
+
)
|
|
121
|
+
if len(mapped_tags) > 0:
|
|
122
|
+
for platform_resource in mapped_tags:
|
|
123
|
+
if (
|
|
124
|
+
platform_resource.resource_info
|
|
125
|
+
and platform_resource.resource_info.value
|
|
126
|
+
):
|
|
127
|
+
lake_formation_tag_platform_resource = (
|
|
128
|
+
LakeFormationTagPlatformResource(
|
|
129
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
130
|
+
LakeFormationTagPlatformResource
|
|
131
|
+
).dict()
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
if (
|
|
135
|
+
lake_formation_tag_platform_resource.id.platform_instance
|
|
136
|
+
== tag_sync_context.platform_instance
|
|
137
|
+
and lake_formation_tag_platform_resource.id.catalog
|
|
138
|
+
== tag_sync_context.catalog
|
|
139
|
+
):
|
|
140
|
+
lake_formation_tag_id = lake_formation_tag_platform_resource.id
|
|
141
|
+
lake_formation_tag_id.exists_in_lake_formation = True
|
|
142
|
+
lake_formation_tag_id.persisted = True
|
|
143
|
+
return lake_formation_tag_id
|
|
144
|
+
else:
|
|
145
|
+
logger.warning(
|
|
146
|
+
f"Platform resource {platform_resource} does not have a resource_info value"
|
|
147
|
+
)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# If we reach here, it means we did not find a mapped tag for the URN
|
|
151
|
+
logger.info(
|
|
152
|
+
f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
|
|
153
|
+
)
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def from_datahub_urn(
|
|
158
|
+
cls,
|
|
159
|
+
urn: str,
|
|
160
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
161
|
+
tag_sync_context: LakeFormationTagSyncContext,
|
|
162
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
163
|
+
"""
|
|
164
|
+
Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
|
|
165
|
+
"""
|
|
166
|
+
# First we check if we already have a mapped platform resource for this
|
|
167
|
+
# urn that is of the type UnityCatalogTagPlatformResource
|
|
168
|
+
# If we do, we can use it to create the UnityCatalogTagPlatformResourceId
|
|
169
|
+
# Else, we need to generate a new UnityCatalogTagPlatformResourceId
|
|
170
|
+
existing_platform_resource_id = cls.search_by_urn(
|
|
171
|
+
urn, platform_resource_repository, tag_sync_context
|
|
172
|
+
)
|
|
173
|
+
if existing_platform_resource_id:
|
|
174
|
+
logger.info(
|
|
175
|
+
f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
|
|
176
|
+
)
|
|
177
|
+
return existing_platform_resource_id
|
|
178
|
+
|
|
179
|
+
# Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
|
|
180
|
+
new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
|
|
181
|
+
if new_tag_id:
|
|
182
|
+
# we then check if this tag has already been ingested as a platform
|
|
183
|
+
# resource in the platform resource repository
|
|
184
|
+
resource_key = platform_resource_repository.get(
|
|
185
|
+
new_tag_id.to_platform_resource_key()
|
|
186
|
+
)
|
|
187
|
+
if resource_key:
|
|
188
|
+
logger.info(
|
|
189
|
+
f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
|
|
190
|
+
)
|
|
191
|
+
new_tag_id.exists_in_lake_formation = (
|
|
192
|
+
True # TODO: Check if this is a safe assumption
|
|
193
|
+
)
|
|
194
|
+
return new_tag_id
|
|
195
|
+
raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
|
|
196
|
+
|
|
197
|
+
@classmethod
|
|
198
|
+
def generate_tag_id(
|
|
199
|
+
cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
|
|
200
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
201
|
+
parsed_urn = Urn.from_string(urn)
|
|
202
|
+
entity_type = parsed_urn.entity_type
|
|
203
|
+
if entity_type == "tag":
|
|
204
|
+
new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
|
|
205
|
+
TagUrn.from_string(urn), tag_sync_context
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
|
|
209
|
+
return new_tag_id
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def from_datahub_tag(
|
|
213
|
+
cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
|
|
214
|
+
) -> "LakeFormationTagPlatformResourceId":
|
|
215
|
+
tag = LakeFormationTag.from_urn(tag_urn)
|
|
216
|
+
|
|
217
|
+
return LakeFormationTagPlatformResourceId(
|
|
218
|
+
tag_key=str(tag.key),
|
|
219
|
+
tag_value=str(tag.value),
|
|
220
|
+
platform_instance=tag_sync_context.platform_instance,
|
|
221
|
+
catalog=tag_sync_context.catalog,
|
|
222
|
+
exists_in_lake_formation=False,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
|
|
227
|
+
datahub_urns: LinkedResourceSet
|
|
228
|
+
managed_by_datahub: bool
|
|
229
|
+
id: LakeFormationTagPlatformResourceId
|
|
230
|
+
allowed_values: Optional[List[str]]
|
|
231
|
+
|
|
232
|
+
def get_id(self) -> ExternalEntityId:
|
|
233
|
+
return self.id
|
|
234
|
+
|
|
235
|
+
def is_managed_by_datahub(self) -> bool:
|
|
236
|
+
return self.managed_by_datahub
|
|
237
|
+
|
|
238
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
239
|
+
return self.datahub_urns
|
|
240
|
+
|
|
241
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
242
|
+
return PlatformResource.create(
|
|
243
|
+
key=self.id.to_platform_resource_key(),
|
|
244
|
+
secondary_keys=[u for u in self.datahub_urns.urns],
|
|
245
|
+
value=self,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def get_from_datahub(
|
|
250
|
+
cls,
|
|
251
|
+
lake_formation_tag_id: LakeFormationTagPlatformResourceId,
|
|
252
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
253
|
+
managed_by_datahub: bool = False,
|
|
254
|
+
) -> "LakeFormationTagPlatformResource":
|
|
255
|
+
# Search for linked DataHub URNs
|
|
256
|
+
platform_resources = [
|
|
257
|
+
r
|
|
258
|
+
for r in platform_resource_repository.search_by_filter(
|
|
259
|
+
ElasticDocumentQuery.create_from(
|
|
260
|
+
(
|
|
261
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
262
|
+
str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
|
|
263
|
+
),
|
|
264
|
+
(
|
|
265
|
+
PlatformResourceSearchFields.PRIMARY_KEY,
|
|
266
|
+
f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
|
|
267
|
+
),
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
]
|
|
271
|
+
for platform_resource in platform_resources:
|
|
272
|
+
if (
|
|
273
|
+
platform_resource.resource_info
|
|
274
|
+
and platform_resource.resource_info.value
|
|
275
|
+
):
|
|
276
|
+
lf_tag = LakeFormationTagPlatformResource(
|
|
277
|
+
**platform_resource.resource_info.value.as_pydantic_object(
|
|
278
|
+
LakeFormationTagPlatformResource
|
|
279
|
+
).dict()
|
|
280
|
+
)
|
|
281
|
+
if (
|
|
282
|
+
lf_tag.id.platform_instance
|
|
283
|
+
== lake_formation_tag_id.platform_instance
|
|
284
|
+
and lf_tag.id.catalog == lake_formation_tag_id.catalog
|
|
285
|
+
):
|
|
286
|
+
return lf_tag
|
|
287
|
+
return cls(
|
|
288
|
+
id=lake_formation_tag_id,
|
|
289
|
+
datahub_urns=LinkedResourceSet(urns=[]),
|
|
290
|
+
managed_by_datahub=managed_by_datahub,
|
|
291
|
+
allowed_values=None,
|
|
292
|
+
)
|
|
@@ -323,9 +323,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
323
323
|
try:
|
|
324
324
|
yield from self.loop_jobs(inspector, self.config)
|
|
325
325
|
except Exception as e:
|
|
326
|
-
self.report.
|
|
327
|
-
"jobs",
|
|
328
|
-
|
|
326
|
+
self.report.failure(
|
|
327
|
+
message="Failed to list jobs",
|
|
328
|
+
title="SQL Server Jobs Extraction",
|
|
329
|
+
context="Error occurred during database-level job extraction",
|
|
330
|
+
exc=e,
|
|
329
331
|
)
|
|
330
332
|
|
|
331
333
|
def get_schema_level_workunits(
|
|
@@ -343,12 +345,158 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
343
345
|
try:
|
|
344
346
|
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
345
347
|
except Exception as e:
|
|
346
|
-
self.report.
|
|
347
|
-
"
|
|
348
|
-
|
|
348
|
+
self.report.failure(
|
|
349
|
+
message="Failed to list stored procedures",
|
|
350
|
+
title="SQL Server Stored Procedures Extraction",
|
|
351
|
+
context="Error occurred during schema-level stored procedure extraction",
|
|
352
|
+
exc=e,
|
|
349
353
|
)
|
|
350
354
|
|
|
355
|
+
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
356
|
+
"""
|
|
357
|
+
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
358
|
+
Returns True if RDS/managed, False if on-premises.
|
|
359
|
+
"""
|
|
360
|
+
try:
|
|
361
|
+
# Try to access system tables directly - this typically fails in RDS
|
|
362
|
+
conn.execute("SELECT TOP 1 * FROM msdb.dbo.sysjobs")
|
|
363
|
+
logger.debug(
|
|
364
|
+
"Direct table access successful - likely on-premises environment"
|
|
365
|
+
)
|
|
366
|
+
return False
|
|
367
|
+
except Exception:
|
|
368
|
+
logger.debug("Direct table access failed - likely RDS/managed environment")
|
|
369
|
+
return True
|
|
370
|
+
|
|
351
371
|
def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
372
|
+
"""
|
|
373
|
+
Get job information with environment detection to choose optimal method first.
|
|
374
|
+
"""
|
|
375
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
376
|
+
|
|
377
|
+
# Detect environment to choose optimal method first
|
|
378
|
+
is_rds = self._detect_rds_environment(conn)
|
|
379
|
+
|
|
380
|
+
if is_rds:
|
|
381
|
+
# Managed environment - try stored procedures first
|
|
382
|
+
try:
|
|
383
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
384
|
+
logger.info(
|
|
385
|
+
"Successfully retrieved jobs using stored procedures (managed environment)"
|
|
386
|
+
)
|
|
387
|
+
return jobs
|
|
388
|
+
except Exception as sp_error:
|
|
389
|
+
logger.warning(
|
|
390
|
+
f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
|
|
391
|
+
)
|
|
392
|
+
# Try direct query as fallback (might work in some managed environments)
|
|
393
|
+
try:
|
|
394
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
395
|
+
logger.info(
|
|
396
|
+
"Successfully retrieved jobs using direct query fallback in managed environment"
|
|
397
|
+
)
|
|
398
|
+
return jobs
|
|
399
|
+
except Exception as direct_error:
|
|
400
|
+
self.report.failure(
|
|
401
|
+
message="Failed to retrieve jobs in managed environment",
|
|
402
|
+
title="SQL Server Jobs Extraction",
|
|
403
|
+
context="Both stored procedures and direct query methods failed",
|
|
404
|
+
exc=direct_error,
|
|
405
|
+
)
|
|
406
|
+
else:
|
|
407
|
+
# On-premises environment - try direct query first (usually faster)
|
|
408
|
+
try:
|
|
409
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
410
|
+
logger.info(
|
|
411
|
+
"Successfully retrieved jobs using direct query (on-premises environment)"
|
|
412
|
+
)
|
|
413
|
+
return jobs
|
|
414
|
+
except Exception as direct_error:
|
|
415
|
+
logger.warning(
|
|
416
|
+
f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
|
|
417
|
+
)
|
|
418
|
+
# Try stored procedures as fallback
|
|
419
|
+
try:
|
|
420
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
421
|
+
logger.info(
|
|
422
|
+
"Successfully retrieved jobs using stored procedures fallback in on-premises environment"
|
|
423
|
+
)
|
|
424
|
+
return jobs
|
|
425
|
+
except Exception as sp_error:
|
|
426
|
+
self.report.failure(
|
|
427
|
+
message="Failed to retrieve jobs in on-premises environment",
|
|
428
|
+
title="SQL Server Jobs Extraction",
|
|
429
|
+
context="Both direct query and stored procedures methods failed",
|
|
430
|
+
exc=sp_error,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return jobs
|
|
434
|
+
|
|
435
|
+
def _get_jobs_via_stored_procedures(
|
|
436
|
+
self, conn: Connection, db_name: str
|
|
437
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
438
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
439
|
+
|
|
440
|
+
# First, get all jobs
|
|
441
|
+
jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
|
|
442
|
+
jobs_data = {}
|
|
443
|
+
|
|
444
|
+
for row in jobs_result:
|
|
445
|
+
job_id = str(row["job_id"])
|
|
446
|
+
jobs_data[job_id] = {
|
|
447
|
+
"job_id": job_id,
|
|
448
|
+
"name": row["name"],
|
|
449
|
+
"description": row.get("description", ""),
|
|
450
|
+
"date_created": row.get("date_created"),
|
|
451
|
+
"date_modified": row.get("date_modified"),
|
|
452
|
+
"enabled": row.get("enabled", 1),
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
# Now get job steps for each job, filtering by database
|
|
456
|
+
for job_id, job_info in jobs_data.items():
|
|
457
|
+
try:
|
|
458
|
+
# Get steps for this specific job
|
|
459
|
+
steps_result = conn.execute(
|
|
460
|
+
f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
job_steps = {}
|
|
464
|
+
for step_row in steps_result:
|
|
465
|
+
# Only include steps that run against our target database
|
|
466
|
+
step_database = step_row.get("database_name", "")
|
|
467
|
+
if step_database.lower() == db_name.lower() or not step_database:
|
|
468
|
+
step_data = {
|
|
469
|
+
"job_id": job_id,
|
|
470
|
+
"job_name": job_info["name"],
|
|
471
|
+
"description": job_info["description"],
|
|
472
|
+
"date_created": job_info["date_created"],
|
|
473
|
+
"date_modified": job_info["date_modified"],
|
|
474
|
+
"step_id": step_row["step_id"],
|
|
475
|
+
"step_name": step_row["step_name"],
|
|
476
|
+
"subsystem": step_row.get("subsystem", ""),
|
|
477
|
+
"command": step_row.get("command", ""),
|
|
478
|
+
"database_name": step_database,
|
|
479
|
+
}
|
|
480
|
+
job_steps[step_row["step_id"]] = step_data
|
|
481
|
+
|
|
482
|
+
# Only add job if it has relevant steps
|
|
483
|
+
if job_steps:
|
|
484
|
+
jobs[job_info["name"]] = job_steps
|
|
485
|
+
|
|
486
|
+
except Exception as step_error:
|
|
487
|
+
logger.warning(
|
|
488
|
+
f"Failed to get steps for job {job_info['name']}: {step_error}"
|
|
489
|
+
)
|
|
490
|
+
continue
|
|
491
|
+
|
|
492
|
+
return jobs
|
|
493
|
+
|
|
494
|
+
def _get_jobs_via_direct_query(
|
|
495
|
+
self, conn: Connection, db_name: str
|
|
496
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
497
|
+
"""
|
|
498
|
+
Original method using direct table access for on-premises SQL Server.
|
|
499
|
+
"""
|
|
352
500
|
jobs_data = conn.execute(
|
|
353
501
|
f"""
|
|
354
502
|
SELECT
|
|
@@ -371,6 +519,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
371
519
|
where database_name = '{db_name}'
|
|
372
520
|
"""
|
|
373
521
|
)
|
|
522
|
+
|
|
374
523
|
jobs: Dict[str, Dict[str, Any]] = {}
|
|
375
524
|
for row in jobs_data:
|
|
376
525
|
step_data = dict(
|
|
@@ -383,11 +532,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
383
532
|
step_name=row["step_name"],
|
|
384
533
|
subsystem=row["subsystem"],
|
|
385
534
|
command=row["command"],
|
|
535
|
+
database_name=row["database_name"],
|
|
386
536
|
)
|
|
387
537
|
if row["name"] in jobs:
|
|
388
538
|
jobs[row["name"]][row["step_id"]] = step_data
|
|
389
539
|
else:
|
|
390
540
|
jobs[row["name"]] = {row["step_id"]: step_data}
|
|
541
|
+
|
|
391
542
|
return jobs
|
|
392
543
|
|
|
393
544
|
def loop_jobs(
|
|
@@ -397,21 +548,59 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
397
548
|
) -> Iterable[MetadataWorkUnit]:
|
|
398
549
|
"""
|
|
399
550
|
Loop MS SQL jobs as dataFlow-s.
|
|
400
|
-
|
|
551
|
+
Now supports both managed and on-premises SQL Server.
|
|
401
552
|
"""
|
|
402
553
|
db_name = self.get_db_name(inspector)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
with inspector.engine.connect() as conn:
|
|
557
|
+
jobs = self._get_jobs(conn, db_name)
|
|
558
|
+
|
|
559
|
+
if not jobs:
|
|
560
|
+
logger.info(f"No jobs found for database: {db_name}")
|
|
561
|
+
return
|
|
562
|
+
|
|
563
|
+
logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
|
|
564
|
+
|
|
565
|
+
for job_name, job_steps in jobs.items():
|
|
566
|
+
try:
|
|
567
|
+
job = MSSQLJob(
|
|
568
|
+
name=job_name,
|
|
569
|
+
env=sql_config.env,
|
|
570
|
+
db=db_name,
|
|
571
|
+
platform_instance=sql_config.platform_instance,
|
|
572
|
+
)
|
|
573
|
+
data_flow = MSSQLDataFlow(entity=job)
|
|
574
|
+
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
575
|
+
yield from self.loop_job_steps(job, job_steps)
|
|
576
|
+
|
|
577
|
+
except Exception as job_error:
|
|
578
|
+
logger.warning(f"Failed to process job {job_name}: {job_error}")
|
|
579
|
+
self.report.warning(
|
|
580
|
+
message=f"Failed to process job {job_name}",
|
|
581
|
+
title="SQL Server Jobs Extraction",
|
|
582
|
+
context="Error occurred while processing individual job",
|
|
583
|
+
exc=job_error,
|
|
584
|
+
)
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
except Exception as e:
|
|
588
|
+
error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
|
|
589
|
+
logger.error(error_message)
|
|
590
|
+
|
|
591
|
+
# Provide specific guidance for permission issues
|
|
592
|
+
if "permission" in str(e).lower() or "denied" in str(e).lower():
|
|
593
|
+
permission_guidance = (
|
|
594
|
+
"For managed SQL Server services, ensure the following permissions are granted:\n"
|
|
595
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
|
|
596
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
|
|
597
|
+
"For on-premises SQL Server, you may also need:\n"
|
|
598
|
+
"GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
|
|
599
|
+
"GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
|
|
411
600
|
)
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
601
|
+
logger.info(permission_guidance)
|
|
602
|
+
|
|
603
|
+
raise e
|
|
415
604
|
|
|
416
605
|
def loop_job_steps(
|
|
417
606
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
@@ -785,7 +785,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
785
785
|
description=schema.comment,
|
|
786
786
|
owner_urn=self.get_owner_urn(schema.owner),
|
|
787
787
|
external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
|
|
788
|
-
tags=[tag.to_datahub_tag_urn().
|
|
788
|
+
tags=[tag.to_datahub_tag_urn().name for tag in schema_tags]
|
|
789
789
|
if schema_tags
|
|
790
790
|
else None,
|
|
791
791
|
)
|
|
@@ -830,7 +830,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
830
830
|
description=catalog.comment,
|
|
831
831
|
owner_urn=self.get_owner_urn(catalog.owner),
|
|
832
832
|
external_url=f"{self.external_url_base}/{catalog.name}",
|
|
833
|
-
tags=[tag.to_datahub_tag_urn().
|
|
833
|
+
tags=[tag.to_datahub_tag_urn().name for tag in catalog_tags]
|
|
834
834
|
if catalog_tags
|
|
835
835
|
else None,
|
|
836
836
|
)
|
|
@@ -1083,7 +1083,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1083
1083
|
)
|
|
1084
1084
|
else:
|
|
1085
1085
|
if tags is not None:
|
|
1086
|
-
logger.debug(f"Column tags are: {tags}")
|
|
1087
1086
|
attribution = MetadataAttribution(
|
|
1088
1087
|
source="urn:li:dataPlatform:unity-catalog",
|
|
1089
1088
|
actor="urn:li:corpuser:datahub",
|