acryl-datahub 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/METADATA +2569 -2569
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/RECORD +37 -35
- datahub/_version.py +1 -1
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/graph/client.py +6 -11
- datahub/ingestion/graph/filters.py +22 -2
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/mlflow.py +19 -1
- datahub/ingestion/source/redshift/lineage_v2.py +7 -0
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +153 -13
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/metadata/schema.avsc +2 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/main_client.py +2 -1
- datahub/sdk/search_filters.py +18 -23
- datahub/sql_parsing/split_statements.py +17 -3
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
|
|
|
12
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
13
|
from datahub.ingestion.api.source import SourceReport
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
-
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
15
|
+
from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
17
|
from datahub.utilities.stats_collections import TopKDict
|
|
18
18
|
from datahub.utilities.urns._urn_base import Urn
|
|
19
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
|
-
QUERY_ENTITIES = """
|
|
23
|
-
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
-
scrollAcrossEntities(input: $input) {
|
|
25
|
-
nextScrollId
|
|
26
|
-
count
|
|
27
|
-
searchResults {
|
|
28
|
-
entity {
|
|
29
|
-
... on QueryEntity {
|
|
30
|
-
urn
|
|
31
|
-
}
|
|
32
|
-
... on DataProcessInstance {
|
|
33
|
-
urn
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
23
|
|
|
42
24
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
43
25
|
enabled: bool = Field(
|
|
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
64
46
|
)
|
|
65
47
|
|
|
66
48
|
entity_types: Optional[List[str]] = Field(
|
|
67
|
-
default
|
|
49
|
+
# A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
|
|
50
|
+
default=[
|
|
51
|
+
"dataset",
|
|
52
|
+
"dashboard",
|
|
53
|
+
"chart",
|
|
54
|
+
"mlmodel",
|
|
55
|
+
"mlmodelGroup",
|
|
56
|
+
"mlfeatureTable",
|
|
57
|
+
"mlfeature",
|
|
58
|
+
"mlprimaryKey",
|
|
59
|
+
"dataFlow",
|
|
60
|
+
"dataJob",
|
|
61
|
+
"glossaryTerm",
|
|
62
|
+
"glossaryNode",
|
|
63
|
+
"tag",
|
|
64
|
+
"role",
|
|
65
|
+
"corpuser",
|
|
66
|
+
"corpGroup",
|
|
67
|
+
"container",
|
|
68
|
+
"domain",
|
|
69
|
+
"dataProduct",
|
|
70
|
+
"notebook",
|
|
71
|
+
"businessAttribute",
|
|
72
|
+
"schemaField",
|
|
73
|
+
"query",
|
|
74
|
+
"dataProcessInstance",
|
|
75
|
+
],
|
|
68
76
|
description="List of entity types to cleanup",
|
|
69
77
|
)
|
|
70
78
|
|
|
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
103
111
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
104
112
|
num_soft_deleted_entity_processed: int = 0
|
|
105
113
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
114
|
+
num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
|
|
115
|
+
default_factory=TopKDict
|
|
116
|
+
)
|
|
106
117
|
num_soft_deleted_entity_removal_started: int = 0
|
|
107
118
|
num_hard_deleted: int = 0
|
|
108
119
|
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
111
122
|
)
|
|
112
123
|
runtime_limit_reached: bool = False
|
|
113
124
|
deletion_limit_reached: bool = False
|
|
125
|
+
num_soft_deleted_entity_found: int = 0
|
|
126
|
+
num_soft_deleted_entity_invalid_urn: int = 0
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
133
146
|
self.config = config
|
|
134
147
|
self.report = report
|
|
135
148
|
self.dry_run = dry_run
|
|
136
|
-
self.start_time =
|
|
149
|
+
self.start_time = time.time()
|
|
137
150
|
self._report_lock: Lock = Lock()
|
|
138
151
|
self.last_print_time = 0.0
|
|
139
152
|
|
|
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
142
155
|
with self._report_lock:
|
|
143
156
|
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
144
157
|
|
|
158
|
+
def _increment_retained_by_type(self, type: str) -> None:
|
|
159
|
+
"""Thread-safe method to update report fields"""
|
|
160
|
+
with self._report_lock:
|
|
161
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
|
|
162
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
|
|
163
|
+
+ 1
|
|
164
|
+
)
|
|
165
|
+
|
|
145
166
|
def _increment_removal_started_count(self) -> None:
|
|
146
167
|
"""Thread-safe method to update report fields"""
|
|
147
168
|
with self._report_lock:
|
|
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
160
181
|
)
|
|
161
182
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
183
|
|
|
163
|
-
def delete_entity(self, urn:
|
|
184
|
+
def delete_entity(self, urn: Urn) -> None:
|
|
164
185
|
assert self.ctx.graph
|
|
165
186
|
|
|
166
|
-
entity_urn = Urn.from_string(urn)
|
|
167
187
|
if self.dry_run:
|
|
168
188
|
logger.info(
|
|
169
189
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
172
192
|
if self._deletion_limit_reached() or self._times_up():
|
|
173
193
|
return
|
|
174
194
|
self._increment_removal_started_count()
|
|
175
|
-
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
195
|
+
self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
|
|
176
196
|
self.ctx.graph.delete_references_to_urn(
|
|
177
|
-
urn=urn,
|
|
197
|
+
urn=urn.urn(),
|
|
178
198
|
dry_run=False,
|
|
179
199
|
)
|
|
180
|
-
self._update_report(urn,
|
|
200
|
+
self._update_report(urn.urn(), urn.entity_type)
|
|
181
201
|
|
|
182
|
-
def delete_soft_deleted_entity(self, urn:
|
|
202
|
+
def delete_soft_deleted_entity(self, urn: Urn) -> None:
|
|
183
203
|
assert self.ctx.graph
|
|
184
204
|
|
|
185
205
|
retention_time = (
|
|
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
187
207
|
- self.config.retention_days * 24 * 60 * 60
|
|
188
208
|
)
|
|
189
209
|
|
|
190
|
-
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
|
|
210
|
+
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
|
|
191
211
|
if "status" in aspect["aspects"]:
|
|
192
212
|
if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
|
|
193
213
|
"status"
|
|
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
196
216
|
self.delete_entity(urn)
|
|
197
217
|
else:
|
|
198
218
|
self._increment_retained_count()
|
|
219
|
+
self._increment_retained_by_type(urn.entity_type)
|
|
199
220
|
|
|
200
221
|
def _print_report(self) -> None:
|
|
201
222
|
time_taken = round(time.time() - self.last_print_time, 1)
|
|
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
204
225
|
self.last_print_time = time.time()
|
|
205
226
|
logger.info(f"\n{self.report.as_string()}")
|
|
206
227
|
|
|
207
|
-
def _process_futures(self, futures: Dict[Future,
|
|
228
|
+
def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
|
|
208
229
|
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
209
230
|
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
210
231
|
|
|
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
214
235
|
self.report.failure(
|
|
215
236
|
title="Failed to delete entity",
|
|
216
237
|
message="Failed to delete entity",
|
|
217
|
-
context=futures[future],
|
|
238
|
+
context=futures[future].urn(),
|
|
218
239
|
exc=future.exception(),
|
|
219
240
|
)
|
|
220
241
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
|
|
|
229
250
|
time.sleep(self.config.delay)
|
|
230
251
|
return futures
|
|
231
252
|
|
|
232
|
-
def
|
|
253
|
+
def _get_urns(self) -> Iterable[str]:
|
|
233
254
|
assert self.ctx.graph
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
-
# This will be remove in future version after server with fix has been
|
|
241
|
-
# around for a while
|
|
242
|
-
batch_size = 10
|
|
243
|
-
|
|
244
|
-
while True:
|
|
245
|
-
try:
|
|
246
|
-
if entity_type not in self.report.num_calls_made:
|
|
247
|
-
self.report.num_calls_made[entity_type] = 1
|
|
248
|
-
else:
|
|
249
|
-
self.report.num_calls_made[entity_type] += 1
|
|
250
|
-
self._print_report()
|
|
251
|
-
result = self.ctx.graph.execute_graphql(
|
|
252
|
-
graphql_query,
|
|
253
|
-
{
|
|
254
|
-
"input": {
|
|
255
|
-
"types": [entity_type],
|
|
256
|
-
"query": "*",
|
|
257
|
-
"scrollId": scroll_id if scroll_id else None,
|
|
258
|
-
"count": batch_size,
|
|
259
|
-
"orFilters": [
|
|
260
|
-
{
|
|
261
|
-
"and": [
|
|
262
|
-
{
|
|
263
|
-
"field": "removed",
|
|
264
|
-
"values": ["true"],
|
|
265
|
-
"condition": "EQUAL",
|
|
266
|
-
}
|
|
267
|
-
]
|
|
268
|
-
}
|
|
269
|
-
],
|
|
270
|
-
}
|
|
271
|
-
},
|
|
272
|
-
)
|
|
273
|
-
except Exception as e:
|
|
274
|
-
self.report.failure(
|
|
275
|
-
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
276
|
-
)
|
|
277
|
-
break
|
|
278
|
-
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
279
|
-
if not scroll_across_entities:
|
|
280
|
-
break
|
|
281
|
-
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
-
count = scroll_across_entities.get("count")
|
|
283
|
-
if not count or not search_results:
|
|
284
|
-
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
-
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
-
break
|
|
287
|
-
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
-
# Temp workaround. See note in beginning of the function
|
|
289
|
-
# We make the batch size = config after call has succeeded once
|
|
290
|
-
batch_size = self.config.batch_size
|
|
291
|
-
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
292
|
-
if entity_type not in self.report.num_entities_found:
|
|
293
|
-
self.report.num_entities_found[entity_type] = 0
|
|
294
|
-
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
-
"count"
|
|
255
|
+
# Entities created in the retention period are not considered for deletion
|
|
256
|
+
created_from = int(
|
|
257
|
+
(
|
|
258
|
+
datetime.now(timezone.utc).timestamp()
|
|
259
|
+
- self.config.retention_days * 24 * 60 * 60
|
|
296
260
|
)
|
|
297
|
-
|
|
298
|
-
|
|
261
|
+
* 1000
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
entity_types = self.config.entity_types
|
|
265
|
+
# dataProcessInstance is a special case where we need to get the entities separately
|
|
266
|
+
# because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
|
|
267
|
+
# Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
|
|
268
|
+
if (
|
|
269
|
+
self.config.entity_types
|
|
270
|
+
and "dataProcessInstance" in self.config.entity_types
|
|
271
|
+
):
|
|
272
|
+
entity_types = self.config.entity_types.copy()
|
|
273
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
274
|
+
entity_types=["dataProcessInstance"],
|
|
275
|
+
platform=self.config.platform,
|
|
276
|
+
env=self.config.env,
|
|
277
|
+
query=self.config.query,
|
|
278
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
279
|
+
batch_size=self.config.batch_size,
|
|
280
|
+
extraFilters=[
|
|
281
|
+
SearchFilterRule(
|
|
282
|
+
field="created",
|
|
283
|
+
condition="LESS_THAN",
|
|
284
|
+
values=[f"{created_from}"],
|
|
285
|
+
).to_raw()
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
entity_types.remove("dataProcessInstance")
|
|
299
290
|
|
|
300
|
-
def _get_urns(self) -> Iterable[str]:
|
|
301
|
-
assert self.ctx.graph
|
|
302
291
|
yield from self.ctx.graph.get_urns_by_filter(
|
|
303
|
-
entity_types=
|
|
292
|
+
entity_types=entity_types,
|
|
304
293
|
platform=self.config.platform,
|
|
305
294
|
env=self.config.env,
|
|
306
295
|
query=self.config.query,
|
|
307
296
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
308
297
|
batch_size=self.config.batch_size,
|
|
309
298
|
)
|
|
310
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
312
299
|
|
|
313
300
|
def _times_up(self) -> bool:
|
|
314
301
|
if (
|
|
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
335
322
|
return
|
|
336
323
|
self.start_time = time.time()
|
|
337
324
|
|
|
338
|
-
futures: Dict[Future,
|
|
325
|
+
futures: Dict[Future, Urn] = dict()
|
|
339
326
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
340
327
|
for urn in self._get_urns():
|
|
328
|
+
try:
|
|
329
|
+
self.report.num_soft_deleted_entity_found += 1
|
|
330
|
+
soft_deleted_urn = Urn.from_string(urn)
|
|
331
|
+
except InvalidUrnError as e:
|
|
332
|
+
logger.error(f"Failed to parse urn {urn} with error {e}")
|
|
333
|
+
self.report.num_soft_deleted_entity_invalid_urn += 1
|
|
334
|
+
continue
|
|
335
|
+
|
|
341
336
|
self._print_report()
|
|
342
337
|
while len(futures) >= self.config.futures_max_at_time:
|
|
343
338
|
futures = self._process_futures(futures)
|
|
344
339
|
if self._deletion_limit_reached() or self._times_up():
|
|
345
340
|
break
|
|
346
|
-
future = executor.submit(
|
|
347
|
-
|
|
341
|
+
future = executor.submit(
|
|
342
|
+
self.delete_soft_deleted_entity, soft_deleted_urn
|
|
343
|
+
)
|
|
344
|
+
futures[future] = soft_deleted_urn
|
|
348
345
|
|
|
349
346
|
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
350
347
|
while len(futures) > 0:
|
|
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
602
602
|
if not self.config.include_field_median_value:
|
|
603
603
|
return
|
|
604
604
|
try:
|
|
605
|
-
if self.dataset.engine.dialect.name.lower()
|
|
605
|
+
if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
|
|
606
606
|
column_profile.median = str(
|
|
607
607
|
self.dataset.engine.execute(
|
|
608
608
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
610
610
|
)
|
|
611
611
|
).scalar()
|
|
612
612
|
)
|
|
613
|
+
elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
|
|
614
|
+
column_profile.median = str(
|
|
615
|
+
self.dataset.engine.execute(
|
|
616
|
+
sa.select(
|
|
617
|
+
sa.text(
|
|
618
|
+
f"approx_percentile(`{column}`, 0.5) as approx_median"
|
|
619
|
+
)
|
|
620
|
+
).select_from(self.dataset._table)
|
|
621
|
+
).scalar()
|
|
622
|
+
)
|
|
613
623
|
elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
|
|
614
624
|
column_profile.median = str(
|
|
615
625
|
self.dataset.engine.execute(
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import time
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
|
|
@@ -115,6 +116,13 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
115
116
|
default=None, description="Mapping of source type to datahub platform"
|
|
116
117
|
)
|
|
117
118
|
|
|
119
|
+
username: Optional[str] = Field(
|
|
120
|
+
default=None, description="Username for MLflow authentication"
|
|
121
|
+
)
|
|
122
|
+
password: Optional[str] = Field(
|
|
123
|
+
default=None, description="Password for MLflow authentication"
|
|
124
|
+
)
|
|
125
|
+
|
|
118
126
|
|
|
119
127
|
@dataclass
|
|
120
128
|
class MLflowRegisteredModelStageInfo:
|
|
@@ -161,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
161
169
|
self.ctx = ctx
|
|
162
170
|
self.config = config
|
|
163
171
|
self.report = StaleEntityRemovalSourceReport()
|
|
164
|
-
self.client =
|
|
172
|
+
self.client = self._configure_client()
|
|
173
|
+
|
|
174
|
+
def _configure_client(self) -> MlflowClient:
|
|
175
|
+
if bool(self.config.username) != bool(self.config.password):
|
|
176
|
+
raise ValueError("Both username and password must be set together")
|
|
177
|
+
|
|
178
|
+
if self.config.username and self.config.password:
|
|
179
|
+
os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
|
|
180
|
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
|
|
181
|
+
|
|
182
|
+
return MlflowClient(
|
|
165
183
|
tracking_uri=self.config.tracking_uri,
|
|
166
184
|
registry_uri=self.config.registry_uri,
|
|
167
185
|
)
|
|
@@ -400,6 +400,10 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
400
400
|
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
401
401
|
) -> None:
|
|
402
402
|
for schema_name, tables in all_tables[self.database].items():
|
|
403
|
+
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
404
|
+
if not db_schemas[self.database].get(schema_name):
|
|
405
|
+
logger.warning(f"Schema {schema_name} not found")
|
|
406
|
+
continue
|
|
403
407
|
for table in tables:
|
|
404
408
|
schema = db_schemas[self.database][schema_name]
|
|
405
409
|
if (
|
|
@@ -407,6 +411,9 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
407
411
|
and schema.is_external_schema()
|
|
408
412
|
and schema.external_platform
|
|
409
413
|
):
|
|
414
|
+
logger.info(
|
|
415
|
+
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
416
|
+
)
|
|
410
417
|
# external_db_params = schema.option
|
|
411
418
|
upstream_platform = schema.external_platform.lower()
|
|
412
419
|
|
|
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
|
|
|
44
44
|
SELECT
|
|
45
45
|
schema_name,
|
|
46
46
|
schema_type,
|
|
47
|
-
schema_option,
|
|
47
|
+
cast(null as varchar(1024)) as schema_option,
|
|
48
48
|
cast(null as varchar(256)) as external_platform,
|
|
49
49
|
cast(null as varchar(256)) as external_database
|
|
50
50
|
FROM svv_redshift_schemas
|
|
@@ -100,7 +100,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
100
100
|
|
|
101
101
|
stream_pattern: AllowDenyPattern = Field(
|
|
102
102
|
default=AllowDenyPattern.allow_all(),
|
|
103
|
-
description="Regex patterns for streams to filter in ingestion.
|
|
103
|
+
description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
107
|
+
default=AllowDenyPattern.allow_all(),
|
|
108
|
+
description="Regex patterns for procedures to filter in ingestion. "
|
|
109
|
+
"Specify regex to match the entire procedure name in database.schema.procedure format. "
|
|
110
|
+
"e.g. to match all procedures starting with customer in Customer database and public schema,"
|
|
111
|
+
" use the regex 'Customer.public.customer.*'",
|
|
104
112
|
)
|
|
105
113
|
|
|
106
114
|
match_fully_qualified_names: bool = Field(
|
|
@@ -284,6 +292,11 @@ class SnowflakeV2Config(
|
|
|
284
292
|
description="If enabled, streams will be ingested as separate entities from tables/views.",
|
|
285
293
|
)
|
|
286
294
|
|
|
295
|
+
include_procedures: bool = Field(
|
|
296
|
+
default=True,
|
|
297
|
+
description="If enabled, procedures will be ingested as pipelines/tasks.",
|
|
298
|
+
)
|
|
299
|
+
|
|
287
300
|
structured_property_pattern: AllowDenyPattern = Field(
|
|
288
301
|
default=AllowDenyPattern.allow_all(),
|
|
289
302
|
description=(
|
|
@@ -164,6 +164,23 @@ class SnowflakeQuery:
|
|
|
164
164
|
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
165
165
|
order by table_schema, table_name"""
|
|
166
166
|
|
|
167
|
+
@staticmethod
|
|
168
|
+
def procedures_for_database(db_name: Optional[str]) -> str:
|
|
169
|
+
db_clause = f'"{db_name}".' if db_name is not None else ""
|
|
170
|
+
return f"""
|
|
171
|
+
SELECT procedure_catalog AS "PROCEDURE_CATALOG",
|
|
172
|
+
procedure_schema AS "PROCEDURE_SCHEMA",
|
|
173
|
+
procedure_name AS "PROCEDURE_NAME",
|
|
174
|
+
procedure_language AS "PROCEDURE_LANGUAGE",
|
|
175
|
+
argument_signature AS "ARGUMENT_SIGNATURE",
|
|
176
|
+
data_type AS "PROCEDURE_RETURN_TYPE",
|
|
177
|
+
procedure_definition AS "PROCEDURE_DEFINITION",
|
|
178
|
+
created AS "CREATED",
|
|
179
|
+
last_altered AS "LAST_ALTERED",
|
|
180
|
+
comment AS "COMMENT"
|
|
181
|
+
FROM {db_clause}information_schema.procedures
|
|
182
|
+
order by procedure_schema, procedure_name"""
|
|
183
|
+
|
|
167
184
|
@staticmethod
|
|
168
185
|
def get_all_tags():
|
|
169
186
|
return """
|
|
@@ -105,6 +105,7 @@ class SnowflakeV2Report(
|
|
|
105
105
|
databases_scanned: int = 0
|
|
106
106
|
tags_scanned: int = 0
|
|
107
107
|
streams_scanned: int = 0
|
|
108
|
+
procedures_scanned: int = 0
|
|
108
109
|
|
|
109
110
|
include_usage_stats: bool = False
|
|
110
111
|
include_operational_stats: bool = False
|
|
@@ -163,6 +164,8 @@ class SnowflakeV2Report(
|
|
|
163
164
|
self.tags_scanned += 1
|
|
164
165
|
elif ent_type == "stream":
|
|
165
166
|
self.streams_scanned += 1
|
|
167
|
+
elif ent_type == "procedure":
|
|
168
|
+
self.procedures_scanned += 1
|
|
166
169
|
else:
|
|
167
170
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
168
171
|
|
|
@@ -14,6 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
16
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
17
|
+
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
17
18
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
18
19
|
from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
|
|
19
20
|
from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
@@ -714,3 +715,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
714
715
|
stream_pagination_marker = stream_name
|
|
715
716
|
|
|
716
717
|
return streams
|
|
718
|
+
|
|
719
|
+
@serialized_lru_cache(maxsize=1)
|
|
720
|
+
def get_procedures_for_database(
|
|
721
|
+
self, db_name: str
|
|
722
|
+
) -> Dict[str, List[BaseProcedure]]:
|
|
723
|
+
procedures: Dict[str, List[BaseProcedure]] = {}
|
|
724
|
+
cur = self.connection.query(
|
|
725
|
+
SnowflakeQuery.procedures_for_database(db_name),
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
for procedure in cur:
|
|
729
|
+
if procedure["PROCEDURE_SCHEMA"] not in procedures:
|
|
730
|
+
procedures[procedure["PROCEDURE_SCHEMA"]] = []
|
|
731
|
+
|
|
732
|
+
procedures[procedure["PROCEDURE_SCHEMA"]].append(
|
|
733
|
+
BaseProcedure(
|
|
734
|
+
name=procedure["PROCEDURE_NAME"],
|
|
735
|
+
language=procedure["PROCEDURE_LANGUAGE"],
|
|
736
|
+
argument_signature=procedure["ARGUMENT_SIGNATURE"],
|
|
737
|
+
return_type=procedure["PROCEDURE_RETURN_TYPE"],
|
|
738
|
+
procedure_definition=procedure["PROCEDURE_DEFINITION"],
|
|
739
|
+
created=procedure["CREATED"],
|
|
740
|
+
last_altered=procedure["LAST_ALTERED"],
|
|
741
|
+
comment=procedure["COMMENT"],
|
|
742
|
+
extra_properties=None,
|
|
743
|
+
)
|
|
744
|
+
)
|
|
745
|
+
return procedures
|