acryl-datahub 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show
  1. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/METADATA +2569 -2569
  2. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/RECORD +37 -35
  3. datahub/_version.py +1 -1
  4. datahub/emitter/rest_emitter.py +2 -2
  5. datahub/ingestion/graph/client.py +6 -11
  6. datahub/ingestion/graph/filters.py +22 -2
  7. datahub/ingestion/source/common/subtypes.py +1 -1
  8. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  9. datahub/ingestion/source/ge_data_profiler.py +11 -1
  10. datahub/ingestion/source/mlflow.py +19 -1
  11. datahub/ingestion/source/redshift/lineage_v2.py +7 -0
  12. datahub/ingestion/source/redshift/query.py +1 -1
  13. datahub/ingestion/source/snowflake/constants.py +1 -0
  14. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  15. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  16. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  17. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  18. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  19. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  20. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  21. datahub/ingestion/source/sql/mssql/source.py +8 -4
  22. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  23. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  24. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  25. datahub/ingestion/source/superset.py +153 -13
  26. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  27. datahub/metadata/schema.avsc +2 -0
  28. datahub/metadata/schemas/Deprecation.avsc +2 -0
  29. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  30. datahub/sdk/__init__.py +1 -0
  31. datahub/sdk/main_client.py +2 -1
  32. datahub/sdk/search_filters.py +18 -23
  33. datahub/sql_parsing/split_statements.py +17 -3
  34. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/WHEEL +0 -0
  35. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/licenses/LICENSE +0 -0
  37. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/top_level.txt +0 -0
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
12
12
  from datahub.ingestion.api.common import PipelineContext
13
13
  from datahub.ingestion.api.source import SourceReport
14
14
  from datahub.ingestion.graph.client import DataHubGraph
15
- from datahub.ingestion.graph.filters import RemovedStatusFilter
15
+ from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
16
16
  from datahub.utilities.lossy_collections import LossyList
17
17
  from datahub.utilities.stats_collections import TopKDict
18
18
  from datahub.utilities.urns._urn_base import Urn
19
+ from datahub.utilities.urns.error import InvalidUrnError
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
- QUERY_ENTITIES = """
23
- query listEntities($input: ScrollAcrossEntitiesInput!) {
24
- scrollAcrossEntities(input: $input) {
25
- nextScrollId
26
- count
27
- searchResults {
28
- entity {
29
- ... on QueryEntity {
30
- urn
31
- }
32
- ... on DataProcessInstance {
33
- urn
34
- }
35
- }
36
- }
37
- }
38
- }
39
- """
40
-
41
23
 
42
24
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
43
25
  enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
64
46
  )
65
47
 
66
48
  entity_types: Optional[List[str]] = Field(
67
- default=None,
49
+ # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
50
+ default=[
51
+ "dataset",
52
+ "dashboard",
53
+ "chart",
54
+ "mlmodel",
55
+ "mlmodelGroup",
56
+ "mlfeatureTable",
57
+ "mlfeature",
58
+ "mlprimaryKey",
59
+ "dataFlow",
60
+ "dataJob",
61
+ "glossaryTerm",
62
+ "glossaryNode",
63
+ "tag",
64
+ "role",
65
+ "corpuser",
66
+ "corpGroup",
67
+ "container",
68
+ "domain",
69
+ "dataProduct",
70
+ "notebook",
71
+ "businessAttribute",
72
+ "schemaField",
73
+ "query",
74
+ "dataProcessInstance",
75
+ ],
68
76
  description="List of entity types to cleanup",
69
77
  )
70
78
 
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
103
111
  num_entities_found: Dict[str, int] = field(default_factory=dict)
104
112
  num_soft_deleted_entity_processed: int = 0
105
113
  num_soft_deleted_retained_due_to_age: int = 0
114
+ num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
115
+ default_factory=TopKDict
116
+ )
106
117
  num_soft_deleted_entity_removal_started: int = 0
107
118
  num_hard_deleted: int = 0
108
119
  num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
111
122
  )
112
123
  runtime_limit_reached: bool = False
113
124
  deletion_limit_reached: bool = False
125
+ num_soft_deleted_entity_found: int = 0
126
+ num_soft_deleted_entity_invalid_urn: int = 0
114
127
 
115
128
 
116
129
  class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
133
146
  self.config = config
134
147
  self.report = report
135
148
  self.dry_run = dry_run
136
- self.start_time = 0.0
149
+ self.start_time = time.time()
137
150
  self._report_lock: Lock = Lock()
138
151
  self.last_print_time = 0.0
139
152
 
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
142
155
  with self._report_lock:
143
156
  self.report.num_soft_deleted_retained_due_to_age += 1
144
157
 
158
+ def _increment_retained_by_type(self, type: str) -> None:
159
+ """Thread-safe method to update report fields"""
160
+ with self._report_lock:
161
+ self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
162
+ self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
163
+ + 1
164
+ )
165
+
145
166
  def _increment_removal_started_count(self) -> None:
146
167
  """Thread-safe method to update report fields"""
147
168
  with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
160
181
  )
161
182
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
183
 
163
- def delete_entity(self, urn: str) -> None:
184
+ def delete_entity(self, urn: Urn) -> None:
164
185
  assert self.ctx.graph
165
186
 
166
- entity_urn = Urn.from_string(urn)
167
187
  if self.dry_run:
168
188
  logger.info(
169
189
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
172
192
  if self._deletion_limit_reached() or self._times_up():
173
193
  return
174
194
  self._increment_removal_started_count()
175
- self.ctx.graph.delete_entity(urn=urn, hard=True)
195
+ self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
176
196
  self.ctx.graph.delete_references_to_urn(
177
- urn=urn,
197
+ urn=urn.urn(),
178
198
  dry_run=False,
179
199
  )
180
- self._update_report(urn, entity_urn.entity_type)
200
+ self._update_report(urn.urn(), urn.entity_type)
181
201
 
182
- def delete_soft_deleted_entity(self, urn: str) -> None:
202
+ def delete_soft_deleted_entity(self, urn: Urn) -> None:
183
203
  assert self.ctx.graph
184
204
 
185
205
  retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
187
207
  - self.config.retention_days * 24 * 60 * 60
188
208
  )
189
209
 
190
- aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
210
+ aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
191
211
  if "status" in aspect["aspects"]:
192
212
  if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
193
213
  "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
196
216
  self.delete_entity(urn)
197
217
  else:
198
218
  self._increment_retained_count()
219
+ self._increment_retained_by_type(urn.entity_type)
199
220
 
200
221
  def _print_report(self) -> None:
201
222
  time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
204
225
  self.last_print_time = time.time()
205
226
  logger.info(f"\n{self.report.as_string()}")
206
227
 
207
- def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
228
+ def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
208
229
  done, not_done = wait(futures, return_when=FIRST_COMPLETED)
209
230
  futures = {future: urn for future, urn in futures.items() if future in not_done}
210
231
 
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
214
235
  self.report.failure(
215
236
  title="Failed to delete entity",
216
237
  message="Failed to delete entity",
217
- context=futures[future],
238
+ context=futures[future].urn(),
218
239
  exc=future.exception(),
219
240
  )
220
241
  self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
229
250
  time.sleep(self.config.delay)
230
251
  return futures
231
252
 
232
- def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
253
+ def _get_urns(self) -> Iterable[str]:
233
254
  assert self.ctx.graph
234
- scroll_id: Optional[str] = None
235
-
236
- batch_size = self.config.batch_size
237
- if entity_type == "DATA_PROCESS_INSTANCE":
238
- # Due to a bug in Data process instance querying this is a temp workaround
239
- # to avoid a giant stacktrace by having a smaller batch size in first call
240
- # This will be remove in future version after server with fix has been
241
- # around for a while
242
- batch_size = 10
243
-
244
- while True:
245
- try:
246
- if entity_type not in self.report.num_calls_made:
247
- self.report.num_calls_made[entity_type] = 1
248
- else:
249
- self.report.num_calls_made[entity_type] += 1
250
- self._print_report()
251
- result = self.ctx.graph.execute_graphql(
252
- graphql_query,
253
- {
254
- "input": {
255
- "types": [entity_type],
256
- "query": "*",
257
- "scrollId": scroll_id if scroll_id else None,
258
- "count": batch_size,
259
- "orFilters": [
260
- {
261
- "and": [
262
- {
263
- "field": "removed",
264
- "values": ["true"],
265
- "condition": "EQUAL",
266
- }
267
- ]
268
- }
269
- ],
270
- }
271
- },
272
- )
273
- except Exception as e:
274
- self.report.failure(
275
- f"While trying to get {entity_type} with {scroll_id}", exc=e
276
- )
277
- break
278
- scroll_across_entities = result.get("scrollAcrossEntities")
279
- if not scroll_across_entities:
280
- break
281
- search_results = scroll_across_entities.get("searchResults")
282
- count = scroll_across_entities.get("count")
283
- if not count or not search_results:
284
- # Due to a server bug we cannot rely on just count as it was returning response like this
285
- # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
- break
287
- if entity_type == "DATA_PROCESS_INSTANCE":
288
- # Temp workaround. See note in beginning of the function
289
- # We make the batch size = config after call has succeeded once
290
- batch_size = self.config.batch_size
291
- scroll_id = scroll_across_entities.get("nextScrollId")
292
- if entity_type not in self.report.num_entities_found:
293
- self.report.num_entities_found[entity_type] = 0
294
- self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
- "count"
255
+ # Entities created in the retention period are not considered for deletion
256
+ created_from = int(
257
+ (
258
+ datetime.now(timezone.utc).timestamp()
259
+ - self.config.retention_days * 24 * 60 * 60
296
260
  )
297
- for query in search_results:
298
- yield query["entity"]["urn"]
261
+ * 1000
262
+ )
263
+
264
+ entity_types = self.config.entity_types
265
+ # dataProcessInstance is a special case where we need to get the entities separately
266
+ # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
267
+ # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
268
+ if (
269
+ self.config.entity_types
270
+ and "dataProcessInstance" in self.config.entity_types
271
+ ):
272
+ entity_types = self.config.entity_types.copy()
273
+ yield from self.ctx.graph.get_urns_by_filter(
274
+ entity_types=["dataProcessInstance"],
275
+ platform=self.config.platform,
276
+ env=self.config.env,
277
+ query=self.config.query,
278
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
279
+ batch_size=self.config.batch_size,
280
+ extraFilters=[
281
+ SearchFilterRule(
282
+ field="created",
283
+ condition="LESS_THAN",
284
+ values=[f"{created_from}"],
285
+ ).to_raw()
286
+ ],
287
+ )
288
+
289
+ entity_types.remove("dataProcessInstance")
299
290
 
300
- def _get_urns(self) -> Iterable[str]:
301
- assert self.ctx.graph
302
291
  yield from self.ctx.graph.get_urns_by_filter(
303
- entity_types=self.config.entity_types,
292
+ entity_types=entity_types,
304
293
  platform=self.config.platform,
305
294
  env=self.config.env,
306
295
  query=self.config.query,
307
296
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
308
297
  batch_size=self.config.batch_size,
309
298
  )
310
- yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
- yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
312
299
 
313
300
  def _times_up(self) -> bool:
314
301
  if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
335
322
  return
336
323
  self.start_time = time.time()
337
324
 
338
- futures: Dict[Future, str] = dict()
325
+ futures: Dict[Future, Urn] = dict()
339
326
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
340
327
  for urn in self._get_urns():
328
+ try:
329
+ self.report.num_soft_deleted_entity_found += 1
330
+ soft_deleted_urn = Urn.from_string(urn)
331
+ except InvalidUrnError as e:
332
+ logger.error(f"Failed to parse urn {urn} with error {e}")
333
+ self.report.num_soft_deleted_entity_invalid_urn += 1
334
+ continue
335
+
341
336
  self._print_report()
342
337
  while len(futures) >= self.config.futures_max_at_time:
343
338
  futures = self._process_futures(futures)
344
339
  if self._deletion_limit_reached() or self._times_up():
345
340
  break
346
- future = executor.submit(self.delete_soft_deleted_entity, urn)
347
- futures[future] = urn
341
+ future = executor.submit(
342
+ self.delete_soft_deleted_entity, soft_deleted_urn
343
+ )
344
+ futures[future] = soft_deleted_urn
348
345
 
349
346
  logger.info(f"Waiting for {len(futures)} futures to complete")
350
347
  while len(futures) > 0:
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
602
602
  if not self.config.include_field_median_value:
603
603
  return
604
604
  try:
605
- if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
605
+ if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
606
606
  column_profile.median = str(
607
607
  self.dataset.engine.execute(
608
608
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
610
610
  )
611
611
  ).scalar()
612
612
  )
613
+ elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
614
+ column_profile.median = str(
615
+ self.dataset.engine.execute(
616
+ sa.select(
617
+ sa.text(
618
+ f"approx_percentile(`{column}`, 0.5) as approx_median"
619
+ )
620
+ ).select_from(self.dataset._table)
621
+ ).scalar()
622
+ )
613
623
  elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
614
624
  column_profile.median = str(
615
625
  self.dataset.engine.execute(
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import time
3
4
  from dataclasses import dataclass
4
5
  from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
@@ -115,6 +116,13 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
115
116
  default=None, description="Mapping of source type to datahub platform"
116
117
  )
117
118
 
119
+ username: Optional[str] = Field(
120
+ default=None, description="Username for MLflow authentication"
121
+ )
122
+ password: Optional[str] = Field(
123
+ default=None, description="Password for MLflow authentication"
124
+ )
125
+
118
126
 
119
127
  @dataclass
120
128
  class MLflowRegisteredModelStageInfo:
@@ -161,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
161
169
  self.ctx = ctx
162
170
  self.config = config
163
171
  self.report = StaleEntityRemovalSourceReport()
164
- self.client = MlflowClient(
172
+ self.client = self._configure_client()
173
+
174
+ def _configure_client(self) -> MlflowClient:
175
+ if bool(self.config.username) != bool(self.config.password):
176
+ raise ValueError("Both username and password must be set together")
177
+
178
+ if self.config.username and self.config.password:
179
+ os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
180
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
181
+
182
+ return MlflowClient(
165
183
  tracking_uri=self.config.tracking_uri,
166
184
  registry_uri=self.config.registry_uri,
167
185
  )
@@ -400,6 +400,10 @@ class RedshiftSqlLineageV2(Closeable):
400
400
  db_schemas: Dict[str, Dict[str, RedshiftSchema]],
401
401
  ) -> None:
402
402
  for schema_name, tables in all_tables[self.database].items():
403
+ logger.info(f"External table lineage: checking schema {schema_name}")
404
+ if not db_schemas[self.database].get(schema_name):
405
+ logger.warning(f"Schema {schema_name} not found")
406
+ continue
403
407
  for table in tables:
404
408
  schema = db_schemas[self.database][schema_name]
405
409
  if (
@@ -407,6 +411,9 @@ class RedshiftSqlLineageV2(Closeable):
407
411
  and schema.is_external_schema()
408
412
  and schema.external_platform
409
413
  ):
414
+ logger.info(
415
+ f"External table lineage: processing table {schema_name}.{table.name}"
416
+ )
410
417
  # external_db_params = schema.option
411
418
  upstream_platform = schema.external_platform.lower()
412
419
 
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
44
44
  SELECT
45
45
  schema_name,
46
46
  schema_type,
47
- schema_option,
47
+ cast(null as varchar(1024)) as schema_option,
48
48
  cast(null as varchar(256)) as external_platform,
49
49
  cast(null as varchar(256)) as external_database
50
50
  FROM svv_redshift_schemas
@@ -54,6 +54,7 @@ class SnowflakeObjectDomain(StrEnum):
54
54
  COLUMN = "column"
55
55
  ICEBERG_TABLE = "iceberg table"
56
56
  STREAM = "stream"
57
+ PROCEDURE = "procedure"
57
58
 
58
59
 
59
60
  GENERIC_PERMISSION_ERROR_KEY = "permission-error"
@@ -100,7 +100,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
100
100
 
101
101
  stream_pattern: AllowDenyPattern = Field(
102
102
  default=AllowDenyPattern.allow_all(),
103
- description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
103
+ description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
104
+ )
105
+
106
+ procedure_pattern: AllowDenyPattern = Field(
107
+ default=AllowDenyPattern.allow_all(),
108
+ description="Regex patterns for procedures to filter in ingestion. "
109
+ "Specify regex to match the entire procedure name in database.schema.procedure format. "
110
+ "e.g. to match all procedures starting with customer in Customer database and public schema,"
111
+ " use the regex 'Customer.public.customer.*'",
104
112
  )
105
113
 
106
114
  match_fully_qualified_names: bool = Field(
@@ -284,6 +292,11 @@ class SnowflakeV2Config(
284
292
  description="If enabled, streams will be ingested as separate entities from tables/views.",
285
293
  )
286
294
 
295
+ include_procedures: bool = Field(
296
+ default=True,
297
+ description="If enabled, procedures will be ingested as pipelines/tasks.",
298
+ )
299
+
287
300
  structured_property_pattern: AllowDenyPattern = Field(
288
301
  default=AllowDenyPattern.allow_all(),
289
302
  description=(
@@ -164,6 +164,23 @@ class SnowflakeQuery:
164
164
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
165
165
  order by table_schema, table_name"""
166
166
 
167
+ @staticmethod
168
+ def procedures_for_database(db_name: Optional[str]) -> str:
169
+ db_clause = f'"{db_name}".' if db_name is not None else ""
170
+ return f"""
171
+ SELECT procedure_catalog AS "PROCEDURE_CATALOG",
172
+ procedure_schema AS "PROCEDURE_SCHEMA",
173
+ procedure_name AS "PROCEDURE_NAME",
174
+ procedure_language AS "PROCEDURE_LANGUAGE",
175
+ argument_signature AS "ARGUMENT_SIGNATURE",
176
+ data_type AS "PROCEDURE_RETURN_TYPE",
177
+ procedure_definition AS "PROCEDURE_DEFINITION",
178
+ created AS "CREATED",
179
+ last_altered AS "LAST_ALTERED",
180
+ comment AS "COMMENT"
181
+ FROM {db_clause}information_schema.procedures
182
+ order by procedure_schema, procedure_name"""
183
+
167
184
  @staticmethod
168
185
  def get_all_tags():
169
186
  return """
@@ -105,6 +105,7 @@ class SnowflakeV2Report(
105
105
  databases_scanned: int = 0
106
106
  tags_scanned: int = 0
107
107
  streams_scanned: int = 0
108
+ procedures_scanned: int = 0
108
109
 
109
110
  include_usage_stats: bool = False
110
111
  include_operational_stats: bool = False
@@ -163,6 +164,8 @@ class SnowflakeV2Report(
163
164
  self.tags_scanned += 1
164
165
  elif ent_type == "stream":
165
166
  self.streams_scanned += 1
167
+ elif ent_type == "procedure":
168
+ self.procedures_scanned += 1
166
169
  else:
167
170
  raise KeyError(f"Unknown entity {ent_type}.")
168
171
 
@@ -14,6 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
14
14
  SnowflakeQuery,
15
15
  )
16
16
  from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
17
+ from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
17
18
  from datahub.utilities.file_backed_collections import FileBackedDict
18
19
  from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
19
20
  from datahub.utilities.serialized_lru_cache import serialized_lru_cache
@@ -714,3 +715,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
714
715
  stream_pagination_marker = stream_name
715
716
 
716
717
  return streams
718
+
719
+ @serialized_lru_cache(maxsize=1)
720
+ def get_procedures_for_database(
721
+ self, db_name: str
722
+ ) -> Dict[str, List[BaseProcedure]]:
723
+ procedures: Dict[str, List[BaseProcedure]] = {}
724
+ cur = self.connection.query(
725
+ SnowflakeQuery.procedures_for_database(db_name),
726
+ )
727
+
728
+ for procedure in cur:
729
+ if procedure["PROCEDURE_SCHEMA"] not in procedures:
730
+ procedures[procedure["PROCEDURE_SCHEMA"]] = []
731
+
732
+ procedures[procedure["PROCEDURE_SCHEMA"]].append(
733
+ BaseProcedure(
734
+ name=procedure["PROCEDURE_NAME"],
735
+ language=procedure["PROCEDURE_LANGUAGE"],
736
+ argument_signature=procedure["ARGUMENT_SIGNATURE"],
737
+ return_type=procedure["PROCEDURE_RETURN_TYPE"],
738
+ procedure_definition=procedure["PROCEDURE_DEFINITION"],
739
+ created=procedure["CREATED"],
740
+ last_altered=procedure["LAST_ALTERED"],
741
+ comment=procedure["COMMENT"],
742
+ extra_properties=None,
743
+ )
744
+ )
745
+ return procedures