acryl-datahub 1.2.0.7rc2__py3-none-any.whl → 1.2.0.7rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (24) hide show
  1. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/METADATA +2525 -2521
  2. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/RECORD +23 -24
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/source/redshift/config.py +9 -6
  5. datahub/ingestion/source/redshift/lineage.py +386 -687
  6. datahub/ingestion/source/redshift/redshift.py +19 -106
  7. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +4 -1
  8. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  9. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  10. datahub/ingestion/source/sql/mssql/source.py +62 -3
  11. datahub/ingestion/source/unity/config.py +11 -0
  12. datahub/ingestion/source/unity/proxy.py +77 -0
  13. datahub/ingestion/source/unity/proxy_types.py +24 -0
  14. datahub/ingestion/source/unity/report.py +5 -0
  15. datahub/ingestion/source/unity/source.py +99 -1
  16. datahub/metadata/_internal_schema_classes.py +5 -5
  17. datahub/metadata/schema.avsc +66 -60
  18. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  19. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  20. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  21. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/WHEEL +0 -0
  22. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/entry_points.txt +0 -0
  23. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/licenses/LICENSE +0 -0
  24. {acryl_datahub-1.2.0.7rc2.dist-info → acryl_datahub-1.2.0.7rc3.dist-info}/top_level.txt +0 -0
@@ -1,466 +0,0 @@
1
- import collections
2
- import logging
3
- from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
4
-
5
- import redshift_connector
6
-
7
- from datahub.emitter import mce_builder
8
- from datahub.ingestion.api.closeable import Closeable
9
- from datahub.ingestion.api.common import PipelineContext
10
- from datahub.ingestion.api.workunit import MetadataWorkUnit
11
- from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
12
- from datahub.ingestion.source.redshift.lineage import (
13
- LineageCollectorType,
14
- RedshiftLineageExtractor,
15
- )
16
- from datahub.ingestion.source.redshift.query import (
17
- RedshiftCommonQuery,
18
- RedshiftProvisionedQuery,
19
- RedshiftServerlessQuery,
20
- )
21
- from datahub.ingestion.source.redshift.redshift_schema import (
22
- LineageRow,
23
- RedshiftDataDictionary,
24
- RedshiftSchema,
25
- RedshiftTable,
26
- RedshiftView,
27
- )
28
- from datahub.ingestion.source.redshift.report import RedshiftReport
29
- from datahub.ingestion.source.state.redundant_run_skip_handler import (
30
- RedundantLineageRunSkipHandler,
31
- )
32
- from datahub.metadata.urns import DatasetUrn
33
- from datahub.sql_parsing.sql_parsing_aggregator import (
34
- KnownQueryLineageInfo,
35
- ObservedQuery,
36
- SqlParsingAggregator,
37
- )
38
- from datahub.utilities.perf_timer import PerfTimer
39
-
40
- logger = logging.getLogger(__name__)
41
-
42
-
43
- class RedshiftSqlLineageV2(Closeable):
44
- # does lineage and usage based on SQL parsing.
45
-
46
- def __init__(
47
- self,
48
- config: RedshiftConfig,
49
- report: RedshiftReport,
50
- context: PipelineContext,
51
- database: str,
52
- redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
53
- ):
54
- self.platform = "redshift"
55
- self.config = config
56
- self.report = report
57
- self.context = context
58
-
59
- self.database = database
60
- self.known_urns: Set[str] = set() # will be set later
61
-
62
- self.aggregator = SqlParsingAggregator(
63
- platform=self.platform,
64
- platform_instance=self.config.platform_instance,
65
- env=self.config.env,
66
- generate_lineage=True,
67
- generate_queries=self.config.lineage_v2_generate_queries,
68
- generate_usage_statistics=False,
69
- generate_operations=False,
70
- usage_config=self.config,
71
- graph=self.context.graph,
72
- is_temp_table=self._is_temp_table,
73
- )
74
- self.report.sql_aggregator = self.aggregator.report
75
-
76
- self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
77
- if self.config.is_serverless:
78
- self.queries = RedshiftServerlessQuery()
79
-
80
- self._lineage_v1 = RedshiftLineageExtractor(
81
- config=config,
82
- report=report,
83
- context=context,
84
- redundant_run_skip_handler=redundant_run_skip_handler,
85
- )
86
-
87
- self.start_time, self.end_time = (
88
- self.report.lineage_start_time,
89
- self.report.lineage_end_time,
90
- ) = self._lineage_v1.get_time_window()
91
-
92
- def _is_temp_table(self, name: str) -> bool:
93
- return (
94
- DatasetUrn.create_from_ids(
95
- self.platform,
96
- name,
97
- env=self.config.env,
98
- platform_instance=self.config.platform_instance,
99
- ).urn()
100
- not in self.known_urns
101
- )
102
-
103
- def build(
104
- self,
105
- connection: redshift_connector.Connection,
106
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
107
- db_schemas: Dict[str, Dict[str, RedshiftSchema]],
108
- ) -> None:
109
- # Assume things not in `all_tables` as temp tables.
110
- self.known_urns = {
111
- DatasetUrn.create_from_ids(
112
- self.platform,
113
- f"{db}.{schema}.{table.name}",
114
- env=self.config.env,
115
- platform_instance=self.config.platform_instance,
116
- ).urn()
117
- for db, schemas in all_tables.items()
118
- for schema, tables in schemas.items()
119
- for table in tables
120
- }
121
-
122
- # Handle all the temp tables up front.
123
- if self.config.resolve_temp_table_in_lineage:
124
- for temp_row in self._lineage_v1.get_temp_tables(connection=connection):
125
- self.aggregator.add_observed_query(
126
- ObservedQuery(
127
- query=temp_row.query_text,
128
- default_db=self.database,
129
- default_schema=self.config.default_schema,
130
- session_id=temp_row.session_id,
131
- timestamp=temp_row.start_time,
132
- ),
133
- # The "temp table" query actually returns all CREATE TABLE statements, even if they
134
- # aren't explicitly a temp table. As such, setting is_known_temp_table=True
135
- # would not be correct. We already have mechanisms to autodetect temp tables,
136
- # so we won't lose anything by not setting it.
137
- is_known_temp_table=False,
138
- )
139
-
140
- populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
141
-
142
- if self.config.include_table_rename_lineage:
143
- # Process all the ALTER TABLE RENAME statements
144
- table_renames, _ = self._lineage_v1._process_table_renames(
145
- database=self.database,
146
- connection=connection,
147
- all_tables=collections.defaultdict(
148
- lambda: collections.defaultdict(set)
149
- ),
150
- )
151
- for entry in table_renames.values():
152
- self.aggregator.add_table_rename(entry)
153
-
154
- if self.config.table_lineage_mode in {
155
- LineageMode.SQL_BASED,
156
- LineageMode.MIXED,
157
- }:
158
- # Populate lineage by parsing table creating sqls
159
- query = self.queries.list_insert_create_queries_sql(
160
- db_name=self.database,
161
- start_time=self.start_time,
162
- end_time=self.end_time,
163
- )
164
- populate_calls.append(
165
- (
166
- LineageCollectorType.QUERY_SQL_PARSER,
167
- query,
168
- self._process_sql_parser_lineage,
169
- )
170
- )
171
- if self.config.table_lineage_mode in {
172
- LineageMode.STL_SCAN_BASED,
173
- LineageMode.MIXED,
174
- }:
175
- # Populate lineage by getting upstream tables from stl_scan redshift table
176
- query = self.queries.stl_scan_based_lineage_query(
177
- self.database,
178
- self.start_time,
179
- self.end_time,
180
- )
181
- populate_calls.append(
182
- (LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
183
- )
184
-
185
- if self.config.include_views and self.config.include_view_lineage:
186
- # Populate lineage for views
187
- query = self.queries.view_lineage_query()
188
- populate_calls.append(
189
- (LineageCollectorType.VIEW, query, self._process_view_lineage)
190
- )
191
-
192
- # Populate lineage for late binding views
193
- query = self.queries.list_late_view_ddls_query()
194
- populate_calls.append(
195
- (
196
- LineageCollectorType.VIEW_DDL_SQL_PARSING,
197
- query,
198
- self._process_view_lineage,
199
- )
200
- )
201
-
202
- if self.config.include_copy_lineage:
203
- # Populate lineage for copy commands.
204
- query = self.queries.list_copy_commands_sql(
205
- db_name=self.database,
206
- start_time=self.start_time,
207
- end_time=self.end_time,
208
- )
209
- populate_calls.append(
210
- (LineageCollectorType.COPY, query, self._process_copy_command)
211
- )
212
-
213
- if self.config.include_unload_lineage:
214
- # Populate lineage for unload commands.
215
- query = self.queries.list_unload_commands_sql(
216
- db_name=self.database,
217
- start_time=self.start_time,
218
- end_time=self.end_time,
219
- )
220
- populate_calls.append(
221
- (LineageCollectorType.UNLOAD, query, self._process_unload_command)
222
- )
223
-
224
- for lineage_type, query, processor in populate_calls:
225
- self._populate_lineage_agg(
226
- query=query,
227
- lineage_type=lineage_type,
228
- processor=processor,
229
- connection=connection,
230
- )
231
-
232
- # Populate lineage for external tables.
233
- if not self.config.skip_external_tables:
234
- self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
235
-
236
- def _populate_lineage_agg(
237
- self,
238
- query: str,
239
- lineage_type: LineageCollectorType,
240
- processor: Callable[[LineageRow], None],
241
- connection: redshift_connector.Connection,
242
- ) -> None:
243
- logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
244
- try:
245
- logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
246
-
247
- timer = self.report.lineage_phases_timer.setdefault(
248
- lineage_type.name, PerfTimer()
249
- )
250
- with timer:
251
- for lineage_row in RedshiftDataDictionary.get_lineage_rows(
252
- conn=connection, query=query
253
- ):
254
- processor(lineage_row)
255
- except Exception as e:
256
- self.report.warning(
257
- title="Failed to extract some lineage",
258
- message=f"Failed to extract lineage of type {lineage_type.name}",
259
- context=f"Query: '{query}'",
260
- exc=e,
261
- )
262
- self._lineage_v1.report_status(f"extract-{lineage_type.name}", False)
263
-
264
- def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
265
- ddl = lineage_row.ddl
266
- if ddl is None:
267
- return
268
-
269
- # TODO actor
270
-
271
- self.aggregator.add_observed_query(
272
- ObservedQuery(
273
- query=ddl,
274
- default_db=self.database,
275
- default_schema=self.config.default_schema,
276
- timestamp=lineage_row.timestamp,
277
- session_id=lineage_row.session_id,
278
- )
279
- )
280
-
281
- def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
282
- target = DatasetUrn.create_from_ids(
283
- self.platform,
284
- f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
285
- env=self.config.env,
286
- platform_instance=self.config.platform_instance,
287
- )
288
- if target.urn() not in self.known_urns:
289
- logger.debug(
290
- f"Skipping lineage for {target.urn()} as it is not in known_urns"
291
- )
292
- return None
293
-
294
- return target
295
-
296
- def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
297
- target = self._make_filtered_target(lineage_row)
298
- if not target:
299
- return
300
-
301
- source = DatasetUrn.create_from_ids(
302
- self.platform,
303
- f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
304
- env=self.config.env,
305
- platform_instance=self.config.platform_instance,
306
- )
307
-
308
- if lineage_row.ddl is None:
309
- logger.warning(
310
- f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
311
- )
312
- return
313
- self.aggregator.add_known_query_lineage(
314
- KnownQueryLineageInfo(
315
- query_text=lineage_row.ddl,
316
- downstream=target.urn(),
317
- upstreams=[source.urn()],
318
- timestamp=lineage_row.timestamp,
319
- ),
320
- merge_lineage=True,
321
- )
322
-
323
- def _process_view_lineage(self, lineage_row: LineageRow) -> None:
324
- ddl = lineage_row.ddl
325
- if ddl is None:
326
- return
327
-
328
- target = self._make_filtered_target(lineage_row)
329
- if not target:
330
- return
331
-
332
- self.aggregator.add_view_definition(
333
- view_urn=target,
334
- view_definition=ddl,
335
- default_db=self.database,
336
- default_schema=self.config.default_schema,
337
- )
338
-
339
- def _process_copy_command(self, lineage_row: LineageRow) -> None:
340
- logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
341
- sources = self._lineage_v1._get_sources(
342
- lineage_type=LineageCollectorType.COPY,
343
- db_name=self.database,
344
- source_schema=None,
345
- source_table=None,
346
- ddl=None,
347
- filename=lineage_row.filename,
348
- )
349
- logger.debug(f"Recognized sources: {sources}")
350
- source = sources[0]
351
- if not source:
352
- logger.debug("Ignoring command since couldn't recognize proper source")
353
- return
354
- s3_urn = source[0].urn
355
- logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
356
- if not lineage_row.target_schema or not lineage_row.target_table:
357
- logger.debug(
358
- f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
359
- )
360
- return
361
- target = self._make_filtered_target(lineage_row)
362
- if not target:
363
- return
364
-
365
- self.aggregator.add_known_lineage_mapping(
366
- upstream_urn=s3_urn, downstream_urn=target.urn()
367
- )
368
-
369
- def _process_unload_command(self, lineage_row: LineageRow) -> None:
370
- lineage_entry = self._lineage_v1._get_target_lineage(
371
- alias_db_name=self.database,
372
- lineage_row=lineage_row,
373
- lineage_type=LineageCollectorType.UNLOAD,
374
- all_tables_set={},
375
- )
376
- if not lineage_entry:
377
- return
378
- output_urn = lineage_entry.dataset.urn
379
-
380
- if not lineage_row.source_schema or not lineage_row.source_table:
381
- return
382
- source = DatasetUrn.create_from_ids(
383
- self.platform,
384
- f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
385
- env=self.config.env,
386
- platform_instance=self.config.platform_instance,
387
- )
388
- if source.urn() not in self.known_urns:
389
- logger.debug(
390
- f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
391
- )
392
- return
393
-
394
- self.aggregator.add_known_lineage_mapping(
395
- upstream_urn=source.urn(), downstream_urn=output_urn
396
- )
397
-
398
- def _process_external_tables(
399
- self,
400
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
401
- db_schemas: Dict[str, Dict[str, RedshiftSchema]],
402
- ) -> None:
403
- for schema_name, tables in all_tables[self.database].items():
404
- logger.info(f"External table lineage: checking schema {schema_name}")
405
- if not db_schemas[self.database].get(schema_name):
406
- logger.warning(f"Schema {schema_name} not found")
407
- continue
408
- for table in tables:
409
- schema = db_schemas[self.database][schema_name]
410
- if (
411
- table.is_external_table()
412
- and schema.is_external_schema()
413
- and schema.external_platform
414
- ):
415
- logger.info(
416
- f"External table lineage: processing table {schema_name}.{table.name}"
417
- )
418
- # external_db_params = schema.option
419
- upstream_platform = schema.external_platform.lower()
420
-
421
- table_urn = mce_builder.make_dataset_urn_with_platform_instance(
422
- self.platform,
423
- f"{self.database}.{schema_name}.{table.name}",
424
- platform_instance=self.config.platform_instance,
425
- env=self.config.env,
426
- )
427
- if upstream_platform == self.platform:
428
- upstream_schema = schema.get_upstream_schema_name() or "public"
429
- upstream_dataset_name = (
430
- f"{schema.external_database}.{upstream_schema}.{table.name}"
431
- )
432
- upstream_platform_instance = self.config.platform_instance
433
- else:
434
- upstream_dataset_name = (
435
- f"{schema.external_database}.{table.name}"
436
- )
437
- upstream_platform_instance = (
438
- self.config.platform_instance_map.get(upstream_platform)
439
- if self.config.platform_instance_map
440
- else None
441
- )
442
-
443
- upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
444
- upstream_platform,
445
- upstream_dataset_name,
446
- platform_instance=upstream_platform_instance,
447
- env=self.config.env,
448
- )
449
-
450
- self.aggregator.add_known_lineage_mapping(
451
- upstream_urn=upstream_urn,
452
- downstream_urn=table_urn,
453
- )
454
-
455
- def generate(self) -> Iterable[MetadataWorkUnit]:
456
- for mcp in self.aggregator.gen_metadata():
457
- yield mcp.as_workunit()
458
- if len(self.aggregator.report.observed_query_parse_failures) > 0:
459
- self.report.report_warning(
460
- title="Failed to extract some SQL lineage",
461
- message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
462
- context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
463
- )
464
-
465
- def close(self) -> None:
466
- self.aggregator.close()