acryl-datahub 0.15.0rc15__py3-none-any.whl → 0.15.0rc17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/METADATA +2485 -2501
- {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/RECORD +49 -49
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +7 -5
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +66 -20
- datahub/configuration/common.py +3 -3
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +5 -1
- datahub/ingestion/api/source_helpers.py +3 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +1 -1
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +3 -3
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/gc/datahub_gc.py +5 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
- datahub/ingestion/source/kafka/kafka.py +18 -11
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +12 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +17 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +45 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +35 -16
- datahub/ingestion/source/tableau/tableau_common.py +0 -1
- datahub/ingestion/source/unity/source.py +2 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/metadata/_schema_classes.py +122 -2
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +73 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sqlglot_lineage.py +48 -13
- datahub/testing/doctest.py +12 -0
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/top_level.txt +0 -0
|
@@ -117,3 +117,9 @@ class PipelineConfig(ConfigModel):
|
|
|
117
117
|
config = cls.parse_obj(resolved_dict)
|
|
118
118
|
config._raw_dict = raw_dict
|
|
119
119
|
return config
|
|
120
|
+
|
|
121
|
+
def get_raw_dict(self) -> Dict:
|
|
122
|
+
result = self._raw_dict
|
|
123
|
+
if result is None:
|
|
124
|
+
result = self.dict()
|
|
125
|
+
return result
|
|
@@ -65,11 +65,11 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
65
65
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
66
66
|
|
|
67
67
|
# These only apply in async modes.
|
|
68
|
-
max_threads:
|
|
69
|
-
max_pending_requests:
|
|
68
|
+
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
69
|
+
max_pending_requests: pydantic.PositiveInt = 2000
|
|
70
70
|
|
|
71
71
|
# Only applies in async batch mode.
|
|
72
|
-
max_per_batch:
|
|
72
|
+
max_per_batch: pydantic.PositiveInt = 100
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
@dataclasses.dataclass
|
|
@@ -201,6 +201,10 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
201
201
|
).infer_schema(file)
|
|
202
202
|
elif extension == ".json":
|
|
203
203
|
fields = json.JsonInferrer().infer_schema(file)
|
|
204
|
+
elif extension == ".jsonl":
|
|
205
|
+
fields = json.JsonInferrer(
|
|
206
|
+
max_rows=self.source_config.max_rows, format="jsonl"
|
|
207
|
+
).infer_schema(file)
|
|
204
208
|
elif extension == ".avro":
|
|
205
209
|
fields = avro.AvroInferrer().infer_schema(file)
|
|
206
210
|
else:
|
|
@@ -153,11 +153,6 @@ class DataHubGcSource(Source):
|
|
|
153
153
|
self.truncate_indices()
|
|
154
154
|
except Exception as e:
|
|
155
155
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
156
|
-
if self.dataprocess_cleanup:
|
|
157
|
-
try:
|
|
158
|
-
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
159
|
-
except Exception as e:
|
|
160
|
-
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
161
156
|
if self.soft_deleted_entities_cleanup:
|
|
162
157
|
try:
|
|
163
158
|
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
@@ -170,6 +165,11 @@ class DataHubGcSource(Source):
|
|
|
170
165
|
self.execution_request_cleanup.run()
|
|
171
166
|
except Exception as e:
|
|
172
167
|
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
168
|
+
if self.dataprocess_cleanup:
|
|
169
|
+
try:
|
|
170
|
+
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
171
|
+
except Exception as e:
|
|
172
|
+
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
173
173
|
yield from []
|
|
174
174
|
|
|
175
175
|
def truncate_indices(self) -> None:
|
|
@@ -60,7 +60,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
60
60
|
description="Query to filter entities",
|
|
61
61
|
)
|
|
62
62
|
limit_entities_delete: Optional[int] = Field(
|
|
63
|
-
|
|
63
|
+
25000, description="Max number of entities to delete."
|
|
64
64
|
)
|
|
65
65
|
|
|
66
66
|
runtime_limit_seconds: Optional[int] = Field(
|
|
@@ -141,6 +141,10 @@ class KafkaSourceConfig(
|
|
|
141
141
|
default=False,
|
|
142
142
|
description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
|
|
143
143
|
)
|
|
144
|
+
ingest_schemas_as_entities: bool = pydantic.Field(
|
|
145
|
+
default=False,
|
|
146
|
+
description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
|
|
147
|
+
)
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
def get_kafka_consumer(
|
|
@@ -343,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
343
347
|
else:
|
|
344
348
|
self.report.report_dropped(topic)
|
|
345
349
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
350
|
+
if self.source_config.ingest_schemas_as_entities:
|
|
351
|
+
# Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
|
|
352
|
+
for subject in self.schema_registry_client.get_subjects():
|
|
353
|
+
try:
|
|
354
|
+
yield from self._extract_record(
|
|
355
|
+
subject, True, topic_detail=None, extra_topic_config=None
|
|
356
|
+
)
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.warning(
|
|
359
|
+
f"Failed to extract subject {subject}", exc_info=True
|
|
360
|
+
)
|
|
361
|
+
self.report.report_warning(
|
|
362
|
+
"subject", f"Exception while extracting topic {subject}: {e}"
|
|
363
|
+
)
|
|
357
364
|
|
|
358
365
|
def _extract_record(
|
|
359
366
|
self,
|
|
@@ -88,8 +88,7 @@ class LookerFieldContext:
|
|
|
88
88
|
for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql):
|
|
89
89
|
matched_field = upstream_field_match.group(1)
|
|
90
90
|
# Remove quotes from field names
|
|
91
|
-
matched_field
|
|
92
|
-
column_names.append(matched_field)
|
|
91
|
+
column_names.append(matched_field.replace('"', "").replace("`", "").lower())
|
|
93
92
|
|
|
94
93
|
return column_names
|
|
95
94
|
|
|
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
|
|
|
25
25
|
LookMLSourceReport,
|
|
26
26
|
)
|
|
27
27
|
from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
|
|
28
|
+
from datahub.sql_parsing.schema_resolver import match_columns_to_schema
|
|
28
29
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
29
30
|
ColumnLineageInfo,
|
|
30
31
|
ColumnRef,
|
|
31
32
|
SqlParsingResult,
|
|
32
33
|
Urn,
|
|
34
|
+
create_and_cache_schema_resolver,
|
|
33
35
|
create_lineage_sql_parsed_result,
|
|
34
36
|
)
|
|
35
37
|
|
|
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
|
|
|
200
202
|
class AbstractViewUpstream(ABC):
|
|
201
203
|
"""
|
|
202
204
|
Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
|
|
203
|
-
For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
|
|
205
|
+
For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
|
|
204
206
|
"""
|
|
205
207
|
|
|
206
208
|
view_context: LookerViewContext
|
|
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
|
|
|
236
238
|
def create_fields(self) -> List[ViewField]:
|
|
237
239
|
return [] # it is for the special case
|
|
238
240
|
|
|
241
|
+
def create_upstream_column_refs(
|
|
242
|
+
self, upstream_urn: str, downstream_looker_columns: List[str]
|
|
243
|
+
) -> List[ColumnRef]:
|
|
244
|
+
"""
|
|
245
|
+
- **`upstream_urn`**: The URN of the upstream dataset.
|
|
246
|
+
|
|
247
|
+
- **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
|
|
248
|
+
|
|
249
|
+
- This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
|
|
250
|
+
"""
|
|
251
|
+
schema_resolver = create_and_cache_schema_resolver(
|
|
252
|
+
platform=self.view_context.view_connection.platform,
|
|
253
|
+
platform_instance=self.view_context.view_connection.platform_instance,
|
|
254
|
+
env=self.view_context.view_connection.platform_env or self.config.env,
|
|
255
|
+
graph=self.ctx.graph,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
|
|
259
|
+
|
|
260
|
+
if schema_info:
|
|
261
|
+
actual_columns = match_columns_to_schema(
|
|
262
|
+
schema_info, downstream_looker_columns
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
|
|
267
|
+
)
|
|
268
|
+
actual_columns = [column.lower() for column in downstream_looker_columns]
|
|
269
|
+
|
|
270
|
+
upstream_column_refs: List[ColumnRef] = []
|
|
271
|
+
|
|
272
|
+
for column in actual_columns:
|
|
273
|
+
upstream_column_refs.append(
|
|
274
|
+
ColumnRef(
|
|
275
|
+
column=column,
|
|
276
|
+
table=upstream_urn,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return upstream_column_refs
|
|
281
|
+
|
|
239
282
|
|
|
240
283
|
class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
241
284
|
"""
|
|
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
|
372
415
|
# in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
|
|
373
416
|
# referring to upstream table
|
|
374
417
|
if self._get_upstream_dataset_urn() and not upstreams_column_refs:
|
|
375
|
-
upstreams_column_refs =
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
)
|
|
382
|
-
for column in field_context.column_name_in_sql_attribute()
|
|
383
|
-
]
|
|
418
|
+
upstreams_column_refs = self.create_upstream_column_refs(
|
|
419
|
+
upstream_urn=self._get_upstream_dataset_urn()[
|
|
420
|
+
0
|
|
421
|
+
], # 0th index has table of from clause,
|
|
422
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
423
|
+
)
|
|
384
424
|
|
|
385
425
|
# fix any derived view reference present in urn
|
|
386
426
|
upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
|
|
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
487
527
|
return upstream_column_refs
|
|
488
528
|
|
|
489
529
|
explore_urn: str = self._get_upstream_dataset_urn()[0]
|
|
530
|
+
expected_columns: List[str] = []
|
|
490
531
|
|
|
491
532
|
for column in field_context.column_name_in_sql_attribute():
|
|
492
533
|
if column in self._get_explore_column_mapping():
|
|
493
534
|
explore_column: Dict = self._get_explore_column_mapping()[column]
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
column=explore_column.get("field", explore_column[NAME]),
|
|
497
|
-
table=explore_urn,
|
|
498
|
-
)
|
|
535
|
+
expected_columns.append(
|
|
536
|
+
explore_column.get("field", explore_column[NAME])
|
|
499
537
|
)
|
|
500
538
|
|
|
501
|
-
return
|
|
539
|
+
return self.create_upstream_column_refs(
|
|
540
|
+
upstream_urn=explore_urn, downstream_looker_columns=expected_columns
|
|
541
|
+
)
|
|
502
542
|
|
|
503
543
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
504
544
|
return self._get_upstream_dataset_urn()
|
|
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
|
|
|
548
588
|
def get_upstream_column_ref(
|
|
549
589
|
self, field_context: LookerFieldContext
|
|
550
590
|
) -> List[ColumnRef]:
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
|
|
556
|
-
)
|
|
557
|
-
|
|
558
|
-
return upstream_column_ref
|
|
591
|
+
return self.create_upstream_column_refs(
|
|
592
|
+
upstream_urn=self._get_upstream_dataset_urn(),
|
|
593
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
594
|
+
)
|
|
559
595
|
|
|
560
596
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
561
597
|
return [self._get_upstream_dataset_urn()]
|
|
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
|
|
|
609
645
|
self, field_context: LookerFieldContext
|
|
610
646
|
) -> List[ColumnRef]:
|
|
611
647
|
upstream_column_ref: List[ColumnRef] = []
|
|
648
|
+
|
|
612
649
|
if not self._get_upstream_dataset_urn():
|
|
613
650
|
return upstream_column_ref
|
|
614
651
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return upstream_column_ref
|
|
652
|
+
return self.create_upstream_column_refs(
|
|
653
|
+
upstream_urn=self._get_upstream_dataset_urn()[0],
|
|
654
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
655
|
+
)
|
|
621
656
|
|
|
622
657
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
623
658
|
return self._get_upstream_dataset_urn()
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -18,7 +18,6 @@ from pydantic import Field, validator
|
|
|
18
18
|
from requests.adapters import HTTPAdapter, Retry
|
|
19
19
|
from requests.exceptions import ConnectionError
|
|
20
20
|
from requests.models import HTTPBasicAuth, HTTPError
|
|
21
|
-
from sqllineage.runner import LineageRunner
|
|
22
21
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
23
22
|
|
|
24
23
|
import datahub.emitter.mce_builder as builder
|
|
@@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
820
819
|
)
|
|
821
820
|
return None
|
|
822
821
|
|
|
823
|
-
@lru_cache(maxsize=None)
|
|
824
|
-
def _get_source_from_query(self, raw_query: str) -> set:
|
|
825
|
-
query = self._replace_definitions(raw_query)
|
|
826
|
-
parser = LineageRunner(query)
|
|
827
|
-
source_paths = set()
|
|
828
|
-
try:
|
|
829
|
-
for table in parser.source_tables:
|
|
830
|
-
sources = str(table).split(".")
|
|
831
|
-
source_schema, source_table = sources[-2], sources[-1]
|
|
832
|
-
if source_schema == "<default>":
|
|
833
|
-
source_schema = str(self.config.default_schema)
|
|
834
|
-
|
|
835
|
-
source_paths.add(f"{source_schema}.{source_table}")
|
|
836
|
-
except Exception as e:
|
|
837
|
-
self.report.report_failure(
|
|
838
|
-
title="Failed to Extract Lineage From Query",
|
|
839
|
-
message="Unable to retrieve lineage from Mode query.",
|
|
840
|
-
context=f"Query: {raw_query}, Error: {str(e)}",
|
|
841
|
-
)
|
|
842
|
-
|
|
843
|
-
return source_paths
|
|
844
|
-
|
|
845
822
|
def _get_datasource_urn(
|
|
846
823
|
self,
|
|
847
824
|
platform: str,
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional, Set
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Set
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
|
22
22
|
platform_name,
|
|
23
23
|
support_status,
|
|
24
24
|
)
|
|
25
|
-
from datahub.ingestion.api.registry import import_path
|
|
26
25
|
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
|
27
26
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
27
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
|
|
|
39
38
|
ChartTypeClass,
|
|
40
39
|
DashboardInfoClass,
|
|
41
40
|
)
|
|
41
|
+
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
42
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
|
-
from datahub.utilities.sql_parser_base import SQLParser
|
|
45
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
46
45
|
|
|
47
46
|
logger = logging.getLogger(__name__)
|
|
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
|
|
|
270
269
|
parse_table_names_from_sql: bool = Field(
|
|
271
270
|
default=False, description="See note below."
|
|
272
271
|
)
|
|
273
|
-
sql_parser: str = Field(
|
|
274
|
-
default="datahub.utilities.sql_parser.DefaultSQLParser",
|
|
275
|
-
description="custom SQL parser. See note below for details.",
|
|
276
|
-
)
|
|
277
272
|
|
|
278
273
|
env: str = Field(
|
|
279
274
|
default=DEFAULT_ENV,
|
|
@@ -354,7 +349,6 @@ class RedashSource(Source):
|
|
|
354
349
|
self.api_page_limit = self.config.api_page_limit or math.inf
|
|
355
350
|
|
|
356
351
|
self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
|
|
357
|
-
self.sql_parser_path = self.config.sql_parser
|
|
358
352
|
|
|
359
353
|
logger.info(
|
|
360
354
|
f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
|
|
@@ -380,31 +374,6 @@ class RedashSource(Source):
|
|
|
380
374
|
config = RedashConfig.parse_obj(config_dict)
|
|
381
375
|
return cls(ctx, config)
|
|
382
376
|
|
|
383
|
-
@classmethod
|
|
384
|
-
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
|
|
385
|
-
assert "." in sql_parser_path, "sql_parser-path must contain a ."
|
|
386
|
-
parser_cls = import_path(sql_parser_path)
|
|
387
|
-
|
|
388
|
-
if not issubclass(parser_cls, SQLParser):
|
|
389
|
-
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
|
|
390
|
-
return parser_cls
|
|
391
|
-
|
|
392
|
-
@classmethod
|
|
393
|
-
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
|
|
394
|
-
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
|
|
395
|
-
|
|
396
|
-
try:
|
|
397
|
-
sql_table_names: List[str] = parser_cls(sql).get_tables()
|
|
398
|
-
except Exception as e:
|
|
399
|
-
logger.warning(f"Sql parser failed on {sql} with {e}")
|
|
400
|
-
return []
|
|
401
|
-
|
|
402
|
-
# Remove quotes from table names
|
|
403
|
-
sql_table_names = [t.replace('"', "") for t in sql_table_names]
|
|
404
|
-
sql_table_names = [t.replace("`", "") for t in sql_table_names]
|
|
405
|
-
|
|
406
|
-
return sql_table_names
|
|
407
|
-
|
|
408
377
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
|
409
378
|
url = f"/api/data_sources/{data_source_id}"
|
|
410
379
|
resp = self.client._get(url).json()
|
|
@@ -441,14 +410,6 @@ class RedashSource(Source):
|
|
|
441
410
|
|
|
442
411
|
return database_name
|
|
443
412
|
|
|
444
|
-
def _construct_datalineage_urn(
|
|
445
|
-
self, platform: str, database_name: str, sql_table_name: str
|
|
446
|
-
) -> str:
|
|
447
|
-
full_dataset_name = get_full_qualified_name(
|
|
448
|
-
platform, database_name, sql_table_name
|
|
449
|
-
)
|
|
450
|
-
return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
|
|
451
|
-
|
|
452
413
|
def _get_datasource_urns(
|
|
453
414
|
self, data_source: Dict, sql_query_data: Dict = {}
|
|
454
415
|
) -> Optional[List[str]]:
|
|
@@ -464,34 +425,23 @@ class RedashSource(Source):
|
|
|
464
425
|
# Getting table lineage from SQL parsing
|
|
465
426
|
if self.parse_table_names_from_sql and data_source_syntax == "sql":
|
|
466
427
|
dataset_urns = list()
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
428
|
+
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
|
429
|
+
query=query,
|
|
430
|
+
platform=platform,
|
|
431
|
+
env=self.config.env,
|
|
432
|
+
platform_instance=None,
|
|
433
|
+
default_db=database_name,
|
|
434
|
+
)
|
|
435
|
+
# make sure dataset_urns is not empty list
|
|
436
|
+
dataset_urns = sql_parser_in_tables.in_tables
|
|
437
|
+
if sql_parser_in_tables.debug_info.table_error:
|
|
472
438
|
self.report.queries_problem_parsing.add(str(query_id))
|
|
473
439
|
self.error(
|
|
474
440
|
logger,
|
|
475
441
|
"sql-parsing",
|
|
476
|
-
f"exception {
|
|
442
|
+
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
|
477
443
|
)
|
|
478
|
-
sql_table_names = []
|
|
479
|
-
for sql_table_name in sql_table_names:
|
|
480
|
-
try:
|
|
481
|
-
dataset_urns.append(
|
|
482
|
-
self._construct_datalineage_urn(
|
|
483
|
-
platform, database_name, sql_table_name
|
|
484
|
-
)
|
|
485
|
-
)
|
|
486
|
-
except Exception:
|
|
487
|
-
self.report.queries_problem_parsing.add(str(query_id))
|
|
488
|
-
self.warn(
|
|
489
|
-
logger,
|
|
490
|
-
"data-urn-invalid",
|
|
491
|
-
f"Problem making URN for {sql_table_name} parsed from query {query_id}",
|
|
492
|
-
)
|
|
493
444
|
|
|
494
|
-
# make sure dataset_urns is not empty list
|
|
495
445
|
return dataset_urns if len(dataset_urns) > 0 else None
|
|
496
446
|
|
|
497
447
|
else:
|
|
@@ -159,6 +159,7 @@ class RedshiftConfig(
|
|
|
159
159
|
description="Whether to extract column level lineage. This config works with rest-sink only.",
|
|
160
160
|
)
|
|
161
161
|
|
|
162
|
+
# TODO - use DatasetPropertiesConfigMixin instead
|
|
162
163
|
patch_custom_properties: bool = Field(
|
|
163
164
|
default=True,
|
|
164
165
|
description="Whether to patch custom properties on existing datasets rather than replace.",
|
|
@@ -831,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
831
831
|
customProperties=custom_properties,
|
|
832
832
|
)
|
|
833
833
|
if self.config.patch_custom_properties:
|
|
834
|
+
# TODO: use auto_incremental_properties workunit processor instead
|
|
835
|
+
# Deprecate use of patch_custom_properties
|
|
834
836
|
patch_builder = create_dataset_props_patch_builder(
|
|
835
837
|
dataset_urn, dataset_properties
|
|
836
838
|
)
|
|
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
|
|
|
16
16
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
17
17
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
18
18
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
19
|
+
from datahub.ingestion.api.incremental_properties_helper import (
|
|
20
|
+
IncrementalPropertiesConfigMixin,
|
|
21
|
+
)
|
|
19
22
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
20
23
|
ClassificationSourceConfigMixin,
|
|
21
24
|
)
|
|
@@ -188,6 +191,7 @@ class SnowflakeV2Config(
|
|
|
188
191
|
StatefulUsageConfigMixin,
|
|
189
192
|
StatefulProfilingConfigMixin,
|
|
190
193
|
ClassificationSourceConfigMixin,
|
|
194
|
+
IncrementalPropertiesConfigMixin,
|
|
191
195
|
):
|
|
192
196
|
include_usage_stats: bool = Field(
|
|
193
197
|
default=True,
|
|
@@ -129,7 +129,9 @@ class SnowflakeQuery:
|
|
|
129
129
|
row_count AS "ROW_COUNT",
|
|
130
130
|
bytes AS "BYTES",
|
|
131
131
|
clustering_key AS "CLUSTERING_KEY",
|
|
132
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
132
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
133
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
134
|
+
is_iceberg AS "IS_ICEBERG"
|
|
133
135
|
FROM {db_clause}information_schema.tables t
|
|
134
136
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
135
137
|
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
@@ -149,7 +151,9 @@ class SnowflakeQuery:
|
|
|
149
151
|
row_count AS "ROW_COUNT",
|
|
150
152
|
bytes AS "BYTES",
|
|
151
153
|
clustering_key AS "CLUSTERING_KEY",
|
|
152
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
154
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
155
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
156
|
+
is_iceberg AS "IS_ICEBERG"
|
|
153
157
|
FROM {db_clause}information_schema.tables t
|
|
154
158
|
where table_schema='{schema_name}'
|
|
155
159
|
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
|
|
|
113
113
|
external_lineage_queries_secs: float = -1
|
|
114
114
|
num_tables_with_known_upstreams: int = 0
|
|
115
115
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
116
|
+
num_secure_views_missing_definition: int = 0
|
|
116
117
|
|
|
117
118
|
data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
|
|
118
119
|
|
|
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
|
|
|
90
90
|
foreign_keys: List[SnowflakeFK] = field(default_factory=list)
|
|
91
91
|
tags: Optional[List[SnowflakeTag]] = None
|
|
92
92
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
93
|
+
is_dynamic: bool = False
|
|
94
|
+
is_iceberg: bool = False
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_hybrid(self) -> bool:
|
|
98
|
+
return self.type is not None and self.type == "HYBRID TABLE"
|
|
93
99
|
|
|
94
100
|
|
|
95
101
|
@dataclass
|
|
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
|
|
|
98
104
|
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
99
105
|
tags: Optional[List[SnowflakeTag]] = None
|
|
100
106
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
107
|
+
is_secure: bool = False
|
|
101
108
|
|
|
102
109
|
|
|
103
110
|
@dataclass
|
|
@@ -289,6 +296,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
289
296
|
rows_count=table["ROW_COUNT"],
|
|
290
297
|
comment=table["COMMENT"],
|
|
291
298
|
clustering_key=table["CLUSTERING_KEY"],
|
|
299
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
300
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
292
301
|
)
|
|
293
302
|
)
|
|
294
303
|
return tables
|
|
@@ -313,6 +322,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
313
322
|
rows_count=table["ROW_COUNT"],
|
|
314
323
|
comment=table["COMMENT"],
|
|
315
324
|
clustering_key=table["CLUSTERING_KEY"],
|
|
325
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
326
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
316
327
|
)
|
|
317
328
|
)
|
|
318
329
|
return tables
|
|
@@ -356,6 +367,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
356
367
|
materialized=(
|
|
357
368
|
view.get("is_materialized", "false").lower() == "true"
|
|
358
369
|
),
|
|
370
|
+
is_secure=(view.get("is_secure", "false").lower() == "true"),
|
|
359
371
|
)
|
|
360
372
|
)
|
|
361
373
|
|
|
@@ -431,6 +431,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
431
431
|
default_db=db_name,
|
|
432
432
|
default_schema=schema_name,
|
|
433
433
|
)
|
|
434
|
+
elif view.is_secure:
|
|
435
|
+
self.report.num_secure_views_missing_definition += 1
|
|
434
436
|
|
|
435
437
|
if self.config.include_technical_schema:
|
|
436
438
|
for view in views:
|
|
@@ -749,8 +751,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
749
751
|
) -> DatasetProperties:
|
|
750
752
|
custom_properties = {}
|
|
751
753
|
|
|
752
|
-
if isinstance(table, SnowflakeTable)
|
|
753
|
-
|
|
754
|
+
if isinstance(table, SnowflakeTable):
|
|
755
|
+
if table.clustering_key:
|
|
756
|
+
custom_properties["CLUSTERING_KEY"] = table.clustering_key
|
|
757
|
+
|
|
758
|
+
if table.is_hybrid:
|
|
759
|
+
custom_properties["IS_HYBRID"] = "true"
|
|
760
|
+
|
|
761
|
+
if table.is_dynamic:
|
|
762
|
+
custom_properties["IS_DYNAMIC"] = "true"
|
|
763
|
+
|
|
764
|
+
if table.is_iceberg:
|
|
765
|
+
custom_properties["IS_ICEBERG"] = "true"
|
|
766
|
+
|
|
767
|
+
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
768
|
+
custom_properties["IS_SECURE"] = "true"
|
|
754
769
|
|
|
755
770
|
return DatasetProperties(
|
|
756
771
|
name=table.name,
|