acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
|
|
|
43
43
|
"EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
|
|
44
44
|
"KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
|
|
45
45
|
"OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
|
|
46
|
+
"OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
|
|
46
47
|
}
|
|
47
48
|
|
|
48
49
|
_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
|
|
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
104
105
|
description="Connect args to pass to Snowflake SqlAlchemy driver",
|
|
105
106
|
exclude=True,
|
|
106
107
|
)
|
|
108
|
+
token: Optional[str] = pydantic.Field(
|
|
109
|
+
default=None,
|
|
110
|
+
description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
|
|
111
|
+
)
|
|
107
112
|
|
|
108
113
|
def get_account(self) -> str:
|
|
109
114
|
assert self.account_id
|
|
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
148
153
|
logger.info(f"using authenticator type '{v}'")
|
|
149
154
|
return v
|
|
150
155
|
|
|
156
|
+
@pydantic.validator("token", always=True)
|
|
157
|
+
def validate_token_oauth_config(cls, v, values):
|
|
158
|
+
auth_type = values.get("authentication_type")
|
|
159
|
+
if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
160
|
+
if not v:
|
|
161
|
+
raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
|
|
162
|
+
elif v is not None:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
|
|
165
|
+
)
|
|
166
|
+
return v
|
|
167
|
+
|
|
151
168
|
@staticmethod
|
|
152
169
|
def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
|
|
153
170
|
if oauth_config is None:
|
|
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
333
350
|
application=_APPLICATION_NAME,
|
|
334
351
|
**connect_args,
|
|
335
352
|
)
|
|
353
|
+
elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
354
|
+
return snowflake.connector.connect(
|
|
355
|
+
user=self.username,
|
|
356
|
+
account=self.account_id,
|
|
357
|
+
authenticator="oauth",
|
|
358
|
+
token=self.token, # Token generated externally and provided directly to the recipe
|
|
359
|
+
warehouse=self.warehouse,
|
|
360
|
+
role=self.role,
|
|
361
|
+
application=_APPLICATION_NAME,
|
|
362
|
+
**connect_args,
|
|
363
|
+
)
|
|
336
364
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR":
|
|
337
365
|
return self.get_oauth_connection()
|
|
338
366
|
elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
|
|
@@ -413,9 +413,14 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
413
413
|
return UpstreamLineageEdge.parse_obj(db_row)
|
|
414
414
|
except Exception as e:
|
|
415
415
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
416
|
+
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
417
|
+
downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
|
|
416
418
|
self.structured_reporter.warning(
|
|
417
419
|
"Failed to parse lineage edge",
|
|
418
|
-
|
|
420
|
+
# Tricky: sometimes the full row data is too large, and so the context
|
|
421
|
+
# message gets truncated. By pulling out the upstreams and downstream
|
|
422
|
+
# list, we can at least get the important fields if truncation does occur.
|
|
423
|
+
context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
|
|
419
424
|
exc=e,
|
|
420
425
|
)
|
|
421
426
|
return None
|
|
@@ -129,10 +129,12 @@ class SnowflakeQuery:
|
|
|
129
129
|
row_count AS "ROW_COUNT",
|
|
130
130
|
bytes AS "BYTES",
|
|
131
131
|
clustering_key AS "CLUSTERING_KEY",
|
|
132
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
132
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
133
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
134
|
+
is_iceberg AS "IS_ICEBERG"
|
|
133
135
|
FROM {db_clause}information_schema.tables t
|
|
134
136
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
135
|
-
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
|
|
137
|
+
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
136
138
|
order by table_schema, table_name"""
|
|
137
139
|
|
|
138
140
|
@staticmethod
|
|
@@ -149,10 +151,12 @@ class SnowflakeQuery:
|
|
|
149
151
|
row_count AS "ROW_COUNT",
|
|
150
152
|
bytes AS "BYTES",
|
|
151
153
|
clustering_key AS "CLUSTERING_KEY",
|
|
152
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
154
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
155
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
156
|
+
is_iceberg AS "IS_ICEBERG"
|
|
153
157
|
FROM {db_clause}information_schema.tables t
|
|
154
158
|
where table_schema='{schema_name}'
|
|
155
|
-
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
159
|
+
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
156
160
|
order by table_schema, table_name"""
|
|
157
161
|
|
|
158
162
|
@staticmethod
|
|
@@ -233,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
233
237
|
LIMIT {limit} {from_clause};
|
|
234
238
|
"""
|
|
235
239
|
|
|
240
|
+
@staticmethod
|
|
241
|
+
def get_secure_view_definitions() -> str:
|
|
242
|
+
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
243
|
+
return """
|
|
244
|
+
SELECT
|
|
245
|
+
TABLE_CATALOG as "TABLE_CATALOG",
|
|
246
|
+
TABLE_SCHEMA as "TABLE_SCHEMA",
|
|
247
|
+
TABLE_NAME as "TABLE_NAME",
|
|
248
|
+
VIEW_DEFINITION as "VIEW_DEFINITION"
|
|
249
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
|
|
250
|
+
WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
|
|
251
|
+
"""
|
|
252
|
+
|
|
236
253
|
@staticmethod
|
|
237
254
|
def columns_for_schema(
|
|
238
255
|
schema_name: str,
|
|
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
|
|
|
113
113
|
external_lineage_queries_secs: float = -1
|
|
114
114
|
num_tables_with_known_upstreams: int = 0
|
|
115
115
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
116
|
+
num_secure_views_missing_definition: int = 0
|
|
116
117
|
|
|
117
118
|
data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
|
|
118
119
|
|
|
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
|
|
|
90
90
|
foreign_keys: List[SnowflakeFK] = field(default_factory=list)
|
|
91
91
|
tags: Optional[List[SnowflakeTag]] = None
|
|
92
92
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
93
|
+
is_dynamic: bool = False
|
|
94
|
+
is_iceberg: bool = False
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_hybrid(self) -> bool:
|
|
98
|
+
return self.type is not None and self.type == "HYBRID TABLE"
|
|
93
99
|
|
|
94
100
|
|
|
95
101
|
@dataclass
|
|
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
|
|
|
98
104
|
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
99
105
|
tags: Optional[List[SnowflakeTag]] = None
|
|
100
106
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
107
|
+
is_secure: bool = False
|
|
101
108
|
|
|
102
109
|
|
|
103
110
|
@dataclass
|
|
@@ -259,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
259
266
|
snowflake_schemas.append(snowflake_schema)
|
|
260
267
|
return snowflake_schemas
|
|
261
268
|
|
|
269
|
+
@serialized_lru_cache(maxsize=1)
|
|
270
|
+
def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
|
|
271
|
+
secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
|
|
272
|
+
lambda: defaultdict(lambda: defaultdict())
|
|
273
|
+
)
|
|
274
|
+
cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
|
|
275
|
+
for view in cur:
|
|
276
|
+
db_name = view["TABLE_CATALOG"]
|
|
277
|
+
schema_name = view["TABLE_SCHEMA"]
|
|
278
|
+
view_name = view["TABLE_NAME"]
|
|
279
|
+
secure_view_definitions[db_name][schema_name][view_name] = view[
|
|
280
|
+
"VIEW_DEFINITION"
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
return secure_view_definitions
|
|
284
|
+
|
|
262
285
|
@serialized_lru_cache(maxsize=1)
|
|
263
286
|
def get_tables_for_database(
|
|
264
287
|
self, db_name: str
|
|
@@ -289,6 +312,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
289
312
|
rows_count=table["ROW_COUNT"],
|
|
290
313
|
comment=table["COMMENT"],
|
|
291
314
|
clustering_key=table["CLUSTERING_KEY"],
|
|
315
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
316
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
292
317
|
)
|
|
293
318
|
)
|
|
294
319
|
return tables
|
|
@@ -313,6 +338,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
313
338
|
rows_count=table["ROW_COUNT"],
|
|
314
339
|
comment=table["COMMENT"],
|
|
315
340
|
clustering_key=table["CLUSTERING_KEY"],
|
|
341
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
342
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
316
343
|
)
|
|
317
344
|
)
|
|
318
345
|
return tables
|
|
@@ -356,6 +383,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
356
383
|
materialized=(
|
|
357
384
|
view.get("is_materialized", "false").lower() == "true"
|
|
358
385
|
),
|
|
386
|
+
is_secure=(view.get("is_secure", "false").lower() == "true"),
|
|
359
387
|
)
|
|
360
388
|
)
|
|
361
389
|
|
|
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
|
|
|
103
103
|
logger = logging.getLogger(__name__)
|
|
104
104
|
|
|
105
105
|
# https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
106
|
+
# TODO: Move to the standardized types in sql_types.py
|
|
106
107
|
SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
|
|
107
108
|
"DATE": DateType,
|
|
108
109
|
"BIGINT": NumberType,
|
|
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
423
424
|
view_identifier = self.identifiers.get_dataset_identifier(
|
|
424
425
|
view.name, schema_name, db_name
|
|
425
426
|
)
|
|
427
|
+
if view.is_secure and not view.view_definition:
|
|
428
|
+
view.view_definition = self.fetch_secure_view_definition(
|
|
429
|
+
view.name, schema_name, db_name
|
|
430
|
+
)
|
|
426
431
|
if view.view_definition:
|
|
427
432
|
self.aggregator.add_view_definition(
|
|
428
433
|
view_urn=self.identifiers.gen_dataset_urn(view_identifier),
|
|
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
430
435
|
default_db=db_name,
|
|
431
436
|
default_schema=schema_name,
|
|
432
437
|
)
|
|
438
|
+
elif view.is_secure:
|
|
439
|
+
self.report.num_secure_views_missing_definition += 1
|
|
433
440
|
|
|
434
441
|
if self.config.include_technical_schema:
|
|
435
442
|
for view in views:
|
|
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
446
453
|
context=f"{db_name}.{schema_name}",
|
|
447
454
|
)
|
|
448
455
|
|
|
456
|
+
def fetch_secure_view_definition(
|
|
457
|
+
self, table_name: str, schema_name: str, db_name: str
|
|
458
|
+
) -> Optional[str]:
|
|
459
|
+
try:
|
|
460
|
+
view_definitions = self.data_dictionary.get_secure_view_definitions()
|
|
461
|
+
return view_definitions[db_name][schema_name][table_name]
|
|
462
|
+
except Exception as e:
|
|
463
|
+
if isinstance(e, SnowflakePermissionError):
|
|
464
|
+
error_msg = (
|
|
465
|
+
"Failed to get secure views definitions. Please check permissions."
|
|
466
|
+
)
|
|
467
|
+
else:
|
|
468
|
+
error_msg = "Failed to get secure views definitions"
|
|
469
|
+
self.structured_reporter.warning(
|
|
470
|
+
error_msg,
|
|
471
|
+
exc=e,
|
|
472
|
+
)
|
|
473
|
+
return None
|
|
474
|
+
|
|
449
475
|
def fetch_views_for_schema(
|
|
450
476
|
self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
|
|
451
477
|
) -> List[SnowflakeView]:
|
|
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
748
774
|
) -> DatasetProperties:
|
|
749
775
|
custom_properties = {}
|
|
750
776
|
|
|
751
|
-
if isinstance(table, SnowflakeTable)
|
|
752
|
-
|
|
777
|
+
if isinstance(table, SnowflakeTable):
|
|
778
|
+
if table.clustering_key:
|
|
779
|
+
custom_properties["CLUSTERING_KEY"] = table.clustering_key
|
|
780
|
+
|
|
781
|
+
if table.is_hybrid:
|
|
782
|
+
custom_properties["IS_HYBRID"] = "true"
|
|
783
|
+
|
|
784
|
+
if table.is_dynamic:
|
|
785
|
+
custom_properties["IS_DYNAMIC"] = "true"
|
|
786
|
+
|
|
787
|
+
if table.is_iceberg:
|
|
788
|
+
custom_properties["IS_ICEBERG"] = "true"
|
|
789
|
+
|
|
790
|
+
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
791
|
+
custom_properties["IS_SECURE"] = "true"
|
|
753
792
|
|
|
754
793
|
return DatasetProperties(
|
|
755
794
|
name=table.name,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from functools import cached_property
|
|
3
|
-
from typing import ClassVar, Literal, Optional, Tuple
|
|
3
|
+
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
6
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
@@ -119,7 +119,6 @@ class SnowflakeFilter:
|
|
|
119
119
|
) -> bool:
|
|
120
120
|
if not dataset_type or not dataset_name:
|
|
121
121
|
return True
|
|
122
|
-
dataset_params = dataset_name.split(".")
|
|
123
122
|
if dataset_type.lower() not in (
|
|
124
123
|
SnowflakeObjectDomain.TABLE,
|
|
125
124
|
SnowflakeObjectDomain.EXTERNAL_TABLE,
|
|
@@ -131,6 +130,7 @@ class SnowflakeFilter:
|
|
|
131
130
|
if _is_sys_table(dataset_name):
|
|
132
131
|
return False
|
|
133
132
|
|
|
133
|
+
dataset_params = _split_qualified_name(dataset_name)
|
|
134
134
|
if len(dataset_params) != 3:
|
|
135
135
|
self.structured_reporter.info(
|
|
136
136
|
title="Unexpected dataset pattern",
|
|
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
184
184
|
return table_name.lower().startswith("sys$")
|
|
185
185
|
|
|
186
186
|
|
|
187
|
+
def _split_qualified_name(qualified_name: str) -> List[str]:
|
|
188
|
+
"""
|
|
189
|
+
Split a qualified name into its constituent parts.
|
|
190
|
+
|
|
191
|
+
>>> _split_qualified_name("db.my_schema.my_table")
|
|
192
|
+
['db', 'my_schema', 'my_table']
|
|
193
|
+
>>> _split_qualified_name('"db"."my_schema"."my_table"')
|
|
194
|
+
['db', 'my_schema', 'my_table']
|
|
195
|
+
>>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
|
|
196
|
+
['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
|
|
197
|
+
>>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
|
|
198
|
+
['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# Fast path - no quotes.
|
|
202
|
+
if '"' not in qualified_name:
|
|
203
|
+
return qualified_name.split(".")
|
|
204
|
+
|
|
205
|
+
# First pass - split on dots that are not inside quotes.
|
|
206
|
+
in_quote = False
|
|
207
|
+
parts: List[List[str]] = [[]]
|
|
208
|
+
for char in qualified_name:
|
|
209
|
+
if char == '"':
|
|
210
|
+
in_quote = not in_quote
|
|
211
|
+
elif char == "." and not in_quote:
|
|
212
|
+
parts.append([])
|
|
213
|
+
else:
|
|
214
|
+
parts[-1].append(char)
|
|
215
|
+
|
|
216
|
+
# Second pass - remove outer pairs of quotes.
|
|
217
|
+
result = []
|
|
218
|
+
for part in parts:
|
|
219
|
+
if len(part) > 2 and part[0] == '"' and part[-1] == '"':
|
|
220
|
+
part = part[1:-1]
|
|
221
|
+
|
|
222
|
+
result.append("".join(part))
|
|
223
|
+
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
|
|
187
227
|
# Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
|
|
188
228
|
# For example "test-database"."test-schema".test_table
|
|
189
229
|
# whereas we generate urns without quotes even for quoted identifiers for backward compatibility
|
|
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
192
232
|
def _cleanup_qualified_name(
|
|
193
233
|
qualified_name: str, structured_reporter: SourceReport
|
|
194
234
|
) -> str:
|
|
195
|
-
name_parts = qualified_name
|
|
235
|
+
name_parts = _split_qualified_name(qualified_name)
|
|
196
236
|
if len(name_parts) != 3:
|
|
197
237
|
if not _is_sys_table(qualified_name):
|
|
198
238
|
structured_reporter.info(
|
|
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
|
|
|
203
243
|
)
|
|
204
244
|
return qualified_name.replace('"', "")
|
|
205
245
|
return _combine_identifier_parts(
|
|
206
|
-
db_name=name_parts[0]
|
|
207
|
-
schema_name=name_parts[1]
|
|
208
|
-
table_name=name_parts[2]
|
|
246
|
+
db_name=name_parts[0],
|
|
247
|
+
schema_name=name_parts[1],
|
|
248
|
+
table_name=name_parts[2],
|
|
209
249
|
)
|
|
210
250
|
|
|
211
251
|
|
|
@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
17
|
support_status,
|
|
18
18
|
)
|
|
19
19
|
from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
|
|
20
|
+
from datahub.ingestion.api.incremental_properties_helper import (
|
|
21
|
+
auto_incremental_properties,
|
|
22
|
+
)
|
|
20
23
|
from datahub.ingestion.api.source import (
|
|
21
24
|
CapabilityReport,
|
|
22
25
|
MetadataWorkUnitProcessor,
|
|
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
|
|
|
446
449
|
functools.partial(
|
|
447
450
|
auto_incremental_lineage, self.config.incremental_lineage
|
|
448
451
|
),
|
|
452
|
+
functools.partial(
|
|
453
|
+
auto_incremental_properties, self.config.incremental_properties
|
|
454
|
+
),
|
|
449
455
|
StaleEntityRemovalHandler.create(
|
|
450
456
|
self, self.config, self.ctx
|
|
451
457
|
).workunit_processor,
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
26
|
platform_name,
|
|
27
27
|
support_status,
|
|
28
28
|
)
|
|
29
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
29
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
31
32
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
36
|
register_custom_type,
|
|
36
37
|
)
|
|
37
38
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
|
|
39
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
39
41
|
add_table_to_schema_container,
|
|
40
42
|
gen_database_container,
|
|
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
|
|
|
48
50
|
get_schema_fields_for_sqlalchemy_column,
|
|
49
51
|
)
|
|
50
52
|
|
|
53
|
+
try:
|
|
54
|
+
from typing_extensions import override
|
|
55
|
+
except ImportError:
|
|
56
|
+
_F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
|
|
57
|
+
|
|
58
|
+
def override(f: _F, /) -> _F: # noqa: F811
|
|
59
|
+
return f
|
|
60
|
+
|
|
61
|
+
|
|
51
62
|
logger = logging.getLogger(__name__)
|
|
52
63
|
|
|
53
64
|
assert STRUCT, "required type modules are not available"
|
|
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
|
|
|
322
333
|
- Profiling when enabled.
|
|
323
334
|
"""
|
|
324
335
|
|
|
325
|
-
|
|
336
|
+
config: AthenaConfig
|
|
337
|
+
report: SQLSourceReport
|
|
326
338
|
|
|
327
339
|
def __init__(self, config, ctx):
|
|
328
340
|
super().__init__(config, ctx, "athena")
|
|
329
341
|
self.cursor: Optional[BaseCursor] = None
|
|
330
342
|
|
|
343
|
+
self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
|
|
344
|
+
|
|
331
345
|
@classmethod
|
|
332
346
|
def create(cls, config_dict, ctx):
|
|
333
347
|
config = AthenaConfig.parse_obj(config_dict)
|
|
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
452
466
|
)
|
|
453
467
|
|
|
454
468
|
# It seems like database/schema filter in the connection string does not work and this to work around that
|
|
469
|
+
@override
|
|
455
470
|
def get_schema_names(self, inspector: Inspector) -> List[str]:
|
|
456
471
|
athena_config = typing.cast(AthenaConfig, self.config)
|
|
457
472
|
schemas = inspector.get_schema_names()
|
|
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
|
|
|
459
474
|
return [schema for schema in schemas if schema == athena_config.database]
|
|
460
475
|
return schemas
|
|
461
476
|
|
|
462
|
-
|
|
477
|
+
@classmethod
|
|
478
|
+
def _casted_partition_key(cls, key: str) -> str:
|
|
479
|
+
# We need to cast the partition keys to a VARCHAR, since otherwise
|
|
480
|
+
# Athena may throw an error during concatenation / comparison.
|
|
481
|
+
return f"CAST({key} as VARCHAR)"
|
|
482
|
+
|
|
483
|
+
@override
|
|
463
484
|
def get_partitions(
|
|
464
485
|
self, inspector: Inspector, schema: str, table: str
|
|
465
|
-
) -> List[str]:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
athena_config = typing.cast(AthenaConfig, self.config)
|
|
469
|
-
|
|
470
|
-
if not athena_config.extract_partitions:
|
|
471
|
-
return []
|
|
486
|
+
) -> Optional[List[str]]:
|
|
487
|
+
if not self.config.extract_partitions:
|
|
488
|
+
return None
|
|
472
489
|
|
|
473
490
|
if not self.cursor:
|
|
474
|
-
return
|
|
491
|
+
return None
|
|
475
492
|
|
|
476
493
|
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
477
494
|
table_name=table, schema_name=schema
|
|
478
495
|
)
|
|
479
496
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
return []
|
|
497
|
+
partitions = []
|
|
498
|
+
for key in metadata.partition_keys:
|
|
499
|
+
if key.name:
|
|
500
|
+
partitions.append(key.name)
|
|
501
|
+
if not partitions:
|
|
502
|
+
return []
|
|
487
503
|
|
|
488
|
-
|
|
489
|
-
|
|
504
|
+
with self.report.report_exc(
|
|
505
|
+
message="Failed to extract partition details",
|
|
506
|
+
context=f"{schema}.{table}",
|
|
507
|
+
level=StructuredLogLevel.WARN,
|
|
508
|
+
):
|
|
509
|
+
# We create an artifical concatenated partition key to be able to query max partition easier
|
|
510
|
+
part_concat = " || '-' || ".join(
|
|
511
|
+
self._casted_partition_key(key) for key in partitions
|
|
512
|
+
)
|
|
490
513
|
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
|
|
491
514
|
ret = self.cursor.execute(max_partition_query)
|
|
492
515
|
max_partition: Dict[str, str] = {}
|
|
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
|
|
|
500
523
|
partitions=partitions,
|
|
501
524
|
max_partition=max_partition,
|
|
502
525
|
)
|
|
503
|
-
return partitions
|
|
504
526
|
|
|
505
|
-
return
|
|
527
|
+
return partitions
|
|
506
528
|
|
|
507
529
|
# Overwrite to modify the creation of schema fields
|
|
508
530
|
def get_schema_fields_for_column(
|
|
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
573
|
if partition and partition.max_partition:
|
|
552
574
|
max_partition_filters = []
|
|
553
575
|
for key, value in partition.max_partition.items():
|
|
554
|
-
max_partition_filters.append(
|
|
576
|
+
max_partition_filters.append(
|
|
577
|
+
f"{self._casted_partition_key(key)} = '{value}'"
|
|
578
|
+
)
|
|
555
579
|
max_partition = str(partition.max_partition)
|
|
556
580
|
return (
|
|
557
581
|
max_partition,
|
|
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
8
|
-
|
|
9
|
-
# This import verifies that the dependencies are available.
|
|
10
8
|
from pydantic.fields import Field
|
|
11
9
|
from sqlalchemy import create_engine, inspect
|
|
12
10
|
from sqlalchemy.engine.base import Connection
|
|
@@ -50,6 +48,7 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
50
48
|
BasicSQLAlchemyConfig,
|
|
51
49
|
make_sqlalchemy_uri,
|
|
52
50
|
)
|
|
51
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
53
52
|
from datahub.metadata.schema_classes import (
|
|
54
53
|
BooleanTypeClass,
|
|
55
54
|
NumberTypeClass,
|
|
@@ -78,6 +77,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
78
77
|
include_stored_procedures_code: bool = Field(
|
|
79
78
|
default=True, description="Include information about object code."
|
|
80
79
|
)
|
|
80
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
81
|
+
default=AllowDenyPattern.allow_all(),
|
|
82
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
83
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
84
|
+
)
|
|
81
85
|
include_jobs: bool = Field(
|
|
82
86
|
default=True,
|
|
83
87
|
description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
|
|
@@ -164,6 +168,8 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
164
168
|
If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure.
|
|
165
169
|
"""
|
|
166
170
|
|
|
171
|
+
report: SQLSourceReport
|
|
172
|
+
|
|
167
173
|
def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
|
|
168
174
|
super().__init__(config, ctx, "mssql")
|
|
169
175
|
# Cache the table and column descriptions
|
|
@@ -416,10 +422,16 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
416
422
|
data_flow = MSSQLDataFlow(entity=mssql_default_job)
|
|
417
423
|
with inspector.engine.connect() as conn:
|
|
418
424
|
procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
|
|
419
|
-
procedures = [
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
425
|
+
procedures: List[StoredProcedure] = []
|
|
426
|
+
for procedure_data in procedures_data_list:
|
|
427
|
+
procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}"
|
|
428
|
+
if not self.config.procedure_pattern.allowed(procedure_full_name):
|
|
429
|
+
self.report.report_dropped(procedure_full_name)
|
|
430
|
+
continue
|
|
431
|
+
procedures.append(
|
|
432
|
+
StoredProcedure(flow=mssql_default_job, **procedure_data)
|
|
433
|
+
)
|
|
434
|
+
|
|
423
435
|
if procedures:
|
|
424
436
|
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
425
437
|
for procedure in procedures:
|