acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
|
|
|
103
103
|
logger = logging.getLogger(__name__)
|
|
104
104
|
|
|
105
105
|
# https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
106
|
+
# TODO: Move to the standardized types in sql_types.py
|
|
106
107
|
SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
|
|
107
108
|
"DATE": DateType,
|
|
108
109
|
"BIGINT": NumberType,
|
|
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
423
424
|
view_identifier = self.identifiers.get_dataset_identifier(
|
|
424
425
|
view.name, schema_name, db_name
|
|
425
426
|
)
|
|
427
|
+
if view.is_secure and not view.view_definition:
|
|
428
|
+
view.view_definition = self.fetch_secure_view_definition(
|
|
429
|
+
view.name, schema_name, db_name
|
|
430
|
+
)
|
|
426
431
|
if view.view_definition:
|
|
427
432
|
self.aggregator.add_view_definition(
|
|
428
433
|
view_urn=self.identifiers.gen_dataset_urn(view_identifier),
|
|
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
430
435
|
default_db=db_name,
|
|
431
436
|
default_schema=schema_name,
|
|
432
437
|
)
|
|
438
|
+
elif view.is_secure:
|
|
439
|
+
self.report.num_secure_views_missing_definition += 1
|
|
433
440
|
|
|
434
441
|
if self.config.include_technical_schema:
|
|
435
442
|
for view in views:
|
|
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
446
453
|
context=f"{db_name}.{schema_name}",
|
|
447
454
|
)
|
|
448
455
|
|
|
456
|
+
def fetch_secure_view_definition(
|
|
457
|
+
self, table_name: str, schema_name: str, db_name: str
|
|
458
|
+
) -> Optional[str]:
|
|
459
|
+
try:
|
|
460
|
+
view_definitions = self.data_dictionary.get_secure_view_definitions()
|
|
461
|
+
return view_definitions[db_name][schema_name][table_name]
|
|
462
|
+
except Exception as e:
|
|
463
|
+
if isinstance(e, SnowflakePermissionError):
|
|
464
|
+
error_msg = (
|
|
465
|
+
"Failed to get secure views definitions. Please check permissions."
|
|
466
|
+
)
|
|
467
|
+
else:
|
|
468
|
+
error_msg = "Failed to get secure views definitions"
|
|
469
|
+
self.structured_reporter.warning(
|
|
470
|
+
error_msg,
|
|
471
|
+
exc=e,
|
|
472
|
+
)
|
|
473
|
+
return None
|
|
474
|
+
|
|
449
475
|
def fetch_views_for_schema(
|
|
450
476
|
self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
|
|
451
477
|
) -> List[SnowflakeView]:
|
|
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
748
774
|
) -> DatasetProperties:
|
|
749
775
|
custom_properties = {}
|
|
750
776
|
|
|
751
|
-
if isinstance(table, SnowflakeTable)
|
|
752
|
-
|
|
777
|
+
if isinstance(table, SnowflakeTable):
|
|
778
|
+
if table.clustering_key:
|
|
779
|
+
custom_properties["CLUSTERING_KEY"] = table.clustering_key
|
|
780
|
+
|
|
781
|
+
if table.is_hybrid:
|
|
782
|
+
custom_properties["IS_HYBRID"] = "true"
|
|
783
|
+
|
|
784
|
+
if table.is_dynamic:
|
|
785
|
+
custom_properties["IS_DYNAMIC"] = "true"
|
|
786
|
+
|
|
787
|
+
if table.is_iceberg:
|
|
788
|
+
custom_properties["IS_ICEBERG"] = "true"
|
|
789
|
+
|
|
790
|
+
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
791
|
+
custom_properties["IS_SECURE"] = "true"
|
|
753
792
|
|
|
754
793
|
return DatasetProperties(
|
|
755
794
|
name=table.name,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from functools import cached_property
|
|
3
|
-
from typing import ClassVar, Literal, Optional, Tuple
|
|
3
|
+
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
6
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
@@ -119,7 +119,6 @@ class SnowflakeFilter:
|
|
|
119
119
|
) -> bool:
|
|
120
120
|
if not dataset_type or not dataset_name:
|
|
121
121
|
return True
|
|
122
|
-
dataset_params = dataset_name.split(".")
|
|
123
122
|
if dataset_type.lower() not in (
|
|
124
123
|
SnowflakeObjectDomain.TABLE,
|
|
125
124
|
SnowflakeObjectDomain.EXTERNAL_TABLE,
|
|
@@ -131,6 +130,7 @@ class SnowflakeFilter:
|
|
|
131
130
|
if _is_sys_table(dataset_name):
|
|
132
131
|
return False
|
|
133
132
|
|
|
133
|
+
dataset_params = _split_qualified_name(dataset_name)
|
|
134
134
|
if len(dataset_params) != 3:
|
|
135
135
|
self.structured_reporter.info(
|
|
136
136
|
title="Unexpected dataset pattern",
|
|
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
184
184
|
return table_name.lower().startswith("sys$")
|
|
185
185
|
|
|
186
186
|
|
|
187
|
+
def _split_qualified_name(qualified_name: str) -> List[str]:
|
|
188
|
+
"""
|
|
189
|
+
Split a qualified name into its constituent parts.
|
|
190
|
+
|
|
191
|
+
>>> _split_qualified_name("db.my_schema.my_table")
|
|
192
|
+
['db', 'my_schema', 'my_table']
|
|
193
|
+
>>> _split_qualified_name('"db"."my_schema"."my_table"')
|
|
194
|
+
['db', 'my_schema', 'my_table']
|
|
195
|
+
>>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
|
|
196
|
+
['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
|
|
197
|
+
>>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
|
|
198
|
+
['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# Fast path - no quotes.
|
|
202
|
+
if '"' not in qualified_name:
|
|
203
|
+
return qualified_name.split(".")
|
|
204
|
+
|
|
205
|
+
# First pass - split on dots that are not inside quotes.
|
|
206
|
+
in_quote = False
|
|
207
|
+
parts: List[List[str]] = [[]]
|
|
208
|
+
for char in qualified_name:
|
|
209
|
+
if char == '"':
|
|
210
|
+
in_quote = not in_quote
|
|
211
|
+
elif char == "." and not in_quote:
|
|
212
|
+
parts.append([])
|
|
213
|
+
else:
|
|
214
|
+
parts[-1].append(char)
|
|
215
|
+
|
|
216
|
+
# Second pass - remove outer pairs of quotes.
|
|
217
|
+
result = []
|
|
218
|
+
for part in parts:
|
|
219
|
+
if len(part) > 2 and part[0] == '"' and part[-1] == '"':
|
|
220
|
+
part = part[1:-1]
|
|
221
|
+
|
|
222
|
+
result.append("".join(part))
|
|
223
|
+
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
|
|
187
227
|
# Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
|
|
188
228
|
# For example "test-database"."test-schema".test_table
|
|
189
229
|
# whereas we generate urns without quotes even for quoted identifiers for backward compatibility
|
|
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
192
232
|
def _cleanup_qualified_name(
|
|
193
233
|
qualified_name: str, structured_reporter: SourceReport
|
|
194
234
|
) -> str:
|
|
195
|
-
name_parts = qualified_name
|
|
235
|
+
name_parts = _split_qualified_name(qualified_name)
|
|
196
236
|
if len(name_parts) != 3:
|
|
197
237
|
if not _is_sys_table(qualified_name):
|
|
198
238
|
structured_reporter.info(
|
|
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
|
|
|
203
243
|
)
|
|
204
244
|
return qualified_name.replace('"', "")
|
|
205
245
|
return _combine_identifier_parts(
|
|
206
|
-
db_name=name_parts[0]
|
|
207
|
-
schema_name=name_parts[1]
|
|
208
|
-
table_name=name_parts[2]
|
|
246
|
+
db_name=name_parts[0],
|
|
247
|
+
schema_name=name_parts[1],
|
|
248
|
+
table_name=name_parts[2],
|
|
209
249
|
)
|
|
210
250
|
|
|
211
251
|
|
|
@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
17
|
support_status,
|
|
18
18
|
)
|
|
19
19
|
from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
|
|
20
|
+
from datahub.ingestion.api.incremental_properties_helper import (
|
|
21
|
+
auto_incremental_properties,
|
|
22
|
+
)
|
|
20
23
|
from datahub.ingestion.api.source import (
|
|
21
24
|
CapabilityReport,
|
|
22
25
|
MetadataWorkUnitProcessor,
|
|
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
|
|
|
446
449
|
functools.partial(
|
|
447
450
|
auto_incremental_lineage, self.config.incremental_lineage
|
|
448
451
|
),
|
|
452
|
+
functools.partial(
|
|
453
|
+
auto_incremental_properties, self.config.incremental_properties
|
|
454
|
+
),
|
|
449
455
|
StaleEntityRemovalHandler.create(
|
|
450
456
|
self, self.config, self.ctx
|
|
451
457
|
).workunit_processor,
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
26
|
platform_name,
|
|
27
27
|
support_status,
|
|
28
28
|
)
|
|
29
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
29
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
31
32
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
36
|
register_custom_type,
|
|
36
37
|
)
|
|
37
38
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
|
|
39
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
39
41
|
add_table_to_schema_container,
|
|
40
42
|
gen_database_container,
|
|
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
|
|
|
48
50
|
get_schema_fields_for_sqlalchemy_column,
|
|
49
51
|
)
|
|
50
52
|
|
|
53
|
+
try:
|
|
54
|
+
from typing_extensions import override
|
|
55
|
+
except ImportError:
|
|
56
|
+
_F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
|
|
57
|
+
|
|
58
|
+
def override(f: _F, /) -> _F: # noqa: F811
|
|
59
|
+
return f
|
|
60
|
+
|
|
61
|
+
|
|
51
62
|
logger = logging.getLogger(__name__)
|
|
52
63
|
|
|
53
64
|
assert STRUCT, "required type modules are not available"
|
|
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
|
|
|
322
333
|
- Profiling when enabled.
|
|
323
334
|
"""
|
|
324
335
|
|
|
325
|
-
|
|
336
|
+
config: AthenaConfig
|
|
337
|
+
report: SQLSourceReport
|
|
326
338
|
|
|
327
339
|
def __init__(self, config, ctx):
|
|
328
340
|
super().__init__(config, ctx, "athena")
|
|
329
341
|
self.cursor: Optional[BaseCursor] = None
|
|
330
342
|
|
|
343
|
+
self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
|
|
344
|
+
|
|
331
345
|
@classmethod
|
|
332
346
|
def create(cls, config_dict, ctx):
|
|
333
347
|
config = AthenaConfig.parse_obj(config_dict)
|
|
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
452
466
|
)
|
|
453
467
|
|
|
454
468
|
# It seems like database/schema filter in the connection string does not work and this to work around that
|
|
469
|
+
@override
|
|
455
470
|
def get_schema_names(self, inspector: Inspector) -> List[str]:
|
|
456
471
|
athena_config = typing.cast(AthenaConfig, self.config)
|
|
457
472
|
schemas = inspector.get_schema_names()
|
|
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
|
|
|
459
474
|
return [schema for schema in schemas if schema == athena_config.database]
|
|
460
475
|
return schemas
|
|
461
476
|
|
|
462
|
-
|
|
477
|
+
@classmethod
|
|
478
|
+
def _casted_partition_key(cls, key: str) -> str:
|
|
479
|
+
# We need to cast the partition keys to a VARCHAR, since otherwise
|
|
480
|
+
# Athena may throw an error during concatenation / comparison.
|
|
481
|
+
return f"CAST({key} as VARCHAR)"
|
|
482
|
+
|
|
483
|
+
@override
|
|
463
484
|
def get_partitions(
|
|
464
485
|
self, inspector: Inspector, schema: str, table: str
|
|
465
|
-
) -> List[str]:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
athena_config = typing.cast(AthenaConfig, self.config)
|
|
469
|
-
|
|
470
|
-
if not athena_config.extract_partitions:
|
|
471
|
-
return []
|
|
486
|
+
) -> Optional[List[str]]:
|
|
487
|
+
if not self.config.extract_partitions:
|
|
488
|
+
return None
|
|
472
489
|
|
|
473
490
|
if not self.cursor:
|
|
474
|
-
return
|
|
491
|
+
return None
|
|
475
492
|
|
|
476
493
|
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
477
494
|
table_name=table, schema_name=schema
|
|
478
495
|
)
|
|
479
496
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
return []
|
|
497
|
+
partitions = []
|
|
498
|
+
for key in metadata.partition_keys:
|
|
499
|
+
if key.name:
|
|
500
|
+
partitions.append(key.name)
|
|
501
|
+
if not partitions:
|
|
502
|
+
return []
|
|
487
503
|
|
|
488
|
-
|
|
489
|
-
|
|
504
|
+
with self.report.report_exc(
|
|
505
|
+
message="Failed to extract partition details",
|
|
506
|
+
context=f"{schema}.{table}",
|
|
507
|
+
level=StructuredLogLevel.WARN,
|
|
508
|
+
):
|
|
509
|
+
# We create an artifical concatenated partition key to be able to query max partition easier
|
|
510
|
+
part_concat = " || '-' || ".join(
|
|
511
|
+
self._casted_partition_key(key) for key in partitions
|
|
512
|
+
)
|
|
490
513
|
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
|
|
491
514
|
ret = self.cursor.execute(max_partition_query)
|
|
492
515
|
max_partition: Dict[str, str] = {}
|
|
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
|
|
|
500
523
|
partitions=partitions,
|
|
501
524
|
max_partition=max_partition,
|
|
502
525
|
)
|
|
503
|
-
return partitions
|
|
504
526
|
|
|
505
|
-
return
|
|
527
|
+
return partitions
|
|
506
528
|
|
|
507
529
|
# Overwrite to modify the creation of schema fields
|
|
508
530
|
def get_schema_fields_for_column(
|
|
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
573
|
if partition and partition.max_partition:
|
|
552
574
|
max_partition_filters = []
|
|
553
575
|
for key, value in partition.max_partition.items():
|
|
554
|
-
max_partition_filters.append(
|
|
576
|
+
max_partition_filters.append(
|
|
577
|
+
f"{self._casted_partition_key(key)} = '{value}'"
|
|
578
|
+
)
|
|
555
579
|
max_partition = str(partition.max_partition)
|
|
556
580
|
return (
|
|
557
581
|
max_partition,
|
|
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
8
|
-
|
|
9
|
-
# This import verifies that the dependencies are available.
|
|
10
8
|
from pydantic.fields import Field
|
|
11
9
|
from sqlalchemy import create_engine, inspect
|
|
12
10
|
from sqlalchemy.engine.base import Connection
|
|
@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
582
582
|
generate_operations=False,
|
|
583
583
|
)
|
|
584
584
|
for dataset_name in self._view_definition_cache.keys():
|
|
585
|
+
# TODO: Ensure that the lineage generated from the view definition
|
|
586
|
+
# matches the dataset_name.
|
|
585
587
|
view_definition = self._view_definition_cache[dataset_name]
|
|
586
588
|
result = self._run_sql_parser(
|
|
587
589
|
dataset_name,
|
|
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1059
1061
|
exc=e,
|
|
1060
1062
|
)
|
|
1061
1063
|
|
|
1064
|
+
def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
|
|
1065
|
+
try:
|
|
1066
|
+
view_definition = inspector.get_view_definition(view, schema)
|
|
1067
|
+
if view_definition is None:
|
|
1068
|
+
view_definition = ""
|
|
1069
|
+
else:
|
|
1070
|
+
# Some dialects return a TextClause instead of a raw string,
|
|
1071
|
+
# so we need to convert them to a string.
|
|
1072
|
+
view_definition = str(view_definition)
|
|
1073
|
+
except NotImplementedError:
|
|
1074
|
+
view_definition = ""
|
|
1075
|
+
|
|
1076
|
+
return view_definition
|
|
1077
|
+
|
|
1062
1078
|
def _process_view(
|
|
1063
1079
|
self,
|
|
1064
1080
|
dataset_name: str,
|
|
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1077
1093
|
columns = inspector.get_columns(view, schema)
|
|
1078
1094
|
except KeyError:
|
|
1079
1095
|
# For certain types of views, we are unable to fetch the list of columns.
|
|
1080
|
-
self.
|
|
1096
|
+
self.report.warning(
|
|
1097
|
+
message="Unable to get schema for a view",
|
|
1098
|
+
context=f"{dataset_name}",
|
|
1099
|
+
)
|
|
1081
1100
|
schema_metadata = None
|
|
1082
1101
|
else:
|
|
1083
1102
|
schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
|
|
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1091
1110
|
if self._save_schema_to_resolver():
|
|
1092
1111
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
1093
1112
|
self.discovered_datasets.add(dataset_name)
|
|
1113
|
+
|
|
1094
1114
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
1095
|
-
try:
|
|
1096
|
-
view_definition = inspector.get_view_definition(view, schema)
|
|
1097
|
-
if view_definition is None:
|
|
1098
|
-
view_definition = ""
|
|
1099
|
-
else:
|
|
1100
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
1101
|
-
# so we need to convert them to a string.
|
|
1102
|
-
view_definition = str(view_definition)
|
|
1103
|
-
except NotImplementedError:
|
|
1104
|
-
view_definition = ""
|
|
1105
|
-
properties["view_definition"] = view_definition
|
|
1106
1115
|
properties["is_view"] = "True"
|
|
1116
|
+
|
|
1117
|
+
view_definition = self._get_view_definition(inspector, schema, view)
|
|
1118
|
+
properties["view_definition"] = view_definition
|
|
1107
1119
|
if view_definition and self.config.include_view_lineage:
|
|
1108
1120
|
self._view_definition_cache[dataset_name] = view_definition
|
|
1109
1121
|
|
|
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1135
1147
|
entityUrn=dataset_urn,
|
|
1136
1148
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
|
|
1137
1149
|
).as_workunit()
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
).as_workunit()
|
|
1150
|
+
|
|
1151
|
+
view_properties_aspect = ViewPropertiesClass(
|
|
1152
|
+
materialized=False, viewLanguage="SQL", viewLogic=view_definition
|
|
1153
|
+
)
|
|
1154
|
+
yield MetadataChangeProposalWrapper(
|
|
1155
|
+
entityUrn=dataset_urn,
|
|
1156
|
+
aspect=view_properties_aspect,
|
|
1157
|
+
).as_workunit()
|
|
1147
1158
|
|
|
1148
1159
|
if self.config.domain and self.domain_registry:
|
|
1149
1160
|
yield from get_domain_wu(
|
|
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1197
1208
|
)
|
|
1198
1209
|
else:
|
|
1199
1210
|
self.report.num_view_definitions_parsed += 1
|
|
1211
|
+
if raw_lineage.out_tables != [view_urn]:
|
|
1212
|
+
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1200
1213
|
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1201
1214
|
|
|
1202
1215
|
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
@@ -48,6 +48,7 @@ class SQLSourceReport(
|
|
|
48
48
|
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
|
49
49
|
|
|
50
50
|
num_view_definitions_parsed: int = 0
|
|
51
|
+
num_view_definitions_view_urn_mismatch: int = 0
|
|
51
52
|
num_view_definitions_failed_parsing: int = 0
|
|
52
53
|
num_view_definitions_failed_column_parsing: int = 0
|
|
53
54
|
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any, Dict, ValuesView
|
|
2
|
+
from typing import Any, Dict, Optional, Type, Union, ValuesView
|
|
3
3
|
|
|
4
4
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
5
5
|
ArrayType,
|
|
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
16
16
|
UnionType,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
DATAHUB_FIELD_TYPE = Union[
|
|
20
|
+
ArrayType,
|
|
21
|
+
BooleanType,
|
|
22
|
+
BytesType,
|
|
23
|
+
DateType,
|
|
24
|
+
EnumType,
|
|
25
|
+
MapType,
|
|
26
|
+
NullType,
|
|
27
|
+
NumberType,
|
|
28
|
+
RecordType,
|
|
29
|
+
StringType,
|
|
30
|
+
TimeType,
|
|
31
|
+
UnionType,
|
|
32
|
+
]
|
|
22
33
|
|
|
23
|
-
# we map from format_type since this is what dbt uses
|
|
24
|
-
# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
25
34
|
|
|
26
|
-
#
|
|
35
|
+
# These can be obtained by running `select format_type(oid, null),* from pg_type;`
|
|
36
|
+
# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
|
|
37
|
+
# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
|
|
38
|
+
# We map from format_type since this is what dbt uses.
|
|
39
|
+
# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
40
|
+
# See https://www.npgsql.org/dev/types.html for helpful type annotations
|
|
27
41
|
POSTGRES_TYPES_MAP: Dict[str, Any] = {
|
|
28
42
|
"boolean": BooleanType,
|
|
29
43
|
"bytea": BytesType,
|
|
@@ -262,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
|
|
|
262
276
|
return VERTICA_SQL_TYPES_MAP[type_string]
|
|
263
277
|
|
|
264
278
|
|
|
265
|
-
# see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
266
279
|
SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
267
280
|
"NUMBER": NumberType,
|
|
268
281
|
"DECIMAL": NumberType,
|
|
@@ -298,6 +311,18 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
|
298
311
|
"GEOGRAPHY": None,
|
|
299
312
|
}
|
|
300
313
|
|
|
314
|
+
|
|
315
|
+
def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
316
|
+
# Match types with precision and scale, e.g., 'DECIMAL(38,0)'
|
|
317
|
+
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
|
+
if match:
|
|
319
|
+
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
|
|
321
|
+
|
|
322
|
+
# Fallback for types without precision/scale
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string, None)
|
|
324
|
+
|
|
325
|
+
|
|
301
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|
|
302
327
|
BIGQUERY_TYPES_MAP: Dict[str, Any] = {
|
|
303
328
|
"STRING": StringType,
|
|
@@ -366,6 +391,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
366
391
|
"row": RecordType,
|
|
367
392
|
"map": MapType,
|
|
368
393
|
"array": ArrayType,
|
|
394
|
+
"json": RecordType,
|
|
369
395
|
}
|
|
370
396
|
|
|
371
397
|
# https://docs.aws.amazon.com/athena/latest/ug/data-types.html
|
|
@@ -430,3 +456,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
430
456
|
"geography": None,
|
|
431
457
|
"uuid": StringType,
|
|
432
458
|
}
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
_merged_mapping = {
|
|
462
|
+
"boolean": BooleanType,
|
|
463
|
+
"date": DateType,
|
|
464
|
+
"time": TimeType,
|
|
465
|
+
"numeric": NumberType,
|
|
466
|
+
"text": StringType,
|
|
467
|
+
"timestamp with time zone": DateType,
|
|
468
|
+
"timestamp without time zone": DateType,
|
|
469
|
+
"integer": NumberType,
|
|
470
|
+
"float8": NumberType,
|
|
471
|
+
"struct": RecordType,
|
|
472
|
+
**POSTGRES_TYPES_MAP,
|
|
473
|
+
**SNOWFLAKE_TYPES_MAP,
|
|
474
|
+
**BIGQUERY_TYPES_MAP,
|
|
475
|
+
**SPARK_SQL_TYPES_MAP,
|
|
476
|
+
**TRINO_SQL_TYPES_MAP,
|
|
477
|
+
**ATHENA_SQL_TYPES_MAP,
|
|
478
|
+
**VERTICA_SQL_TYPES_MAP,
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def resolve_sql_type(
|
|
483
|
+
column_type: Optional[str],
|
|
484
|
+
platform: Optional[str] = None,
|
|
485
|
+
) -> Optional[DATAHUB_FIELD_TYPE]:
|
|
486
|
+
# In theory, we should use the platform-specific mapping where available.
|
|
487
|
+
# However, the types don't ever conflict, so the merged mapping is fine.
|
|
488
|
+
TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
|
|
489
|
+
_merged_mapping.get(column_type) if column_type else None
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
if TypeClass is None and column_type:
|
|
493
|
+
# resolve a modified type
|
|
494
|
+
if platform == "trino":
|
|
495
|
+
TypeClass = resolve_trino_modified_type(column_type)
|
|
496
|
+
elif platform == "athena":
|
|
497
|
+
TypeClass = resolve_athena_modified_type(column_type)
|
|
498
|
+
elif platform == "postgres" or platform == "redshift":
|
|
499
|
+
# Redshift uses a variant of Postgres, so we can use the same logic.
|
|
500
|
+
TypeClass = resolve_postgres_modified_type(column_type)
|
|
501
|
+
elif platform == "vertica":
|
|
502
|
+
TypeClass = resolve_vertica_modified_type(column_type)
|
|
503
|
+
elif platform == "snowflake":
|
|
504
|
+
# Snowflake types are uppercase, so we check that.
|
|
505
|
+
TypeClass = resolve_snowflake_modified_type(column_type.upper())
|
|
506
|
+
|
|
507
|
+
if TypeClass:
|
|
508
|
+
return TypeClass()
|
|
509
|
+
return None
|
|
@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
|
|
|
69
69
|
platform: Optional[str] = None
|
|
70
70
|
source_class = type(self.source)
|
|
71
71
|
if hasattr(source_class, "get_platform_name"):
|
|
72
|
-
platform = source_class.get_platform_name()
|
|
72
|
+
platform = source_class.get_platform_name()
|
|
73
73
|
|
|
74
74
|
# Default name for everything else
|
|
75
75
|
job_name_suffix = self.get_job_name_suffix()
|