acryl-datahub 0.15.0rc5__py3-none-any.whl → 0.15.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/METADATA +2456 -2426
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/RECORD +43 -41
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +11 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +6 -1
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +1 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +3 -47
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
26
|
platform_name,
|
|
27
27
|
support_status,
|
|
28
28
|
)
|
|
29
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
29
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
31
32
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
36
|
register_custom_type,
|
|
36
37
|
)
|
|
37
38
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
|
|
39
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
39
41
|
add_table_to_schema_container,
|
|
40
42
|
gen_database_container,
|
|
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
|
|
|
48
50
|
get_schema_fields_for_sqlalchemy_column,
|
|
49
51
|
)
|
|
50
52
|
|
|
53
|
+
try:
|
|
54
|
+
from typing_extensions import override
|
|
55
|
+
except ImportError:
|
|
56
|
+
_F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
|
|
57
|
+
|
|
58
|
+
def override(f: _F, /) -> _F: # noqa: F811
|
|
59
|
+
return f
|
|
60
|
+
|
|
61
|
+
|
|
51
62
|
logger = logging.getLogger(__name__)
|
|
52
63
|
|
|
53
64
|
assert STRUCT, "required type modules are not available"
|
|
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
|
|
|
322
333
|
- Profiling when enabled.
|
|
323
334
|
"""
|
|
324
335
|
|
|
325
|
-
|
|
336
|
+
config: AthenaConfig
|
|
337
|
+
report: SQLSourceReport
|
|
326
338
|
|
|
327
339
|
def __init__(self, config, ctx):
|
|
328
340
|
super().__init__(config, ctx, "athena")
|
|
329
341
|
self.cursor: Optional[BaseCursor] = None
|
|
330
342
|
|
|
343
|
+
self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
|
|
344
|
+
|
|
331
345
|
@classmethod
|
|
332
346
|
def create(cls, config_dict, ctx):
|
|
333
347
|
config = AthenaConfig.parse_obj(config_dict)
|
|
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
452
466
|
)
|
|
453
467
|
|
|
454
468
|
# It seems like database/schema filter in the connection string does not work and this to work around that
|
|
469
|
+
@override
|
|
455
470
|
def get_schema_names(self, inspector: Inspector) -> List[str]:
|
|
456
471
|
athena_config = typing.cast(AthenaConfig, self.config)
|
|
457
472
|
schemas = inspector.get_schema_names()
|
|
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
|
|
|
459
474
|
return [schema for schema in schemas if schema == athena_config.database]
|
|
460
475
|
return schemas
|
|
461
476
|
|
|
462
|
-
|
|
477
|
+
@classmethod
|
|
478
|
+
def _casted_partition_key(cls, key: str) -> str:
|
|
479
|
+
# We need to cast the partition keys to a VARCHAR, since otherwise
|
|
480
|
+
# Athena may throw an error during concatenation / comparison.
|
|
481
|
+
return f"CAST({key} as VARCHAR)"
|
|
482
|
+
|
|
483
|
+
@override
|
|
463
484
|
def get_partitions(
|
|
464
485
|
self, inspector: Inspector, schema: str, table: str
|
|
465
|
-
) -> List[str]:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
athena_config = typing.cast(AthenaConfig, self.config)
|
|
469
|
-
|
|
470
|
-
if not athena_config.extract_partitions:
|
|
471
|
-
return []
|
|
486
|
+
) -> Optional[List[str]]:
|
|
487
|
+
if not self.config.extract_partitions:
|
|
488
|
+
return None
|
|
472
489
|
|
|
473
490
|
if not self.cursor:
|
|
474
|
-
return
|
|
491
|
+
return None
|
|
475
492
|
|
|
476
493
|
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
477
494
|
table_name=table, schema_name=schema
|
|
478
495
|
)
|
|
479
496
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
return []
|
|
497
|
+
partitions = []
|
|
498
|
+
for key in metadata.partition_keys:
|
|
499
|
+
if key.name:
|
|
500
|
+
partitions.append(key.name)
|
|
501
|
+
if not partitions:
|
|
502
|
+
return []
|
|
487
503
|
|
|
488
|
-
|
|
489
|
-
|
|
504
|
+
with self.report.report_exc(
|
|
505
|
+
message="Failed to extract partition details",
|
|
506
|
+
context=f"{schema}.{table}",
|
|
507
|
+
level=StructuredLogLevel.WARN,
|
|
508
|
+
):
|
|
509
|
+
# We create an artifical concatenated partition key to be able to query max partition easier
|
|
510
|
+
part_concat = " || '-' || ".join(
|
|
511
|
+
self._casted_partition_key(key) for key in partitions
|
|
512
|
+
)
|
|
490
513
|
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
|
|
491
514
|
ret = self.cursor.execute(max_partition_query)
|
|
492
515
|
max_partition: Dict[str, str] = {}
|
|
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
|
|
|
500
523
|
partitions=partitions,
|
|
501
524
|
max_partition=max_partition,
|
|
502
525
|
)
|
|
503
|
-
return partitions
|
|
504
526
|
|
|
505
|
-
return
|
|
527
|
+
return partitions
|
|
506
528
|
|
|
507
529
|
# Overwrite to modify the creation of schema fields
|
|
508
530
|
def get_schema_fields_for_column(
|
|
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
573
|
if partition and partition.max_partition:
|
|
552
574
|
max_partition_filters = []
|
|
553
575
|
for key, value in partition.max_partition.items():
|
|
554
|
-
max_partition_filters.append(
|
|
576
|
+
max_partition_filters.append(
|
|
577
|
+
f"{self._casted_partition_key(key)} = '{value}'"
|
|
578
|
+
)
|
|
555
579
|
max_partition = str(partition.max_partition)
|
|
556
580
|
return (
|
|
557
581
|
max_partition,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any, Dict, ValuesView
|
|
2
|
+
from typing import Any, Dict, Optional, Type, Union, ValuesView
|
|
3
3
|
|
|
4
4
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
5
5
|
ArrayType,
|
|
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
16
16
|
UnionType,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
DATAHUB_FIELD_TYPE = Union[
|
|
20
|
+
ArrayType,
|
|
21
|
+
BooleanType,
|
|
22
|
+
BytesType,
|
|
23
|
+
DateType,
|
|
24
|
+
EnumType,
|
|
25
|
+
MapType,
|
|
26
|
+
NullType,
|
|
27
|
+
NumberType,
|
|
28
|
+
RecordType,
|
|
29
|
+
StringType,
|
|
30
|
+
TimeType,
|
|
31
|
+
UnionType,
|
|
32
|
+
]
|
|
22
33
|
|
|
23
|
-
# we map from format_type since this is what dbt uses
|
|
24
|
-
# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
25
34
|
|
|
26
|
-
#
|
|
35
|
+
# These can be obtained by running `select format_type(oid, null),* from pg_type;`
|
|
36
|
+
# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
|
|
37
|
+
# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
|
|
38
|
+
# We map from format_type since this is what dbt uses.
|
|
39
|
+
# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
40
|
+
# See https://www.npgsql.org/dev/types.html for helpful type annotations
|
|
27
41
|
POSTGRES_TYPES_MAP: Dict[str, Any] = {
|
|
28
42
|
"boolean": BooleanType,
|
|
29
43
|
"bytea": BytesType,
|
|
@@ -262,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
|
|
|
262
276
|
return VERTICA_SQL_TYPES_MAP[type_string]
|
|
263
277
|
|
|
264
278
|
|
|
265
|
-
# see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
266
279
|
SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
267
280
|
"NUMBER": NumberType,
|
|
268
281
|
"DECIMAL": NumberType,
|
|
@@ -298,6 +311,18 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
|
|
|
298
311
|
"GEOGRAPHY": None,
|
|
299
312
|
}
|
|
300
313
|
|
|
314
|
+
|
|
315
|
+
def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
316
|
+
# Match types with precision and scale, e.g., 'DECIMAL(38,0)'
|
|
317
|
+
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
|
+
if match:
|
|
319
|
+
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
|
|
321
|
+
|
|
322
|
+
# Fallback for types without precision/scale
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string, None)
|
|
324
|
+
|
|
325
|
+
|
|
301
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|
|
302
327
|
BIGQUERY_TYPES_MAP: Dict[str, Any] = {
|
|
303
328
|
"STRING": StringType,
|
|
@@ -366,6 +391,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
366
391
|
"row": RecordType,
|
|
367
392
|
"map": MapType,
|
|
368
393
|
"array": ArrayType,
|
|
394
|
+
"json": RecordType,
|
|
369
395
|
}
|
|
370
396
|
|
|
371
397
|
# https://docs.aws.amazon.com/athena/latest/ug/data-types.html
|
|
@@ -430,3 +456,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
430
456
|
"geography": None,
|
|
431
457
|
"uuid": StringType,
|
|
432
458
|
}
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
_merged_mapping = {
|
|
462
|
+
"boolean": BooleanType,
|
|
463
|
+
"date": DateType,
|
|
464
|
+
"time": TimeType,
|
|
465
|
+
"numeric": NumberType,
|
|
466
|
+
"text": StringType,
|
|
467
|
+
"timestamp with time zone": DateType,
|
|
468
|
+
"timestamp without time zone": DateType,
|
|
469
|
+
"integer": NumberType,
|
|
470
|
+
"float8": NumberType,
|
|
471
|
+
"struct": RecordType,
|
|
472
|
+
**POSTGRES_TYPES_MAP,
|
|
473
|
+
**SNOWFLAKE_TYPES_MAP,
|
|
474
|
+
**BIGQUERY_TYPES_MAP,
|
|
475
|
+
**SPARK_SQL_TYPES_MAP,
|
|
476
|
+
**TRINO_SQL_TYPES_MAP,
|
|
477
|
+
**ATHENA_SQL_TYPES_MAP,
|
|
478
|
+
**VERTICA_SQL_TYPES_MAP,
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def resolve_sql_type(
|
|
483
|
+
column_type: Optional[str],
|
|
484
|
+
platform: Optional[str] = None,
|
|
485
|
+
) -> Optional[DATAHUB_FIELD_TYPE]:
|
|
486
|
+
# In theory, we should use the platform-specific mapping where available.
|
|
487
|
+
# However, the types don't ever conflict, so the merged mapping is fine.
|
|
488
|
+
TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
|
|
489
|
+
_merged_mapping.get(column_type) if column_type else None
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
if TypeClass is None and column_type:
|
|
493
|
+
# resolve a modified type
|
|
494
|
+
if platform == "trino":
|
|
495
|
+
TypeClass = resolve_trino_modified_type(column_type)
|
|
496
|
+
elif platform == "athena":
|
|
497
|
+
TypeClass = resolve_athena_modified_type(column_type)
|
|
498
|
+
elif platform == "postgres" or platform == "redshift":
|
|
499
|
+
# Redshift uses a variant of Postgres, so we can use the same logic.
|
|
500
|
+
TypeClass = resolve_postgres_modified_type(column_type)
|
|
501
|
+
elif platform == "vertica":
|
|
502
|
+
TypeClass = resolve_vertica_modified_type(column_type)
|
|
503
|
+
elif platform == "snowflake":
|
|
504
|
+
# Snowflake types are uppercase, so we check that.
|
|
505
|
+
TypeClass = resolve_snowflake_modified_type(column_type.upper())
|
|
506
|
+
|
|
507
|
+
if TypeClass:
|
|
508
|
+
return TypeClass()
|
|
509
|
+
return None
|
|
@@ -33,6 +33,7 @@ from datahub.metadata.schema_classes import (
|
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
|
+
# TODO: (maybe) Replace with standardized types in sql_types.py
|
|
36
37
|
DATA_TYPE_REGISTRY: dict = {
|
|
37
38
|
ColumnTypeName.BOOLEAN: BooleanTypeClass,
|
|
38
39
|
ColumnTypeName.BYTE: BytesTypeClass,
|
|
@@ -74,7 +74,7 @@ class AddDatasetTags(DatasetTagsTransformer):
|
|
|
74
74
|
logger.debug("Generating tags")
|
|
75
75
|
|
|
76
76
|
for tag_association in self.processed_tags.values():
|
|
77
|
-
tag_urn = TagUrn.
|
|
77
|
+
tag_urn = TagUrn.from_string(tag_association.tag)
|
|
78
78
|
mcps.append(
|
|
79
79
|
MetadataChangeProposalWrapper(
|
|
80
80
|
entityUrn=tag_urn.urn(),
|
|
@@ -100,7 +100,7 @@ class GenericAspectTransformer(
|
|
|
100
100
|
)
|
|
101
101
|
if transformed_aspect:
|
|
102
102
|
# for end of stream records, we modify the workunit-id
|
|
103
|
-
structured_urn = Urn.
|
|
103
|
+
structured_urn = Urn.from_string(urn)
|
|
104
104
|
simple_name = "-".join(structured_urn.get_entity_id())
|
|
105
105
|
record_metadata = envelope.metadata.copy()
|
|
106
106
|
record_metadata.update(
|
|
@@ -42,7 +42,7 @@ def get_entity_name(assertion: BaseEntityAssertion) -> Tuple[str, str, str]:
|
|
|
42
42
|
if qualified_name is not None:
|
|
43
43
|
parts = qualified_name.split(".")
|
|
44
44
|
else:
|
|
45
|
-
urn_id = Urn.
|
|
45
|
+
urn_id = Urn.from_string(assertion.entity).entity_ids[1]
|
|
46
46
|
parts = urn_id.split(".")
|
|
47
47
|
if len(parts) > 3:
|
|
48
48
|
parts = parts[-3:]
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -609,7 +609,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
609
609
|
aspect_map, DataPlatformInstanceClass
|
|
610
610
|
) # type: ignore
|
|
611
611
|
|
|
612
|
-
needs_platform = Urn.
|
|
612
|
+
needs_platform = Urn.from_string(entity_urn).get_type() in [
|
|
613
613
|
"dataset",
|
|
614
614
|
"container",
|
|
615
615
|
"chart",
|
|
@@ -617,7 +617,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
617
617
|
"dataFlow",
|
|
618
618
|
"dataJob",
|
|
619
619
|
]
|
|
620
|
-
entity_urn_parsed = Urn.
|
|
620
|
+
entity_urn_parsed = Urn.from_string(entity_urn)
|
|
621
621
|
if entity_urn_parsed.get_type() in ["dataFlow", "dataJob"]:
|
|
622
622
|
self.add_edge(
|
|
623
623
|
entity_urn,
|
|
@@ -630,15 +630,12 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
630
630
|
# this is a top-level entity
|
|
631
631
|
if not dpi:
|
|
632
632
|
logger.debug(f"No data platform instance for {entity_urn}")
|
|
633
|
-
maybe_parent_urn = Urn.
|
|
633
|
+
maybe_parent_urn = Urn.from_string(entity_urn).get_entity_id()[0]
|
|
634
634
|
needs_dpi = False
|
|
635
635
|
if maybe_parent_urn.startswith(Urn.URN_PREFIX):
|
|
636
636
|
parent_urn = maybe_parent_urn
|
|
637
|
-
if (
|
|
638
|
-
|
|
639
|
-
== "dataPlatform"
|
|
640
|
-
):
|
|
641
|
-
data_platform_urn = DataPlatformUrn.create_from_string(
|
|
637
|
+
if Urn.from_string(maybe_parent_urn).get_type() == "dataPlatform":
|
|
638
|
+
data_platform_urn = DataPlatformUrn.from_string(
|
|
642
639
|
maybe_parent_urn
|
|
643
640
|
)
|
|
644
641
|
needs_dpi = True
|
|
@@ -660,7 +657,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
660
657
|
logger.error(f"Failed to generate edges entity {entity_urn}", e)
|
|
661
658
|
parent_urn = str(data_platform_instance_urn)
|
|
662
659
|
else:
|
|
663
|
-
data_platform_urn = DataPlatformUrn.
|
|
660
|
+
data_platform_urn = DataPlatformUrn.from_string(dpi.platform)
|
|
664
661
|
data_platform_instance = dpi.instance or "default"
|
|
665
662
|
data_platform_instance_urn = Urn(
|
|
666
663
|
entity_type="dataPlatformInstance",
|
|
@@ -673,9 +670,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
673
670
|
parent_urn = "__root__"
|
|
674
671
|
|
|
675
672
|
types = (
|
|
676
|
-
subtypes.typeNames
|
|
677
|
-
if subtypes
|
|
678
|
-
else [Urn.create_from_string(entity_urn).get_type()]
|
|
673
|
+
subtypes.typeNames if subtypes else [Urn.from_string(entity_urn).get_type()]
|
|
679
674
|
)
|
|
680
675
|
for t in types:
|
|
681
676
|
type_urn = Urn(entity_type="systemNode", entity_id=[parent_urn, t])
|
|
@@ -686,7 +681,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
686
681
|
def _create_edges_from_data_platform_instance(
|
|
687
682
|
self, data_platform_instance_urn: Urn
|
|
688
683
|
) -> None:
|
|
689
|
-
data_platform_urn = DataPlatformUrn.
|
|
684
|
+
data_platform_urn = DataPlatformUrn.from_string(
|
|
690
685
|
data_platform_instance_urn.get_entity_id()[0]
|
|
691
686
|
)
|
|
692
687
|
data_platform_instances_urn = Urn(
|
|
@@ -735,7 +730,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
735
730
|
if isinstance(aspect, DatasetPropertiesClass):
|
|
736
731
|
dp: DatasetPropertiesClass = aspect
|
|
737
732
|
if dp.name:
|
|
738
|
-
specific_urn = DatasetUrn.
|
|
733
|
+
specific_urn = DatasetUrn.from_string(entity_urn)
|
|
739
734
|
if (
|
|
740
735
|
specific_urn.get_data_platform_urn().get_entity_id_as_string()
|
|
741
736
|
== "looker"
|
|
@@ -755,7 +750,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
755
750
|
self.add_edge(entity_urn, "name", cp.name, remove_existing=True)
|
|
756
751
|
elif isinstance(aspect, DataPlatformInstanceClass):
|
|
757
752
|
dpi: DataPlatformInstanceClass = aspect
|
|
758
|
-
data_platform_urn = DataPlatformUrn.
|
|
753
|
+
data_platform_urn = DataPlatformUrn.from_string(dpi.platform)
|
|
759
754
|
data_platform_instance = dpi.instance or "default"
|
|
760
755
|
data_platform_instance_urn = Urn(
|
|
761
756
|
entity_type="dataPlatformInstance",
|
|
@@ -763,7 +758,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
763
758
|
)
|
|
764
759
|
self._create_edges_from_data_platform_instance(data_platform_instance_urn)
|
|
765
760
|
elif isinstance(aspect, ChartInfoClass):
|
|
766
|
-
urn = Urn.
|
|
761
|
+
urn = Urn.from_string(entity_urn)
|
|
767
762
|
self.add_edge(
|
|
768
763
|
entity_urn,
|
|
769
764
|
"name",
|
|
@@ -771,7 +766,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
771
766
|
remove_existing=True,
|
|
772
767
|
)
|
|
773
768
|
elif isinstance(aspect, DashboardInfoClass):
|
|
774
|
-
urn = Urn.
|
|
769
|
+
urn = Urn.from_string(entity_urn)
|
|
775
770
|
self.add_edge(
|
|
776
771
|
entity_urn,
|
|
777
772
|
"name",
|
datahub/specific/chart.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
5
|
AccessLevelClass,
|
|
7
|
-
AuditStampClass,
|
|
8
6
|
ChangeAuditStampsClass,
|
|
9
7
|
ChartInfoClass as ChartInfo,
|
|
10
8
|
ChartTypeClass,
|
|
@@ -47,43 +45,6 @@ class ChartPatchBuilder(MetadataPatchProposal):
|
|
|
47
45
|
)
|
|
48
46
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
49
47
|
|
|
50
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
51
|
-
"""
|
|
52
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
message: The message associated with the audit stamp (optional).
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
An instance of AuditStampClass.
|
|
59
|
-
"""
|
|
60
|
-
return AuditStampClass(
|
|
61
|
-
time=int(time.time() * 1000.0),
|
|
62
|
-
actor="urn:li:corpuser:datahub",
|
|
63
|
-
message=message,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
def _ensure_urn_type(
|
|
67
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
68
|
-
) -> None:
|
|
69
|
-
"""
|
|
70
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
entity_type: The entity type to check against.
|
|
74
|
-
edges: A list of Edge objects.
|
|
75
|
-
context: The context or description of the operation.
|
|
76
|
-
|
|
77
|
-
Raises:
|
|
78
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
79
|
-
"""
|
|
80
|
-
for e in edges:
|
|
81
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
82
|
-
if not urn.get_type() == entity_type:
|
|
83
|
-
raise ValueError(
|
|
84
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
85
|
-
)
|
|
86
|
-
|
|
87
48
|
def add_owner(self, owner: Owner) -> "ChartPatchBuilder":
|
|
88
49
|
"""
|
|
89
50
|
Adds an owner to the ChartPatchBuilder.
|
datahub/specific/dashboard.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
5
|
AccessLevelClass,
|
|
7
|
-
AuditStampClass,
|
|
8
6
|
ChangeAuditStampsClass,
|
|
9
7
|
DashboardInfoClass as DashboardInfo,
|
|
10
8
|
EdgeClass as Edge,
|
|
@@ -46,43 +44,6 @@ class DashboardPatchBuilder(MetadataPatchProposal):
|
|
|
46
44
|
)
|
|
47
45
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
48
46
|
|
|
49
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
50
|
-
"""
|
|
51
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
message: The message associated with the audit stamp (optional).
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
An instance of AuditStampClass.
|
|
58
|
-
"""
|
|
59
|
-
return AuditStampClass(
|
|
60
|
-
time=int(time.time() * 1000.0),
|
|
61
|
-
actor="urn:li:corpuser:datahub",
|
|
62
|
-
message=message,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
def _ensure_urn_type(
|
|
66
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
67
|
-
) -> None:
|
|
68
|
-
"""
|
|
69
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
entity_type: The entity type to check against.
|
|
73
|
-
edges: A list of Edge objects.
|
|
74
|
-
context: The context or description of the operation.
|
|
75
|
-
|
|
76
|
-
Raises:
|
|
77
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
78
|
-
"""
|
|
79
|
-
for e in edges:
|
|
80
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
81
|
-
if not urn.get_type() == entity_type:
|
|
82
|
-
raise ValueError(
|
|
83
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
84
|
-
)
|
|
85
|
-
|
|
86
47
|
def add_owner(self, owner: Owner) -> "DashboardPatchBuilder":
|
|
87
48
|
"""
|
|
88
49
|
Adds an owner to the DashboardPatchBuilder.
|
datahub/specific/datajob.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import time
|
|
2
1
|
from typing import Dict, List, Optional, Union
|
|
3
2
|
|
|
4
3
|
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
5
4
|
from datahub.metadata.schema_classes import (
|
|
6
|
-
AuditStampClass,
|
|
7
5
|
DataJobInfoClass as DataJobInfo,
|
|
8
6
|
DataJobInputOutputClass as DataJobInputOutput,
|
|
9
7
|
EdgeClass as Edge,
|
|
@@ -16,10 +14,9 @@ from datahub.metadata.schema_classes import (
|
|
|
16
14
|
SystemMetadataClass,
|
|
17
15
|
TagAssociationClass as Tag,
|
|
18
16
|
)
|
|
17
|
+
from datahub.metadata.urns import SchemaFieldUrn, TagUrn, Urn
|
|
19
18
|
from datahub.specific.custom_properties import CustomPropertiesPatchHelper
|
|
20
19
|
from datahub.specific.ownership import OwnershipPatchHelper
|
|
21
|
-
from datahub.utilities.urns.tag_urn import TagUrn
|
|
22
|
-
from datahub.utilities.urns.urn import Urn
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
class DataJobPatchBuilder(MetadataPatchProposal):
|
|
@@ -45,43 +42,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
45
42
|
)
|
|
46
43
|
self.ownership_patch_helper = OwnershipPatchHelper(self)
|
|
47
44
|
|
|
48
|
-
def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass:
|
|
49
|
-
"""
|
|
50
|
-
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
message: The message associated with the audit stamp (optional).
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
An instance of AuditStampClass.
|
|
57
|
-
"""
|
|
58
|
-
return AuditStampClass(
|
|
59
|
-
time=int(time.time() * 1000.0),
|
|
60
|
-
actor="urn:li:corpuser:datahub",
|
|
61
|
-
message=message,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
def _ensure_urn_type(
|
|
65
|
-
self, entity_type: str, edges: List[Edge], context: str
|
|
66
|
-
) -> None:
|
|
67
|
-
"""
|
|
68
|
-
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
entity_type: The entity type to check against.
|
|
72
|
-
edges: A list of Edge objects.
|
|
73
|
-
context: The context or description of the operation.
|
|
74
|
-
|
|
75
|
-
Raises:
|
|
76
|
-
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
77
|
-
"""
|
|
78
|
-
for e in edges:
|
|
79
|
-
urn = Urn.create_from_string(e.destinationUrn)
|
|
80
|
-
if not urn.get_type() == entity_type:
|
|
81
|
-
raise ValueError(
|
|
82
|
-
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
83
|
-
)
|
|
84
|
-
|
|
85
45
|
def add_owner(self, owner: Owner) -> "DataJobPatchBuilder":
|
|
86
46
|
"""
|
|
87
47
|
Adds an owner to the DataJobPatchBuilder.
|
|
@@ -392,9 +352,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
392
352
|
ValueError: If the input is not a Schema Field urn.
|
|
393
353
|
"""
|
|
394
354
|
input_urn = str(input)
|
|
395
|
-
|
|
396
|
-
if not urn.get_type() == "schemaField":
|
|
397
|
-
raise ValueError(f"Input {input} is not a Schema Field urn")
|
|
355
|
+
assert SchemaFieldUrn.from_string(input_urn)
|
|
398
356
|
|
|
399
357
|
self._add_patch(
|
|
400
358
|
DataJobInputOutput.ASPECT_NAME,
|
|
@@ -466,9 +424,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
|
|
|
466
424
|
ValueError: If the output is not a Schema Field urn.
|
|
467
425
|
"""
|
|
468
426
|
output_urn = str(output)
|
|
469
|
-
|
|
470
|
-
if not urn.get_type() == "schemaField":
|
|
471
|
-
raise ValueError(f"Input {output} is not a Schema Field urn")
|
|
427
|
+
assert SchemaFieldUrn.from_string(output_urn)
|
|
472
428
|
|
|
473
429
|
self._add_patch(
|
|
474
430
|
DataJobInputOutput.ASPECT_NAME,
|
datahub/utilities/urn_encoder.py
CHANGED
|
@@ -4,7 +4,8 @@ from typing import List
|
|
|
4
4
|
# NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
|
|
5
5
|
# If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
|
|
6
6
|
# We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
|
|
7
|
-
|
|
7
|
+
# Also see https://datahubproject.io/docs/what/urn/#restrictions
|
|
8
|
+
RESERVED_CHARS = {",", "(", ")", "␟"}
|
|
8
9
|
RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})
|
|
9
10
|
|
|
10
11
|
|
|
@@ -4,4 +4,4 @@ __all__ = ["StructuredPropertyUrn", "make_structured_property_urn"]
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def make_structured_property_urn(structured_property_id: str) -> str:
|
|
7
|
-
return str(StructuredPropertyUrn.
|
|
7
|
+
return str(StructuredPropertyUrn.from_string(structured_property_id))
|
|
File without changes
|
|
File without changes
|