acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2609 -2608
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +74 -73
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +3 -3
- datahub/api/entities/external/external_tag.py +6 -4
- datahub/api/entities/external/lake_formation_external_entites.py +50 -49
- datahub/api/entities/external/restricted_text.py +105 -180
- datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/quickstart_versioning.py +1 -1
- datahub/cli/specific/assertions_cli.py +37 -2
- datahub/cli/specific/datacontract_cli.py +54 -4
- datahub/emitter/rest_emitter.py +18 -5
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
- datahub/ingestion/api/report.py +21 -2
- datahub/ingestion/api/source.py +81 -7
- datahub/ingestion/autogenerated/capability_summary.json +47 -19
- datahub/ingestion/graph/client.py +19 -3
- datahub/ingestion/sink/datahub_rest.py +2 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +9 -0
- datahub/ingestion/source/aws/glue.py +18 -2
- datahub/ingestion/source/aws/tag_entities.py +4 -4
- datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
- datahub/ingestion/source/dbt/dbt_common.py +10 -0
- datahub/ingestion/source/delta_lake/source.py +8 -1
- datahub/ingestion/source/dremio/dremio_source.py +19 -2
- datahub/ingestion/source/fivetran/fivetran.py +9 -3
- datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
- datahub/ingestion/source/ge_data_profiler.py +8 -0
- datahub/ingestion/source/grafana/models.py +6 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +4 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
- datahub/ingestion/source/powerbi/powerbi.py +4 -1
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/salesforce.py +8 -0
- datahub/ingestion/source/slack/slack.py +7 -14
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
- datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
- datahub/ingestion/source/sql/hive_metastore.py +8 -0
- datahub/ingestion/source/sql/teradata.py +8 -1
- datahub/ingestion/source/sql/trino.py +9 -0
- datahub/ingestion/source/tableau/tableau.py +1 -1
- datahub/ingestion/source/unity/config.py +36 -1
- datahub/ingestion/source/unity/proxy.py +332 -46
- datahub/ingestion/source/unity/proxy_types.py +12 -2
- datahub/ingestion/source/unity/source.py +91 -34
- datahub/ingestion/source/unity/tag_entities.py +5 -5
- datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/metadata/_internal_schema_classes.py +513 -513
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16745 -16348
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/sdk/entity_client.py +22 -7
- datahub/sdk/search_client.py +3 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataset.py +37 -59
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/server_config_util.py +2 -1
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
|
@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
72
72
|
description="Whether to profile for the sample values for all columns.",
|
|
73
73
|
)
|
|
74
74
|
|
|
75
|
-
@pydantic.root_validator()
|
|
75
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
76
76
|
def ensure_field_level_settings_are_normalized(
|
|
77
77
|
cls: "DataLakeProfilerConfig", values: Dict[str, Any]
|
|
78
78
|
) -> Dict[str, Any]:
|
|
@@ -44,6 +44,7 @@ from datahub.ingestion.source.azure.abs_utils import (
|
|
|
44
44
|
get_key_prefix,
|
|
45
45
|
strip_abs_prefix,
|
|
46
46
|
)
|
|
47
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
47
48
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import (
|
|
48
49
|
ContainerWUCreator,
|
|
49
50
|
add_partition_columns_to_schema,
|
|
@@ -128,6 +129,14 @@ class TableData:
|
|
|
128
129
|
@support_status(SupportStatus.INCUBATING)
|
|
129
130
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
130
131
|
@capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
|
|
132
|
+
@capability(
|
|
133
|
+
SourceCapability.CONTAINERS,
|
|
134
|
+
"Extract ABS containers and folders",
|
|
135
|
+
subtype_modifier=[
|
|
136
|
+
SourceCapabilityModifier.FOLDER,
|
|
137
|
+
SourceCapabilityModifier.ABS_CONTAINER,
|
|
138
|
+
],
|
|
139
|
+
)
|
|
131
140
|
class ABSSource(StatefulIngestionSourceBase):
|
|
132
141
|
source_config: DataLakeSourceConfig
|
|
133
142
|
report: DataLakeSourceReport
|
|
@@ -395,7 +395,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
395
395
|
t = LakeFormationTag(
|
|
396
396
|
key=tag_key,
|
|
397
397
|
value=tag_value,
|
|
398
|
-
|
|
398
|
+
catalog=catalog_id,
|
|
399
399
|
)
|
|
400
400
|
tags.append(t)
|
|
401
401
|
return tags
|
|
@@ -438,7 +438,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
438
438
|
t = LakeFormationTag(
|
|
439
439
|
key=tag_key,
|
|
440
440
|
value=tag_value,
|
|
441
|
-
|
|
441
|
+
catalog=catalog_id,
|
|
442
442
|
)
|
|
443
443
|
tags.append(t)
|
|
444
444
|
return tags
|
|
@@ -522,6 +522,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
522
522
|
bucket = url.netloc
|
|
523
523
|
key = url.path[1:]
|
|
524
524
|
|
|
525
|
+
# validate that we have a non-empty key
|
|
526
|
+
if not key:
|
|
527
|
+
self.report.num_job_script_location_invalid += 1
|
|
528
|
+
logger.warning(
|
|
529
|
+
f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path for flow urn: {flow_urn}."
|
|
530
|
+
)
|
|
531
|
+
return None
|
|
532
|
+
|
|
525
533
|
# download the script contents
|
|
526
534
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
|
|
527
535
|
try:
|
|
@@ -533,6 +541,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
533
541
|
)
|
|
534
542
|
self.report.num_job_script_failed_download += 1
|
|
535
543
|
return None
|
|
544
|
+
except botocore.exceptions.ParamValidationError as e:
|
|
545
|
+
self.report_warning(
|
|
546
|
+
flow_urn,
|
|
547
|
+
f"Invalid S3 path for Glue job script {script_path}: {e}",
|
|
548
|
+
)
|
|
549
|
+
self.report.num_job_script_location_invalid += 1
|
|
550
|
+
return None
|
|
551
|
+
|
|
536
552
|
script = obj["Body"].read().decode("utf-8")
|
|
537
553
|
|
|
538
554
|
try:
|
|
@@ -37,7 +37,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
37
37
|
|
|
38
38
|
tag_key: str
|
|
39
39
|
tag_value: Optional[str] = None
|
|
40
|
-
platform_instance: Optional[str]
|
|
40
|
+
platform_instance: Optional[str] = None
|
|
41
41
|
catalog: Optional[str] = None
|
|
42
42
|
exists_in_lake_formation: bool = False
|
|
43
43
|
persisted: bool = False
|
|
@@ -88,8 +88,8 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
|
|
|
88
88
|
return existing_platform_resource
|
|
89
89
|
|
|
90
90
|
return LakeFormationTagPlatformResourceId(
|
|
91
|
-
tag_key=tag.key,
|
|
92
|
-
tag_value=tag.value if tag.value is not None else None,
|
|
91
|
+
tag_key=str(tag.key),
|
|
92
|
+
tag_value=str(tag.value) if tag.value is not None else None,
|
|
93
93
|
platform_instance=platform_instance,
|
|
94
94
|
exists_in_lake_formation=exists_in_lake_formation,
|
|
95
95
|
catalog=catalog,
|
|
@@ -227,7 +227,7 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
|
|
|
227
227
|
datahub_urns: LinkedResourceSet
|
|
228
228
|
managed_by_datahub: bool
|
|
229
229
|
id: LakeFormationTagPlatformResourceId
|
|
230
|
-
allowed_values: Optional[List[str]]
|
|
230
|
+
allowed_values: Optional[List[str]] = None
|
|
231
231
|
|
|
232
232
|
def get_id(self) -> ExternalEntityId:
|
|
233
233
|
return self.id
|
|
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
|
|
|
62
62
|
|
|
63
63
|
date_format: Optional[str] = Field(
|
|
64
64
|
default=None,
|
|
65
|
-
type=str,
|
|
66
65
|
description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
|
|
67
66
|
)
|
|
68
67
|
|
|
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
|
|
|
260
259
|
) -> Union[None, parse.Result, parse.Match]:
|
|
261
260
|
return self.compiled_folder_include.parse(path)
|
|
262
261
|
|
|
263
|
-
@pydantic.root_validator()
|
|
262
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
264
263
|
def validate_no_double_stars(cls, values: Dict) -> Dict:
|
|
265
264
|
if "include" not in values:
|
|
266
265
|
return values
|
|
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
|
|
|
456
455
|
partition = partition.rsplit("/", 1)[0]
|
|
457
456
|
for partition_key in partition.split("/"):
|
|
458
457
|
if partition_key.find("=") != -1:
|
|
459
|
-
|
|
458
|
+
key_value = partition_key.split(
|
|
459
|
+
"=", 1
|
|
460
|
+
) # Split into at most 2 parts
|
|
461
|
+
if len(key_value) == 2:
|
|
462
|
+
partition_keys.append((key_value[0], key_value[1]))
|
|
460
463
|
else:
|
|
461
464
|
partition_split = partition.rsplit("/", 1)
|
|
462
465
|
if len(partition_split) == 1:
|
|
@@ -19,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
19
19
|
auto_workunit_reporter,
|
|
20
20
|
)
|
|
21
21
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
22
23
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
23
24
|
from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
|
|
24
25
|
from datahub.ingestion.source.datahub.datahub_database_reader import (
|
|
@@ -39,7 +40,13 @@ logger = logging.getLogger(__name__)
|
|
|
39
40
|
@platform_name("DataHub")
|
|
40
41
|
@config_class(DataHubSourceConfig)
|
|
41
42
|
@support_status(SupportStatus.TESTING)
|
|
42
|
-
@capability(
|
|
43
|
+
@capability(
|
|
44
|
+
SourceCapability.CONTAINERS,
|
|
45
|
+
"Enabled by default",
|
|
46
|
+
subtype_modifier=[
|
|
47
|
+
SourceCapabilityModifier.DATABASE,
|
|
48
|
+
],
|
|
49
|
+
)
|
|
43
50
|
class DataHubSource(StatefulIngestionSourceBase):
|
|
44
51
|
platform: str = "datahub"
|
|
45
52
|
|
|
@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
370
370
|
name = node["alias"]
|
|
371
371
|
|
|
372
372
|
comment = node.get("comment", "")
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
373
|
+
|
|
374
|
+
# In dbt sources, there are two types of descriptions:
|
|
375
|
+
# - description: table-level description (specific to the source table)
|
|
376
|
+
# - sourceDescription: schema-level description (describes the overall source schema)
|
|
377
|
+
# The table-level description should take precedence since it's more specific.
|
|
378
|
+
description = node["description"] or node.get("sourceDescription", "")
|
|
376
379
|
|
|
377
380
|
if node["resourceType"] == "model":
|
|
378
381
|
materialization = node["materializedType"]
|
|
@@ -120,6 +120,7 @@ logger = logging.getLogger(__name__)
|
|
|
120
120
|
DBT_PLATFORM = "dbt"
|
|
121
121
|
|
|
122
122
|
_DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
|
123
|
+
_DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
|
|
123
124
|
|
|
124
125
|
|
|
125
126
|
@dataclass
|
|
@@ -1684,6 +1685,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1684
1685
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
|
1685
1686
|
pass
|
|
1686
1687
|
|
|
1688
|
+
@staticmethod
|
|
1689
|
+
def _truncate_code(code: str, max_length: int) -> str:
|
|
1690
|
+
if len(code) > max_length:
|
|
1691
|
+
return code[:max_length] + "..."
|
|
1692
|
+
return code
|
|
1693
|
+
|
|
1687
1694
|
def _create_view_properties_aspect(
|
|
1688
1695
|
self, node: DBTNode
|
|
1689
1696
|
) -> Optional[ViewPropertiesClass]:
|
|
@@ -1695,6 +1702,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1695
1702
|
compiled_code = try_format_query(
|
|
1696
1703
|
node.compiled_code, platform=self.config.target_platform
|
|
1697
1704
|
)
|
|
1705
|
+
compiled_code = self._truncate_code(
|
|
1706
|
+
compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
|
|
1707
|
+
)
|
|
1698
1708
|
|
|
1699
1709
|
materialized = node.materialization in {"table", "incremental", "snapshot"}
|
|
1700
1710
|
view_properties = ViewPropertiesClass(
|
|
@@ -29,6 +29,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
29
29
|
get_key_prefix,
|
|
30
30
|
strip_s3_prefix,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
32
33
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
33
34
|
from datahub.ingestion.source.delta_lake.config import DeltaLakeSourceConfig
|
|
34
35
|
from datahub.ingestion.source.delta_lake.delta_lake_utils import (
|
|
@@ -85,7 +86,13 @@ OPERATION_STATEMENT_TYPES = {
|
|
|
85
86
|
@config_class(DeltaLakeSourceConfig)
|
|
86
87
|
@support_status(SupportStatus.INCUBATING)
|
|
87
88
|
@capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
|
|
88
|
-
@capability(
|
|
89
|
+
@capability(
|
|
90
|
+
SourceCapability.CONTAINERS,
|
|
91
|
+
"Enabled by default",
|
|
92
|
+
subtype_modifier=[
|
|
93
|
+
SourceCapabilityModifier.FOLDER,
|
|
94
|
+
],
|
|
95
|
+
)
|
|
89
96
|
class DeltaLakeSource(StatefulIngestionSourceBase):
|
|
90
97
|
"""
|
|
91
98
|
This plugin extracts:
|
|
@@ -22,6 +22,7 @@ from datahub.ingestion.api.source import (
|
|
|
22
22
|
SourceReport,
|
|
23
23
|
)
|
|
24
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
25
26
|
from datahub.ingestion.source.dremio.dremio_api import (
|
|
26
27
|
DremioAPIOperations,
|
|
27
28
|
DremioEdition,
|
|
@@ -86,11 +87,27 @@ class DremioSourceMapEntry:
|
|
|
86
87
|
@platform_name("Dremio")
|
|
87
88
|
@config_class(DremioSourceConfig)
|
|
88
89
|
@support_status(SupportStatus.CERTIFIED)
|
|
89
|
-
@capability(
|
|
90
|
+
@capability(
|
|
91
|
+
SourceCapability.CONTAINERS,
|
|
92
|
+
"Enabled by default",
|
|
93
|
+
)
|
|
90
94
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
91
95
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
92
96
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
93
|
-
@capability(
|
|
97
|
+
@capability(
|
|
98
|
+
SourceCapability.LINEAGE_COARSE,
|
|
99
|
+
"Enabled by default",
|
|
100
|
+
subtype_modifier=[
|
|
101
|
+
SourceCapabilityModifier.TABLE,
|
|
102
|
+
],
|
|
103
|
+
)
|
|
104
|
+
@capability(
|
|
105
|
+
SourceCapability.LINEAGE_FINE,
|
|
106
|
+
"Extract column-level lineage",
|
|
107
|
+
subtype_modifier=[
|
|
108
|
+
SourceCapabilityModifier.TABLE,
|
|
109
|
+
],
|
|
110
|
+
)
|
|
94
111
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
95
112
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
96
113
|
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
@@ -16,7 +16,11 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
platform_name,
|
|
17
17
|
support_status,
|
|
18
18
|
)
|
|
19
|
-
from datahub.ingestion.api.source import
|
|
19
|
+
from datahub.ingestion.api.source import (
|
|
20
|
+
MetadataWorkUnitProcessor,
|
|
21
|
+
SourceReport,
|
|
22
|
+
StructuredLogCategory,
|
|
23
|
+
)
|
|
20
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
25
|
from datahub.ingestion.source.fivetran.config import (
|
|
22
26
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
@@ -96,8 +100,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
96
100
|
self.report.info(
|
|
97
101
|
title="Guessing source platform for lineage",
|
|
98
102
|
message="We encountered a connector type that we don't fully support yet. "
|
|
99
|
-
"We will attempt to guess the platform based on the connector type."
|
|
100
|
-
|
|
103
|
+
"We will attempt to guess the platform based on the connector type. "
|
|
104
|
+
"Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
|
|
105
|
+
context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
|
|
106
|
+
log_category=StructuredLogCategory.LINEAGE,
|
|
101
107
|
)
|
|
102
108
|
source_details.platform = connector.connector_type
|
|
103
109
|
|
|
@@ -69,9 +69,10 @@ class FivetranLogAPI:
|
|
|
69
69
|
fivetran_log_query.set_schema(bigquery_destination_config.dataset)
|
|
70
70
|
|
|
71
71
|
# The "database" should be the BigQuery project name.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
result = engine.execute("SELECT @@project_id").fetchone()
|
|
73
|
+
if result is None:
|
|
74
|
+
raise ValueError("Failed to retrieve BigQuery project ID")
|
|
75
|
+
fivetran_log_database = result[0]
|
|
75
76
|
else:
|
|
76
77
|
raise ConfigurationError(
|
|
77
78
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -216,6 +216,14 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
216
216
|
)
|
|
217
217
|
).scalar()
|
|
218
218
|
)
|
|
219
|
+
elif self.engine.dialect.name.lower() == DATABRICKS:
|
|
220
|
+
return convert_to_json_serializable(
|
|
221
|
+
self.engine.execute(
|
|
222
|
+
sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from(
|
|
223
|
+
self._table
|
|
224
|
+
)
|
|
225
|
+
).scalar()
|
|
226
|
+
)
|
|
219
227
|
return convert_to_json_serializable(
|
|
220
228
|
self.engine.execute(
|
|
221
229
|
sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
|
|
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, Field
|
|
14
14
|
|
|
15
|
+
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
15
16
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
16
17
|
|
|
17
18
|
# Grafana-specific type definitions for better type safety
|
|
@@ -106,6 +107,11 @@ class Folder(BaseModel):
|
|
|
106
107
|
title: str
|
|
107
108
|
description: Optional[str] = ""
|
|
108
109
|
|
|
110
|
+
if PYDANTIC_VERSION_2:
|
|
111
|
+
from pydantic import ConfigDict
|
|
112
|
+
|
|
113
|
+
model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
|
|
114
|
+
|
|
109
115
|
|
|
110
116
|
class FolderKey(ContainerKey):
|
|
111
117
|
"""Key for identifying a Grafana folder."""
|
|
@@ -69,7 +69,7 @@ class HexSourceConfig(
|
|
|
69
69
|
)
|
|
70
70
|
include_components: bool = Field(
|
|
71
71
|
default=True,
|
|
72
|
-
|
|
72
|
+
description="Include Hex Components in the ingestion",
|
|
73
73
|
)
|
|
74
74
|
page_size: int = Field(
|
|
75
75
|
default=HEX_API_PAGE_SIZE_DEFAULT,
|
|
@@ -97,7 +97,7 @@ class HexQueryFetcher:
|
|
|
97
97
|
if not query_urns or not entities_by_urn:
|
|
98
98
|
self.report.warning(
|
|
99
99
|
title="No Queries found with Hex as origin",
|
|
100
|
-
message="No lineage because of no Queries found with Hex as origin in the given time range
|
|
100
|
+
message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
|
|
101
101
|
context=str(
|
|
102
102
|
dict(
|
|
103
103
|
workspace_name=self.workspace_name,
|
|
@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
524
524
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
525
525
|
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
526
526
|
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
527
|
-
if table.current_snapshot():
|
|
528
|
-
custom_properties["snapshot-id"] = str(
|
|
529
|
-
custom_properties["manifest-list"] =
|
|
527
|
+
if current_snapshot := table.current_snapshot():
|
|
528
|
+
custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
|
|
529
|
+
custom_properties["manifest-list"] = current_snapshot.manifest_list
|
|
530
530
|
if not last_modified:
|
|
531
|
-
last_modified = int(
|
|
531
|
+
last_modified = int(current_snapshot.timestamp_ms)
|
|
532
532
|
if "created-at" in custom_properties:
|
|
533
533
|
try:
|
|
534
534
|
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import lru_cache
|
|
2
|
-
from typing import ClassVar, Optional, TextIO
|
|
2
|
+
from typing import ClassVar, Optional, TextIO, Type
|
|
3
3
|
|
|
4
4
|
from liquid import Environment
|
|
5
5
|
from liquid.ast import Node
|
|
@@ -20,16 +20,27 @@ class CustomTagException(Exception):
|
|
|
20
20
|
class ConditionNode(Node):
|
|
21
21
|
def __init__(self, tok: Token, sql_or_lookml_reference: str, filter_name: str):
|
|
22
22
|
self.tok = tok
|
|
23
|
-
|
|
24
23
|
self.sql_or_lookml_reference = sql_or_lookml_reference
|
|
25
|
-
|
|
26
24
|
self.filter_name = filter_name
|
|
27
25
|
|
|
28
26
|
def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
|
|
29
27
|
# This implementation will make sure that sql parse work correctly if looker condition tag
|
|
30
28
|
# is used in lookml sql field
|
|
31
29
|
buffer.write(f"{self.sql_or_lookml_reference}='dummy_value'")
|
|
30
|
+
return True
|
|
32
31
|
|
|
32
|
+
|
|
33
|
+
class IncrementConditionNode(Node):
|
|
34
|
+
def __init__(self, tok: Token, sql_or_lookml_reference: str):
|
|
35
|
+
self.tok = tok
|
|
36
|
+
self.sql_or_lookml_reference = sql_or_lookml_reference
|
|
37
|
+
|
|
38
|
+
def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
|
|
39
|
+
# For incrementcondition, we need to generate a condition that would be used
|
|
40
|
+
# in incremental PDT updates. This typically involves date/time comparisons.
|
|
41
|
+
# We'll render it as a date comparison with a placeholder value
|
|
42
|
+
# See details in Looker documentation for incrementcondition tag -> cloud.google.com/looker/docs/reference/param-view-increment-key
|
|
43
|
+
buffer.write(f"{self.sql_or_lookml_reference} > '2023-01-01'")
|
|
33
44
|
return True
|
|
34
45
|
|
|
35
46
|
|
|
@@ -44,7 +55,6 @@ class ConditionTag(Tag):
|
|
|
44
55
|
This class render the below tag as order.region='ap-south-1' if order_region is provided in config.liquid_variables
|
|
45
56
|
as order_region: 'ap-south-1'
|
|
46
57
|
{% condition order_region %} order.region {% endcondition %}
|
|
47
|
-
|
|
48
58
|
"""
|
|
49
59
|
|
|
50
60
|
TAG_START: ClassVar[str] = "condition"
|
|
@@ -79,7 +89,48 @@ class ConditionTag(Tag):
|
|
|
79
89
|
)
|
|
80
90
|
|
|
81
91
|
|
|
82
|
-
|
|
92
|
+
class IncrementConditionTag(Tag):
|
|
93
|
+
"""
|
|
94
|
+
IncrementConditionTag is the equivalent implementation of looker's custom liquid tag "incrementcondition".
|
|
95
|
+
Refer doc: https://cloud.google.com/looker/docs/incremental-pdts#using_the_incrementcondition_tag
|
|
96
|
+
|
|
97
|
+
This tag is used for incremental PDTs to determine which records should be updated.
|
|
98
|
+
It typically works with date/time fields to filter data that has changed since the last update.
|
|
99
|
+
|
|
100
|
+
Example usage in Looker:
|
|
101
|
+
{% incrementcondition created_at %} order.created_at {% endincrementcondition %}
|
|
102
|
+
|
|
103
|
+
This would generate SQL like: order.created_at > '2023-01-01 00:00:00'
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
TAG_START: ClassVar[str] = "incrementcondition"
|
|
107
|
+
TAG_END: ClassVar[str] = "endincrementcondition"
|
|
108
|
+
name: str = "incrementcondition"
|
|
109
|
+
|
|
110
|
+
def __init__(self, env: Environment):
|
|
111
|
+
super().__init__(env)
|
|
112
|
+
self.parser = get_parser(self.env)
|
|
113
|
+
|
|
114
|
+
def parse(self, stream: TokenStream) -> Node:
|
|
115
|
+
expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_START)
|
|
116
|
+
|
|
117
|
+
start_token = stream.current
|
|
118
|
+
|
|
119
|
+
stream.next_token()
|
|
120
|
+
expect(stream, TOKEN_LITERAL)
|
|
121
|
+
sql_or_lookml_reference: str = stream.current.value.strip()
|
|
122
|
+
|
|
123
|
+
stream.next_token()
|
|
124
|
+
expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_END)
|
|
125
|
+
|
|
126
|
+
return IncrementConditionNode(
|
|
127
|
+
tok=start_token,
|
|
128
|
+
sql_or_lookml_reference=sql_or_lookml_reference,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# Updated custom_tags list to include both tags
|
|
133
|
+
custom_tags: list[Type[Tag]] = [ConditionTag, IncrementConditionTag]
|
|
83
134
|
|
|
84
135
|
|
|
85
136
|
@string_filter
|
|
@@ -13,7 +13,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
13
13
|
platform_name,
|
|
14
14
|
support_status,
|
|
15
15
|
)
|
|
16
|
-
from datahub.ingestion.api.source import Source, SourceReport
|
|
16
|
+
from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
19
19
|
from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
@@ -35,6 +35,8 @@ from datahub.utilities.str_enum import StrEnum
|
|
|
35
35
|
|
|
36
36
|
logger = logging.getLogger(__name__)
|
|
37
37
|
|
|
38
|
+
PLATFORM_NAME = "fake"
|
|
39
|
+
|
|
38
40
|
|
|
39
41
|
class SubTypePattern(StrEnum):
|
|
40
42
|
ALTERNATING = "alternating"
|
|
@@ -137,6 +139,10 @@ class DataHubMockDataConfig(ConfigModel):
|
|
|
137
139
|
default=0,
|
|
138
140
|
description="Number of warnings to add in report for testing",
|
|
139
141
|
)
|
|
142
|
+
num_info: int = Field(
|
|
143
|
+
default=0,
|
|
144
|
+
description="Number of info to add in report for testing",
|
|
145
|
+
)
|
|
140
146
|
|
|
141
147
|
gen_1: LineageConfigGen1 = Field(
|
|
142
148
|
default_factory=LineageConfigGen1,
|
|
@@ -144,7 +150,7 @@ class DataHubMockDataConfig(ConfigModel):
|
|
|
144
150
|
)
|
|
145
151
|
|
|
146
152
|
|
|
147
|
-
@platform_name(
|
|
153
|
+
@platform_name(PLATFORM_NAME)
|
|
148
154
|
@config_class(DataHubMockDataConfig)
|
|
149
155
|
@support_status(SupportStatus.TESTING)
|
|
150
156
|
class DataHubMockDataSource(Source):
|
|
@@ -159,6 +165,9 @@ class DataHubMockDataSource(Source):
|
|
|
159
165
|
self.report = DataHubMockDataReport()
|
|
160
166
|
|
|
161
167
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
168
|
+
# We don't want any implicit aspects to be produced
|
|
169
|
+
# so we are not using get_workunits_internal
|
|
170
|
+
|
|
162
171
|
if self.config.throw_uncaught_exceptions:
|
|
163
172
|
raise Exception("This is a test exception")
|
|
164
173
|
|
|
@@ -176,10 +185,17 @@ class DataHubMockDataSource(Source):
|
|
|
176
185
|
message="This is test warning",
|
|
177
186
|
title="Test Warning",
|
|
178
187
|
context=f"This is test warning {i}",
|
|
188
|
+
log_category=StructuredLogCategory.LINEAGE,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if self.config.num_info > 0:
|
|
192
|
+
for i in range(self.config.num_info):
|
|
193
|
+
self.report.info(
|
|
194
|
+
message="This is test info",
|
|
195
|
+
title="Test Info",
|
|
196
|
+
context=f"This is test info {i}",
|
|
179
197
|
)
|
|
180
198
|
|
|
181
|
-
# We don't want any implicit aspects to be produced
|
|
182
|
-
# so we are not using get_workunits_internal
|
|
183
199
|
if self.config.gen_1.enabled:
|
|
184
200
|
for wu in self._data_gen_1():
|
|
185
201
|
if self.report.first_urn_seen is None:
|
|
@@ -309,7 +325,7 @@ class DataHubMockDataSource(Source):
|
|
|
309
325
|
table_level, table_index, subtype_pattern, subtype_types, level_subtypes
|
|
310
326
|
)
|
|
311
327
|
|
|
312
|
-
urn = make_dataset_urn(platform=
|
|
328
|
+
urn = make_dataset_urn(platform=PLATFORM_NAME, name=table_name)
|
|
313
329
|
mcp = MetadataChangeProposalWrapper(
|
|
314
330
|
entityUrn=urn,
|
|
315
331
|
entityType="dataset",
|
|
@@ -433,7 +449,7 @@ class DataHubMockDataSource(Source):
|
|
|
433
449
|
|
|
434
450
|
def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
|
|
435
451
|
urn = make_dataset_urn(
|
|
436
|
-
platform=
|
|
452
|
+
platform=PLATFORM_NAME,
|
|
437
453
|
name=table,
|
|
438
454
|
)
|
|
439
455
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -448,7 +464,7 @@ class DataHubMockDataSource(Source):
|
|
|
448
464
|
) -> MetadataWorkUnit:
|
|
449
465
|
mcp = MetadataChangeProposalWrapper(
|
|
450
466
|
entityUrn=make_dataset_urn(
|
|
451
|
-
platform=
|
|
467
|
+
platform=PLATFORM_NAME,
|
|
452
468
|
name=downstream_table,
|
|
453
469
|
),
|
|
454
470
|
entityType="dataset",
|
|
@@ -456,7 +472,7 @@ class DataHubMockDataSource(Source):
|
|
|
456
472
|
upstreams=[
|
|
457
473
|
UpstreamClass(
|
|
458
474
|
dataset=make_dataset_urn(
|
|
459
|
-
platform=
|
|
475
|
+
platform=PLATFORM_NAME,
|
|
460
476
|
name=upstream_table,
|
|
461
477
|
),
|
|
462
478
|
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
@@ -468,7 +484,7 @@ class DataHubMockDataSource(Source):
|
|
|
468
484
|
|
|
469
485
|
def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
|
|
470
486
|
urn = make_dataset_urn(
|
|
471
|
-
platform=
|
|
487
|
+
platform=PLATFORM_NAME,
|
|
472
488
|
name=table,
|
|
473
489
|
)
|
|
474
490
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -485,7 +501,7 @@ class DataHubMockDataSource(Source):
|
|
|
485
501
|
|
|
486
502
|
def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
|
|
487
503
|
urn = make_dataset_urn(
|
|
488
|
-
platform=
|
|
504
|
+
platform=PLATFORM_NAME,
|
|
489
505
|
name=table,
|
|
490
506
|
)
|
|
491
507
|
mcp = MetadataChangeProposalWrapper(
|
|
@@ -1226,7 +1226,10 @@ class Mapper:
|
|
|
1226
1226
|
@platform_name("PowerBI")
|
|
1227
1227
|
@config_class(PowerBiDashboardSourceConfig)
|
|
1228
1228
|
@support_status(SupportStatus.CERTIFIED)
|
|
1229
|
-
@capability(
|
|
1229
|
+
@capability(
|
|
1230
|
+
SourceCapability.CONTAINERS,
|
|
1231
|
+
"Enabled by default",
|
|
1232
|
+
)
|
|
1230
1233
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
1231
1234
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
1232
1235
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
|
|
|
27
27
|
is_favorite: bool = Field(alias="IsFavorite")
|
|
28
28
|
user_info: Any = Field(None, alias="UserInfo")
|
|
29
29
|
display_name: Optional[str] = Field(None, alias="DisplayName")
|
|
30
|
-
has_data_sources: bool = Field(
|
|
31
|
-
data_sources: Optional[List["DataSource"]] = Field(
|
|
32
|
-
default_factory=list, alias="DataSources"
|
|
33
|
-
)
|
|
30
|
+
has_data_sources: bool = Field(False, alias="HasDataSources")
|
|
31
|
+
data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
|
|
34
32
|
|
|
35
33
|
@validator("display_name", always=True)
|
|
36
34
|
def validate_diplay_name(cls, value, values):
|
|
@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
|
|
|
26
26
|
|
|
27
27
|
class OutboundSharePlatformResource(BaseModel):
|
|
28
28
|
namespace: str
|
|
29
|
-
platform_instance: Optional[str]
|
|
29
|
+
platform_instance: Optional[str] = None
|
|
30
30
|
env: str
|
|
31
31
|
source_database: str
|
|
32
32
|
share_name: str
|
|
@@ -132,6 +132,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
132
132
|
"Enabled by default",
|
|
133
133
|
subtype_modifier=[
|
|
134
134
|
SourceCapabilityModifier.DATABASE,
|
|
135
|
+
SourceCapabilityModifier.SCHEMA,
|
|
135
136
|
],
|
|
136
137
|
)
|
|
137
138
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|