acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/autogenerated/capability_summary.json +45 -1
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +76 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +493 -547
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/metabase.py +9 -2
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +122 -15
- datahub/ingestion/source/tableau/tableau.py +68 -14
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/metadata/_internal_schema_classes.py +728 -528
- datahub/metadata/_urns/urn_defs.py +1702 -1702
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +17434 -17732
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/_shared.py +126 -0
- datahub/sdk/chart.py +87 -30
- datahub/sdk/dashboard.py +79 -34
- datahub/sdk/entity_client.py +11 -4
- datahub/sdk/lineage_client.py +3 -3
- datahub/sdk/search_filters.py +1 -7
- datahub/sql_parsing/split_statements.py +13 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
|
@@ -307,7 +307,6 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
307
307
|
"get_column_max",
|
|
308
308
|
"get_column_mean",
|
|
309
309
|
"get_column_stdev",
|
|
310
|
-
"get_column_nonnull_count",
|
|
311
310
|
"get_column_unique_count",
|
|
312
311
|
}
|
|
313
312
|
CONSTANT_ROW_QUERY_METHODS = {
|
|
@@ -331,6 +330,7 @@ def _is_single_row_query_method(query: Any) -> bool:
|
|
|
331
330
|
|
|
332
331
|
FIRST_PARTY_SINGLE_ROW_QUERY_METHODS = {
|
|
333
332
|
"get_column_unique_count_dh_patch",
|
|
333
|
+
"_get_column_cardinality",
|
|
334
334
|
}
|
|
335
335
|
|
|
336
336
|
# We'll do this the inefficient way since the arrays are pretty small.
|
|
@@ -497,7 +497,20 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
497
497
|
self, column_spec: _SingleColumnSpec, column: str
|
|
498
498
|
) -> None:
|
|
499
499
|
try:
|
|
500
|
-
|
|
500
|
+
# Don't use Great Expectations get_column_nonnull_count because it
|
|
501
|
+
# generates this SQL:
|
|
502
|
+
#
|
|
503
|
+
# sum(CASE WHEN (mycolumn IN (NULL) OR mycolumn IS NULL) THEN 1 ELSE 0 END)
|
|
504
|
+
#
|
|
505
|
+
# which fails for complex types (such as Databricks maps) that don't
|
|
506
|
+
# support the IN operator.
|
|
507
|
+
nonnull_count = convert_to_json_serializable(
|
|
508
|
+
self.dataset.engine.execute(
|
|
509
|
+
sa.select(sa.func.count(sa.column(column))).select_from(
|
|
510
|
+
self.dataset._table
|
|
511
|
+
)
|
|
512
|
+
).scalar()
|
|
513
|
+
)
|
|
501
514
|
column_spec.nonnull_count = nonnull_count
|
|
502
515
|
except Exception as e:
|
|
503
516
|
logger.debug(
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Annotated, Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
|
|
10
10
|
from datahub.ingestion.source_config.operation_config import OperationConfig
|
|
11
11
|
|
|
12
12
|
_PROFILING_FLAGS_TO_REPORT = {
|
|
@@ -120,37 +120,40 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
120
120
|
"number of columns to profile goes up.",
|
|
121
121
|
)
|
|
122
122
|
|
|
123
|
-
profile_if_updated_since_days:
|
|
123
|
+
profile_if_updated_since_days: Annotated[
|
|
124
|
+
Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
|
|
125
|
+
] = Field(
|
|
124
126
|
default=None,
|
|
125
127
|
description="Profile table only if it has been updated since these many number of days. "
|
|
126
128
|
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
129
|
"Supported only in `snowflake` and `BigQuery`.",
|
|
128
|
-
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
|
|
129
130
|
)
|
|
130
131
|
|
|
131
|
-
profile_table_size_limit:
|
|
132
|
+
profile_table_size_limit: Annotated[
|
|
133
|
+
Optional[int],
|
|
134
|
+
SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
|
|
135
|
+
] = Field(
|
|
132
136
|
default=5,
|
|
133
137
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
134
138
|
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
135
139
|
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
136
|
-
schema_extra={
|
|
137
|
-
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
|
|
138
|
-
},
|
|
139
140
|
)
|
|
140
141
|
|
|
141
|
-
profile_table_row_limit:
|
|
142
|
+
profile_table_row_limit: Annotated[
|
|
143
|
+
Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
|
|
144
|
+
] = Field(
|
|
142
145
|
default=5000000,
|
|
143
146
|
description="Profile tables only if their row count is less than specified count. "
|
|
144
147
|
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
145
148
|
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
146
|
-
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
|
|
147
149
|
)
|
|
148
150
|
|
|
149
|
-
profile_table_row_count_estimate_only:
|
|
151
|
+
profile_table_row_count_estimate_only: Annotated[
|
|
152
|
+
bool, SupportedSources(["postgres", "mysql"])
|
|
153
|
+
] = Field(
|
|
150
154
|
default=False,
|
|
151
155
|
description="Use an approximate query for row count. This will be much faster but slightly "
|
|
152
156
|
"less accurate. Only supported for Postgres and MySQL. ",
|
|
153
|
-
schema_extra={"supported_sources": ["postgres", "mysql"]},
|
|
154
157
|
)
|
|
155
158
|
|
|
156
159
|
# The query combiner enables us to combine multiple queries into a single query,
|
|
@@ -163,36 +166,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
163
166
|
# Hidden option - used for debugging purposes.
|
|
164
167
|
catch_exceptions: bool = Field(default=True, description="")
|
|
165
168
|
|
|
166
|
-
partition_profiling_enabled:
|
|
169
|
+
partition_profiling_enabled: Annotated[
|
|
170
|
+
bool, SupportedSources(["athena", "bigquery"])
|
|
171
|
+
] = Field(
|
|
167
172
|
default=True,
|
|
168
173
|
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
|
169
174
|
"If enabled, latest partition data is used for profiling.",
|
|
170
|
-
schema_extra={"supported_sources": ["athena", "bigquery"]},
|
|
171
175
|
)
|
|
172
|
-
partition_datetime:
|
|
176
|
+
partition_datetime: Annotated[
|
|
177
|
+
Optional[datetime.datetime], SupportedSources(["bigquery"])
|
|
178
|
+
] = Field(
|
|
173
179
|
default=None,
|
|
174
180
|
description="If specified, profile only the partition which matches this datetime. "
|
|
175
181
|
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
|
176
|
-
schema_extra={"supported_sources": ["bigquery"]},
|
|
177
182
|
)
|
|
178
|
-
use_sampling: bool = Field(
|
|
183
|
+
use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
179
184
|
default=True,
|
|
180
185
|
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
|
181
186
|
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
|
182
|
-
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
183
187
|
)
|
|
184
188
|
|
|
185
|
-
sample_size: int = Field(
|
|
189
|
+
sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
186
190
|
default=10000,
|
|
187
191
|
description="Number of rows to be sampled from table for column level profiling."
|
|
188
192
|
"Applicable only if `use_sampling` is set to True.",
|
|
189
|
-
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
190
193
|
)
|
|
191
194
|
|
|
192
|
-
profile_external_tables:
|
|
195
|
+
profile_external_tables: Annotated[
|
|
196
|
+
bool, SupportedSources(["redshift", "snowflake"])
|
|
197
|
+
] = Field(
|
|
193
198
|
default=False,
|
|
194
199
|
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
|
195
|
-
schema_extra={"supported_sources": ["redshift", "snowflake"]},
|
|
196
200
|
)
|
|
197
201
|
|
|
198
202
|
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
|
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from pydantic import Field, SecretStr, validator
|
|
4
4
|
|
|
5
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
6
6
|
from datahub.configuration.source_common import (
|
|
7
7
|
DatasetLineageProviderConfigBase,
|
|
8
8
|
EnvConfigMixin,
|
|
@@ -37,7 +37,7 @@ class GrafanaSourceConfig(
|
|
|
37
37
|
):
|
|
38
38
|
"""Configuration for Grafana source"""
|
|
39
39
|
|
|
40
|
-
platform: str = Field(default="grafana"
|
|
40
|
+
platform: HiddenFromDocs[str] = Field(default="grafana")
|
|
41
41
|
url: str = Field(
|
|
42
42
|
description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
|
|
43
43
|
)
|
|
@@ -10,9 +10,8 @@ References:
|
|
|
10
10
|
|
|
11
11
|
from typing import Any, Dict, List, Optional
|
|
12
12
|
|
|
13
|
-
from pydantic import BaseModel, Field
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
14
|
|
|
15
|
-
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
16
15
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
17
16
|
|
|
18
17
|
# Grafana-specific type definitions for better type safety
|
|
@@ -25,7 +24,11 @@ GrafanaFieldConfig = Dict[
|
|
|
25
24
|
GrafanaTransformation = Dict[str, Any] # Transformations: id, options
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
class
|
|
27
|
+
class _GrafanaBaseModel(BaseModel):
|
|
28
|
+
model_config = ConfigDict(coerce_numbers_to_str=True)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DatasourceRef(_GrafanaBaseModel):
|
|
29
32
|
"""Reference to a Grafana datasource."""
|
|
30
33
|
|
|
31
34
|
type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
|
|
@@ -33,13 +36,13 @@ class DatasourceRef(BaseModel):
|
|
|
33
36
|
name: Optional[str] = None # Datasource display name
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
class Panel(
|
|
39
|
+
class Panel(_GrafanaBaseModel):
|
|
37
40
|
"""Represents a Grafana dashboard panel."""
|
|
38
41
|
|
|
39
42
|
id: str
|
|
40
43
|
title: str
|
|
41
44
|
description: str = ""
|
|
42
|
-
type: Optional[str]
|
|
45
|
+
type: Optional[str] = None
|
|
43
46
|
# Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
|
|
44
47
|
query_targets: List[GrafanaQueryTarget] = Field(
|
|
45
48
|
default_factory=list, alias="targets"
|
|
@@ -52,16 +55,16 @@ class Panel(BaseModel):
|
|
|
52
55
|
transformations: List[GrafanaTransformation] = Field(default_factory=list)
|
|
53
56
|
|
|
54
57
|
|
|
55
|
-
class Dashboard(
|
|
58
|
+
class Dashboard(_GrafanaBaseModel):
|
|
56
59
|
"""Represents a Grafana dashboard."""
|
|
57
60
|
|
|
58
61
|
uid: str
|
|
59
62
|
title: str
|
|
60
63
|
description: str = ""
|
|
61
|
-
version: Optional[str]
|
|
64
|
+
version: Optional[str] = None
|
|
62
65
|
panels: List[Panel]
|
|
63
66
|
tags: List[str]
|
|
64
|
-
timezone: Optional[str]
|
|
67
|
+
timezone: Optional[str] = None
|
|
65
68
|
refresh: Optional[str] = None
|
|
66
69
|
schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
|
|
67
70
|
folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
|
|
@@ -100,18 +103,13 @@ class Dashboard(BaseModel):
|
|
|
100
103
|
return super().parse_obj(dashboard_dict)
|
|
101
104
|
|
|
102
105
|
|
|
103
|
-
class Folder(
|
|
106
|
+
class Folder(_GrafanaBaseModel):
|
|
104
107
|
"""Represents a Grafana folder."""
|
|
105
108
|
|
|
106
109
|
id: str
|
|
107
110
|
title: str
|
|
108
111
|
description: Optional[str] = ""
|
|
109
112
|
|
|
110
|
-
if PYDANTIC_VERSION_2:
|
|
111
|
-
from pydantic import ConfigDict
|
|
112
|
-
|
|
113
|
-
model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
|
|
114
|
-
|
|
115
113
|
|
|
116
114
|
class FolderKey(ContainerKey):
|
|
117
115
|
"""Key for identifying a Grafana folder."""
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from datetime import datetime, timedelta, timezone
|
|
3
4
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -122,7 +123,11 @@ class HexSourceConfig(
|
|
|
122
123
|
|
|
123
124
|
@root_validator(pre=True)
|
|
124
125
|
def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
125
|
-
#
|
|
126
|
+
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
127
|
+
# in test_hex.py where the same dict is reused.
|
|
128
|
+
# So a deepcopy is performed first.
|
|
129
|
+
data = deepcopy(data)
|
|
130
|
+
|
|
126
131
|
if "lineage_end_time" not in data or data["lineage_end_time"] is None:
|
|
127
132
|
data["lineage_end_time"] = datetime.now(tz=timezone.utc)
|
|
128
133
|
# if string is given, parse it
|
|
@@ -12,6 +12,7 @@ from pyiceberg.types import (
|
|
|
12
12
|
IcebergType,
|
|
13
13
|
IntegerType,
|
|
14
14
|
LongType,
|
|
15
|
+
PrimitiveType,
|
|
15
16
|
TimestampType,
|
|
16
17
|
TimestamptzType,
|
|
17
18
|
TimeType,
|
|
@@ -22,6 +23,7 @@ from pyiceberg.utils.datetime import (
|
|
|
22
23
|
to_human_timestamp,
|
|
23
24
|
to_human_timestamptz,
|
|
24
25
|
)
|
|
26
|
+
from typing_extensions import TypeGuard
|
|
25
27
|
|
|
26
28
|
from datahub.emitter.mce_builder import get_sys_time
|
|
27
29
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
@@ -65,7 +67,7 @@ class IcebergProfiler:
|
|
|
65
67
|
aggregated_values: Dict[int, Any],
|
|
66
68
|
manifest_values: Dict[int, bytes],
|
|
67
69
|
) -> None:
|
|
68
|
-
for field_id, value_encoded in manifest_values.items():
|
|
70
|
+
for field_id, value_encoded in manifest_values.items():
|
|
69
71
|
try:
|
|
70
72
|
field = schema.find_field(field_id)
|
|
71
73
|
except ValueError:
|
|
@@ -240,7 +242,7 @@ class IcebergProfiler:
|
|
|
240
242
|
return None
|
|
241
243
|
|
|
242
244
|
@staticmethod
|
|
243
|
-
def _is_numeric_type(type: IcebergType) ->
|
|
245
|
+
def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
|
|
244
246
|
return isinstance(
|
|
245
247
|
type,
|
|
246
248
|
(
|
|
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
|
|
|
4
4
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
DatasetLineageProviderConfigBase,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
|
|
|
29
29
|
class ProvidedConfig(ConfigModel):
|
|
30
30
|
provider: str
|
|
31
31
|
path_key: str
|
|
32
|
-
value:
|
|
32
|
+
value: LaxStr
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class GenericConnectorConfig(ConfigModel):
|
|
@@ -28,7 +28,7 @@ from looker_sdk.sdk.api40.models import (
|
|
|
28
28
|
User,
|
|
29
29
|
WriteQuery,
|
|
30
30
|
)
|
|
31
|
-
from pydantic
|
|
31
|
+
from pydantic import validator
|
|
32
32
|
|
|
33
33
|
import datahub.emitter.mce_builder as builder
|
|
34
34
|
from datahub.api.entities.platformresource.platform_resource import (
|
|
@@ -36,7 +36,7 @@ from datahub.api.entities.platformresource.platform_resource import (
|
|
|
36
36
|
PlatformResourceKey,
|
|
37
37
|
)
|
|
38
38
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
39
|
-
from datahub.emitter.mcp_builder import ContainerKey
|
|
39
|
+
from datahub.emitter.mcp_builder import ContainerKey
|
|
40
40
|
from datahub.ingestion.api.report import Report
|
|
41
41
|
from datahub.ingestion.api.source import SourceReport
|
|
42
42
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
@@ -72,7 +72,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
72
72
|
UpstreamClass,
|
|
73
73
|
UpstreamLineage,
|
|
74
74
|
)
|
|
75
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
76
75
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
77
76
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
78
77
|
ArrayTypeClass,
|
|
@@ -90,21 +89,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
90
89
|
)
|
|
91
90
|
from datahub.metadata.schema_classes import (
|
|
92
91
|
BrowsePathEntryClass,
|
|
93
|
-
BrowsePathsClass,
|
|
94
92
|
BrowsePathsV2Class,
|
|
95
|
-
|
|
96
|
-
DatasetPropertiesClass,
|
|
93
|
+
EmbedClass,
|
|
97
94
|
EnumTypeClass,
|
|
98
95
|
FineGrainedLineageClass,
|
|
99
96
|
GlobalTagsClass,
|
|
100
97
|
SchemaMetadataClass,
|
|
101
|
-
StatusClass,
|
|
102
|
-
SubTypesClass,
|
|
103
98
|
TagAssociationClass,
|
|
104
99
|
TagPropertiesClass,
|
|
105
100
|
TagSnapshotClass,
|
|
106
101
|
)
|
|
107
102
|
from datahub.metadata.urns import TagUrn
|
|
103
|
+
from datahub.sdk.dataset import Dataset
|
|
108
104
|
from datahub.sql_parsing.sqlglot_lineage import ColumnRef
|
|
109
105
|
from datahub.utilities.lossy_collections import LossyList, LossySet
|
|
110
106
|
from datahub.utilities.url_util import remove_port_from_url
|
|
@@ -255,6 +251,11 @@ class LookerViewId:
|
|
|
255
251
|
|
|
256
252
|
return generated_urn
|
|
257
253
|
|
|
254
|
+
def get_view_dataset_name(self, config: LookerCommonConfig) -> str:
|
|
255
|
+
n_mapping: ViewNamingPatternMapping = self.get_mapping(config)
|
|
256
|
+
n_mapping.file_path = self.preprocess_file_path(n_mapping.file_path)
|
|
257
|
+
return config.view_naming_pattern.replace_variables(n_mapping)
|
|
258
|
+
|
|
258
259
|
def get_browse_path(self, config: LookerCommonConfig) -> str:
|
|
259
260
|
browse_path = config.view_browse_pattern.replace_variables(
|
|
260
261
|
self.get_mapping(config)
|
|
@@ -282,6 +283,22 @@ class LookerViewId:
|
|
|
282
283
|
],
|
|
283
284
|
)
|
|
284
285
|
|
|
286
|
+
def get_view_dataset_parent_container(
|
|
287
|
+
self, config: LookerCommonConfig
|
|
288
|
+
) -> List[str]:
|
|
289
|
+
project_key = gen_project_key(config, self.project_name)
|
|
290
|
+
view_path = (
|
|
291
|
+
remove_suffix(self.file_path, ".view.lkml")
|
|
292
|
+
if "{file_path}" in config.view_browse_pattern.pattern
|
|
293
|
+
else os.path.dirname(self.file_path)
|
|
294
|
+
)
|
|
295
|
+
path_entries = view_path.split("/") if view_path else []
|
|
296
|
+
return [
|
|
297
|
+
"Develop",
|
|
298
|
+
project_key.as_urn(),
|
|
299
|
+
*path_entries,
|
|
300
|
+
]
|
|
301
|
+
|
|
285
302
|
|
|
286
303
|
class ViewFieldType(Enum):
|
|
287
304
|
DIMENSION = "Dimension"
|
|
@@ -1286,50 +1303,28 @@ class LookerExplore:
|
|
|
1286
1303
|
reporter: SourceReport,
|
|
1287
1304
|
base_url: str,
|
|
1288
1305
|
extract_embed_urls: bool,
|
|
1289
|
-
) ->
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
dataset_snapshot = DatasetSnapshot(
|
|
1294
|
-
urn=self.get_explore_urn(config),
|
|
1295
|
-
aspects=[], # we append to this list later on
|
|
1296
|
-
)
|
|
1297
|
-
|
|
1298
|
-
model_key = gen_model_key(config, self.model_name)
|
|
1299
|
-
browse_paths = BrowsePathsClass(paths=[self.get_explore_browse_path(config)])
|
|
1300
|
-
container = ContainerClass(container=model_key.as_urn())
|
|
1301
|
-
dataset_snapshot.aspects.append(browse_paths)
|
|
1302
|
-
dataset_snapshot.aspects.append(StatusClass(removed=False))
|
|
1303
|
-
|
|
1304
|
-
custom_properties = {
|
|
1305
|
-
"project": self.project_name,
|
|
1306
|
-
"model": self.model_name,
|
|
1307
|
-
"looker.explore.label": self.label,
|
|
1308
|
-
"looker.explore.name": self.name,
|
|
1309
|
-
"looker.explore.file": self.source_file,
|
|
1310
|
-
}
|
|
1311
|
-
dataset_props = DatasetPropertiesClass(
|
|
1312
|
-
name=str(self.label) if self.label else LookerUtil._display_name(self.name),
|
|
1313
|
-
description=self.description,
|
|
1314
|
-
customProperties={
|
|
1315
|
-
k: str(v) for k, v in custom_properties.items() if v is not None
|
|
1316
|
-
},
|
|
1317
|
-
)
|
|
1318
|
-
dataset_props.externalUrl = self._get_url(base_url)
|
|
1306
|
+
) -> Dataset:
|
|
1307
|
+
"""
|
|
1308
|
+
Generate a Dataset metadata event for this Looker Explore.
|
|
1319
1309
|
|
|
1320
|
-
|
|
1310
|
+
Only generates datasets for explores that contain FROM clauses and do NOT contain joins.
|
|
1311
|
+
Passthrough explores and joins are handled via lineage and do not need additional nodes.
|
|
1312
|
+
"""
|
|
1313
|
+
upstream_lineage = None
|
|
1321
1314
|
view_name_to_urn_map: Dict[str, str] = {}
|
|
1315
|
+
|
|
1322
1316
|
if self.upstream_views is not None:
|
|
1323
1317
|
assert self.project_name is not None
|
|
1324
|
-
upstreams = []
|
|
1318
|
+
upstreams: list[UpstreamClass] = []
|
|
1325
1319
|
observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
1320
|
+
|
|
1326
1321
|
for view_ref in sorted(self.upstream_views):
|
|
1327
1322
|
# set file_path to ViewFieldType.UNKNOWN if file_path is not available to keep backward compatibility
|
|
1328
1323
|
# if we raise error on file_path equal to None then existing test-cases will fail as mock data
|
|
1329
1324
|
# doesn't have required attributes.
|
|
1330
1325
|
file_path: str = (
|
|
1331
1326
|
cast(str, self.upstream_views_file_path[view_ref.include])
|
|
1332
|
-
if self.upstream_views_file_path
|
|
1327
|
+
if self.upstream_views_file_path.get(view_ref.include) is not None
|
|
1333
1328
|
else ViewFieldValue.NOT_AVAILABLE.value
|
|
1334
1329
|
)
|
|
1335
1330
|
|
|
@@ -1356,7 +1351,7 @@ class LookerExplore:
|
|
|
1356
1351
|
)
|
|
1357
1352
|
view_name_to_urn_map[view_ref.include] = view_urn
|
|
1358
1353
|
|
|
1359
|
-
fine_grained_lineages = []
|
|
1354
|
+
fine_grained_lineages: list[FineGrainedLineageClass] = []
|
|
1360
1355
|
if config.extract_column_level_lineage:
|
|
1361
1356
|
for field in self.fields or []:
|
|
1362
1357
|
# Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
|
|
@@ -1397,9 +1392,11 @@ class LookerExplore:
|
|
|
1397
1392
|
)
|
|
1398
1393
|
|
|
1399
1394
|
upstream_lineage = UpstreamLineage(
|
|
1400
|
-
upstreams=upstreams,
|
|
1395
|
+
upstreams=upstreams,
|
|
1396
|
+
fineGrainedLineages=fine_grained_lineages or None,
|
|
1401
1397
|
)
|
|
1402
|
-
|
|
1398
|
+
|
|
1399
|
+
schema_metadata = None
|
|
1403
1400
|
if self.fields is not None:
|
|
1404
1401
|
schema_metadata = LookerUtil._get_schema(
|
|
1405
1402
|
platform_name=config.platform_name,
|
|
@@ -1407,42 +1404,46 @@ class LookerExplore:
|
|
|
1407
1404
|
view_fields=self.fields,
|
|
1408
1405
|
reporter=reporter,
|
|
1409
1406
|
)
|
|
1410
|
-
if schema_metadata is not None:
|
|
1411
|
-
dataset_snapshot.aspects.append(schema_metadata)
|
|
1412
|
-
|
|
1413
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1414
|
-
mcp = MetadataChangeProposalWrapper(
|
|
1415
|
-
entityUrn=dataset_snapshot.urn,
|
|
1416
|
-
aspect=SubTypesClass(typeNames=[DatasetSubTypes.LOOKER_EXPLORE]),
|
|
1417
|
-
)
|
|
1418
|
-
|
|
1419
|
-
proposals: List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]] = [
|
|
1420
|
-
mce,
|
|
1421
|
-
mcp,
|
|
1422
|
-
]
|
|
1423
1407
|
|
|
1424
|
-
|
|
1425
|
-
explore_tag_urns: List[TagAssociationClass] = [
|
|
1426
|
-
TagAssociationClass(tag=TagUrn(tag).urn()) for tag in self.tags
|
|
1427
|
-
]
|
|
1428
|
-
if explore_tag_urns:
|
|
1429
|
-
dataset_snapshot.aspects.append(GlobalTagsClass(explore_tag_urns))
|
|
1408
|
+
extra_aspects: List[Union[GlobalTagsClass, EmbedClass]] = []
|
|
1430
1409
|
|
|
1431
|
-
|
|
1410
|
+
explore_tag_urns: List[TagUrn] = [TagUrn(tag) for tag in self.tags]
|
|
1432
1411
|
if extract_embed_urls:
|
|
1433
|
-
|
|
1434
|
-
dataset_snapshot.urn, self._get_embed_url(base_url)
|
|
1435
|
-
)
|
|
1436
|
-
proposals.append(embed_mcp)
|
|
1412
|
+
extra_aspects.append(EmbedClass(renderUrl=self._get_embed_url(base_url)))
|
|
1437
1413
|
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1414
|
+
custom_properties: Dict[str, Optional[str]] = {
|
|
1415
|
+
"project": self.project_name,
|
|
1416
|
+
"model": self.model_name,
|
|
1417
|
+
"looker.explore.label": self.label,
|
|
1418
|
+
"looker.explore.name": self.name,
|
|
1419
|
+
"looker.explore.file": self.source_file,
|
|
1420
|
+
}
|
|
1444
1421
|
|
|
1445
|
-
return
|
|
1422
|
+
return Dataset(
|
|
1423
|
+
platform=config.platform_name,
|
|
1424
|
+
name=config.explore_naming_pattern.replace_variables(
|
|
1425
|
+
self.get_mapping(config)
|
|
1426
|
+
),
|
|
1427
|
+
display_name=str(self.label)
|
|
1428
|
+
if self.label
|
|
1429
|
+
else LookerUtil._display_name(self.name),
|
|
1430
|
+
description=self.description,
|
|
1431
|
+
subtype=DatasetSubTypes.LOOKER_EXPLORE,
|
|
1432
|
+
env=config.env,
|
|
1433
|
+
platform_instance=config.platform_instance,
|
|
1434
|
+
custom_properties={
|
|
1435
|
+
k: str(v) for k, v in custom_properties.items() if v is not None
|
|
1436
|
+
},
|
|
1437
|
+
external_url=self._get_url(base_url),
|
|
1438
|
+
upstreams=upstream_lineage,
|
|
1439
|
+
schema=schema_metadata,
|
|
1440
|
+
parent_container=[
|
|
1441
|
+
"Explore",
|
|
1442
|
+
gen_model_key(config, self.model_name).as_urn(),
|
|
1443
|
+
],
|
|
1444
|
+
tags=explore_tag_urns if explore_tag_urns else None,
|
|
1445
|
+
extra_aspects=extra_aspects,
|
|
1446
|
+
)
|
|
1446
1447
|
|
|
1447
1448
|
|
|
1448
1449
|
def gen_project_key(config: LookerCommonConfig, project_name: str) -> LookMLProjectKey:
|
|
@@ -5,10 +5,14 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union, cast
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from looker_sdk.sdk.api40.models import DBConnection
|
|
8
|
-
from pydantic import Field, validator
|
|
8
|
+
from pydantic import Field, model_validator, validator
|
|
9
9
|
|
|
10
10
|
from datahub.configuration import ConfigModel
|
|
11
|
-
from datahub.configuration.common import
|
|
11
|
+
from datahub.configuration.common import (
|
|
12
|
+
AllowDenyPattern,
|
|
13
|
+
ConfigurationError,
|
|
14
|
+
HiddenFromDocs,
|
|
15
|
+
)
|
|
12
16
|
from datahub.configuration.source_common import (
|
|
13
17
|
EnvConfigMixin,
|
|
14
18
|
PlatformInstanceConfigMixin,
|
|
@@ -43,6 +47,14 @@ class NamingPattern(ConfigModel):
|
|
|
43
47
|
assert isinstance(v, str), "pattern must be a string"
|
|
44
48
|
return {"pattern": v}
|
|
45
49
|
|
|
50
|
+
@model_validator(mode="before")
|
|
51
|
+
@classmethod
|
|
52
|
+
def pydantic_v2_accept_raw_pattern(cls, v):
|
|
53
|
+
# Pydantic v2 compatibility: handle string input by converting to dict
|
|
54
|
+
if isinstance(v, str):
|
|
55
|
+
return {"pattern": v}
|
|
56
|
+
return v
|
|
57
|
+
|
|
46
58
|
@classmethod
|
|
47
59
|
def pydantic_validate_pattern(cls, v):
|
|
48
60
|
assert isinstance(v, NamingPattern)
|
|
@@ -132,11 +144,10 @@ class LookerCommonConfig(EnvConfigMixin, PlatformInstanceConfigMixin):
|
|
|
132
144
|
description="When enabled, attaches tags to measures, dimensions and dimension groups to make them more "
|
|
133
145
|
"discoverable. When disabled, adds this information to the description of the column.",
|
|
134
146
|
)
|
|
135
|
-
platform_name: str = Field(
|
|
147
|
+
platform_name: HiddenFromDocs[str] = Field(
|
|
136
148
|
# TODO: This shouldn't be part of the config.
|
|
137
149
|
"looker",
|
|
138
150
|
description="Default platform name.",
|
|
139
|
-
hidden_from_docs=True,
|
|
140
151
|
)
|
|
141
152
|
extract_column_level_lineage: bool = Field(
|
|
142
153
|
True,
|