acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
  2. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
  3. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  8. datahub/api/entities/dataset/dataset.py +9 -18
  9. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  10. datahub/api/graphql/operation.py +10 -6
  11. datahub/cli/docker_check.py +2 -2
  12. datahub/configuration/common.py +29 -1
  13. datahub/configuration/connection_resolver.py +5 -2
  14. datahub/configuration/import_resolver.py +7 -4
  15. datahub/configuration/pydantic_migration_helpers.py +0 -9
  16. datahub/configuration/source_common.py +3 -2
  17. datahub/configuration/validate_field_deprecation.py +5 -2
  18. datahub/configuration/validate_field_removal.py +5 -2
  19. datahub/configuration/validate_field_rename.py +6 -5
  20. datahub/configuration/validate_multiline_string.py +5 -2
  21. datahub/ingestion/autogenerated/capability_summary.json +45 -1
  22. datahub/ingestion/run/pipeline_config.py +2 -2
  23. datahub/ingestion/source/azure/azure_common.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  25. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  26. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
  27. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  28. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  29. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  30. datahub/ingestion/source/datahub/config.py +8 -9
  31. datahub/ingestion/source/dbt/dbt_common.py +65 -5
  32. datahub/ingestion/source/delta_lake/config.py +1 -1
  33. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  34. datahub/ingestion/source/feast.py +8 -10
  35. datahub/ingestion/source/fivetran/config.py +1 -1
  36. datahub/ingestion/source/gcs/gcs_source.py +19 -2
  37. datahub/ingestion/source/ge_data_profiler.py +15 -2
  38. datahub/ingestion/source/ge_profiling_config.py +26 -22
  39. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  40. datahub/ingestion/source/grafana/models.py +12 -14
  41. datahub/ingestion/source/hex/hex.py +6 -1
  42. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  43. datahub/ingestion/source/kafka_connect/common.py +2 -2
  44. datahub/ingestion/source/looker/looker_common.py +76 -75
  45. datahub/ingestion/source/looker/looker_config.py +15 -4
  46. datahub/ingestion/source/looker/looker_source.py +493 -547
  47. datahub/ingestion/source/looker/lookml_config.py +1 -1
  48. datahub/ingestion/source/looker/lookml_source.py +46 -88
  49. datahub/ingestion/source/metabase.py +9 -2
  50. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  51. datahub/ingestion/source/metadata/lineage.py +1 -1
  52. datahub/ingestion/source/mode.py +13 -5
  53. datahub/ingestion/source/nifi.py +1 -1
  54. datahub/ingestion/source/powerbi/config.py +14 -21
  55. datahub/ingestion/source/preset.py +1 -1
  56. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  57. datahub/ingestion/source/redash.py +1 -1
  58. datahub/ingestion/source/redshift/config.py +6 -3
  59. datahub/ingestion/source/redshift/query.py +23 -19
  60. datahub/ingestion/source/s3/source.py +26 -24
  61. datahub/ingestion/source/salesforce.py +13 -9
  62. datahub/ingestion/source/schema/json_schema.py +14 -14
  63. datahub/ingestion/source/sigma/data_classes.py +3 -0
  64. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  65. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  66. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  67. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  68. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  69. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  70. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  71. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  72. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  73. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  74. datahub/ingestion/source/sql/athena.py +2 -1
  75. datahub/ingestion/source/sql/clickhouse.py +12 -7
  76. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  77. datahub/ingestion/source/sql/druid.py +2 -2
  78. datahub/ingestion/source/sql/hive.py +4 -3
  79. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  80. datahub/ingestion/source/sql/mssql/source.py +2 -2
  81. datahub/ingestion/source/sql/mysql.py +2 -2
  82. datahub/ingestion/source/sql/oracle.py +3 -3
  83. datahub/ingestion/source/sql/presto.py +2 -1
  84. datahub/ingestion/source/sql/teradata.py +4 -4
  85. datahub/ingestion/source/sql/trino.py +2 -1
  86. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  87. datahub/ingestion/source/sql/vertica.py +1 -1
  88. datahub/ingestion/source/sql_queries.py +6 -6
  89. datahub/ingestion/source/state/checkpoint.py +5 -1
  90. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  91. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  92. datahub/ingestion/source/superset.py +122 -15
  93. datahub/ingestion/source/tableau/tableau.py +68 -14
  94. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  95. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  96. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  97. datahub/ingestion/source/unity/config.py +7 -3
  98. datahub/ingestion/source/usage/usage_common.py +3 -3
  99. datahub/ingestion/source_config/pulsar.py +3 -1
  100. datahub/ingestion/transformer/set_browse_path.py +112 -0
  101. datahub/metadata/_internal_schema_classes.py +728 -528
  102. datahub/metadata/_urns/urn_defs.py +1702 -1702
  103. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  104. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  105. datahub/metadata/schema.avsc +17434 -17732
  106. datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
  107. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  108. datahub/metadata/schemas/LogicalParent.avsc +2 -1
  109. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  110. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  111. datahub/sdk/_shared.py +126 -0
  112. datahub/sdk/chart.py +87 -30
  113. datahub/sdk/dashboard.py +79 -34
  114. datahub/sdk/entity_client.py +11 -4
  115. datahub/sdk/lineage_client.py +3 -3
  116. datahub/sdk/search_filters.py +1 -7
  117. datahub/sql_parsing/split_statements.py +13 -0
  118. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
  119. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
  120. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
@@ -307,7 +307,6 @@ def _is_single_row_query_method(query: Any) -> bool:
307
307
  "get_column_max",
308
308
  "get_column_mean",
309
309
  "get_column_stdev",
310
- "get_column_nonnull_count",
311
310
  "get_column_unique_count",
312
311
  }
313
312
  CONSTANT_ROW_QUERY_METHODS = {
@@ -331,6 +330,7 @@ def _is_single_row_query_method(query: Any) -> bool:
331
330
 
332
331
  FIRST_PARTY_SINGLE_ROW_QUERY_METHODS = {
333
332
  "get_column_unique_count_dh_patch",
333
+ "_get_column_cardinality",
334
334
  }
335
335
 
336
336
  # We'll do this the inefficient way since the arrays are pretty small.
@@ -497,7 +497,20 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
497
497
  self, column_spec: _SingleColumnSpec, column: str
498
498
  ) -> None:
499
499
  try:
500
- nonnull_count = self.dataset.get_column_nonnull_count(column)
500
+ # Don't use Great Expectations get_column_nonnull_count because it
501
+ # generates this SQL:
502
+ #
503
+ # sum(CASE WHEN (mycolumn IN (NULL) OR mycolumn IS NULL) THEN 1 ELSE 0 END)
504
+ #
505
+ # which fails for complex types (such as Databricks maps) that don't
506
+ # support the IN operator.
507
+ nonnull_count = convert_to_json_serializable(
508
+ self.dataset.engine.execute(
509
+ sa.select(sa.func.count(sa.column(column))).select_from(
510
+ self.dataset._table
511
+ )
512
+ ).scalar()
513
+ )
501
514
  column_spec.nonnull_count = nonnull_count
502
515
  except Exception as e:
503
516
  logger.debug(
@@ -1,12 +1,12 @@
1
1
  import datetime
2
2
  import logging
3
3
  import os
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Annotated, Any, Dict, List, Optional
5
5
 
6
6
  import pydantic
7
7
  from pydantic.fields import Field
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
10
10
  from datahub.ingestion.source_config.operation_config import OperationConfig
11
11
 
12
12
  _PROFILING_FLAGS_TO_REPORT = {
@@ -120,37 +120,40 @@ class GEProfilingConfig(GEProfilingBaseConfig):
120
120
  "number of columns to profile goes up.",
121
121
  )
122
122
 
123
- profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
123
+ profile_if_updated_since_days: Annotated[
124
+ Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
125
+ ] = Field(
124
126
  default=None,
125
127
  description="Profile table only if it has been updated since these many number of days. "
126
128
  "If set to `null`, no constraint of last modified time for tables to profile. "
127
129
  "Supported only in `snowflake` and `BigQuery`.",
128
- schema_extra={"supported_sources": ["snowflake", "bigquery"]},
129
130
  )
130
131
 
131
- profile_table_size_limit: Optional[int] = Field(
132
+ profile_table_size_limit: Annotated[
133
+ Optional[int],
134
+ SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
135
+ ] = Field(
132
136
  default=5,
133
137
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
134
138
  "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
135
139
  "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
136
- schema_extra={
137
- "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
138
- },
139
140
  )
140
141
 
141
- profile_table_row_limit: Optional[int] = Field(
142
+ profile_table_row_limit: Annotated[
143
+ Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
144
+ ] = Field(
142
145
  default=5000000,
143
146
  description="Profile tables only if their row count is less than specified count. "
144
147
  "If set to `null`, no limit on the row count of tables to profile. Supported only in "
145
148
  "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
146
- schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
147
149
  )
148
150
 
149
- profile_table_row_count_estimate_only: bool = Field(
151
+ profile_table_row_count_estimate_only: Annotated[
152
+ bool, SupportedSources(["postgres", "mysql"])
153
+ ] = Field(
150
154
  default=False,
151
155
  description="Use an approximate query for row count. This will be much faster but slightly "
152
156
  "less accurate. Only supported for Postgres and MySQL. ",
153
- schema_extra={"supported_sources": ["postgres", "mysql"]},
154
157
  )
155
158
 
156
159
  # The query combiner enables us to combine multiple queries into a single query,
@@ -163,36 +166,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
163
166
  # Hidden option - used for debugging purposes.
164
167
  catch_exceptions: bool = Field(default=True, description="")
165
168
 
166
- partition_profiling_enabled: bool = Field(
169
+ partition_profiling_enabled: Annotated[
170
+ bool, SupportedSources(["athena", "bigquery"])
171
+ ] = Field(
167
172
  default=True,
168
173
  description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
169
174
  "If enabled, latest partition data is used for profiling.",
170
- schema_extra={"supported_sources": ["athena", "bigquery"]},
171
175
  )
172
- partition_datetime: Optional[datetime.datetime] = Field(
176
+ partition_datetime: Annotated[
177
+ Optional[datetime.datetime], SupportedSources(["bigquery"])
178
+ ] = Field(
173
179
  default=None,
174
180
  description="If specified, profile only the partition which matches this datetime. "
175
181
  "If not specified, profile the latest partition. Only Bigquery supports this.",
176
- schema_extra={"supported_sources": ["bigquery"]},
177
182
  )
178
- use_sampling: bool = Field(
183
+ use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
179
184
  default=True,
180
185
  description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
181
186
  "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
182
- schema_extra={"supported_sources": ["bigquery", "snowflake"]},
183
187
  )
184
188
 
185
- sample_size: int = Field(
189
+ sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
186
190
  default=10000,
187
191
  description="Number of rows to be sampled from table for column level profiling."
188
192
  "Applicable only if `use_sampling` is set to True.",
189
- schema_extra={"supported_sources": ["bigquery", "snowflake"]},
190
193
  )
191
194
 
192
- profile_external_tables: bool = Field(
195
+ profile_external_tables: Annotated[
196
+ bool, SupportedSources(["redshift", "snowflake"])
197
+ ] = Field(
193
198
  default=False,
194
199
  description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
195
- schema_extra={"supported_sources": ["redshift", "snowflake"]},
196
200
  )
197
201
 
198
202
  tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
@@ -2,7 +2,7 @@ from typing import Dict, Optional
2
2
 
3
3
  from pydantic import Field, SecretStr, validator
4
4
 
5
- from datahub.configuration.common import AllowDenyPattern
5
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
6
6
  from datahub.configuration.source_common import (
7
7
  DatasetLineageProviderConfigBase,
8
8
  EnvConfigMixin,
@@ -37,7 +37,7 @@ class GrafanaSourceConfig(
37
37
  ):
38
38
  """Configuration for Grafana source"""
39
39
 
40
- platform: str = Field(default="grafana", hidden_from_docs=True)
40
+ platform: HiddenFromDocs[str] = Field(default="grafana")
41
41
  url: str = Field(
42
42
  description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
43
43
  )
@@ -10,9 +10,8 @@ References:
10
10
 
11
11
  from typing import Any, Dict, List, Optional
12
12
 
13
- from pydantic import BaseModel, Field
13
+ from pydantic import BaseModel, ConfigDict, Field
14
14
 
15
- from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
15
  from datahub.emitter.mcp_builder import ContainerKey
17
16
 
18
17
  # Grafana-specific type definitions for better type safety
@@ -25,7 +24,11 @@ GrafanaFieldConfig = Dict[
25
24
  GrafanaTransformation = Dict[str, Any] # Transformations: id, options
26
25
 
27
26
 
28
- class DatasourceRef(BaseModel):
27
+ class _GrafanaBaseModel(BaseModel):
28
+ model_config = ConfigDict(coerce_numbers_to_str=True)
29
+
30
+
31
+ class DatasourceRef(_GrafanaBaseModel):
29
32
  """Reference to a Grafana datasource."""
30
33
 
31
34
  type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
@@ -33,13 +36,13 @@ class DatasourceRef(BaseModel):
33
36
  name: Optional[str] = None # Datasource display name
34
37
 
35
38
 
36
- class Panel(BaseModel):
39
+ class Panel(_GrafanaBaseModel):
37
40
  """Represents a Grafana dashboard panel."""
38
41
 
39
42
  id: str
40
43
  title: str
41
44
  description: str = ""
42
- type: Optional[str]
45
+ type: Optional[str] = None
43
46
  # Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
44
47
  query_targets: List[GrafanaQueryTarget] = Field(
45
48
  default_factory=list, alias="targets"
@@ -52,16 +55,16 @@ class Panel(BaseModel):
52
55
  transformations: List[GrafanaTransformation] = Field(default_factory=list)
53
56
 
54
57
 
55
- class Dashboard(BaseModel):
58
+ class Dashboard(_GrafanaBaseModel):
56
59
  """Represents a Grafana dashboard."""
57
60
 
58
61
  uid: str
59
62
  title: str
60
63
  description: str = ""
61
- version: Optional[str]
64
+ version: Optional[str] = None
62
65
  panels: List[Panel]
63
66
  tags: List[str]
64
- timezone: Optional[str]
67
+ timezone: Optional[str] = None
65
68
  refresh: Optional[str] = None
66
69
  schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
67
70
  folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
@@ -100,18 +103,13 @@ class Dashboard(BaseModel):
100
103
  return super().parse_obj(dashboard_dict)
101
104
 
102
105
 
103
- class Folder(BaseModel):
106
+ class Folder(_GrafanaBaseModel):
104
107
  """Represents a Grafana folder."""
105
108
 
106
109
  id: str
107
110
  title: str
108
111
  description: Optional[str] = ""
109
112
 
110
- if PYDANTIC_VERSION_2:
111
- from pydantic import ConfigDict
112
-
113
- model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
114
-
115
113
 
116
114
  class FolderKey(ContainerKey):
117
115
  """Key for identifying a Grafana folder."""
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from dataclasses import dataclass
2
3
  from datetime import datetime, timedelta, timezone
3
4
  from typing import Any, Dict, Iterable, List, Optional
@@ -122,7 +123,11 @@ class HexSourceConfig(
122
123
 
123
124
  @root_validator(pre=True)
124
125
  def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
125
- # lineage_end_time default = now
126
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
127
+ # in test_hex.py where the same dict is reused.
128
+ # So a deepcopy is performed first.
129
+ data = deepcopy(data)
130
+
126
131
  if "lineage_end_time" not in data or data["lineage_end_time"] is None:
127
132
  data["lineage_end_time"] = datetime.now(tz=timezone.utc)
128
133
  # if string is given, parse it
@@ -12,6 +12,7 @@ from pyiceberg.types import (
12
12
  IcebergType,
13
13
  IntegerType,
14
14
  LongType,
15
+ PrimitiveType,
15
16
  TimestampType,
16
17
  TimestamptzType,
17
18
  TimeType,
@@ -22,6 +23,7 @@ from pyiceberg.utils.datetime import (
22
23
  to_human_timestamp,
23
24
  to_human_timestamptz,
24
25
  )
26
+ from typing_extensions import TypeGuard
25
27
 
26
28
  from datahub.emitter.mce_builder import get_sys_time
27
29
  from datahub.ingestion.source.iceberg.iceberg_common import (
@@ -65,7 +67,7 @@ class IcebergProfiler:
65
67
  aggregated_values: Dict[int, Any],
66
68
  manifest_values: Dict[int, bytes],
67
69
  ) -> None:
68
- for field_id, value_encoded in manifest_values.items(): # type: int, Any
70
+ for field_id, value_encoded in manifest_values.items():
69
71
  try:
70
72
  field = schema.find_field(field_id)
71
73
  except ValueError:
@@ -240,7 +242,7 @@ class IcebergProfiler:
240
242
  return None
241
243
 
242
244
  @staticmethod
243
- def _is_numeric_type(type: IcebergType) -> bool:
245
+ def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
244
246
  return isinstance(
245
247
  type,
246
248
  (
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
4
4
 
5
5
  from pydantic.fields import Field
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
8
8
  from datahub.configuration.source_common import (
9
9
  DatasetLineageProviderConfigBase,
10
10
  PlatformInstanceConfigMixin,
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
29
29
  class ProvidedConfig(ConfigModel):
30
30
  provider: str
31
31
  path_key: str
32
- value: str
32
+ value: LaxStr
33
33
 
34
34
 
35
35
  class GenericConnectorConfig(ConfigModel):
@@ -28,7 +28,7 @@ from looker_sdk.sdk.api40.models import (
28
28
  User,
29
29
  WriteQuery,
30
30
  )
31
- from pydantic.class_validators import validator
31
+ from pydantic import validator
32
32
 
33
33
  import datahub.emitter.mce_builder as builder
34
34
  from datahub.api.entities.platformresource.platform_resource import (
@@ -36,7 +36,7 @@ from datahub.api.entities.platformresource.platform_resource import (
36
36
  PlatformResourceKey,
37
37
  )
38
38
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
39
- from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp
39
+ from datahub.emitter.mcp_builder import ContainerKey
40
40
  from datahub.ingestion.api.report import Report
41
41
  from datahub.ingestion.api.source import SourceReport
42
42
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
@@ -72,7 +72,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
72
72
  UpstreamClass,
73
73
  UpstreamLineage,
74
74
  )
75
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
76
75
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
77
76
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
78
77
  ArrayTypeClass,
@@ -90,21 +89,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
90
89
  )
91
90
  from datahub.metadata.schema_classes import (
92
91
  BrowsePathEntryClass,
93
- BrowsePathsClass,
94
92
  BrowsePathsV2Class,
95
- ContainerClass,
96
- DatasetPropertiesClass,
93
+ EmbedClass,
97
94
  EnumTypeClass,
98
95
  FineGrainedLineageClass,
99
96
  GlobalTagsClass,
100
97
  SchemaMetadataClass,
101
- StatusClass,
102
- SubTypesClass,
103
98
  TagAssociationClass,
104
99
  TagPropertiesClass,
105
100
  TagSnapshotClass,
106
101
  )
107
102
  from datahub.metadata.urns import TagUrn
103
+ from datahub.sdk.dataset import Dataset
108
104
  from datahub.sql_parsing.sqlglot_lineage import ColumnRef
109
105
  from datahub.utilities.lossy_collections import LossyList, LossySet
110
106
  from datahub.utilities.url_util import remove_port_from_url
@@ -255,6 +251,11 @@ class LookerViewId:
255
251
 
256
252
  return generated_urn
257
253
 
254
+ def get_view_dataset_name(self, config: LookerCommonConfig) -> str:
255
+ n_mapping: ViewNamingPatternMapping = self.get_mapping(config)
256
+ n_mapping.file_path = self.preprocess_file_path(n_mapping.file_path)
257
+ return config.view_naming_pattern.replace_variables(n_mapping)
258
+
258
259
  def get_browse_path(self, config: LookerCommonConfig) -> str:
259
260
  browse_path = config.view_browse_pattern.replace_variables(
260
261
  self.get_mapping(config)
@@ -282,6 +283,22 @@ class LookerViewId:
282
283
  ],
283
284
  )
284
285
 
286
+ def get_view_dataset_parent_container(
287
+ self, config: LookerCommonConfig
288
+ ) -> List[str]:
289
+ project_key = gen_project_key(config, self.project_name)
290
+ view_path = (
291
+ remove_suffix(self.file_path, ".view.lkml")
292
+ if "{file_path}" in config.view_browse_pattern.pattern
293
+ else os.path.dirname(self.file_path)
294
+ )
295
+ path_entries = view_path.split("/") if view_path else []
296
+ return [
297
+ "Develop",
298
+ project_key.as_urn(),
299
+ *path_entries,
300
+ ]
301
+
285
302
 
286
303
  class ViewFieldType(Enum):
287
304
  DIMENSION = "Dimension"
@@ -1286,50 +1303,28 @@ class LookerExplore:
1286
1303
  reporter: SourceReport,
1287
1304
  base_url: str,
1288
1305
  extract_embed_urls: bool,
1289
- ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]:
1290
- # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
1291
- # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.
1292
-
1293
- dataset_snapshot = DatasetSnapshot(
1294
- urn=self.get_explore_urn(config),
1295
- aspects=[], # we append to this list later on
1296
- )
1297
-
1298
- model_key = gen_model_key(config, self.model_name)
1299
- browse_paths = BrowsePathsClass(paths=[self.get_explore_browse_path(config)])
1300
- container = ContainerClass(container=model_key.as_urn())
1301
- dataset_snapshot.aspects.append(browse_paths)
1302
- dataset_snapshot.aspects.append(StatusClass(removed=False))
1303
-
1304
- custom_properties = {
1305
- "project": self.project_name,
1306
- "model": self.model_name,
1307
- "looker.explore.label": self.label,
1308
- "looker.explore.name": self.name,
1309
- "looker.explore.file": self.source_file,
1310
- }
1311
- dataset_props = DatasetPropertiesClass(
1312
- name=str(self.label) if self.label else LookerUtil._display_name(self.name),
1313
- description=self.description,
1314
- customProperties={
1315
- k: str(v) for k, v in custom_properties.items() if v is not None
1316
- },
1317
- )
1318
- dataset_props.externalUrl = self._get_url(base_url)
1306
+ ) -> Dataset:
1307
+ """
1308
+ Generate a Dataset metadata event for this Looker Explore.
1319
1309
 
1320
- dataset_snapshot.aspects.append(dataset_props)
1310
+ Only generates datasets for explores that contain FROM clauses and do NOT contain joins.
1311
+ Passthrough explores and joins are handled via lineage and do not need additional nodes.
1312
+ """
1313
+ upstream_lineage = None
1321
1314
  view_name_to_urn_map: Dict[str, str] = {}
1315
+
1322
1316
  if self.upstream_views is not None:
1323
1317
  assert self.project_name is not None
1324
- upstreams = []
1318
+ upstreams: list[UpstreamClass] = []
1325
1319
  observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc)
1320
+
1326
1321
  for view_ref in sorted(self.upstream_views):
1327
1322
  # set file_path to ViewFieldType.UNKNOWN if file_path is not available to keep backward compatibility
1328
1323
  # if we raise error on file_path equal to None then existing test-cases will fail as mock data
1329
1324
  # doesn't have required attributes.
1330
1325
  file_path: str = (
1331
1326
  cast(str, self.upstream_views_file_path[view_ref.include])
1332
- if self.upstream_views_file_path[view_ref.include] is not None
1327
+ if self.upstream_views_file_path.get(view_ref.include) is not None
1333
1328
  else ViewFieldValue.NOT_AVAILABLE.value
1334
1329
  )
1335
1330
 
@@ -1356,7 +1351,7 @@ class LookerExplore:
1356
1351
  )
1357
1352
  view_name_to_urn_map[view_ref.include] = view_urn
1358
1353
 
1359
- fine_grained_lineages = []
1354
+ fine_grained_lineages: list[FineGrainedLineageClass] = []
1360
1355
  if config.extract_column_level_lineage:
1361
1356
  for field in self.fields or []:
1362
1357
  # Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
@@ -1397,9 +1392,11 @@ class LookerExplore:
1397
1392
  )
1398
1393
 
1399
1394
  upstream_lineage = UpstreamLineage(
1400
- upstreams=upstreams, fineGrainedLineages=fine_grained_lineages or None
1395
+ upstreams=upstreams,
1396
+ fineGrainedLineages=fine_grained_lineages or None,
1401
1397
  )
1402
- dataset_snapshot.aspects.append(upstream_lineage)
1398
+
1399
+ schema_metadata = None
1403
1400
  if self.fields is not None:
1404
1401
  schema_metadata = LookerUtil._get_schema(
1405
1402
  platform_name=config.platform_name,
@@ -1407,42 +1404,46 @@ class LookerExplore:
1407
1404
  view_fields=self.fields,
1408
1405
  reporter=reporter,
1409
1406
  )
1410
- if schema_metadata is not None:
1411
- dataset_snapshot.aspects.append(schema_metadata)
1412
-
1413
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1414
- mcp = MetadataChangeProposalWrapper(
1415
- entityUrn=dataset_snapshot.urn,
1416
- aspect=SubTypesClass(typeNames=[DatasetSubTypes.LOOKER_EXPLORE]),
1417
- )
1418
-
1419
- proposals: List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]] = [
1420
- mce,
1421
- mcp,
1422
- ]
1423
1407
 
1424
- # Add tags
1425
- explore_tag_urns: List[TagAssociationClass] = [
1426
- TagAssociationClass(tag=TagUrn(tag).urn()) for tag in self.tags
1427
- ]
1428
- if explore_tag_urns:
1429
- dataset_snapshot.aspects.append(GlobalTagsClass(explore_tag_urns))
1408
+ extra_aspects: List[Union[GlobalTagsClass, EmbedClass]] = []
1430
1409
 
1431
- # If extracting embeds is enabled, produce an MCP for embed URL.
1410
+ explore_tag_urns: List[TagUrn] = [TagUrn(tag) for tag in self.tags]
1432
1411
  if extract_embed_urls:
1433
- embed_mcp = create_embed_mcp(
1434
- dataset_snapshot.urn, self._get_embed_url(base_url)
1435
- )
1436
- proposals.append(embed_mcp)
1412
+ extra_aspects.append(EmbedClass(renderUrl=self._get_embed_url(base_url)))
1437
1413
 
1438
- proposals.append(
1439
- MetadataChangeProposalWrapper(
1440
- entityUrn=dataset_snapshot.urn,
1441
- aspect=container,
1442
- )
1443
- )
1414
+ custom_properties: Dict[str, Optional[str]] = {
1415
+ "project": self.project_name,
1416
+ "model": self.model_name,
1417
+ "looker.explore.label": self.label,
1418
+ "looker.explore.name": self.name,
1419
+ "looker.explore.file": self.source_file,
1420
+ }
1444
1421
 
1445
- return proposals
1422
+ return Dataset(
1423
+ platform=config.platform_name,
1424
+ name=config.explore_naming_pattern.replace_variables(
1425
+ self.get_mapping(config)
1426
+ ),
1427
+ display_name=str(self.label)
1428
+ if self.label
1429
+ else LookerUtil._display_name(self.name),
1430
+ description=self.description,
1431
+ subtype=DatasetSubTypes.LOOKER_EXPLORE,
1432
+ env=config.env,
1433
+ platform_instance=config.platform_instance,
1434
+ custom_properties={
1435
+ k: str(v) for k, v in custom_properties.items() if v is not None
1436
+ },
1437
+ external_url=self._get_url(base_url),
1438
+ upstreams=upstream_lineage,
1439
+ schema=schema_metadata,
1440
+ parent_container=[
1441
+ "Explore",
1442
+ gen_model_key(config, self.model_name).as_urn(),
1443
+ ],
1444
+ tags=explore_tag_urns if explore_tag_urns else None,
1445
+ extra_aspects=extra_aspects,
1446
+ )
1446
1447
 
1447
1448
 
1448
1449
  def gen_project_key(config: LookerCommonConfig, project_name: str) -> LookMLProjectKey:
@@ -5,10 +5,14 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union, cast
5
5
 
6
6
  import pydantic
7
7
  from looker_sdk.sdk.api40.models import DBConnection
8
- from pydantic import Field, validator
8
+ from pydantic import Field, model_validator, validator
9
9
 
10
10
  from datahub.configuration import ConfigModel
11
- from datahub.configuration.common import AllowDenyPattern, ConfigurationError
11
+ from datahub.configuration.common import (
12
+ AllowDenyPattern,
13
+ ConfigurationError,
14
+ HiddenFromDocs,
15
+ )
12
16
  from datahub.configuration.source_common import (
13
17
  EnvConfigMixin,
14
18
  PlatformInstanceConfigMixin,
@@ -43,6 +47,14 @@ class NamingPattern(ConfigModel):
43
47
  assert isinstance(v, str), "pattern must be a string"
44
48
  return {"pattern": v}
45
49
 
50
+ @model_validator(mode="before")
51
+ @classmethod
52
+ def pydantic_v2_accept_raw_pattern(cls, v):
53
+ # Pydantic v2 compatibility: handle string input by converting to dict
54
+ if isinstance(v, str):
55
+ return {"pattern": v}
56
+ return v
57
+
46
58
  @classmethod
47
59
  def pydantic_validate_pattern(cls, v):
48
60
  assert isinstance(v, NamingPattern)
@@ -132,11 +144,10 @@ class LookerCommonConfig(EnvConfigMixin, PlatformInstanceConfigMixin):
132
144
  description="When enabled, attaches tags to measures, dimensions and dimension groups to make them more "
133
145
  "discoverable. When disabled, adds this information to the description of the column.",
134
146
  )
135
- platform_name: str = Field(
147
+ platform_name: HiddenFromDocs[str] = Field(
136
148
  # TODO: This shouldn't be part of the config.
137
149
  "looker",
138
150
  description="Default platform name.",
139
- hidden_from_docs=True,
140
151
  )
141
152
  extract_column_level_lineage: bool = Field(
142
153
  True,