acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (76) hide show
  1. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
  2. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/ingestion/api/common.py +9 -0
  14. datahub/ingestion/api/source.py +4 -1
  15. datahub/ingestion/api/source_helpers.py +26 -1
  16. datahub/ingestion/graph/client.py +104 -0
  17. datahub/ingestion/run/pipeline.py +0 -6
  18. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  19. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  20. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  21. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  22. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  23. datahub/ingestion/source/hex/constants.py +5 -0
  24. datahub/ingestion/source/hex/hex.py +150 -22
  25. datahub/ingestion/source/hex/mapper.py +28 -2
  26. datahub/ingestion/source/hex/model.py +10 -2
  27. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  28. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  29. datahub/ingestion/source/kafka/kafka.py +1 -4
  30. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  31. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  32. datahub/ingestion/source/looker/looker_source.py +2 -3
  33. datahub/ingestion/source/mlflow.py +6 -7
  34. datahub/ingestion/source/mode.py +2 -2
  35. datahub/ingestion/source/nifi.py +3 -3
  36. datahub/ingestion/source/openapi.py +3 -3
  37. datahub/ingestion/source/openapi_parser.py +8 -8
  38. datahub/ingestion/source/powerbi/config.py +1 -1
  39. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  40. datahub/ingestion/source/redshift/profile.py +2 -2
  41. datahub/ingestion/source/sigma/sigma.py +6 -2
  42. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  43. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  44. datahub/ingestion/source/sql/trino.py +4 -3
  45. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  46. datahub/ingestion/source/superset.py +108 -81
  47. datahub/ingestion/source/tableau/tableau.py +4 -4
  48. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  49. datahub/ingestion/source/unity/source.py +1 -1
  50. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  51. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  52. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  53. datahub/ingestion/transformer/dataset_domain.py +1 -1
  54. datahub/lite/lite_util.py +2 -2
  55. datahub/metadata/_schema_classes.py +47 -2
  56. datahub/metadata/_urns/urn_defs.py +56 -0
  57. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  58. datahub/metadata/schema.avsc +121 -85
  59. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  60. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  61. datahub/metadata/schemas/FormInfo.avsc +5 -0
  62. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  63. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  64. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  65. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  66. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  67. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  68. datahub/testing/mcp_diff.py +1 -1
  69. datahub/utilities/file_backed_collections.py +6 -6
  70. datahub/utilities/hive_schema_to_avro.py +2 -2
  71. datahub/utilities/ingest_utils.py +2 -2
  72. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  73. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  74. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  75. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  76. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from datetime import datetime
3
- from typing import Iterable, List, Optional, Tuple
3
+ from typing import Iterable, List, Optional, Tuple, Union
4
4
 
5
5
  from datahub._codegen.aspect import (
6
6
  _Aspect, # TODO: is there a better import than this one?
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DashboardInfoClass,
47
47
  DashboardUsageStatisticsClass,
48
48
  DataPlatformInstanceClass,
49
+ EdgeClass,
49
50
  GlobalTagsClass,
50
51
  OwnerClass,
51
52
  OwnershipClass,
@@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
53
54
  TagAssociationClass,
54
55
  TimeWindowSizeClass,
55
56
  )
56
- from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
57
+ from datahub.metadata.urns import (
58
+ ContainerUrn,
59
+ CorpUserUrn,
60
+ DashboardUrn,
61
+ DatasetUrn,
62
+ SchemaFieldUrn,
63
+ Urn,
64
+ )
57
65
 
58
66
  logger = logging.getLogger(__name__)
59
67
 
@@ -116,6 +124,8 @@ class Mapper:
116
124
  ),
117
125
  externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
118
126
  customProperties=dict(id=project.id),
127
+ datasetEdges=self._dataset_edges(project.upstream_datasets),
128
+ # TODO: support schema field upstream, maybe InputFields?
119
129
  )
120
130
 
121
131
  subtypes = SubTypesClass(
@@ -343,6 +353,22 @@ class Mapper:
343
353
  else None,
344
354
  )
345
355
 
356
+ def _dataset_edges(
357
+ self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
358
+ ) -> Optional[List[EdgeClass]]:
359
+ # TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
360
+ return (
361
+ [
362
+ EdgeClass(
363
+ destinationUrn=upstream_urn.urn(),
364
+ )
365
+ for upstream_urn in upstream
366
+ if isinstance(upstream_urn, DatasetUrn)
367
+ ]
368
+ if upstream
369
+ else None
370
+ )
371
+
346
372
  def _yield_mcps(
347
373
  self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
348
374
  ) -> Iterable[MetadataWorkUnit]:
@@ -1,6 +1,8 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
- from typing import List, Optional
3
+ from typing import List, Optional, Union
4
+
5
+ from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
4
6
 
5
7
 
6
8
  @dataclass
@@ -51,6 +53,12 @@ class Project:
51
53
  creator: Optional[Owner] = None
52
54
  owner: Optional[Owner] = None
53
55
  analytics: Optional[Analytics] = None
56
+ upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
57
+ default_factory=list
58
+ )
59
+ upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
60
+ default_factory=list
61
+ )
54
62
 
55
63
 
56
64
  @dataclass
@@ -0,0 +1,300 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
+
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.source.hex.constants import (
9
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
10
+ HEX_PLATFORM_URN,
11
+ )
12
+ from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
13
+ from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
14
+ from datahub.sdk.main_client import DataHubClient
15
+ from datahub.sdk.search_filters import FilterDsl as F
16
+ from datahub.utilities.time import datetime_to_ts_millis
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
+ # Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
22
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
23
+
24
+
25
+ @dataclass
26
+ class QueryResponse:
27
+ """This is the public response model for the HexQueryFetcher."""
28
+
29
+ urn: QueryUrn
30
+ hex_project_id: str
31
+ dataset_subjects: List[DatasetUrn] = field(default_factory=list)
32
+ schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
33
+
34
+
35
+ @dataclass
36
+ class HexQueryFetcherReport(SourceReport):
37
+ start_datetime: Optional[datetime] = None
38
+ end_datetime: Optional[datetime] = None
39
+ fetched_query_urns: int = 0
40
+ fetched_query_objects: int = 0
41
+ filtered_out_queries_missing_metadata: int = 0
42
+ filtered_out_queries_different_workspace: int = 0
43
+ filtered_out_queries_no_match: int = 0
44
+ filtered_out_queries_no_subjects: int = 0
45
+ total_queries: int = 0
46
+ total_dataset_subjects: int = 0
47
+ total_schema_field_subjects: int = 0
48
+ num_calls_fetch_query_entities: int = 0
49
+
50
+
51
+ class HexQueryFetcher:
52
+ def __init__(
53
+ self,
54
+ datahub_client: DataHubClient,
55
+ workspace_name: str,
56
+ start_datetime: datetime,
57
+ end_datetime: datetime,
58
+ report: HexQueryFetcherReport,
59
+ page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
60
+ ):
61
+ self.datahub_client = datahub_client
62
+ self.workspace_name = workspace_name
63
+ self.start_datetime = start_datetime
64
+ self.end_datetime = end_datetime
65
+ self.report = report
66
+ self.page_size = page_size
67
+
68
+ self.report.start_datetime = start_datetime
69
+ self.report.end_datetime = end_datetime
70
+
71
+ def fetch(self) -> Iterable[QueryResponse]:
72
+ try:
73
+ query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
74
+ assert all(isinstance(urn, QueryUrn) for urn in query_urns)
75
+ self.report.fetched_query_urns = len(query_urns)
76
+
77
+ entities_by_urn = self._fetch_query_entities(query_urns)
78
+ self.report.fetched_query_objects = len(entities_by_urn)
79
+ except Exception as e:
80
+ self.report.failure(
81
+ title="Error fetching Queries for lineage",
82
+ message="Error fetching Queries will result on missing lineage",
83
+ context=str(
84
+ dict(
85
+ workspace_name=self.workspace_name,
86
+ start_datetime=self.start_datetime,
87
+ end_datetime=self.end_datetime,
88
+ )
89
+ ),
90
+ exc=e,
91
+ )
92
+ else:
93
+ if not query_urns or not entities_by_urn:
94
+ self.report.warning(
95
+ title="No Queries found with Hex as origin",
96
+ message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
97
+ context=str(
98
+ dict(
99
+ workspace_name=self.workspace_name,
100
+ start_datetime=self.start_datetime,
101
+ end_datetime=self.end_datetime,
102
+ )
103
+ ),
104
+ )
105
+ return
106
+
107
+ for query_urn, (
108
+ query_properties,
109
+ query_subjects,
110
+ ) in entities_by_urn.items():
111
+ maybe_query_response = self._build_query_response(
112
+ query_urn=query_urn,
113
+ query_properties=query_properties,
114
+ query_subjects=query_subjects,
115
+ )
116
+ if maybe_query_response:
117
+ yield maybe_query_response
118
+
119
+ def _fetch_query_entities(
120
+ self, query_urns: List[QueryUrn]
121
+ ) -> Dict[
122
+ QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
123
+ ]:
124
+ entities_by_urn: Dict[
125
+ QueryUrn,
126
+ Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
127
+ ] = {}
128
+ for i in range(0, len(query_urns), self.page_size):
129
+ batch = query_urns[i : i + self.page_size]
130
+
131
+ logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
132
+ entities = self.datahub_client._graph.get_entities(
133
+ entity_name=QueryUrn.ENTITY_TYPE,
134
+ urns=[urn.urn() for urn in batch],
135
+ aspects=[
136
+ QueryPropertiesClass.ASPECT_NAME,
137
+ QuerySubjectsClass.ASPECT_NAME,
138
+ ],
139
+ with_system_metadata=False,
140
+ )
141
+ self.report.num_calls_fetch_query_entities += 1
142
+ logger.debug(f"Get entities response: {entities}")
143
+
144
+ for urn, entity in entities.items():
145
+ query_urn = QueryUrn.from_string(urn)
146
+
147
+ properties_tuple = entity.get(
148
+ QueryPropertiesClass.ASPECT_NAME, (None, None)
149
+ )
150
+ query_properties: Optional[QueryPropertiesClass] = None
151
+ if properties_tuple and properties_tuple[0]:
152
+ assert isinstance(properties_tuple[0], QueryPropertiesClass)
153
+ query_properties = properties_tuple[0]
154
+
155
+ subjects_tuple = entity.get(
156
+ QuerySubjectsClass.ASPECT_NAME, (None, None)
157
+ )
158
+ query_subjects: Optional[QuerySubjectsClass] = None
159
+ if subjects_tuple and subjects_tuple[0]:
160
+ assert isinstance(subjects_tuple[0], QuerySubjectsClass)
161
+ query_subjects = subjects_tuple[0]
162
+
163
+ entities_by_urn[query_urn] = (query_properties, query_subjects)
164
+
165
+ return entities_by_urn
166
+
167
+ def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
168
+ last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
169
+ last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
170
+
171
+ urns = self.datahub_client.search.get_urns(
172
+ filter=F.and_(
173
+ F.entity_type(QueryUrn.ENTITY_TYPE),
174
+ F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
175
+ F.custom_filter(
176
+ "lastModifiedAt",
177
+ "GREATER_THAN_OR_EQUAL_TO",
178
+ [str(last_modified_start_at_millis)],
179
+ ),
180
+ F.custom_filter(
181
+ "lastModifiedAt",
182
+ "LESS_THAN_OR_EQUAL_TO",
183
+ [str(last_modified_end_at_millis)],
184
+ ),
185
+ ),
186
+ )
187
+ logger.debug(f"Get URNS by filter: {urns}")
188
+ return [QueryUrn.from_string(urn.urn()) for urn in urns]
189
+
190
+ def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
191
+ """
192
+ Extract project ID and workspace name from SQL statement.
193
+
194
+ Looks for Hex metadata in SQL comments in the format:
195
+ -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
196
+
197
+ Example:
198
+ -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
199
+
200
+ # TODO: Consider supporting multiline metadata format in the future:
201
+ # -- Hex query metadata: {
202
+ # -- "categories": ["Scratchpad"],
203
+ # -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
204
+ # -- ...
205
+ # -- }
206
+
207
+ Returns:
208
+ A tuple of (project_id, workspace_name) if both are successfully extracted
209
+ None if extraction fails for any reason
210
+ """
211
+ # Extract both project_id and workspace name in a single regex operation
212
+ match = re.search(HEX_METADATA_PATTERN, sql_statement)
213
+
214
+ if not match:
215
+ self.report.filtered_out_queries_no_match += 1
216
+ return None
217
+
218
+ try:
219
+ project_id = match.group(1)
220
+ workspace_name = match.group(2)
221
+ return project_id, workspace_name
222
+ except (IndexError, AttributeError) as e:
223
+ self.report.warning(
224
+ title="Failed to extract information from Hex query metadata",
225
+ message="Failed to extract information from Hex query metadata will result on missing lineage",
226
+ context=sql_statement,
227
+ exc=e,
228
+ )
229
+
230
+ return None
231
+
232
+ def _build_query_response(
233
+ self,
234
+ query_urn: QueryUrn,
235
+ query_properties: Optional[QueryPropertiesClass],
236
+ query_subjects: Optional[QuerySubjectsClass],
237
+ ) -> Optional[QueryResponse]:
238
+ # Skip if missing required aspects
239
+ if (
240
+ not query_properties
241
+ or not query_properties.statement
242
+ or not query_properties.statement.value
243
+ or not query_subjects
244
+ or query_subjects.subjects is None # empty list is allowed
245
+ ):
246
+ logger.debug(
247
+ f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
248
+ )
249
+ self.report.filtered_out_queries_missing_metadata += 1
250
+ return None
251
+
252
+ # Extract hex metadata (project_id and workspace_name)
253
+ metadata_result = self._extract_hex_metadata(query_properties.statement.value)
254
+ if not metadata_result:
255
+ logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
256
+ self.report.filtered_out_queries_missing_metadata += 1
257
+ return None
258
+
259
+ hex_project_id, workspace_from_url = metadata_result
260
+
261
+ # Validate workspace
262
+ if workspace_from_url != self.workspace_name:
263
+ logger.debug(
264
+ f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
265
+ )
266
+ self.report.filtered_out_queries_different_workspace += 1
267
+ return None
268
+
269
+ # Extract subjects
270
+ dataset_subjects: List[DatasetUrn] = []
271
+ schema_field_subjects: List[SchemaFieldUrn] = []
272
+ for subject in query_subjects.subjects:
273
+ if subject.entity and subject.entity.startswith("urn:li:dataset:"):
274
+ dataset_subjects.append(DatasetUrn.from_string(subject.entity))
275
+ elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
276
+ schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
277
+
278
+ if not dataset_subjects and not schema_field_subjects:
279
+ self.report.filtered_out_queries_no_subjects += 1
280
+ return None
281
+
282
+ # Create response
283
+ response = QueryResponse(
284
+ urn=query_urn,
285
+ hex_project_id=hex_project_id,
286
+ dataset_subjects=dataset_subjects,
287
+ schema_field_subjects=schema_field_subjects,
288
+ )
289
+ logger.debug(
290
+ f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
291
+ )
292
+ self.report.total_queries += 1
293
+ self.report.total_dataset_subjects += len(dataset_subjects)
294
+ self.report.total_schema_field_subjects += len(schema_field_subjects)
295
+
296
+ logger.debug(
297
+ f"Processed query {query_urn} with Hex project ID {hex_project_id}"
298
+ )
299
+
300
+ return response
@@ -2,6 +2,7 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
+ from functools import partial
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple
6
7
 
7
8
  from dateutil import parser as dateutil_parser
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
47
48
  )
48
49
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
49
50
  from datahub.emitter.mcp_builder import NamespaceKey
51
+ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
52
+ auto_patch_last_modified,
53
+ )
54
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
55
+ EnsureAspectSizeProcessor,
56
+ )
50
57
  from datahub.ingestion.api.common import PipelineContext
51
58
  from datahub.ingestion.api.decorators import (
52
59
  SourceCapability,
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
57
64
  support_status,
58
65
  )
59
66
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
67
+ from datahub.ingestion.api.source_helpers import (
68
+ AutoSystemMetadata,
69
+ auto_fix_duplicate_schema_field_paths,
70
+ auto_fix_empty_field_paths,
71
+ auto_lowercase_urns,
72
+ auto_materialize_referenced_tags_terms,
73
+ auto_workunit_reporter,
74
+ )
60
75
  from datahub.ingestion.api.workunit import MetadataWorkUnit
61
76
  from datahub.ingestion.extractor import schema_util
62
77
  from datahub.ingestion.source.common.subtypes import (
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
82
97
  SchemaMetadata,
83
98
  )
84
99
  from datahub.metadata.schema_classes import (
100
+ BrowsePathEntryClass,
101
+ BrowsePathsV2Class,
85
102
  ContainerClass,
86
103
  DataPlatformInstanceClass,
87
104
  DatasetPropertiesClass,
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
134
151
  super().__init__(config, ctx)
135
152
  self.report: IcebergSourceReport = IcebergSourceReport()
136
153
  self.config: IcebergSourceConfig = config
154
+ self.ctx: PipelineContext = ctx
137
155
 
138
156
  @classmethod
139
157
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
141
159
  return cls(config, ctx)
142
160
 
143
161
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
162
+ # This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
163
+ # of parallelism. Because of this, 2 processors won't work as expected:
164
+ # 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
165
+ # in this source
166
+ # 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
167
+ # would have been applied in a thread (single) shared between the source, processors and transformers.
168
+ # Since the metadata scraping happens in separate threads, this could lead to difference between
169
+ # time used by systemMetadata and actual time at which metadata was read
170
+ auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
171
+ if (
172
+ self.ctx.pipeline_config
173
+ and self.ctx.pipeline_config.source
174
+ and self.ctx.pipeline_config.source.config
175
+ and (
176
+ (
177
+ hasattr(
178
+ self.ctx.pipeline_config.source.config,
179
+ "convert_urns_to_lowercase",
180
+ )
181
+ and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
182
+ )
183
+ or (
184
+ hasattr(self.ctx.pipeline_config.source.config, "get")
185
+ and self.ctx.pipeline_config.source.config.get(
186
+ "convert_urns_to_lowercase"
187
+ )
188
+ )
189
+ )
190
+ ):
191
+ auto_lowercase_dataset_urns = auto_lowercase_urns
192
+
144
193
  return [
145
- *super().get_workunit_processors(),
194
+ auto_lowercase_dataset_urns,
195
+ auto_materialize_referenced_tags_terms,
196
+ partial(
197
+ auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
198
+ ),
199
+ partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
200
+ partial(auto_workunit_reporter, self.get_report()),
201
+ auto_patch_last_modified,
202
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
146
203
  StaleEntityRemovalHandler.create(
147
204
  self, self.config, self.ctx
148
205
  ).workunit_processor,
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
208
265
  )
209
266
  thread_local.local_catalog = self.config.get_catalog()
210
267
 
268
+ if not hasattr(thread_local, "stamping_processor"):
269
+ LOGGER.debug(
270
+ f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
271
+ )
272
+ thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
273
+
211
274
  with PerfTimer() as timer:
212
275
  table = thread_local.local_catalog.load_table(dataset_path)
213
276
  time_taken = timer.elapsed_seconds()
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
224
287
  for aspect in self._create_iceberg_table_aspects(
225
288
  dataset_name, table, namespace_urn
226
289
  ):
227
- yield MetadataChangeProposalWrapper(
228
- entityUrn=dataset_urn, aspect=aspect
229
- ).as_workunit()
290
+ yield thread_local.stamping_processor.stamp_wu(
291
+ MetadataChangeProposalWrapper(
292
+ entityUrn=dataset_urn, aspect=aspect
293
+ ).as_workunit()
294
+ )
230
295
  except NoSuchPropertyException as e:
231
296
  self.report.warning(
232
297
  title="Unable to process table",
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
308
373
  return
309
374
 
310
375
  try:
376
+ stamping_processor = AutoSystemMetadata(self.ctx)
311
377
  namespace_ids = self._get_namespaces(catalog)
312
378
  namespaces: List[Tuple[Identifier, str]] = []
313
379
  for namespace in namespace_ids:
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
323
389
  )
324
390
  namespaces.append((namespace, namespace_urn))
325
391
  for aspect in self._create_iceberg_namespace_aspects(namespace):
326
- yield MetadataChangeProposalWrapper(
327
- entityUrn=namespace_urn, aspect=aspect
328
- ).as_workunit()
392
+ yield stamping_processor.stamp_wu(
393
+ MetadataChangeProposalWrapper(
394
+ entityUrn=namespace_urn, aspect=aspect
395
+ ).as_workunit()
396
+ )
329
397
  LOGGER.debug("Namespaces ingestion completed")
330
398
  except Exception as e:
331
399
  self.report.report_failure(
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
366
434
  yield dataset_ownership
367
435
 
368
436
  yield self._create_schema_metadata(dataset_name, table)
369
- yield self._get_dataplatform_instance_aspect()
437
+ dpi = self._get_dataplatform_instance_aspect()
438
+ yield dpi
439
+ yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
370
440
  yield ContainerClass(container=str(namespace_urn))
371
441
 
372
442
  self.report.report_table_processing_time(
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
377
447
  profiler = IcebergProfiler(self.report, self.config.profiling)
378
448
  yield from profiler.profile_table(dataset_name, table)
379
449
 
450
+ def _create_browse_paths_aspect(
451
+ self,
452
+ platform_instance_urn: Optional[str] = None,
453
+ container_urn: Optional[str] = None,
454
+ ) -> BrowsePathsV2Class:
455
+ path = []
456
+ if platform_instance_urn:
457
+ path.append(
458
+ BrowsePathEntryClass(
459
+ id=platform_instance_urn, urn=platform_instance_urn
460
+ )
461
+ )
462
+ if container_urn:
463
+ path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
464
+ return BrowsePathsV2Class(path=path)
465
+
380
466
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
381
467
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
382
468
  Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
@@ -425,23 +511,21 @@ class IcebergSource(StatefulIngestionSourceBase):
425
511
  def _get_dataset_properties_aspect(
426
512
  self, dataset_name: str, table: Table
427
513
  ) -> DatasetPropertiesClass:
428
- additional_properties = {}
514
+ created: Optional[TimeStampClass] = None
429
515
  custom_properties = table.metadata.properties.copy()
430
516
  custom_properties["location"] = table.metadata.location
431
517
  custom_properties["format-version"] = str(table.metadata.format_version)
432
518
  custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
519
+ last_modified: Optional[int] = table.metadata.last_updated_ms
433
520
  if table.current_snapshot():
434
521
  custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
435
522
  custom_properties["manifest-list"] = table.current_snapshot().manifest_list
436
- additional_properties["lastModified"] = TimeStampClass(
437
- int(table.current_snapshot().timestamp_ms)
438
- )
523
+ if not last_modified:
524
+ last_modified = int(table.current_snapshot().timestamp_ms)
439
525
  if "created-at" in custom_properties:
440
526
  try:
441
527
  dt = dateutil_parser.isoparse(custom_properties["created-at"])
442
- additional_properties["created"] = TimeStampClass(
443
- int(dt.timestamp() * 1000)
444
- )
528
+ created = TimeStampClass(int(dt.timestamp() * 1000))
445
529
  except Exception as ex:
446
530
  LOGGER.warning(
447
531
  f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
@@ -451,8 +535,10 @@ class IcebergSource(StatefulIngestionSourceBase):
451
535
  name=table.name()[-1],
452
536
  description=table.metadata.properties.get("comment", None),
453
537
  customProperties=custom_properties,
454
- lastModified=additional_properties.get("lastModified"),
455
- created=additional_properties.get("created"),
538
+ lastModified=TimeStampClass(last_modified)
539
+ if last_modified is not None
540
+ else None,
541
+ created=created,
456
542
  qualifiedName=dataset_name,
457
543
  )
458
544
 
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
530
616
  name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
531
617
  )
532
618
  yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
533
- yield self._get_dataplatform_instance_aspect()
619
+ dpi = self._get_dataplatform_instance_aspect()
620
+ yield dpi
621
+ yield self._create_browse_paths_aspect(dpi.instance)
534
622
 
535
623
 
536
624
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
568
568
 
569
569
  for config_key in KafkaTopicConfigKeys:
570
570
  try:
571
- if (
572
- config_key in topic_config.keys()
573
- and topic_config[config_key] is not None
574
- ):
571
+ if config_key in topic_config and topic_config[config_key] is not None:
575
572
  config_value = topic_config[config_key].value
576
573
  custom_props[config_key] = (
577
574
  config_value
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
197
197
  for name in transform_names:
198
198
  transform = {"name": name}
199
199
  transforms.append(transform)
200
- for key in self.connector_manifest.config.keys():
200
+ for key in self.connector_manifest.config:
201
201
  if key.startswith(f"transforms.{name}."):
202
202
  transform[key.replace(f"transforms.{name}.", "")] = (
203
203
  self.connector_manifest.config[key]
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
121
121
  for name in transform_names:
122
122
  transform = {"name": name}
123
123
  transforms.append(transform)
124
- for key in self.connector_manifest.config.keys():
124
+ for key in self.connector_manifest.config:
125
125
  if key.startswith(f"transforms.{name}."):
126
126
  transform[key.replace(f"transforms.{name}.", "")] = (
127
127
  self.connector_manifest.config[key]
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
363
363
  filters: MutableMapping[str, Any] = (
364
364
  query.filters if query.filters is not None else {}
365
365
  )
366
- for field in filters.keys():
366
+ for field in filters:
367
367
  if field is None:
368
368
  continue
369
369
 
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
877
877
  # fine to set them to None.
878
878
  # TODO: Track project names for each explore.
879
879
  explores_to_fetch = [
880
- (None, model, explore)
881
- for (model, explore) in self.reachable_explores.keys()
880
+ (None, model, explore) for (model, explore) in self.reachable_explores
882
881
  ]
883
882
  explores_to_fetch.sort()
884
883