acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -0,0 +1,297 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
+
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.source.hex.constants import (
9
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
10
+ HEX_PLATFORM_URN,
11
+ )
12
+ from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
13
+ from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
14
+ from datahub.sdk.main_client import DataHubClient
15
+ from datahub.sdk.search_filters import FilterDsl as F
16
+ from datahub.utilities.time import datetime_to_ts_millis
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
22
+
23
+
24
+ @dataclass
25
+ class QueryResponse:
26
+ """This is the public response model for the HexQueryFetcher."""
27
+
28
+ urn: QueryUrn
29
+ hex_project_id: str
30
+ dataset_subjects: List[DatasetUrn] = field(default_factory=list)
31
+ schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
32
+
33
+
34
+ @dataclass
35
+ class HexQueryFetcherReport(SourceReport):
36
+ start_datetime: Optional[datetime] = None
37
+ end_datetime: Optional[datetime] = None
38
+ fetched_query_urns: int = 0
39
+ fetched_query_objects: int = 0
40
+ filtered_out_queries_missing_metadata: int = 0
41
+ filtered_out_queries_different_workspace: int = 0
42
+ filtered_out_queries_no_subjects: int = 0
43
+ total_queries: int = 0
44
+ total_dataset_subjects: int = 0
45
+ total_schema_field_subjects: int = 0
46
+ num_calls_fetch_query_entities: int = 0
47
+
48
+
49
+ class HexQueryFetcher:
50
+ def __init__(
51
+ self,
52
+ datahub_client: DataHubClient,
53
+ workspace_name: str,
54
+ start_datetime: datetime,
55
+ end_datetime: datetime,
56
+ report: HexQueryFetcherReport,
57
+ page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
58
+ ):
59
+ self.datahub_client = datahub_client
60
+ self.workspace_name = workspace_name
61
+ self.start_datetime = start_datetime
62
+ self.end_datetime = end_datetime
63
+ self.report = report
64
+ self.page_size = page_size
65
+
66
+ self.report.start_datetime = start_datetime
67
+ self.report.end_datetime = end_datetime
68
+
69
+ def fetch(self) -> Iterable[QueryResponse]:
70
+ try:
71
+ query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
72
+ assert all(isinstance(urn, QueryUrn) for urn in query_urns)
73
+ self.report.fetched_query_urns = len(query_urns)
74
+
75
+ entities_by_urn = self._fetch_query_entities(query_urns)
76
+ self.report.fetched_query_objects = len(entities_by_urn)
77
+ except Exception as e:
78
+ self.report.failure(
79
+ title="Error fetching Queries for lineage",
80
+ message="Error fetching Queries will result on missing lineage",
81
+ context=str(
82
+ dict(
83
+ workspace_name=self.workspace_name,
84
+ start_datetime=self.start_datetime,
85
+ end_datetime=self.end_datetime,
86
+ )
87
+ ),
88
+ exc=e,
89
+ )
90
+ else:
91
+ if not query_urns or not entities_by_urn:
92
+ self.report.warning(
93
+ title="No Queries found with Hex as origin",
94
+ message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
95
+ context=str(
96
+ dict(
97
+ workspace_name=self.workspace_name,
98
+ start_datetime=self.start_datetime,
99
+ end_datetime=self.end_datetime,
100
+ )
101
+ ),
102
+ )
103
+ return
104
+
105
+ for query_urn, (
106
+ query_properties,
107
+ query_subjects,
108
+ ) in entities_by_urn.items():
109
+ maybe_query_response = self._build_query_response(
110
+ query_urn=query_urn,
111
+ query_properties=query_properties,
112
+ query_subjects=query_subjects,
113
+ )
114
+ if maybe_query_response:
115
+ yield maybe_query_response
116
+
117
+ def _fetch_query_entities(
118
+ self, query_urns: List[QueryUrn]
119
+ ) -> Dict[
120
+ QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
121
+ ]:
122
+ entities_by_urn: Dict[
123
+ QueryUrn,
124
+ Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
125
+ ] = {}
126
+ for i in range(0, len(query_urns), self.page_size):
127
+ batch = query_urns[i : i + self.page_size]
128
+
129
+ logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
130
+ entities = self.datahub_client._graph.get_entities(
131
+ entity_name=QueryUrn.ENTITY_TYPE,
132
+ urns=[urn.urn() for urn in batch],
133
+ aspects=[
134
+ QueryPropertiesClass.ASPECT_NAME,
135
+ QuerySubjectsClass.ASPECT_NAME,
136
+ ],
137
+ with_system_metadata=False,
138
+ )
139
+ self.report.num_calls_fetch_query_entities += 1
140
+ logger.debug(f"Get entities response: {entities}")
141
+
142
+ for urn, entity in entities.items():
143
+ query_urn = QueryUrn.from_string(urn)
144
+
145
+ properties_tuple = entity.get(
146
+ QueryPropertiesClass.ASPECT_NAME, (None, None)
147
+ )
148
+ query_properties: Optional[QueryPropertiesClass] = None
149
+ if properties_tuple and properties_tuple[0]:
150
+ assert isinstance(properties_tuple[0], QueryPropertiesClass)
151
+ query_properties = properties_tuple[0]
152
+
153
+ subjects_tuple = entity.get(
154
+ QuerySubjectsClass.ASPECT_NAME, (None, None)
155
+ )
156
+ query_subjects: Optional[QuerySubjectsClass] = None
157
+ if subjects_tuple and subjects_tuple[0]:
158
+ assert isinstance(subjects_tuple[0], QuerySubjectsClass)
159
+ query_subjects = subjects_tuple[0]
160
+
161
+ entities_by_urn[query_urn] = (query_properties, query_subjects)
162
+
163
+ return entities_by_urn
164
+
165
+ def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
166
+ last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
167
+ last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
168
+
169
+ urns = self.datahub_client.search.get_urns(
170
+ filter=F.and_(
171
+ F.entity_type(QueryUrn.ENTITY_TYPE),
172
+ F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
173
+ F.custom_filter(
174
+ "lastModifiedAt",
175
+ "GREATER_THAN_OR_EQUAL_TO",
176
+ [str(last_modified_start_at_millis)],
177
+ ),
178
+ F.custom_filter(
179
+ "lastModifiedAt",
180
+ "LESS_THAN_OR_EQUAL_TO",
181
+ [str(last_modified_end_at_millis)],
182
+ ),
183
+ ),
184
+ )
185
+ logger.debug(f"Get URNS by filter: {urns}")
186
+ return [QueryUrn.from_string(urn.urn()) for urn in urns]
187
+
188
+ def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
189
+ """
190
+ Extract project ID and workspace name from SQL statement.
191
+
192
+ Looks for Hex metadata in SQL comments in the format:
193
+ -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
194
+
195
+ Example:
196
+ -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
197
+
198
+ # TODO: Consider supporting multiline metadata format in the future:
199
+ # -- Hex query metadata: {
200
+ # -- "categories": ["Scratchpad"],
201
+ # -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
202
+ # -- ...
203
+ # -- }
204
+
205
+ Returns:
206
+ A tuple of (project_id, workspace_name) if both are successfully extracted
207
+ None if extraction fails for any reason
208
+ """
209
+ # Extract both project_id and workspace name in a single regex operation
210
+ match = re.search(HEX_METADATA_PATTERN, sql_statement)
211
+
212
+ if not match:
213
+ return None
214
+
215
+ try:
216
+ project_id = match.group(1)
217
+ workspace_name = match.group(2)
218
+ return project_id, workspace_name
219
+ except (IndexError, AttributeError) as e:
220
+ self.report.warning(
221
+ title="Failed to extract information from Hex query metadata",
222
+ message="Failed to extract information from Hex query metadata will result on missing lineage",
223
+ context=sql_statement,
224
+ exc=e,
225
+ )
226
+
227
+ return None
228
+
229
+ def _build_query_response(
230
+ self,
231
+ query_urn: QueryUrn,
232
+ query_properties: Optional[QueryPropertiesClass],
233
+ query_subjects: Optional[QuerySubjectsClass],
234
+ ) -> Optional[QueryResponse]:
235
+ # Skip if missing required aspects
236
+ if (
237
+ not query_properties
238
+ or not query_properties.statement
239
+ or not query_properties.statement.value
240
+ or not query_subjects
241
+ or query_subjects.subjects is None # empty list is allowed
242
+ ):
243
+ logger.debug(
244
+ f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
245
+ )
246
+ self.report.filtered_out_queries_missing_metadata += 1
247
+ return None
248
+
249
+ # Extract hex metadata (project_id and workspace_name)
250
+ metadata_result = self._extract_hex_metadata(query_properties.statement.value)
251
+ if not metadata_result:
252
+ logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
253
+ self.report.filtered_out_queries_missing_metadata += 1
254
+ return None
255
+
256
+ hex_project_id, workspace_from_url = metadata_result
257
+
258
+ # Validate workspace
259
+ if workspace_from_url != self.workspace_name:
260
+ logger.debug(
261
+ f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
262
+ )
263
+ self.report.filtered_out_queries_different_workspace += 1
264
+ return None
265
+
266
+ # Extract subjects
267
+ dataset_subjects: List[DatasetUrn] = []
268
+ schema_field_subjects: List[SchemaFieldUrn] = []
269
+ for subject in query_subjects.subjects:
270
+ if subject.entity and subject.entity.startswith("urn:li:dataset:"):
271
+ dataset_subjects.append(DatasetUrn.from_string(subject.entity))
272
+ elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
273
+ schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
274
+
275
+ if not dataset_subjects and not schema_field_subjects:
276
+ self.report.filtered_out_queries_no_subjects += 1
277
+ return None
278
+
279
+ # Create response
280
+ response = QueryResponse(
281
+ urn=query_urn,
282
+ hex_project_id=hex_project_id,
283
+ dataset_subjects=dataset_subjects,
284
+ schema_field_subjects=schema_field_subjects,
285
+ )
286
+ logger.debug(
287
+ f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
288
+ )
289
+ self.report.total_queries += 1
290
+ self.report.total_dataset_subjects += len(dataset_subjects)
291
+ self.report.total_schema_field_subjects += len(schema_field_subjects)
292
+
293
+ logger.debug(
294
+ f"Processed query {query_urn} with Hex project ID {hex_project_id}"
295
+ )
296
+
297
+ return response
@@ -425,23 +425,21 @@ class IcebergSource(StatefulIngestionSourceBase):
425
425
  def _get_dataset_properties_aspect(
426
426
  self, dataset_name: str, table: Table
427
427
  ) -> DatasetPropertiesClass:
428
- additional_properties = {}
428
+ created: Optional[TimeStampClass] = None
429
429
  custom_properties = table.metadata.properties.copy()
430
430
  custom_properties["location"] = table.metadata.location
431
431
  custom_properties["format-version"] = str(table.metadata.format_version)
432
432
  custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
433
+ last_modified: Optional[int] = table.metadata.last_updated_ms
433
434
  if table.current_snapshot():
434
435
  custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
435
436
  custom_properties["manifest-list"] = table.current_snapshot().manifest_list
436
- additional_properties["lastModified"] = TimeStampClass(
437
- int(table.current_snapshot().timestamp_ms)
438
- )
437
+ if not last_modified:
438
+ last_modified = int(table.current_snapshot().timestamp_ms)
439
439
  if "created-at" in custom_properties:
440
440
  try:
441
441
  dt = dateutil_parser.isoparse(custom_properties["created-at"])
442
- additional_properties["created"] = TimeStampClass(
443
- int(dt.timestamp() * 1000)
444
- )
442
+ created = TimeStampClass(int(dt.timestamp() * 1000))
445
443
  except Exception as ex:
446
444
  LOGGER.warning(
447
445
  f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
@@ -451,8 +449,10 @@ class IcebergSource(StatefulIngestionSourceBase):
451
449
  name=table.name()[-1],
452
450
  description=table.metadata.properties.get("comment", None),
453
451
  customProperties=custom_properties,
454
- lastModified=additional_properties.get("lastModified"),
455
- created=additional_properties.get("created"),
452
+ lastModified=TimeStampClass(last_modified)
453
+ if last_modified is not None
454
+ else None,
455
+ created=created,
456
456
  qualifiedName=dataset_name,
457
457
  )
458
458
 
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
16
16
  )
17
17
  from datahub.configuration.source_common import EnvConfigMixin
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
- from datahub.emitter.mcp_builder import ContainerKey
19
+ from datahub.emitter.mcp_builder import ExperimentKey
20
20
  from datahub.ingestion.api.common import PipelineContext
21
21
  from datahub.ingestion.api.decorators import (
22
22
  SupportStatus,
@@ -77,10 +77,6 @@ from datahub.sdk.dataset import Dataset
77
77
  T = TypeVar("T")
78
78
 
79
79
 
80
- class ContainerKeyWithId(ContainerKey):
81
- id: str
82
-
83
-
84
80
  class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
81
  tracking_uri: Optional[str] = Field(
86
82
  default=None,
@@ -252,7 +248,7 @@ class MLflowSource(StatefulIngestionSourceBase):
252
248
  self, experiment: Experiment
253
249
  ) -> Iterable[MetadataWorkUnit]:
254
250
  experiment_container = Container(
255
- container_key=ContainerKeyWithId(
251
+ container_key=ExperimentKey(
256
252
  platform=str(DataPlatformUrn(platform_name=self.platform)),
257
253
  id=experiment.name,
258
254
  ),
@@ -470,7 +466,7 @@ class MLflowSource(StatefulIngestionSourceBase):
470
466
  def _get_run_workunits(
471
467
  self, experiment: Experiment, run: Run
472
468
  ) -> Iterable[MetadataWorkUnit]:
473
- experiment_key = ContainerKeyWithId(
469
+ experiment_key = ExperimentKey(
474
470
  platform=str(DataPlatformUrn(self.platform)), id=experiment.name
475
471
  )
476
472
 
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
94
94
  UpstreamLineageClass,
95
95
  ViewPropertiesClass,
96
96
  )
97
- from datahub.metadata.urns import ChartUrn
97
+ from datahub.metadata.urns import ChartUrn, DatasetUrn
98
98
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
99
99
  from datahub.utilities.dedup_list import deduplicate_list
100
100
  from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
@@ -1083,6 +1083,7 @@ class Mapper:
1083
1083
  report: powerbi_data_classes.Report,
1084
1084
  chart_mcps: List[MetadataChangeProposalWrapper],
1085
1085
  user_mcps: List[MetadataChangeProposalWrapper],
1086
+ dataset_edges: List[EdgeClass],
1086
1087
  ) -> List[MetadataChangeProposalWrapper]:
1087
1088
  """
1088
1089
  Map PowerBi report to Datahub dashboard
@@ -1104,6 +1105,7 @@ class Mapper:
1104
1105
  charts=chart_urn_list,
1105
1106
  lastModified=ChangeAuditStamps(),
1106
1107
  dashboardUrl=report.webUrl,
1108
+ datasetEdges=dataset_edges,
1107
1109
  )
1108
1110
 
1109
1111
  info_mcp = self.new_mcp(
@@ -1197,12 +1199,23 @@ class Mapper:
1197
1199
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1198
1200
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1199
1201
 
1202
+ # collect all upstream datasets; using a set to retain unique urns
1203
+ dataset_urns = {
1204
+ dataset.entityUrn
1205
+ for dataset in ds_mcps
1206
+ if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
1207
+ }
1208
+ dataset_edges = [
1209
+ EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
1210
+ ]
1211
+
1200
1212
  # Let's convert report to datahub dashboard
1201
1213
  report_mcps = self.report_to_dashboard(
1202
1214
  workspace=workspace,
1203
1215
  report=report,
1204
1216
  chart_mcps=chart_mcps,
1205
1217
  user_mcps=user_mcps,
1218
+ dataset_edges=dataset_edges,
1206
1219
  )
1207
1220
 
1208
1221
  # Now add MCPs in sequence
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
128
128
  if catalog_name is None:
129
129
  raise exc.NoSuchTableError("catalog is required in connection")
130
130
  connector_name = get_catalog_connector_name(connection.engine, catalog_name)
131
- if connector_name is None:
132
- return {}
133
- if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
131
+ if (
132
+ connector_name is not None
133
+ and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
134
+ ):
134
135
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
135
136
  query = f"SELECT * FROM {properties_table}"
136
137
  row = connection.execute(sql.text(query)).fetchone()
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
45
45
  description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
46
46
  le=100.0,
47
47
  ge=0.0,
48
- hidden_from_docs=True,
49
48
  )
50
49
 
51
50