acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/errors.py +4 -0
  14. datahub/ingestion/api/common.py +9 -0
  15. datahub/ingestion/api/source.py +6 -2
  16. datahub/ingestion/api/source_helpers.py +35 -2
  17. datahub/ingestion/graph/client.py +122 -7
  18. datahub/ingestion/graph/filters.py +41 -16
  19. datahub/ingestion/run/pipeline.py +0 -6
  20. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  21. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  22. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  23. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  24. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  25. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  26. datahub/ingestion/source/hex/constants.py +5 -0
  27. datahub/ingestion/source/hex/hex.py +150 -22
  28. datahub/ingestion/source/hex/mapper.py +28 -2
  29. datahub/ingestion/source/hex/model.py +10 -2
  30. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  31. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  32. datahub/ingestion/source/kafka/kafka.py +1 -4
  33. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  34. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  35. datahub/ingestion/source/looker/looker_source.py +2 -3
  36. datahub/ingestion/source/mlflow.py +6 -7
  37. datahub/ingestion/source/mode.py +2 -2
  38. datahub/ingestion/source/nifi.py +3 -3
  39. datahub/ingestion/source/openapi.py +3 -3
  40. datahub/ingestion/source/openapi_parser.py +8 -8
  41. datahub/ingestion/source/powerbi/config.py +1 -1
  42. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  43. datahub/ingestion/source/redshift/profile.py +2 -2
  44. datahub/ingestion/source/sigma/sigma.py +6 -2
  45. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  46. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  47. datahub/ingestion/source/sql/trino.py +4 -3
  48. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  49. datahub/ingestion/source/superset.py +108 -81
  50. datahub/ingestion/source/tableau/tableau.py +4 -4
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  52. datahub/ingestion/source/unity/source.py +1 -1
  53. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  54. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  55. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  56. datahub/ingestion/transformer/dataset_domain.py +1 -1
  57. datahub/lite/lite_util.py +2 -2
  58. datahub/metadata/_schema_classes.py +47 -2
  59. datahub/metadata/_urns/urn_defs.py +56 -0
  60. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  61. datahub/metadata/schema.avsc +121 -85
  62. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  63. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  64. datahub/metadata/schemas/FormInfo.avsc +5 -0
  65. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  66. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  67. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  68. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  69. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  70. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  71. datahub/sdk/search_client.py +81 -8
  72. datahub/sdk/search_filters.py +73 -11
  73. datahub/testing/mcp_diff.py +1 -1
  74. datahub/utilities/file_backed_collections.py +6 -6
  75. datahub/utilities/hive_schema_to_avro.py +2 -2
  76. datahub/utilities/ingest_utils.py +2 -2
  77. datahub/utilities/threaded_iterator_executor.py +16 -3
  78. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  79. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
123
123
  ).workunit_processor,
124
124
  ]
125
125
 
126
- def get_workunits_internal(
127
- self,
128
- ) -> Iterable[MetadataWorkUnit]:
129
- for metadata in self._get_metadata():
130
- if isinstance(metadata, MetadataWorkUnit):
131
- yield metadata
132
- else:
133
- yield from metadata.as_workunits()
134
-
135
- def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
126
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
136
127
  if not self.cassandra_api.authenticate():
137
128
  return
138
129
  keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
362
362
  if self.config.include_table_item is None:
363
363
  return
364
364
  dataset_name = f"{region}.{table_name}"
365
- if dataset_name not in self.config.include_table_item.keys():
365
+ if dataset_name not in self.config.include_table_item:
366
366
  return
367
367
  primary_key_list = self.config.include_table_item.get(dataset_name)
368
368
  assert isinstance(primary_key_list, List)
@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
215
215
  datajob = DataJob(
216
216
  id=connector.connector_id,
217
217
  flow_urn=dataflow_urn,
218
+ platform_instance=self.config.platform_instance,
218
219
  name=connector.connector_name,
219
220
  owners={owner_email} if owner_email else set(),
220
221
  )
@@ -190,7 +190,7 @@ class FivetranLogAPI:
190
190
  jobs: List[Job] = []
191
191
  if connector_sync_log is None:
192
192
  return jobs
193
- for sync_id in connector_sync_log.keys():
193
+ for sync_id in connector_sync_log:
194
194
  if len(connector_sync_log[sync_id]) != 2:
195
195
  # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
196
196
  continue
@@ -1,3 +1,8 @@
1
+ from datahub.metadata.urns import DataPlatformUrn
2
+
1
3
  HEX_PLATFORM_NAME = "hex"
4
+ HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
2
5
  HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
3
6
  HEX_API_PAGE_SIZE_DEFAULT = 100
7
+
8
+ DATAHUB_API_PAGE_SIZE_DEFAULT = 100
@@ -1,9 +1,12 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime, timedelta, timezone
1
3
  from typing import Any, Dict, Iterable, List, Optional
2
4
 
3
- from pydantic import Field, SecretStr
5
+ from pydantic import Field, SecretStr, root_validator
4
6
  from typing_extensions import assert_never
5
7
 
6
8
  from datahub.configuration.common import AllowDenyPattern
9
+ from datahub.configuration.datetimes import parse_user_datetime
7
10
  from datahub.configuration.source_common import (
8
11
  EnvConfigMixin,
9
12
  PlatformInstanceConfigMixin,
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
25
  from datahub.ingestion.source.hex.api import HexApi, HexApiReport
23
26
  from datahub.ingestion.source.hex.constants import (
27
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
24
28
  HEX_API_BASE_URL_DEFAULT,
25
29
  HEX_API_PAGE_SIZE_DEFAULT,
26
30
  HEX_PLATFORM_NAME,
27
31
  )
28
32
  from datahub.ingestion.source.hex.mapper import Mapper
29
33
  from datahub.ingestion.source.hex.model import Component, Project
34
+ from datahub.ingestion.source.hex.query_fetcher import (
35
+ HexQueryFetcher,
36
+ HexQueryFetcherReport,
37
+ )
30
38
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
31
39
  StaleEntityRemovalHandler,
32
40
  StaleEntityRemovalSourceReport,
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
42
  )
35
43
  from datahub.ingestion.source.state.stateful_ingestion_base import (
36
44
  StatefulIngestionConfigBase,
37
- StatefulIngestionReport,
38
45
  StatefulIngestionSourceBase,
39
46
  )
47
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
48
+ from datahub.sdk.main_client import DataHubClient
40
49
 
41
50
 
42
51
  class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
93
102
  default=True,
94
103
  description="Set ownership identity from owner/creator email",
95
104
  )
105
+ include_lineage: bool = Field(
106
+ default=True,
107
+ description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
108
+ )
109
+ lineage_start_time: Optional[datetime] = Field(
110
+ default=None,
111
+ description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
112
+ )
113
+ lineage_end_time: Optional[datetime] = Field(
114
+ default=None,
115
+ description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
116
+ )
117
+ datahub_page_size: int = Field(
118
+ default=DATAHUB_API_PAGE_SIZE_DEFAULT,
119
+ description="Number of items to fetch per DataHub API call.",
120
+ )
121
+
122
+ @root_validator(pre=True)
123
+ def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
124
+ # lineage_end_time default = now
125
+ if "lineage_end_time" not in data or data["lineage_end_time"] is None:
126
+ data["lineage_end_time"] = datetime.now(tz=timezone.utc)
127
+ # if string is given, parse it
128
+ if isinstance(data["lineage_end_time"], str):
129
+ data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
130
+ # if no timezone is given, assume UTC
131
+ if data["lineage_end_time"].tzinfo is None:
132
+ data["lineage_end_time"] = data["lineage_end_time"].replace(
133
+ tzinfo=timezone.utc
134
+ )
135
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
136
+ assert (
137
+ data["lineage_end_time"]
138
+ and isinstance(data["lineage_end_time"], datetime)
139
+ and data["lineage_end_time"].tzinfo is not None
140
+ and data["lineage_end_time"].tzinfo == timezone.utc
141
+ )
142
+
143
+ # lineage_start_time default = lineage_end_time - 1 day
144
+ if "lineage_start_time" not in data or data["lineage_start_time"] is None:
145
+ data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
146
+ # if string is given, parse it
147
+ if isinstance(data["lineage_start_time"], str):
148
+ data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
149
+ # if no timezone is given, assume UTC
150
+ if data["lineage_start_time"].tzinfo is None:
151
+ data["lineage_start_time"] = data["lineage_start_time"].replace(
152
+ tzinfo=timezone.utc
153
+ )
154
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
155
+ assert (
156
+ data["lineage_start_time"]
157
+ and isinstance(data["lineage_start_time"], datetime)
158
+ and data["lineage_start_time"].tzinfo is not None
159
+ and data["lineage_start_time"].tzinfo == timezone.utc
160
+ )
161
+
162
+ return data
96
163
 
97
164
 
98
- class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
165
+ @dataclass
166
+ class HexReport(
167
+ StaleEntityRemovalSourceReport,
168
+ HexApiReport,
169
+ IngestionStageReport,
170
+ HexQueryFetcherReport,
171
+ ):
99
172
  pass
100
173
 
101
174
 
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
110
183
  def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
111
184
  super().__init__(config, ctx)
112
185
  self.source_config = config
113
- self.report = HexReport()
186
+ self.report: HexReport = HexReport()
114
187
  self.platform = HEX_PLATFORM_NAME
115
188
  self.hex_api = HexApi(
116
189
  report=self.report,
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
129
202
  categories_as_tags=self.source_config.categories_as_tags,
130
203
  set_ownership_from_email=self.source_config.set_ownership_from_email,
131
204
  )
205
+ self.project_registry: Dict[str, Project] = {}
206
+ self.component_registry: Dict[str, Component] = {}
207
+
208
+ self.datahub_client: Optional[DataHubClient] = None
209
+ self.query_fetcher: Optional[HexQueryFetcher] = None
210
+ if self.source_config.include_lineage:
211
+ graph = ctx.require_graph("Lineage")
212
+ assert self.source_config.lineage_start_time and isinstance(
213
+ self.source_config.lineage_start_time, datetime
214
+ )
215
+ assert self.source_config.lineage_end_time and isinstance(
216
+ self.source_config.lineage_end_time, datetime
217
+ )
218
+ self.datahub_client = DataHubClient(graph=graph)
219
+ self.query_fetcher = HexQueryFetcher(
220
+ datahub_client=self.datahub_client,
221
+ workspace_name=self.source_config.workspace_name,
222
+ start_datetime=self.source_config.lineage_start_time,
223
+ end_datetime=self.source_config.lineage_end_time,
224
+ report=self.report,
225
+ page_size=self.source_config.datahub_page_size,
226
+ )
132
227
 
133
228
  @classmethod
134
229
  def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
143
238
  ).workunit_processor,
144
239
  ]
145
240
 
146
- def get_report(self) -> StatefulIngestionReport:
241
+ def get_report(self) -> HexReport:
147
242
  return self.report
148
243
 
149
244
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
150
- yield from self.mapper.map_workspace()
151
-
152
- for project_or_component in self.hex_api.fetch_projects():
153
- if isinstance(project_or_component, Project):
154
- if self.source_config.project_title_pattern.allowed(
155
- project_or_component.title
156
- ):
157
- yield from self.mapper.map_project(project=project_or_component)
158
- elif isinstance(project_or_component, Component):
159
- if (
160
- self.source_config.include_components
161
- and self.source_config.component_title_pattern.allowed(
245
+ with self.report.new_stage("Fetch Hex assets from Hex API"):
246
+ for project_or_component in self.hex_api.fetch_projects():
247
+ if isinstance(project_or_component, Project):
248
+ if self.source_config.project_title_pattern.allowed(
162
249
  project_or_component.title
163
- )
164
- ):
165
- yield from self.mapper.map_component(component=project_or_component)
166
- else:
167
- assert_never(project_or_component)
250
+ ):
251
+ self.project_registry[project_or_component.id] = (
252
+ project_or_component
253
+ )
254
+ elif isinstance(project_or_component, Component):
255
+ if (
256
+ self.source_config.include_components
257
+ and self.source_config.component_title_pattern.allowed(
258
+ project_or_component.title
259
+ )
260
+ ):
261
+ self.component_registry[project_or_component.id] = (
262
+ project_or_component
263
+ )
264
+ else:
265
+ assert_never(project_or_component)
266
+
267
+ if self.source_config.include_lineage:
268
+ assert self.datahub_client and self.query_fetcher
269
+
270
+ with self.report.new_stage(
271
+ "Fetch Hex lineage from existing Queries in DataHub"
272
+ ):
273
+ for query_metadata in self.query_fetcher.fetch():
274
+ project = self.project_registry.get(query_metadata.hex_project_id)
275
+ if project:
276
+ project.upstream_datasets.extend(
277
+ query_metadata.dataset_subjects
278
+ )
279
+ project.upstream_schema_fields.extend(
280
+ query_metadata.schema_field_subjects
281
+ )
282
+ else:
283
+ self.report.report_warning(
284
+ title="Missing project for lineage",
285
+ message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
286
+ context=str(query_metadata),
287
+ )
288
+
289
+ with self.report.new_stage("Emit"):
290
+ yield from self.mapper.map_workspace()
291
+
292
+ for project in self.project_registry.values():
293
+ yield from self.mapper.map_project(project=project)
294
+ for component in self.component_registry.values():
295
+ yield from self.mapper.map_component(component=component)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from datetime import datetime
3
- from typing import Iterable, List, Optional, Tuple
3
+ from typing import Iterable, List, Optional, Tuple, Union
4
4
 
5
5
  from datahub._codegen.aspect import (
6
6
  _Aspect, # TODO: is there a better import than this one?
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DashboardInfoClass,
47
47
  DashboardUsageStatisticsClass,
48
48
  DataPlatformInstanceClass,
49
+ EdgeClass,
49
50
  GlobalTagsClass,
50
51
  OwnerClass,
51
52
  OwnershipClass,
@@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
53
54
  TagAssociationClass,
54
55
  TimeWindowSizeClass,
55
56
  )
56
- from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
57
+ from datahub.metadata.urns import (
58
+ ContainerUrn,
59
+ CorpUserUrn,
60
+ DashboardUrn,
61
+ DatasetUrn,
62
+ SchemaFieldUrn,
63
+ Urn,
64
+ )
57
65
 
58
66
  logger = logging.getLogger(__name__)
59
67
 
@@ -116,6 +124,8 @@ class Mapper:
116
124
  ),
117
125
  externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
118
126
  customProperties=dict(id=project.id),
127
+ datasetEdges=self._dataset_edges(project.upstream_datasets),
128
+ # TODO: support schema field upstream, maybe InputFields?
119
129
  )
120
130
 
121
131
  subtypes = SubTypesClass(
@@ -343,6 +353,22 @@ class Mapper:
343
353
  else None,
344
354
  )
345
355
 
356
+ def _dataset_edges(
357
+ self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
358
+ ) -> Optional[List[EdgeClass]]:
359
+ # TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
360
+ return (
361
+ [
362
+ EdgeClass(
363
+ destinationUrn=upstream_urn.urn(),
364
+ )
365
+ for upstream_urn in upstream
366
+ if isinstance(upstream_urn, DatasetUrn)
367
+ ]
368
+ if upstream
369
+ else None
370
+ )
371
+
346
372
  def _yield_mcps(
347
373
  self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
348
374
  ) -> Iterable[MetadataWorkUnit]:
@@ -1,6 +1,8 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
- from typing import List, Optional
3
+ from typing import List, Optional, Union
4
+
5
+ from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
4
6
 
5
7
 
6
8
  @dataclass
@@ -51,6 +53,12 @@ class Project:
51
53
  creator: Optional[Owner] = None
52
54
  owner: Optional[Owner] = None
53
55
  analytics: Optional[Analytics] = None
56
+ upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
57
+ default_factory=list
58
+ )
59
+ upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
60
+ default_factory=list
61
+ )
54
62
 
55
63
 
56
64
  @dataclass
@@ -0,0 +1,300 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
+
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.source.hex.constants import (
9
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
10
+ HEX_PLATFORM_URN,
11
+ )
12
+ from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
13
+ from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
14
+ from datahub.sdk.main_client import DataHubClient
15
+ from datahub.sdk.search_filters import FilterDsl as F
16
+ from datahub.utilities.time import datetime_to_ts_millis
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
+ # Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
22
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
23
+
24
+
25
+ @dataclass
26
+ class QueryResponse:
27
+ """This is the public response model for the HexQueryFetcher."""
28
+
29
+ urn: QueryUrn
30
+ hex_project_id: str
31
+ dataset_subjects: List[DatasetUrn] = field(default_factory=list)
32
+ schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
33
+
34
+
35
+ @dataclass
36
+ class HexQueryFetcherReport(SourceReport):
37
+ start_datetime: Optional[datetime] = None
38
+ end_datetime: Optional[datetime] = None
39
+ fetched_query_urns: int = 0
40
+ fetched_query_objects: int = 0
41
+ filtered_out_queries_missing_metadata: int = 0
42
+ filtered_out_queries_different_workspace: int = 0
43
+ filtered_out_queries_no_match: int = 0
44
+ filtered_out_queries_no_subjects: int = 0
45
+ total_queries: int = 0
46
+ total_dataset_subjects: int = 0
47
+ total_schema_field_subjects: int = 0
48
+ num_calls_fetch_query_entities: int = 0
49
+
50
+
51
+ class HexQueryFetcher:
52
+ def __init__(
53
+ self,
54
+ datahub_client: DataHubClient,
55
+ workspace_name: str,
56
+ start_datetime: datetime,
57
+ end_datetime: datetime,
58
+ report: HexQueryFetcherReport,
59
+ page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
60
+ ):
61
+ self.datahub_client = datahub_client
62
+ self.workspace_name = workspace_name
63
+ self.start_datetime = start_datetime
64
+ self.end_datetime = end_datetime
65
+ self.report = report
66
+ self.page_size = page_size
67
+
68
+ self.report.start_datetime = start_datetime
69
+ self.report.end_datetime = end_datetime
70
+
71
+ def fetch(self) -> Iterable[QueryResponse]:
72
+ try:
73
+ query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
74
+ assert all(isinstance(urn, QueryUrn) for urn in query_urns)
75
+ self.report.fetched_query_urns = len(query_urns)
76
+
77
+ entities_by_urn = self._fetch_query_entities(query_urns)
78
+ self.report.fetched_query_objects = len(entities_by_urn)
79
+ except Exception as e:
80
+ self.report.failure(
81
+ title="Error fetching Queries for lineage",
82
+ message="Error fetching Queries will result on missing lineage",
83
+ context=str(
84
+ dict(
85
+ workspace_name=self.workspace_name,
86
+ start_datetime=self.start_datetime,
87
+ end_datetime=self.end_datetime,
88
+ )
89
+ ),
90
+ exc=e,
91
+ )
92
+ else:
93
+ if not query_urns or not entities_by_urn:
94
+ self.report.warning(
95
+ title="No Queries found with Hex as origin",
96
+ message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
97
+ context=str(
98
+ dict(
99
+ workspace_name=self.workspace_name,
100
+ start_datetime=self.start_datetime,
101
+ end_datetime=self.end_datetime,
102
+ )
103
+ ),
104
+ )
105
+ return
106
+
107
+ for query_urn, (
108
+ query_properties,
109
+ query_subjects,
110
+ ) in entities_by_urn.items():
111
+ maybe_query_response = self._build_query_response(
112
+ query_urn=query_urn,
113
+ query_properties=query_properties,
114
+ query_subjects=query_subjects,
115
+ )
116
+ if maybe_query_response:
117
+ yield maybe_query_response
118
+
119
+ def _fetch_query_entities(
120
+ self, query_urns: List[QueryUrn]
121
+ ) -> Dict[
122
+ QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
123
+ ]:
124
+ entities_by_urn: Dict[
125
+ QueryUrn,
126
+ Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
127
+ ] = {}
128
+ for i in range(0, len(query_urns), self.page_size):
129
+ batch = query_urns[i : i + self.page_size]
130
+
131
+ logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
132
+ entities = self.datahub_client._graph.get_entities(
133
+ entity_name=QueryUrn.ENTITY_TYPE,
134
+ urns=[urn.urn() for urn in batch],
135
+ aspects=[
136
+ QueryPropertiesClass.ASPECT_NAME,
137
+ QuerySubjectsClass.ASPECT_NAME,
138
+ ],
139
+ with_system_metadata=False,
140
+ )
141
+ self.report.num_calls_fetch_query_entities += 1
142
+ logger.debug(f"Get entities response: {entities}")
143
+
144
+ for urn, entity in entities.items():
145
+ query_urn = QueryUrn.from_string(urn)
146
+
147
+ properties_tuple = entity.get(
148
+ QueryPropertiesClass.ASPECT_NAME, (None, None)
149
+ )
150
+ query_properties: Optional[QueryPropertiesClass] = None
151
+ if properties_tuple and properties_tuple[0]:
152
+ assert isinstance(properties_tuple[0], QueryPropertiesClass)
153
+ query_properties = properties_tuple[0]
154
+
155
+ subjects_tuple = entity.get(
156
+ QuerySubjectsClass.ASPECT_NAME, (None, None)
157
+ )
158
+ query_subjects: Optional[QuerySubjectsClass] = None
159
+ if subjects_tuple and subjects_tuple[0]:
160
+ assert isinstance(subjects_tuple[0], QuerySubjectsClass)
161
+ query_subjects = subjects_tuple[0]
162
+
163
+ entities_by_urn[query_urn] = (query_properties, query_subjects)
164
+
165
+ return entities_by_urn
166
+
167
+ def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
168
+ last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
169
+ last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
170
+
171
+ urns = self.datahub_client.search.get_urns(
172
+ filter=F.and_(
173
+ F.entity_type(QueryUrn.ENTITY_TYPE),
174
+ F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
175
+ F.custom_filter(
176
+ "lastModifiedAt",
177
+ "GREATER_THAN_OR_EQUAL_TO",
178
+ [str(last_modified_start_at_millis)],
179
+ ),
180
+ F.custom_filter(
181
+ "lastModifiedAt",
182
+ "LESS_THAN_OR_EQUAL_TO",
183
+ [str(last_modified_end_at_millis)],
184
+ ),
185
+ ),
186
+ )
187
+ logger.debug(f"Get URNS by filter: {urns}")
188
+ return [QueryUrn.from_string(urn.urn()) for urn in urns]
189
+
190
+ def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
191
+ """
192
+ Extract project ID and workspace name from SQL statement.
193
+
194
+ Looks for Hex metadata in SQL comments in the format:
195
+ -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
196
+
197
+ Example:
198
+ -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
199
+
200
+ # TODO: Consider supporting multiline metadata format in the future:
201
+ # -- Hex query metadata: {
202
+ # -- "categories": ["Scratchpad"],
203
+ # -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
204
+ # -- ...
205
+ # -- }
206
+
207
+ Returns:
208
+ A tuple of (project_id, workspace_name) if both are successfully extracted
209
+ None if extraction fails for any reason
210
+ """
211
+ # Extract both project_id and workspace name in a single regex operation
212
+ match = re.search(HEX_METADATA_PATTERN, sql_statement)
213
+
214
+ if not match:
215
+ self.report.filtered_out_queries_no_match += 1
216
+ return None
217
+
218
+ try:
219
+ project_id = match.group(1)
220
+ workspace_name = match.group(2)
221
+ return project_id, workspace_name
222
+ except (IndexError, AttributeError) as e:
223
+ self.report.warning(
224
+ title="Failed to extract information from Hex query metadata",
225
+ message="Failed to extract information from Hex query metadata will result on missing lineage",
226
+ context=sql_statement,
227
+ exc=e,
228
+ )
229
+
230
+ return None
231
+
232
+ def _build_query_response(
233
+ self,
234
+ query_urn: QueryUrn,
235
+ query_properties: Optional[QueryPropertiesClass],
236
+ query_subjects: Optional[QuerySubjectsClass],
237
+ ) -> Optional[QueryResponse]:
238
+ # Skip if missing required aspects
239
+ if (
240
+ not query_properties
241
+ or not query_properties.statement
242
+ or not query_properties.statement.value
243
+ or not query_subjects
244
+ or query_subjects.subjects is None # empty list is allowed
245
+ ):
246
+ logger.debug(
247
+ f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
248
+ )
249
+ self.report.filtered_out_queries_missing_metadata += 1
250
+ return None
251
+
252
+ # Extract hex metadata (project_id and workspace_name)
253
+ metadata_result = self._extract_hex_metadata(query_properties.statement.value)
254
+ if not metadata_result:
255
+ logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
256
+ self.report.filtered_out_queries_missing_metadata += 1
257
+ return None
258
+
259
+ hex_project_id, workspace_from_url = metadata_result
260
+
261
+ # Validate workspace
262
+ if workspace_from_url != self.workspace_name:
263
+ logger.debug(
264
+ f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
265
+ )
266
+ self.report.filtered_out_queries_different_workspace += 1
267
+ return None
268
+
269
+ # Extract subjects
270
+ dataset_subjects: List[DatasetUrn] = []
271
+ schema_field_subjects: List[SchemaFieldUrn] = []
272
+ for subject in query_subjects.subjects:
273
+ if subject.entity and subject.entity.startswith("urn:li:dataset:"):
274
+ dataset_subjects.append(DatasetUrn.from_string(subject.entity))
275
+ elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
276
+ schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
277
+
278
+ if not dataset_subjects and not schema_field_subjects:
279
+ self.report.filtered_out_queries_no_subjects += 1
280
+ return None
281
+
282
+ # Create response
283
+ response = QueryResponse(
284
+ urn=query_urn,
285
+ hex_project_id=hex_project_id,
286
+ dataset_subjects=dataset_subjects,
287
+ schema_field_subjects=schema_field_subjects,
288
+ )
289
+ logger.debug(
290
+ f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
291
+ )
292
+ self.report.total_queries += 1
293
+ self.report.total_dataset_subjects += len(dataset_subjects)
294
+ self.report.total_schema_field_subjects += len(schema_field_subjects)
295
+
296
+ logger.debug(
297
+ f"Processed query {query_urn} with Hex project ID {hex_project_id}"
298
+ )
299
+
300
+ return response