acryl-datahub 1.0.0.2rc1__py3-none-any.whl → 1.0.0.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.2rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.2rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=vzyBMegu61oWM-Gce9R3y5zLfMrINPSGDEFO-MHhthA,323
4
+ datahub/_version.py,sha256=lFv-ImaIXKL_EDY2GlHJHg9iVkj13C_xihZRNnxH3M8,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
6
  datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
217
217
  datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
218
218
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
219
219
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
220
- datahub/ingestion/source/superset.py,sha256=FRZ7cCURW6NHUOKaFicdAZq2caXektvO9rJE4tO9scU,40336
220
+ datahub/ingestion/source/superset.py,sha256=bMfvm9HgUoS3T7BjHsDrrOodc8iBRrJRQYv2D66bABo,41194
221
221
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
222
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
223
223
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -328,10 +328,11 @@ datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
328
328
  datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
329
329
  datahub/ingestion/source/hex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
330
  datahub/ingestion/source/hex/api.py,sha256=JfFPD8O4z16fwZE_BdX5aCQztEq-tbzxJJ7aofH4DE4,12274
331
- datahub/ingestion/source/hex/constants.py,sha256=NuBjxgJpIt598Cyn_9IcZ158PqBdn5vNjw8T92sTQck,115
332
- datahub/ingestion/source/hex/hex.py,sha256=DPpsi5e-sdUgbS0Okyvx1mvc00Adu47zA65oFnRP74A,6510
333
- datahub/ingestion/source/hex/mapper.py,sha256=6dsGvvhPAOAbAG1ayxLwipgJGt1q7YanWYfMX3rZeiM,12603
334
- datahub/ingestion/source/hex/model.py,sha256=hmMfOLEGZcKjwy2DW29OPf_9_Q_TesgnUTCen2br_fA,1471
331
+ datahub/ingestion/source/hex/constants.py,sha256=8hUTMWyG5keTNfXoLu_Dh413Hw_mGGJX1atiiDZyKtg,271
332
+ datahub/ingestion/source/hex/hex.py,sha256=PIRl8fPkKtlHV7cqR4H8RKVYdTLgEFXHFzc3QAqJLhE,12733
333
+ datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H5IUbb-UxU,13344
334
+ datahub/ingestion/source/hex/model.py,sha256=S9bUhfFcjzuio2dBS6HzSyRVPiSJvRvMQ0qyVrjV5-E,1766
335
+ datahub/ingestion/source/hex/query_fetcher.py,sha256=5r065vL7XohcgZ_fj-1h6o8cxrPin37IeYsC99GU6LA,12287
335
336
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
336
337
  datahub/ingestion/source/iceberg/iceberg.py,sha256=PhLLXWgBdfZ3hL7LgLvDr6aTK-QKmiZCFNz5jD-mxZM,30773
337
338
  datahub/ingestion/source/iceberg/iceberg_common.py,sha256=VGosqYPmn_j6GETSnDHZ8Ay1BVOedmx2x5LHxw16I3A,12278
@@ -1043,8 +1044,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1043
1044
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1044
1045
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1045
1046
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1046
- acryl_datahub-1.0.0.2rc1.dist-info/METADATA,sha256=IE26ZK9HREmhmiMf2zQds-JatSIyAh9gcaVjGyOAGLE,176849
1047
- acryl_datahub-1.0.0.2rc1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1048
- acryl_datahub-1.0.0.2rc1.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1049
- acryl_datahub-1.0.0.2rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1050
- acryl_datahub-1.0.0.2rc1.dist-info/RECORD,,
1047
+ acryl_datahub-1.0.0.2rc2.dist-info/METADATA,sha256=VuKbVh0Lt8z7Jik8lZ39CF56PZHqn_oIwn2LBmYzrVc,176849
1048
+ acryl_datahub-1.0.0.2rc2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1049
+ acryl_datahub-1.0.0.2rc2.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1050
+ acryl_datahub-1.0.0.2rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1051
+ acryl_datahub-1.0.0.2rc2.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.2rc1"
3
+ __version__ = "1.0.0.2rc2"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -1,3 +1,8 @@
1
+ from datahub.metadata.urns import DataPlatformUrn
2
+
1
3
  HEX_PLATFORM_NAME = "hex"
4
+ HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
2
5
  HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
3
6
  HEX_API_PAGE_SIZE_DEFAULT = 100
7
+
8
+ DATAHUB_API_PAGE_SIZE_DEFAULT = 100
@@ -1,9 +1,12 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime, timedelta, timezone
1
3
  from typing import Any, Dict, Iterable, List, Optional
2
4
 
3
- from pydantic import Field, SecretStr
5
+ from pydantic import Field, SecretStr, root_validator
4
6
  from typing_extensions import assert_never
5
7
 
6
8
  from datahub.configuration.common import AllowDenyPattern
9
+ from datahub.configuration.datetimes import parse_user_datetime
7
10
  from datahub.configuration.source_common import (
8
11
  EnvConfigMixin,
9
12
  PlatformInstanceConfigMixin,
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
25
  from datahub.ingestion.source.hex.api import HexApi, HexApiReport
23
26
  from datahub.ingestion.source.hex.constants import (
27
+ DATAHUB_API_PAGE_SIZE_DEFAULT,
24
28
  HEX_API_BASE_URL_DEFAULT,
25
29
  HEX_API_PAGE_SIZE_DEFAULT,
26
30
  HEX_PLATFORM_NAME,
27
31
  )
28
32
  from datahub.ingestion.source.hex.mapper import Mapper
29
33
  from datahub.ingestion.source.hex.model import Component, Project
34
+ from datahub.ingestion.source.hex.query_fetcher import (
35
+ HexQueryFetcher,
36
+ HexQueryFetcherReport,
37
+ )
30
38
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
31
39
  StaleEntityRemovalHandler,
32
40
  StaleEntityRemovalSourceReport,
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
42
  )
35
43
  from datahub.ingestion.source.state.stateful_ingestion_base import (
36
44
  StatefulIngestionConfigBase,
37
- StatefulIngestionReport,
38
45
  StatefulIngestionSourceBase,
39
46
  )
47
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
48
+ from datahub.sdk.main_client import DataHubClient
40
49
 
41
50
 
42
51
  class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
93
102
  default=True,
94
103
  description="Set ownership identity from owner/creator email",
95
104
  )
105
+ include_lineage: bool = Field(
106
+ default=True,
107
+ description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
108
+ )
109
+ lineage_start_time: Optional[datetime] = Field(
110
+ default=None,
111
+ description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
112
+ )
113
+ lineage_end_time: Optional[datetime] = Field(
114
+ default=None,
115
+ description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
116
+ )
117
+ datahub_page_size: int = Field(
118
+ default=DATAHUB_API_PAGE_SIZE_DEFAULT,
119
+ description="Number of items to fetch per DataHub API call.",
120
+ )
121
+
122
+ @root_validator(pre=True)
123
+ def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
124
+ # lineage_end_time default = now
125
+ if "lineage_end_time" not in data or data["lineage_end_time"] is None:
126
+ data["lineage_end_time"] = datetime.now(tz=timezone.utc)
127
+ # if string is given, parse it
128
+ if isinstance(data["lineage_end_time"], str):
129
+ data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
130
+ # if no timezone is given, assume UTC
131
+ if data["lineage_end_time"].tzinfo is None:
132
+ data["lineage_end_time"] = data["lineage_end_time"].replace(
133
+ tzinfo=timezone.utc
134
+ )
135
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
136
+ assert (
137
+ data["lineage_end_time"]
138
+ and isinstance(data["lineage_end_time"], datetime)
139
+ and data["lineage_end_time"].tzinfo is not None
140
+ and data["lineage_end_time"].tzinfo == timezone.utc
141
+ )
142
+
143
+ # lineage_start_time default = lineage_end_time - 1 day
144
+ if "lineage_start_time" not in data or data["lineage_start_time"] is None:
145
+ data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
146
+ # if string is given, parse it
147
+ if isinstance(data["lineage_start_time"], str):
148
+ data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
149
+ # if no timezone is given, assume UTC
150
+ if data["lineage_start_time"].tzinfo is None:
151
+ data["lineage_start_time"] = data["lineage_start_time"].replace(
152
+ tzinfo=timezone.utc
153
+ )
154
+ # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
155
+ assert (
156
+ data["lineage_start_time"]
157
+ and isinstance(data["lineage_start_time"], datetime)
158
+ and data["lineage_start_time"].tzinfo is not None
159
+ and data["lineage_start_time"].tzinfo == timezone.utc
160
+ )
161
+
162
+ return data
96
163
 
97
164
 
98
- class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
165
+ @dataclass
166
+ class HexReport(
167
+ StaleEntityRemovalSourceReport,
168
+ HexApiReport,
169
+ IngestionStageReport,
170
+ HexQueryFetcherReport,
171
+ ):
99
172
  pass
100
173
 
101
174
 
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
110
183
  def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
111
184
  super().__init__(config, ctx)
112
185
  self.source_config = config
113
- self.report = HexReport()
186
+ self.report: HexReport = HexReport()
114
187
  self.platform = HEX_PLATFORM_NAME
115
188
  self.hex_api = HexApi(
116
189
  report=self.report,
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
129
202
  categories_as_tags=self.source_config.categories_as_tags,
130
203
  set_ownership_from_email=self.source_config.set_ownership_from_email,
131
204
  )
205
+ self.project_registry: Dict[str, Project] = {}
206
+ self.component_registry: Dict[str, Component] = {}
207
+
208
+ self.datahub_client: Optional[DataHubClient] = None
209
+ self.query_fetcher: Optional[HexQueryFetcher] = None
210
+ if self.source_config.include_lineage:
211
+ graph = ctx.require_graph("Lineage")
212
+ assert self.source_config.lineage_start_time and isinstance(
213
+ self.source_config.lineage_start_time, datetime
214
+ )
215
+ assert self.source_config.lineage_end_time and isinstance(
216
+ self.source_config.lineage_end_time, datetime
217
+ )
218
+ self.datahub_client = DataHubClient(graph=graph)
219
+ self.query_fetcher = HexQueryFetcher(
220
+ datahub_client=self.datahub_client,
221
+ workspace_name=self.source_config.workspace_name,
222
+ start_datetime=self.source_config.lineage_start_time,
223
+ end_datetime=self.source_config.lineage_end_time,
224
+ report=self.report,
225
+ page_size=self.source_config.datahub_page_size,
226
+ )
132
227
 
133
228
  @classmethod
134
229
  def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
143
238
  ).workunit_processor,
144
239
  ]
145
240
 
146
- def get_report(self) -> StatefulIngestionReport:
241
+ def get_report(self) -> HexReport:
147
242
  return self.report
148
243
 
149
244
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
150
- yield from self.mapper.map_workspace()
151
-
152
- for project_or_component in self.hex_api.fetch_projects():
153
- if isinstance(project_or_component, Project):
154
- if self.source_config.project_title_pattern.allowed(
155
- project_or_component.title
156
- ):
157
- yield from self.mapper.map_project(project=project_or_component)
158
- elif isinstance(project_or_component, Component):
159
- if (
160
- self.source_config.include_components
161
- and self.source_config.component_title_pattern.allowed(
245
+ with self.report.new_stage("Fetch Hex assets from Hex API"):
246
+ for project_or_component in self.hex_api.fetch_projects():
247
+ if isinstance(project_or_component, Project):
248
+ if self.source_config.project_title_pattern.allowed(
162
249
  project_or_component.title
163
- )
164
- ):
165
- yield from self.mapper.map_component(component=project_or_component)
166
- else:
167
- assert_never(project_or_component)
250
+ ):
251
+ self.project_registry[project_or_component.id] = (
252
+ project_or_component
253
+ )
254
+ elif isinstance(project_or_component, Component):
255
+ if (
256
+ self.source_config.include_components
257
+ and self.source_config.component_title_pattern.allowed(
258
+ project_or_component.title
259
+ )
260
+ ):
261
+ self.component_registry[project_or_component.id] = (
262
+ project_or_component
263
+ )
264
+ else:
265
+ assert_never(project_or_component)
266
+
267
+ if self.source_config.include_lineage:
268
+ assert self.datahub_client and self.query_fetcher
269
+
270
+ with self.report.new_stage(
271
+ "Fetch Hex lineage from existing Queries in DataHub"
272
+ ):
273
+ for query_metadata in self.query_fetcher.fetch():
274
+ project = self.project_registry.get(query_metadata.hex_project_id)
275
+ if project:
276
+ project.upstream_datasets.extend(
277
+ query_metadata.dataset_subjects
278
+ )
279
+ project.upstream_schema_fields.extend(
280
+ query_metadata.schema_field_subjects
281
+ )
282
+ else:
283
+ self.report.report_warning(
284
+ title="Missing project for lineage",
285
+ message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
286
+ context=str(query_metadata),
287
+ )
288
+
289
+ with self.report.new_stage("Emit"):
290
+ yield from self.mapper.map_workspace()
291
+
292
+ for project in self.project_registry.values():
293
+ yield from self.mapper.map_project(project=project)
294
+ for component in self.component_registry.values():
295
+ yield from self.mapper.map_component(component=component)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from datetime import datetime
3
- from typing import Iterable, List, Optional, Tuple
3
+ from typing import Iterable, List, Optional, Tuple, Union
4
4
 
5
5
  from datahub._codegen.aspect import (
6
6
  _Aspect, # TODO: is there a better import than this one?
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DashboardInfoClass,
47
47
  DashboardUsageStatisticsClass,
48
48
  DataPlatformInstanceClass,
49
+ EdgeClass,
49
50
  GlobalTagsClass,
50
51
  OwnerClass,
51
52
  OwnershipClass,
@@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
53
54
  TagAssociationClass,
54
55
  TimeWindowSizeClass,
55
56
  )
56
- from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
57
+ from datahub.metadata.urns import (
58
+ ContainerUrn,
59
+ CorpUserUrn,
60
+ DashboardUrn,
61
+ DatasetUrn,
62
+ SchemaFieldUrn,
63
+ Urn,
64
+ )
57
65
 
58
66
  logger = logging.getLogger(__name__)
59
67
 
@@ -116,6 +124,8 @@ class Mapper:
116
124
  ),
117
125
  externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
118
126
  customProperties=dict(id=project.id),
127
+ datasetEdges=self._dataset_edges(project.upstream_datasets),
128
+ # TODO: support schema field upstream, maybe InputFields?
119
129
  )
120
130
 
121
131
  subtypes = SubTypesClass(
@@ -343,6 +353,22 @@ class Mapper:
343
353
  else None,
344
354
  )
345
355
 
356
+ def _dataset_edges(
357
+ self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
358
+ ) -> Optional[List[EdgeClass]]:
359
+ # TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
360
+ return (
361
+ [
362
+ EdgeClass(
363
+ destinationUrn=upstream_urn.urn(),
364
+ )
365
+ for upstream_urn in upstream
366
+ if isinstance(upstream_urn, DatasetUrn)
367
+ ]
368
+ if upstream
369
+ else None
370
+ )
371
+
346
372
  def _yield_mcps(
347
373
  self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
348
374
  ) -> Iterable[MetadataWorkUnit]:
@@ -1,6 +1,8 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
- from typing import List, Optional
3
+ from typing import List, Optional, Union
4
+
5
+ from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
4
6
 
5
7
 
6
8
  @dataclass
@@ -51,6 +53,12 @@ class Project:
51
53
  creator: Optional[Owner] = None
52
54
  owner: Optional[Owner] = None
53
55
  analytics: Optional[Analytics] = None
56
+ upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
57
+ default_factory=list
58
+ )
59
+ upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
60
+ default_factory=list
61
+ )
54
62
 
55
63
 
56
64
  @dataclass