acryl-datahub 1.0.0.1rc1__py3-none-any.whl → 1.0.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/METADATA +2575 -2574
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/RECORD +77 -60
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +6 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +6 -11
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +16 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +62 -66
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/powerbi.py +29 -23
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +15 -6
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +309 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +22 -0
- datahub/sdk/search_filters.py +4 -4
- datahub/sql_parsing/split_statements.py +5 -1
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/ingestion/source/vertexai.py +0 -695
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Any, Dict, Generator, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from typing_extensions import assert_never
|
|
9
|
+
|
|
10
|
+
from datahub.ingestion.api.source import SourceReport
|
|
11
|
+
from datahub.ingestion.source.hex.constants import (
|
|
12
|
+
HEX_API_BASE_URL_DEFAULT,
|
|
13
|
+
HEX_API_PAGE_SIZE_DEFAULT,
|
|
14
|
+
)
|
|
15
|
+
from datahub.ingestion.source.hex.model import (
|
|
16
|
+
Analytics,
|
|
17
|
+
Category,
|
|
18
|
+
Collection,
|
|
19
|
+
Component,
|
|
20
|
+
Owner,
|
|
21
|
+
Project,
|
|
22
|
+
Status,
|
|
23
|
+
)
|
|
24
|
+
from datahub.utilities.str_enum import StrEnum
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# The following models were Claude-generated from Hex API OpenAPI definition https://static.hex.site/openapi.json
|
|
29
|
+
# To be exclusively used internally for the deserialization of the API response
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class HexApiAppViewStats(BaseModel):
|
|
33
|
+
"""App view analytics data model."""
|
|
34
|
+
|
|
35
|
+
all_time: Optional[int] = Field(default=None, alias="allTime")
|
|
36
|
+
last_seven_days: Optional[int] = Field(default=None, alias="lastSevenDays")
|
|
37
|
+
last_fourteen_days: Optional[int] = Field(default=None, alias="lastFourteenDays")
|
|
38
|
+
last_thirty_days: Optional[int] = Field(default=None, alias="lastThirtyDays")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class HexApiProjectAnalytics(BaseModel):
|
|
42
|
+
"""Analytics data model for projects."""
|
|
43
|
+
|
|
44
|
+
app_views: Optional[HexApiAppViewStats] = Field(default=None, alias="appViews")
|
|
45
|
+
last_viewed_at: Optional[datetime] = Field(default=None, alias="lastViewedAt")
|
|
46
|
+
published_results_updated_at: Optional[datetime] = Field(
|
|
47
|
+
default=None, alias="publishedResultsUpdatedAt"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
@validator("last_viewed_at", "published_results_updated_at", pre=True)
|
|
51
|
+
def parse_datetime(cls, value):
|
|
52
|
+
if value is None:
|
|
53
|
+
return None
|
|
54
|
+
if isinstance(value, str):
|
|
55
|
+
return datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
|
|
56
|
+
tzinfo=timezone.utc
|
|
57
|
+
)
|
|
58
|
+
return value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class HexApiProjectStatus(BaseModel):
|
|
62
|
+
"""Project status model."""
|
|
63
|
+
|
|
64
|
+
name: str
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class HexApiCategory(BaseModel):
|
|
68
|
+
"""Category model."""
|
|
69
|
+
|
|
70
|
+
name: str
|
|
71
|
+
description: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class HexApiReviews(BaseModel):
|
|
75
|
+
"""Reviews configuration model."""
|
|
76
|
+
|
|
77
|
+
required: bool
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class HexApiUser(BaseModel):
|
|
81
|
+
"""User model."""
|
|
82
|
+
|
|
83
|
+
email: str
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class HexApiAccessType(StrEnum):
|
|
87
|
+
"""Access type enum."""
|
|
88
|
+
|
|
89
|
+
NONE = "NONE"
|
|
90
|
+
VIEW = "VIEW"
|
|
91
|
+
EDIT = "EDIT"
|
|
92
|
+
FULL_ACCESS = "FULL_ACCESS"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class HexApiUserAccess(BaseModel):
|
|
96
|
+
"""User access model."""
|
|
97
|
+
|
|
98
|
+
user: HexApiUser
|
|
99
|
+
access: Optional[HexApiAccessType] = None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class HexApiCollectionData(BaseModel):
|
|
103
|
+
"""Collection data model."""
|
|
104
|
+
|
|
105
|
+
name: str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class HexApiCollectionAccess(BaseModel):
|
|
109
|
+
"""Collection access model."""
|
|
110
|
+
|
|
111
|
+
collection: HexApiCollectionData
|
|
112
|
+
access: Optional[HexApiAccessType] = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class HexApiAccessSettings(BaseModel):
|
|
116
|
+
"""Access settings model."""
|
|
117
|
+
|
|
118
|
+
access: Optional[HexApiAccessType] = None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class HexApiWeeklySchedule(BaseModel):
|
|
122
|
+
"""Weekly schedule model."""
|
|
123
|
+
|
|
124
|
+
day_of_week: str = Field(alias="dayOfWeek")
|
|
125
|
+
hour: int
|
|
126
|
+
minute: int
|
|
127
|
+
timezone: str
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class HexApiSchedule(BaseModel):
|
|
131
|
+
"""Schedule model."""
|
|
132
|
+
|
|
133
|
+
cadence: str
|
|
134
|
+
enabled: bool
|
|
135
|
+
hourly: Optional[Any] = None
|
|
136
|
+
daily: Optional[Any] = None
|
|
137
|
+
weekly: Optional[HexApiWeeklySchedule] = None
|
|
138
|
+
monthly: Optional[Any] = None
|
|
139
|
+
custom: Optional[Any] = None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class HexApiSharing(BaseModel):
|
|
143
|
+
"""Sharing configuration model."""
|
|
144
|
+
|
|
145
|
+
users: Optional[List[HexApiUserAccess]] = []
|
|
146
|
+
collections: Optional[List[HexApiCollectionAccess]] = []
|
|
147
|
+
groups: Optional[List[Any]] = []
|
|
148
|
+
workspace: Optional[HexApiAccessSettings] = None
|
|
149
|
+
public_web: Optional[HexApiAccessSettings] = Field(default=None, alias="publicWeb")
|
|
150
|
+
support: Optional[HexApiAccessSettings] = None
|
|
151
|
+
|
|
152
|
+
class Config:
|
|
153
|
+
extra = "ignore" # Allow extra fields in the JSON
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class HexApiItemType(StrEnum):
|
|
157
|
+
"""Item type enum."""
|
|
158
|
+
|
|
159
|
+
PROJECT = "PROJECT"
|
|
160
|
+
COMPONENT = "COMPONENT"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class HexApiProjectApiResource(BaseModel):
|
|
164
|
+
"""Base model for Hex items (projects and components) from the API."""
|
|
165
|
+
|
|
166
|
+
id: str
|
|
167
|
+
title: str
|
|
168
|
+
description: Optional[str] = None
|
|
169
|
+
type: HexApiItemType
|
|
170
|
+
creator: Optional[HexApiUser] = None
|
|
171
|
+
owner: Optional[HexApiUser] = None
|
|
172
|
+
status: Optional[HexApiProjectStatus] = None
|
|
173
|
+
categories: Optional[List[HexApiCategory]] = []
|
|
174
|
+
reviews: Optional[HexApiReviews] = None
|
|
175
|
+
analytics: Optional[HexApiProjectAnalytics] = None
|
|
176
|
+
last_edited_at: Optional[datetime] = Field(default=None, alias="lastEditedAt")
|
|
177
|
+
last_published_at: Optional[datetime] = Field(default=None, alias="lastPublishedAt")
|
|
178
|
+
created_at: Optional[datetime] = Field(default=None, alias="createdAt")
|
|
179
|
+
archived_at: Optional[datetime] = Field(default=None, alias="archivedAt")
|
|
180
|
+
trashed_at: Optional[datetime] = Field(default=None, alias="trashedAt")
|
|
181
|
+
schedules: Optional[List[HexApiSchedule]] = []
|
|
182
|
+
sharing: Optional[HexApiSharing] = None
|
|
183
|
+
|
|
184
|
+
class Config:
|
|
185
|
+
extra = "ignore" # Allow extra fields in the JSON
|
|
186
|
+
|
|
187
|
+
@validator(
|
|
188
|
+
"created_at",
|
|
189
|
+
"last_edited_at",
|
|
190
|
+
"last_published_at",
|
|
191
|
+
"archived_at",
|
|
192
|
+
"trashed_at",
|
|
193
|
+
pre=True,
|
|
194
|
+
)
|
|
195
|
+
def parse_datetime(cls, value):
|
|
196
|
+
if value is None:
|
|
197
|
+
return None
|
|
198
|
+
if isinstance(value, str):
|
|
199
|
+
return datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
|
|
200
|
+
tzinfo=timezone.utc
|
|
201
|
+
)
|
|
202
|
+
return value
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class HexApiPageCursors(BaseModel):
|
|
206
|
+
"""Pagination cursor model."""
|
|
207
|
+
|
|
208
|
+
after: Optional[str] = None
|
|
209
|
+
before: Optional[str] = None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class HexApiProjectsListResponse(BaseModel):
|
|
213
|
+
"""Response model for the list projects API."""
|
|
214
|
+
|
|
215
|
+
values: List[HexApiProjectApiResource]
|
|
216
|
+
pagination: Optional[HexApiPageCursors] = None
|
|
217
|
+
|
|
218
|
+
class Config:
|
|
219
|
+
extra = "ignore" # Allow extra fields in the JSON
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass
|
|
223
|
+
class HexApiReport(SourceReport):
|
|
224
|
+
fetch_projects_page_calls: int = 0
|
|
225
|
+
fetch_projects_page_items: int = 0
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class HexApi:
|
|
229
|
+
"""https://learn.hex.tech/docs/api/api-reference"""
|
|
230
|
+
|
|
231
|
+
def __init__(
|
|
232
|
+
self,
|
|
233
|
+
token: str,
|
|
234
|
+
report: HexApiReport,
|
|
235
|
+
base_url: str = HEX_API_BASE_URL_DEFAULT,
|
|
236
|
+
page_size: int = HEX_API_PAGE_SIZE_DEFAULT,
|
|
237
|
+
):
|
|
238
|
+
self.token = token
|
|
239
|
+
self.base_url = base_url
|
|
240
|
+
self.report = report
|
|
241
|
+
self.page_size = page_size
|
|
242
|
+
|
|
243
|
+
def _list_projects_url(self):
|
|
244
|
+
return f"{self.base_url}/projects"
|
|
245
|
+
|
|
246
|
+
def _auth_header(self):
|
|
247
|
+
return {"Authorization": f"Bearer {self.token}"}
|
|
248
|
+
|
|
249
|
+
def fetch_projects(
|
|
250
|
+
self,
|
|
251
|
+
include_components: bool = True,
|
|
252
|
+
include_archived: bool = False,
|
|
253
|
+
include_trashed: bool = False,
|
|
254
|
+
) -> Generator[Union[Project, Component], None, None]:
|
|
255
|
+
"""Fetch all projects and components
|
|
256
|
+
|
|
257
|
+
https://learn.hex.tech/docs/api/api-reference#operation/ListProjects
|
|
258
|
+
"""
|
|
259
|
+
params = {
|
|
260
|
+
"includeComponents": include_components,
|
|
261
|
+
"includeArchived": include_archived,
|
|
262
|
+
"includeTrashed": include_trashed,
|
|
263
|
+
"includeSharing": True,
|
|
264
|
+
"limit": self.page_size,
|
|
265
|
+
"after": None,
|
|
266
|
+
"before": None,
|
|
267
|
+
"sortBy": "CREATED_AT",
|
|
268
|
+
"sortDirection": "ASC",
|
|
269
|
+
}
|
|
270
|
+
yield from self._fetch_projects_page(params)
|
|
271
|
+
|
|
272
|
+
while params["after"]:
|
|
273
|
+
yield from self._fetch_projects_page(params)
|
|
274
|
+
|
|
275
|
+
def _fetch_projects_page(
|
|
276
|
+
self, params: Dict[str, Any]
|
|
277
|
+
) -> Generator[Union[Project, Component], None, None]:
|
|
278
|
+
logger.debug(f"Fetching projects page with params: {params}")
|
|
279
|
+
self.report.fetch_projects_page_calls += 1
|
|
280
|
+
try:
|
|
281
|
+
response = requests.get(
|
|
282
|
+
url=self._list_projects_url(),
|
|
283
|
+
headers=self._auth_header(),
|
|
284
|
+
params=params,
|
|
285
|
+
timeout=30,
|
|
286
|
+
)
|
|
287
|
+
response.raise_for_status()
|
|
288
|
+
|
|
289
|
+
api_response = HexApiProjectsListResponse.parse_obj(response.json())
|
|
290
|
+
logger.info(f"Fetched {len(api_response.values)} items")
|
|
291
|
+
params["after"] = (
|
|
292
|
+
api_response.pagination.after if api_response.pagination else None
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
self.report.fetch_projects_page_items += len(api_response.values)
|
|
296
|
+
|
|
297
|
+
for item in api_response.values:
|
|
298
|
+
try:
|
|
299
|
+
ret = self._map_data_from_model(item)
|
|
300
|
+
yield ret
|
|
301
|
+
except Exception as e:
|
|
302
|
+
self.report.warning(
|
|
303
|
+
title="Incomplete metadata",
|
|
304
|
+
message="Incomplete metadata because of error mapping item",
|
|
305
|
+
context=str(item),
|
|
306
|
+
exc=e,
|
|
307
|
+
)
|
|
308
|
+
except ValidationError as e:
|
|
309
|
+
self.report.failure(
|
|
310
|
+
title="Listing Projects and Components API response parsing error",
|
|
311
|
+
message="Error parsing API response and halting metadata ingestion",
|
|
312
|
+
context=str(response.json()),
|
|
313
|
+
exc=e,
|
|
314
|
+
)
|
|
315
|
+
except (requests.RequestException, Exception) as e:
|
|
316
|
+
self.report.failure(
|
|
317
|
+
title="Listing Projects and Components API request error",
|
|
318
|
+
message="Error fetching Projects and Components and halting metadata ingestion",
|
|
319
|
+
context=str(params),
|
|
320
|
+
exc=e,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def _map_data_from_model(
|
|
324
|
+
self, hex_item: HexApiProjectApiResource
|
|
325
|
+
) -> Union[Project, Component]:
|
|
326
|
+
"""
|
|
327
|
+
Maps a HexApi pydantic model parsed from the API to our domain model
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
# Map status
|
|
331
|
+
status = Status(name=hex_item.status.name) if hex_item.status else None
|
|
332
|
+
|
|
333
|
+
# Map categories
|
|
334
|
+
categories = []
|
|
335
|
+
if hex_item.categories:
|
|
336
|
+
categories = [
|
|
337
|
+
Category(name=cat.name, description=cat.description)
|
|
338
|
+
for cat in hex_item.categories
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
# Map collections
|
|
342
|
+
collections = []
|
|
343
|
+
if hex_item.sharing and hex_item.sharing.collections:
|
|
344
|
+
collections = [
|
|
345
|
+
Collection(name=col.collection.name)
|
|
346
|
+
for col in hex_item.sharing.collections
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
# Map creator and owner
|
|
350
|
+
creator = Owner(email=hex_item.creator.email) if hex_item.creator else None
|
|
351
|
+
owner = Owner(email=hex_item.owner.email) if hex_item.owner else None
|
|
352
|
+
|
|
353
|
+
# Map analytics
|
|
354
|
+
analytics = None
|
|
355
|
+
if hex_item.analytics and hex_item.analytics.app_views:
|
|
356
|
+
analytics = Analytics(
|
|
357
|
+
appviews_all_time=hex_item.analytics.app_views.all_time,
|
|
358
|
+
appviews_last_7_days=hex_item.analytics.app_views.last_seven_days,
|
|
359
|
+
appviews_last_14_days=hex_item.analytics.app_views.last_fourteen_days,
|
|
360
|
+
appviews_last_30_days=hex_item.analytics.app_views.last_thirty_days,
|
|
361
|
+
last_viewed_at=hex_item.analytics.last_viewed_at,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Create the appropriate domain model based on type
|
|
365
|
+
if hex_item.type == HexApiItemType.PROJECT:
|
|
366
|
+
return Project(
|
|
367
|
+
id=hex_item.id,
|
|
368
|
+
title=hex_item.title,
|
|
369
|
+
description=hex_item.description,
|
|
370
|
+
created_at=hex_item.created_at,
|
|
371
|
+
last_edited_at=hex_item.last_edited_at,
|
|
372
|
+
status=status,
|
|
373
|
+
categories=categories,
|
|
374
|
+
collections=collections,
|
|
375
|
+
creator=creator,
|
|
376
|
+
owner=owner,
|
|
377
|
+
analytics=analytics,
|
|
378
|
+
)
|
|
379
|
+
elif hex_item.type == HexApiItemType.COMPONENT:
|
|
380
|
+
return Component(
|
|
381
|
+
id=hex_item.id,
|
|
382
|
+
title=hex_item.title,
|
|
383
|
+
description=hex_item.description,
|
|
384
|
+
created_at=hex_item.created_at,
|
|
385
|
+
last_edited_at=hex_item.last_edited_at,
|
|
386
|
+
status=status,
|
|
387
|
+
categories=categories,
|
|
388
|
+
collections=collections,
|
|
389
|
+
creator=creator,
|
|
390
|
+
owner=owner,
|
|
391
|
+
analytics=analytics,
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
assert_never(hex_item.type)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, SecretStr
|
|
4
|
+
from typing_extensions import assert_never
|
|
5
|
+
|
|
6
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
7
|
+
from datahub.configuration.source_common import (
|
|
8
|
+
EnvConfigMixin,
|
|
9
|
+
PlatformInstanceConfigMixin,
|
|
10
|
+
)
|
|
11
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
12
|
+
from datahub.ingestion.api.decorators import (
|
|
13
|
+
SourceCapability,
|
|
14
|
+
SupportStatus,
|
|
15
|
+
capability,
|
|
16
|
+
config_class,
|
|
17
|
+
platform_name,
|
|
18
|
+
support_status,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
|
|
23
|
+
from datahub.ingestion.source.hex.constants import (
|
|
24
|
+
HEX_API_BASE_URL_DEFAULT,
|
|
25
|
+
HEX_API_PAGE_SIZE_DEFAULT,
|
|
26
|
+
HEX_PLATFORM_NAME,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.source.hex.mapper import Mapper
|
|
29
|
+
from datahub.ingestion.source.hex.model import Component, Project
|
|
30
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
31
|
+
StaleEntityRemovalHandler,
|
|
32
|
+
StaleEntityRemovalSourceReport,
|
|
33
|
+
StatefulStaleMetadataRemovalConfig,
|
|
34
|
+
)
|
|
35
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
36
|
+
StatefulIngestionConfigBase,
|
|
37
|
+
StatefulIngestionReport,
|
|
38
|
+
StatefulIngestionSourceBase,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class HexSourceConfig(
|
|
43
|
+
StatefulIngestionConfigBase, PlatformInstanceConfigMixin, EnvConfigMixin
|
|
44
|
+
):
|
|
45
|
+
workspace_name: str = Field(
|
|
46
|
+
description="Hex workspace name. You can find this name in your Hex home page URL: https://app.hex.tech/<workspace_name>",
|
|
47
|
+
)
|
|
48
|
+
token: SecretStr = Field(
|
|
49
|
+
description="Hex API token; either PAT or Workflow token - https://learn.hex.tech/docs/api/api-overview#authentication",
|
|
50
|
+
)
|
|
51
|
+
base_url: str = Field(
|
|
52
|
+
default=HEX_API_BASE_URL_DEFAULT,
|
|
53
|
+
description="Hex API base URL. For most Hex users, this will be https://app.hex.tech/api/v1. "
|
|
54
|
+
"Single-tenant app users should replace this with the URL they use to access Hex.",
|
|
55
|
+
)
|
|
56
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="Configuration for stateful ingestion and stale metadata removal.",
|
|
59
|
+
)
|
|
60
|
+
include_components: bool = Field(
|
|
61
|
+
default=True,
|
|
62
|
+
desciption="Include Hex Components in the ingestion",
|
|
63
|
+
)
|
|
64
|
+
page_size: int = Field(
|
|
65
|
+
default=HEX_API_PAGE_SIZE_DEFAULT,
|
|
66
|
+
description="Number of items to fetch per Hex API call.",
|
|
67
|
+
)
|
|
68
|
+
patch_metadata: bool = Field(
|
|
69
|
+
default=False,
|
|
70
|
+
description="Emit metadata as patch events",
|
|
71
|
+
)
|
|
72
|
+
collections_as_tags: bool = Field(
|
|
73
|
+
default=True,
|
|
74
|
+
description="Emit Hex Collections as tags",
|
|
75
|
+
)
|
|
76
|
+
status_as_tag: bool = Field(
|
|
77
|
+
default=True,
|
|
78
|
+
description="Emit Hex Status as tags",
|
|
79
|
+
)
|
|
80
|
+
categories_as_tags: bool = Field(
|
|
81
|
+
default=True,
|
|
82
|
+
description="Emit Hex Category as tags",
|
|
83
|
+
)
|
|
84
|
+
project_title_pattern: AllowDenyPattern = Field(
|
|
85
|
+
default=AllowDenyPattern.allow_all(),
|
|
86
|
+
description="Regex pattern for project titles to filter in ingestion.",
|
|
87
|
+
)
|
|
88
|
+
component_title_pattern: AllowDenyPattern = Field(
|
|
89
|
+
default=AllowDenyPattern.allow_all(),
|
|
90
|
+
description="Regex pattern for component titles to filter in ingestion.",
|
|
91
|
+
)
|
|
92
|
+
set_ownership_from_email: bool = Field(
|
|
93
|
+
default=True,
|
|
94
|
+
description="Set ownership identity from owner/creator email",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@platform_name("Hex")
|
|
103
|
+
@config_class(HexSourceConfig)
|
|
104
|
+
@support_status(SupportStatus.TESTING)
|
|
105
|
+
@capability(SourceCapability.DESCRIPTIONS, "Supported by default")
|
|
106
|
+
@capability(SourceCapability.OWNERSHIP, "Supported by default")
|
|
107
|
+
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
108
|
+
@capability(SourceCapability.CONTAINERS, "Enabled by default")
|
|
109
|
+
class HexSource(StatefulIngestionSourceBase):
|
|
110
|
+
def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
|
|
111
|
+
super().__init__(config, ctx)
|
|
112
|
+
self.source_config = config
|
|
113
|
+
self.report = HexReport()
|
|
114
|
+
self.platform = HEX_PLATFORM_NAME
|
|
115
|
+
self.hex_api = HexApi(
|
|
116
|
+
report=self.report,
|
|
117
|
+
token=self.source_config.token.get_secret_value(),
|
|
118
|
+
base_url=self.source_config.base_url,
|
|
119
|
+
page_size=self.source_config.page_size,
|
|
120
|
+
)
|
|
121
|
+
self.mapper = Mapper(
|
|
122
|
+
workspace_name=self.source_config.workspace_name,
|
|
123
|
+
platform_instance=self.source_config.platform_instance,
|
|
124
|
+
env=self.source_config.env,
|
|
125
|
+
base_url=self.source_config.base_url,
|
|
126
|
+
patch_metadata=self.source_config.patch_metadata,
|
|
127
|
+
collections_as_tags=self.source_config.collections_as_tags,
|
|
128
|
+
status_as_tag=self.source_config.status_as_tag,
|
|
129
|
+
categories_as_tags=self.source_config.categories_as_tags,
|
|
130
|
+
set_ownership_from_email=self.source_config.set_ownership_from_email,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
|
|
135
|
+
config = HexSourceConfig.parse_obj(config_dict)
|
|
136
|
+
return cls(config, ctx)
|
|
137
|
+
|
|
138
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
139
|
+
return [
|
|
140
|
+
*super().get_workunit_processors(),
|
|
141
|
+
StaleEntityRemovalHandler.create(
|
|
142
|
+
self, self.source_config, self.ctx
|
|
143
|
+
).workunit_processor,
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
def get_report(self) -> StatefulIngestionReport:
|
|
147
|
+
return self.report
|
|
148
|
+
|
|
149
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
150
|
+
yield from self.mapper.map_workspace()
|
|
151
|
+
|
|
152
|
+
for project_or_component in self.hex_api.fetch_projects():
|
|
153
|
+
if isinstance(project_or_component, Project):
|
|
154
|
+
if self.source_config.project_title_pattern.allowed(
|
|
155
|
+
project_or_component.title
|
|
156
|
+
):
|
|
157
|
+
yield from self.mapper.map_project(project=project_or_component)
|
|
158
|
+
elif isinstance(project_or_component, Component):
|
|
159
|
+
if (
|
|
160
|
+
self.source_config.include_components
|
|
161
|
+
and self.source_config.component_title_pattern.allowed(
|
|
162
|
+
project_or_component.title
|
|
163
|
+
)
|
|
164
|
+
):
|
|
165
|
+
yield from self.mapper.map_component(component=project_or_component)
|
|
166
|
+
else:
|
|
167
|
+
assert_never(project_or_component)
|