acryl-datahub 1.1.0.5rc2__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/METADATA +2550 -2550
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/RECORD +42 -35
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/ingestion/api/report.py +123 -2
- datahub/ingestion/api/source.py +45 -44
- datahub/ingestion/autogenerated/lineage_helper.py +193 -0
- datahub/ingestion/run/pipeline.py +6 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +4 -4
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +5 -1
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/vertica.py +2 -1
- datahub/ingestion/source/unity/source.py +36 -20
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/metadata/_internal_schema_classes.py +601 -0
- datahub/metadata/_urns/urn_defs.py +112 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +383 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
- datahub/sdk/datajob.py +39 -15
- datahub/specific/dataproduct.py +4 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set
|
|
5
|
+
|
|
6
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# Global cache for lineage data to avoid repeated file reads
|
|
11
|
+
_lineage_data: Optional[Dict] = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_lineage_data() -> Dict:
|
|
15
|
+
"""
|
|
16
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
17
|
+
|
|
18
|
+
Load lineage data from the autogenerated lineage.json file.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Dict containing the lineage information
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
25
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
26
|
+
"""
|
|
27
|
+
global _lineage_data
|
|
28
|
+
|
|
29
|
+
if _lineage_data is not None:
|
|
30
|
+
return _lineage_data
|
|
31
|
+
|
|
32
|
+
# Get the path to lineage.json relative to this file
|
|
33
|
+
current_file = Path(__file__)
|
|
34
|
+
lineage_file = current_file.parent / "lineage.json"
|
|
35
|
+
|
|
36
|
+
if not lineage_file.exists():
|
|
37
|
+
raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
with open(lineage_file, "r") as f:
|
|
41
|
+
_lineage_data = json.load(f)
|
|
42
|
+
return _lineage_data
|
|
43
|
+
except json.JSONDecodeError as e:
|
|
44
|
+
raise json.JSONDecodeError(
|
|
45
|
+
f"Failed to parse lineage.json: {e}", e.doc, e.pos
|
|
46
|
+
) from e
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
50
|
+
"""
|
|
51
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
52
|
+
|
|
53
|
+
Get lineage fields for a specific entity type and aspect.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
57
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List of lineage field dictionaries, each containing:
|
|
61
|
+
- name: field name
|
|
62
|
+
- path: dot-notation path to the field
|
|
63
|
+
- isLineage: boolean indicating if it's lineage
|
|
64
|
+
- relationship: relationship information
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
68
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
69
|
+
"""
|
|
70
|
+
lineage_data = _load_lineage_data()
|
|
71
|
+
|
|
72
|
+
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
73
|
+
aspect_data = entity_data.get(aspect_name, {})
|
|
74
|
+
|
|
75
|
+
return aspect_data.get("fields", [])
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
81
|
+
|
|
82
|
+
Check if a specific field path is lineage-related.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
86
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
87
|
+
field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if the field is lineage-related, False otherwise
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
94
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
95
|
+
AssertionError: If URN doesn't start with 'urn:li:'
|
|
96
|
+
"""
|
|
97
|
+
entity_type = guess_entity_type(urn)
|
|
98
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
99
|
+
|
|
100
|
+
for field in lineage_fields:
|
|
101
|
+
if field.get("path") == field_path:
|
|
102
|
+
return field.get("isLineage", False)
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def has_lineage(urn: str, aspect: Any) -> bool:
|
|
108
|
+
"""
|
|
109
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
110
|
+
|
|
111
|
+
Check if an aspect has any lineage fields.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
115
|
+
aspect: The aspect object
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
True if the aspect has lineage fields, False otherwise
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
122
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
123
|
+
AssertionError: If URN doesn't start with 'urn:li:'
|
|
124
|
+
"""
|
|
125
|
+
entity_type = guess_entity_type(urn)
|
|
126
|
+
aspect_class = getattr(aspect, "__class__", None)
|
|
127
|
+
aspect_name = (
|
|
128
|
+
aspect_class.__name__ if aspect_class is not None else str(type(aspect))
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
132
|
+
return len(lineage_fields) > 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
|
|
136
|
+
"""
|
|
137
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
138
|
+
|
|
139
|
+
Check if an aspect has any lineage fields.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
143
|
+
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
True if the aspect has lineage fields, False otherwise
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
150
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
151
|
+
"""
|
|
152
|
+
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
153
|
+
return len(lineage_fields) > 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_all_lineage_aspects(entity_type: str) -> Set[str]:
|
|
157
|
+
"""
|
|
158
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
159
|
+
|
|
160
|
+
Get all aspects that have lineage fields for a given entity type.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Set of aspect names that have lineage fields
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
FileNotFoundError: If lineage.json doesn't exist
|
|
170
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
171
|
+
"""
|
|
172
|
+
lineage_data = _load_lineage_data()
|
|
173
|
+
|
|
174
|
+
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
175
|
+
lineage_aspects = set()
|
|
176
|
+
|
|
177
|
+
for aspect_name, aspect_data in entity_data.items():
|
|
178
|
+
if aspect_data.get("fields"):
|
|
179
|
+
lineage_aspects.add(aspect_name)
|
|
180
|
+
|
|
181
|
+
return lineage_aspects
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def clear_cache() -> None:
|
|
185
|
+
"""
|
|
186
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
187
|
+
|
|
188
|
+
Clear the internal cache of lineage data.
|
|
189
|
+
|
|
190
|
+
This is useful for testing or when the lineage.json file has been updated.
|
|
191
|
+
"""
|
|
192
|
+
global _lineage_data
|
|
193
|
+
_lineage_data = None
|
|
@@ -578,11 +578,17 @@ class Pipeline:
|
|
|
578
578
|
sink_failures = len(self.sink.get_report().failures)
|
|
579
579
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
580
580
|
global_warnings = len(get_global_warnings())
|
|
581
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
582
|
+
source_aspects_by_subtype = (
|
|
583
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
584
|
+
)
|
|
581
585
|
|
|
582
586
|
telemetry_instance.ping(
|
|
583
587
|
"ingest_stats",
|
|
584
588
|
{
|
|
585
589
|
"source_type": self.source_type,
|
|
590
|
+
"source_aspects": source_aspects,
|
|
591
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
586
592
|
"sink_type": self.sink_type,
|
|
587
593
|
"transformer_types": [
|
|
588
594
|
transformer.type for transformer in self.config.transformers or []
|
|
@@ -189,6 +189,7 @@ WHERE
|
|
|
189
189
|
|
|
190
190
|
if len(profile_requests) == 0:
|
|
191
191
|
return
|
|
192
|
+
|
|
192
193
|
yield from self.generate_profile_workunits(
|
|
193
194
|
profile_requests,
|
|
194
195
|
max_workers=self.config.profiling.max_workers,
|
|
@@ -226,10 +227,11 @@ WHERE
|
|
|
226
227
|
db_name, schema_name, bq_table, self.config.profiling.partition_datetime
|
|
227
228
|
)
|
|
228
229
|
|
|
229
|
-
if partition
|
|
230
|
+
# For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
|
|
231
|
+
if partition is None and bq_table.partition_info and bq_table.rows_count:
|
|
230
232
|
self.report.report_warning(
|
|
231
233
|
title="Profile skipped for partitioned table",
|
|
232
|
-
message="profile skipped as
|
|
234
|
+
message="profile skipped as partition id or type was invalid",
|
|
233
235
|
context=profile_request.pretty_name,
|
|
234
236
|
)
|
|
235
237
|
return None
|
|
@@ -45,12 +45,12 @@ SELECT
|
|
|
45
45
|
tos.OPTION_VALUE as comment,
|
|
46
46
|
t.is_insertable_into,
|
|
47
47
|
t.ddl,
|
|
48
|
-
ts.row_count,
|
|
49
|
-
ts.size_bytes as
|
|
48
|
+
ts.row_count as row_count,
|
|
49
|
+
ts.size_bytes as size_bytes,
|
|
50
50
|
p.num_partitions,
|
|
51
51
|
p.max_partition_id,
|
|
52
|
-
p.active_billable_bytes,
|
|
53
|
-
p.long_term_billable_bytes,
|
|
52
|
+
p.active_billable_bytes as active_billable_bytes,
|
|
53
|
+
-- IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
|
|
54
54
|
REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
|
|
55
55
|
REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
|
|
56
56
|
|
|
@@ -26,6 +26,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
26
26
|
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
27
27
|
SNOWFLAKE_STREAM = "Snowflake Stream"
|
|
28
28
|
API_ENDPOINT = "API Endpoint"
|
|
29
|
+
SLACK_CHANNEL = "Slack Channel"
|
|
30
|
+
PROJECTIONS = "Projections"
|
|
29
31
|
|
|
30
32
|
# TODO: Create separate entity...
|
|
31
33
|
NOTEBOOK = "Notebook"
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
5
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
6
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
7
|
DataProcessInstance,
|
|
8
8
|
InstanceRunResult,
|
|
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
42
42
|
FineGrainedLineageDownstreamType,
|
|
43
43
|
FineGrainedLineageUpstreamType,
|
|
44
44
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
45
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
46
|
+
from datahub.sdk.dataflow import DataFlow
|
|
47
|
+
from datahub.sdk.datajob import DataJob
|
|
48
|
+
from datahub.sdk.entity import Entity
|
|
47
49
|
|
|
48
50
|
# Logger instance
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
75
77
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
76
78
|
|
|
77
79
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
80
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
81
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
82
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
83
|
|
|
82
84
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
180
|
)
|
|
179
181
|
)
|
|
180
182
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
183
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
184
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
185
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
186
|
|
|
185
187
|
return dict(
|
|
186
188
|
**{
|
|
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
199
|
|
|
198
200
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
201
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
platform=Constant.ORCHESTRATOR,
|
|
203
|
+
name=connector.connector_id,
|
|
202
204
|
env=self.config.env,
|
|
203
|
-
|
|
205
|
+
display_name=connector.connector_name,
|
|
204
206
|
platform_instance=self.config.platform_instance,
|
|
205
207
|
)
|
|
206
208
|
|
|
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
215
|
)
|
|
214
216
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
217
|
datajob = DataJob(
|
|
216
|
-
|
|
218
|
+
name=connector.connector_id,
|
|
217
219
|
flow_urn=dataflow_urn,
|
|
218
220
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
221
|
+
display_name=connector.connector_name,
|
|
222
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
223
|
)
|
|
222
224
|
|
|
223
225
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
234
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
235
|
"destination_id": connector.destination_id,
|
|
234
236
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
237
|
+
|
|
238
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
239
|
|
|
240
240
|
return datajob
|
|
241
241
|
|
|
242
242
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
243
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
244
|
+
datajob_v1 = DataJobV1(
|
|
245
|
+
id=datajob.name,
|
|
246
|
+
flow_urn=datajob.flow_urn,
|
|
247
|
+
platform_instance=self.config.platform_instance,
|
|
248
|
+
name=datajob.name,
|
|
249
|
+
inlets=datajob.inlets,
|
|
250
|
+
outlets=datajob.outlets,
|
|
251
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
252
|
+
)
|
|
243
253
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
254
|
+
datajob=datajob_v1,
|
|
245
255
|
id=job.job_id,
|
|
246
256
|
clone_inlets=True,
|
|
247
257
|
clone_outlets=True,
|
|
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
288
|
|
|
279
289
|
def _get_connector_workunits(
|
|
280
290
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
291
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
292
|
self.report.report_connectors_scanned()
|
|
283
293
|
# Create dataflow entity with same name as connector name
|
|
284
294
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
295
|
+
yield dataflow
|
|
287
296
|
|
|
288
297
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
298
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
299
|
+
yield datajob
|
|
292
300
|
|
|
293
301
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
302
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
318
|
).workunit_processor,
|
|
311
319
|
]
|
|
312
320
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
321
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
322
|
"""
|
|
315
323
|
Datahub Ingestion framework invoke this method
|
|
316
324
|
"""
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|