acryl-datahub 1.3.0.1rc6__py3-none-any.whl → 1.3.0.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/METADATA +2457 -2458
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/RECORD +34 -32
- datahub/_version.py +1 -1
- datahub/cli/docker_check.py +1 -1
- datahub/emitter/mce_builder.py +6 -0
- datahub/ingestion/autogenerated/capability_summary.json +12 -12
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +2 -0
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/dremio/dremio_source.py +15 -15
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/config.py +33 -0
- datahub/ingestion/source/fivetran/fivetran.py +184 -13
- datahub/ingestion/source/fivetran/fivetran_log_api.py +20 -5
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/metabase.py +23 -4
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -1
- datahub/ingestion/source/sql_queries.py +1 -1
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/DataHubFileInfo.avsc +4 -2
- datahub/sdk/mlmodel.py +19 -0
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc6.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, Iterable, List, Optional, Union
|
|
3
|
+
from urllib.parse import urlparse
|
|
3
4
|
|
|
4
5
|
import datahub.emitter.mce_builder as builder
|
|
5
6
|
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
@@ -22,6 +23,7 @@ from datahub.ingestion.api.source import (
|
|
|
22
23
|
StructuredLogCategory,
|
|
23
24
|
)
|
|
24
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
25
27
|
from datahub.ingestion.source.fivetran.config import (
|
|
26
28
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
27
29
|
Constant,
|
|
@@ -35,29 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
|
|
|
35
37
|
MAX_JOBS_PER_CONNECTOR,
|
|
36
38
|
MAX_TABLE_LINEAGE_PER_CONNECTOR,
|
|
37
39
|
)
|
|
40
|
+
from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
|
|
41
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
38
42
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
39
43
|
StaleEntityRemovalHandler,
|
|
40
44
|
)
|
|
41
45
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
42
46
|
StatefulIngestionSourceBase,
|
|
43
47
|
)
|
|
48
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
44
49
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
45
50
|
FineGrainedLineage,
|
|
46
51
|
FineGrainedLineageDownstreamType,
|
|
47
52
|
FineGrainedLineageUpstreamType,
|
|
53
|
+
UpstreamLineage,
|
|
54
|
+
)
|
|
55
|
+
from datahub.metadata.schema_classes import (
|
|
56
|
+
DatasetLineageTypeClass,
|
|
57
|
+
UpstreamClass,
|
|
48
58
|
)
|
|
49
59
|
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
50
60
|
from datahub.sdk.dataflow import DataFlow
|
|
51
61
|
from datahub.sdk.datajob import DataJob
|
|
62
|
+
from datahub.sdk.dataset import Dataset
|
|
52
63
|
from datahub.sdk.entity import Entity
|
|
53
64
|
|
|
54
65
|
# Logger instance
|
|
55
66
|
logger = logging.getLogger(__name__)
|
|
67
|
+
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
|
|
56
68
|
|
|
57
69
|
|
|
58
70
|
@platform_name("Fivetran")
|
|
59
71
|
@config_class(FivetranSourceConfig)
|
|
60
|
-
@support_status(SupportStatus.
|
|
72
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
61
73
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
62
74
|
@capability(
|
|
63
75
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -76,8 +88,12 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
76
88
|
super().__init__(config, ctx)
|
|
77
89
|
self.config = config
|
|
78
90
|
self.report = FivetranSourceReport()
|
|
79
|
-
|
|
80
91
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
92
|
+
self.api_client: Optional[FivetranAPIClient] = None
|
|
93
|
+
self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
|
|
94
|
+
|
|
95
|
+
if self.config.api_config:
|
|
96
|
+
self.api_client = FivetranAPIClient(self.config.api_config)
|
|
81
97
|
|
|
82
98
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
83
99
|
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
@@ -131,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
131
147
|
if source_details.include_schema_in_urn
|
|
132
148
|
else lineage.source_table.split(".", 1)[1]
|
|
133
149
|
)
|
|
134
|
-
input_dataset_urn =
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
150
|
+
input_dataset_urn: Optional[DatasetUrn] = None
|
|
151
|
+
# Special Handling for Google Sheets Connectors
|
|
152
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
153
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
154
|
+
# This is cached in the api_client
|
|
155
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
156
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if gsheets_conn_details:
|
|
160
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
161
|
+
platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
162
|
+
table_name=self._get_gsheet_named_range_dataset_id(
|
|
163
|
+
gsheets_conn_details
|
|
164
|
+
),
|
|
165
|
+
env=source_details.env,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self.report.warning(
|
|
169
|
+
title="Failed to extract lineage for Google Sheets Connector",
|
|
170
|
+
message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
|
|
171
|
+
context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
175
|
+
platform_id=source_details.platform,
|
|
176
|
+
table_name=(
|
|
177
|
+
f"{source_details.database.lower()}.{source_table}"
|
|
178
|
+
if source_details.database
|
|
179
|
+
else source_table
|
|
180
|
+
),
|
|
181
|
+
env=source_details.env,
|
|
182
|
+
platform_instance=source_details.platform_instance,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if input_dataset_urn:
|
|
186
|
+
input_dataset_urn_list.append(input_dataset_urn)
|
|
145
187
|
|
|
146
188
|
destination_table = (
|
|
147
189
|
lineage.destination_table
|
|
@@ -262,6 +304,67 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
262
304
|
clone_outlets=True,
|
|
263
305
|
)
|
|
264
306
|
|
|
307
|
+
def _get_connection_details_by_id(
|
|
308
|
+
self, connection_id: str
|
|
309
|
+
) -> Optional[FivetranConnectionDetails]:
|
|
310
|
+
if self.api_client is None:
|
|
311
|
+
self.report.warning(
|
|
312
|
+
title="Fivetran API client is not initialized",
|
|
313
|
+
message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
|
|
314
|
+
context=f"connector_id: {connection_id}",
|
|
315
|
+
)
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
if connection_id in self._connection_details_cache:
|
|
319
|
+
return self._connection_details_cache[connection_id]
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
self.report.report_fivetran_rest_api_call_count()
|
|
323
|
+
conn_details = self.api_client.get_connection_details_by_id(connection_id)
|
|
324
|
+
# Update Cache
|
|
325
|
+
if conn_details:
|
|
326
|
+
self._connection_details_cache[connection_id] = conn_details
|
|
327
|
+
|
|
328
|
+
return conn_details
|
|
329
|
+
except Exception as e:
|
|
330
|
+
self.report.warning(
|
|
331
|
+
title="Failed to get connection details for Google Sheets Connector",
|
|
332
|
+
message=f"Exception occurred while getting connection details from Fivetran API. {e}",
|
|
333
|
+
context=f"connector_id: {connection_id}",
|
|
334
|
+
)
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def _get_gsheet_sheet_id_from_url(
|
|
338
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
339
|
+
) -> str:
|
|
340
|
+
# Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
|
|
341
|
+
# "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
342
|
+
try:
|
|
343
|
+
parsed = urlparse(gsheets_conn_details.config.sheet_id)
|
|
344
|
+
# Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
|
|
345
|
+
parts = parsed.path.split("/")
|
|
346
|
+
return parts[3] if len(parts) > 2 else ""
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return ""
|
|
353
|
+
|
|
354
|
+
def _get_gsheet_named_range_dataset_id(
|
|
355
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
356
|
+
) -> str:
|
|
357
|
+
sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
|
|
358
|
+
named_range_id = (
|
|
359
|
+
f"{sheet_id}.{gsheets_conn_details.config.named_range}"
|
|
360
|
+
if sheet_id
|
|
361
|
+
else gsheets_conn_details.config.named_range
|
|
362
|
+
)
|
|
363
|
+
logger.debug(
|
|
364
|
+
f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
|
|
365
|
+
)
|
|
366
|
+
return named_range_id
|
|
367
|
+
|
|
265
368
|
def _get_dpi_workunits(
|
|
266
369
|
self, job: Job, dpi: DataProcessInstance
|
|
267
370
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -295,6 +398,74 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
295
398
|
self, connector: Connector
|
|
296
399
|
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
297
400
|
self.report.report_connectors_scanned()
|
|
401
|
+
|
|
402
|
+
"""
|
|
403
|
+
-------------------------------------------------------
|
|
404
|
+
Special Handling for Google Sheets Connectors
|
|
405
|
+
-------------------------------------------------------
|
|
406
|
+
Google Sheets source is not supported by Datahub yet.
|
|
407
|
+
As a workaround, we are emitting a dataset entity for the Google Sheet
|
|
408
|
+
and adding it to the lineage. This workaround needs to be removed once
|
|
409
|
+
Datahub supports Google Sheets source natively.
|
|
410
|
+
-------------------------------------------------------
|
|
411
|
+
"""
|
|
412
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
413
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
414
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
415
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if gsheets_conn_details:
|
|
419
|
+
gsheets_dataset = Dataset(
|
|
420
|
+
name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
|
|
421
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
422
|
+
env=self.config.env,
|
|
423
|
+
display_name=self._get_gsheet_sheet_id_from_url(
|
|
424
|
+
gsheets_conn_details
|
|
425
|
+
),
|
|
426
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
427
|
+
created=gsheets_conn_details.created_at,
|
|
428
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
429
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS,
|
|
430
|
+
custom_properties={
|
|
431
|
+
"ingested_by": "fivetran source",
|
|
432
|
+
"connector_id": gsheets_conn_details.id,
|
|
433
|
+
},
|
|
434
|
+
)
|
|
435
|
+
gsheets_named_range_dataset = Dataset(
|
|
436
|
+
name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
|
|
437
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
438
|
+
env=self.config.env,
|
|
439
|
+
display_name=gsheets_conn_details.config.named_range,
|
|
440
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
441
|
+
created=gsheets_conn_details.created_at,
|
|
442
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
443
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
|
|
444
|
+
custom_properties={
|
|
445
|
+
"ingested_by": "fivetran source",
|
|
446
|
+
"connector_id": gsheets_conn_details.id,
|
|
447
|
+
},
|
|
448
|
+
upstreams=UpstreamLineage(
|
|
449
|
+
upstreams=[
|
|
450
|
+
UpstreamClass(
|
|
451
|
+
dataset=str(gsheets_dataset.urn),
|
|
452
|
+
type=DatasetLineageTypeClass.VIEW,
|
|
453
|
+
auditStamp=AuditStamp(
|
|
454
|
+
time=int(
|
|
455
|
+
gsheets_conn_details.created_at.timestamp()
|
|
456
|
+
* 1000
|
|
457
|
+
),
|
|
458
|
+
actor=CORPUSER_DATAHUB,
|
|
459
|
+
),
|
|
460
|
+
)
|
|
461
|
+
],
|
|
462
|
+
fineGrainedLineages=None,
|
|
463
|
+
),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
yield gsheets_dataset
|
|
467
|
+
yield gsheets_named_range_dataset
|
|
468
|
+
|
|
298
469
|
# Create dataflow entity with same name as connector name
|
|
299
470
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
300
471
|
yield dataflow
|
|
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
|
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
|
11
11
|
from datahub.ingestion.source.fivetran.config import (
|
|
12
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
|
|
12
13
|
Constant,
|
|
13
14
|
FivetranLogConfig,
|
|
14
15
|
FivetranSourceReport,
|
|
@@ -112,7 +113,11 @@ class FivetranLogAPI:
|
|
|
112
113
|
"""
|
|
113
114
|
Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
|
|
114
115
|
"""
|
|
115
|
-
all_column_lineage = defaultdict(list)
|
|
116
|
+
all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
|
|
117
|
+
|
|
118
|
+
if not connector_ids:
|
|
119
|
+
return dict(all_column_lineage)
|
|
120
|
+
|
|
116
121
|
column_lineage_result = self._query(
|
|
117
122
|
self.fivetran_log_query.get_column_lineage_query(
|
|
118
123
|
connector_ids=connector_ids
|
|
@@ -130,7 +135,11 @@ class FivetranLogAPI:
|
|
|
130
135
|
"""
|
|
131
136
|
Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
|
|
132
137
|
"""
|
|
133
|
-
connectors_table_lineage_metadata = defaultdict(list)
|
|
138
|
+
connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
|
|
139
|
+
|
|
140
|
+
if not connector_ids:
|
|
141
|
+
return dict(connectors_table_lineage_metadata)
|
|
142
|
+
|
|
134
143
|
table_lineage_result = self._query(
|
|
135
144
|
self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
|
|
136
145
|
)
|
|
@@ -246,9 +255,15 @@ class FivetranLogAPI:
|
|
|
246
255
|
return self._get_users().get(user_id)
|
|
247
256
|
|
|
248
257
|
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
|
249
|
-
connector_ids
|
|
250
|
-
|
|
251
|
-
|
|
258
|
+
# Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
|
|
259
|
+
tll_connector_ids: List[str] = []
|
|
260
|
+
cll_connector_ids: List[str] = []
|
|
261
|
+
for connector in connectors:
|
|
262
|
+
tll_connector_ids.append(connector.connector_id)
|
|
263
|
+
if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
|
|
264
|
+
cll_connector_ids.append(connector.connector_id)
|
|
265
|
+
table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
|
|
266
|
+
column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
|
|
252
267
|
for connector in connectors:
|
|
253
268
|
connector.lineage = self._extract_connector_lineage(
|
|
254
269
|
table_lineage_result=table_lineage_metadata.get(connector.connector_id),
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from requests.adapters import HTTPAdapter
|
|
5
|
+
from urllib3.util import Retry
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.source.fivetran.config import (
|
|
8
|
+
FivetranAPIConfig,
|
|
9
|
+
)
|
|
10
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Retry configuration constants
|
|
15
|
+
RETRY_MAX_TIMES = 3
|
|
16
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
17
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
18
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FivetranAPIClient:
|
|
22
|
+
"""Client for interacting with the Fivetran REST API."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: FivetranAPIConfig) -> None:
|
|
25
|
+
self.config = config
|
|
26
|
+
self._session = self._create_session()
|
|
27
|
+
|
|
28
|
+
def _create_session(self) -> requests.Session:
|
|
29
|
+
"""
|
|
30
|
+
Create a session with retry logic and basic authentication
|
|
31
|
+
"""
|
|
32
|
+
requests_session = requests.Session()
|
|
33
|
+
|
|
34
|
+
# Configure retry strategy for transient failures
|
|
35
|
+
retry_strategy = Retry(
|
|
36
|
+
total=RETRY_MAX_TIMES,
|
|
37
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
38
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
39
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
40
|
+
raise_on_status=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
44
|
+
requests_session.mount("http://", adapter)
|
|
45
|
+
requests_session.mount("https://", adapter)
|
|
46
|
+
|
|
47
|
+
# Set up basic authentication
|
|
48
|
+
requests_session.auth = (self.config.api_key, self.config.api_secret)
|
|
49
|
+
requests_session.headers.update(
|
|
50
|
+
{
|
|
51
|
+
"Content-Type": "application/json",
|
|
52
|
+
"Accept": "application/json",
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return requests_session
|
|
56
|
+
|
|
57
|
+
def get_connection_details_by_id(
|
|
58
|
+
self, connection_id: str
|
|
59
|
+
) -> FivetranConnectionDetails:
|
|
60
|
+
"""Get details for a specific connection."""
|
|
61
|
+
connection_details = self._session.get(
|
|
62
|
+
f"{self.config.base_url}/v1/connections/{connection_id}",
|
|
63
|
+
timeout=self.config.request_timeout_sec,
|
|
64
|
+
)
|
|
65
|
+
return FivetranConnectionDetails(**connection_details.json().get("data", {}))
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FivetranConnectionWarnings(BaseModel):
|
|
8
|
+
code: str # Warning Code
|
|
9
|
+
message: str # Warning Message
|
|
10
|
+
details: Dict # Warning Details
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FivetranConnectionStatus(BaseModel):
|
|
14
|
+
setup_state: str # Setup State
|
|
15
|
+
schema_status: str # Schema Status
|
|
16
|
+
sync_state: str # Sync State
|
|
17
|
+
update_state: str # Update State
|
|
18
|
+
is_historical_sync: bool # Is Historical Sync
|
|
19
|
+
warnings: List[FivetranConnectionWarnings] # Warnings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FivetranConnectionConfig(BaseModel):
|
|
23
|
+
# Note: Connection Config is different for different connectors
|
|
24
|
+
auth_type: str # Auth Type
|
|
25
|
+
sheet_id: str # Sheet ID - URL to the Google Sheet
|
|
26
|
+
named_range: str # Named Range
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FivetranConnectionSourceSyncDetails(BaseModel):
|
|
30
|
+
last_synced: datetime.datetime # Last Synced
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FivetranConnectionDetails(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
id: str # Source ID
|
|
39
|
+
group_id: str # Destination ID
|
|
40
|
+
service: str # Connector Type
|
|
41
|
+
created_at: datetime.datetime
|
|
42
|
+
succeeded_at: datetime.datetime
|
|
43
|
+
paused: bool # Paused Status
|
|
44
|
+
sync_frequency: int # Sync Frequency (minutes)
|
|
45
|
+
status: FivetranConnectionStatus # Status
|
|
46
|
+
config: FivetranConnectionConfig # Connection Config
|
|
47
|
+
source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
# Sample Response for Google Sheets Connector
|
|
51
|
+
{
|
|
52
|
+
"code": "Success",
|
|
53
|
+
"data": {
|
|
54
|
+
"id": "dialectical_remindful",
|
|
55
|
+
"group_id": "empties_classification",
|
|
56
|
+
"service": "google_sheets",
|
|
57
|
+
"service_version": 1,
|
|
58
|
+
"schema": "fivetran_google_sheets.fivetran_google_sheets",
|
|
59
|
+
"connected_by": "sewn_restrained",
|
|
60
|
+
"created_at": "2025-10-06T17:53:01.554289Z",
|
|
61
|
+
"succeeded_at": "2025-10-06T22:55:45.275000Z",
|
|
62
|
+
"failed_at": null,
|
|
63
|
+
"paused": true,
|
|
64
|
+
"pause_after_trial": false,
|
|
65
|
+
"sync_frequency": 360,
|
|
66
|
+
"data_delay_threshold": 0,
|
|
67
|
+
"data_delay_sensitivity": "NORMAL",
|
|
68
|
+
"private_link_id": null,
|
|
69
|
+
"networking_method": "Directly",
|
|
70
|
+
"proxy_agent_id": null,
|
|
71
|
+
"schedule_type": "auto",
|
|
72
|
+
"status": {
|
|
73
|
+
"setup_state": "connected",
|
|
74
|
+
"schema_status": "ready",
|
|
75
|
+
"sync_state": "paused",
|
|
76
|
+
"update_state": "on_schedule",
|
|
77
|
+
"is_historical_sync": false,
|
|
78
|
+
"tasks": [],
|
|
79
|
+
"warnings": [
|
|
80
|
+
{
|
|
81
|
+
"code": "snowflake_discontinuing_password_auth",
|
|
82
|
+
"message": "Snowflake is discontinuing username/password authentication",
|
|
83
|
+
"details": {}
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
"config": {
|
|
88
|
+
"auth_type": "ServiceAccount",
|
|
89
|
+
"sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
90
|
+
"named_range": "Fivetran_Test_Range"
|
|
91
|
+
},
|
|
92
|
+
"source_sync_details": {
|
|
93
|
+
"last_synced": "2025-10-06T22:55:27.371Z"
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
"""
|
|
@@ -178,7 +178,7 @@ class HexReport(
|
|
|
178
178
|
|
|
179
179
|
@platform_name("Hex")
|
|
180
180
|
@config_class(HexSourceConfig)
|
|
181
|
-
@support_status(SupportStatus.
|
|
181
|
+
@support_status(SupportStatus.INCUBATING)
|
|
182
182
|
@capability(SourceCapability.DESCRIPTIONS, "Supported by default")
|
|
183
183
|
@capability(SourceCapability.OWNERSHIP, "Supported by default")
|
|
184
184
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -118,7 +118,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
@platform_name("Iceberg")
|
|
121
|
-
@support_status(SupportStatus.
|
|
121
|
+
@support_status(SupportStatus.INCUBATING)
|
|
122
122
|
@config_class(IcebergSourceConfig)
|
|
123
123
|
@capability(
|
|
124
124
|
SourceCapability.PLATFORM_INSTANCE,
|
|
@@ -52,6 +52,7 @@ from datahub.metadata.schema_classes import (
|
|
|
52
52
|
ChartQueryTypeClass,
|
|
53
53
|
ChartTypeClass,
|
|
54
54
|
DashboardInfoClass,
|
|
55
|
+
EdgeClass,
|
|
55
56
|
OwnerClass,
|
|
56
57
|
OwnershipClass,
|
|
57
58
|
OwnershipTypeClass,
|
|
@@ -338,19 +339,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
338
339
|
lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
|
|
339
340
|
)
|
|
340
341
|
|
|
341
|
-
|
|
342
|
+
# Convert chart URNs to chart edges (instead of deprecated charts field)
|
|
343
|
+
chart_edges = []
|
|
342
344
|
cards_data = dashboard_details.get("dashcards", {})
|
|
343
345
|
for card_info in cards_data:
|
|
344
346
|
card_id = card_info.get("card").get("id", "")
|
|
345
347
|
if not card_id:
|
|
346
348
|
continue # most likely a virtual card without an id (text or heading), not relevant.
|
|
347
349
|
chart_urn = builder.make_chart_urn(self.platform, str(card_id))
|
|
348
|
-
|
|
350
|
+
chart_edges.append(
|
|
351
|
+
EdgeClass(
|
|
352
|
+
destinationUrn=chart_urn,
|
|
353
|
+
lastModified=last_modified.lastModified,
|
|
354
|
+
)
|
|
355
|
+
)
|
|
349
356
|
|
|
350
357
|
dashboard_info_class = DashboardInfoClass(
|
|
351
358
|
description=description,
|
|
352
359
|
title=title,
|
|
353
|
-
|
|
360
|
+
chartEdges=chart_edges,
|
|
354
361
|
lastModified=last_modified,
|
|
355
362
|
dashboardUrl=f"{self.config.display_uri}/dashboard/{dashboard_id}",
|
|
356
363
|
customProperties={},
|
|
@@ -488,13 +495,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
488
495
|
datasource_urn = self.get_datasource_urn(card_details)
|
|
489
496
|
custom_properties = self.construct_card_custom_properties(card_details)
|
|
490
497
|
|
|
498
|
+
input_edges = (
|
|
499
|
+
[
|
|
500
|
+
EdgeClass(
|
|
501
|
+
destinationUrn=urn,
|
|
502
|
+
lastModified=last_modified.lastModified,
|
|
503
|
+
)
|
|
504
|
+
for urn in datasource_urn
|
|
505
|
+
]
|
|
506
|
+
if datasource_urn
|
|
507
|
+
else None
|
|
508
|
+
)
|
|
509
|
+
|
|
491
510
|
chart_info = ChartInfoClass(
|
|
492
511
|
type=chart_type,
|
|
493
512
|
description=description,
|
|
494
513
|
title=title,
|
|
495
514
|
lastModified=last_modified,
|
|
496
515
|
chartUrl=f"{self.config.display_uri}/card/{card_id}",
|
|
497
|
-
|
|
516
|
+
inputEdges=input_edges,
|
|
498
517
|
customProperties=custom_properties,
|
|
499
518
|
)
|
|
500
519
|
chart_snapshot.aspects.append(chart_info)
|
|
@@ -136,7 +136,7 @@ class MLflowRegisteredModelStageInfo:
|
|
|
136
136
|
|
|
137
137
|
@platform_name("MLflow")
|
|
138
138
|
@config_class(MLflowConfig)
|
|
139
|
-
@support_status(SupportStatus.
|
|
139
|
+
@support_status(SupportStatus.INCUBATING)
|
|
140
140
|
@capability(
|
|
141
141
|
SourceCapability.DESCRIPTIONS,
|
|
142
142
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
@@ -188,7 +188,7 @@ class TableData:
|
|
|
188
188
|
|
|
189
189
|
@platform_name("S3 / Local Files", id="s3")
|
|
190
190
|
@config_class(DataLakeSourceConfig)
|
|
191
|
-
@support_status(SupportStatus.
|
|
191
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
192
192
|
@capability(
|
|
193
193
|
SourceCapability.CONTAINERS,
|
|
194
194
|
"Enabled by default",
|
|
@@ -527,7 +527,7 @@ class SalesforceApi:
|
|
|
527
527
|
|
|
528
528
|
@platform_name("Salesforce")
|
|
529
529
|
@config_class(SalesforceConfig)
|
|
530
|
-
@support_status(SupportStatus.
|
|
530
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
531
531
|
@capability(
|
|
532
532
|
capability_name=SourceCapability.PLATFORM_INSTANCE,
|
|
533
533
|
description="Can be equivalent to Salesforce organization",
|
|
@@ -245,7 +245,7 @@ DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
|
|
|
245
245
|
|
|
246
246
|
@platform_name("Slack")
|
|
247
247
|
@config_class(SlackSourceConfig)
|
|
248
|
-
@support_status(SupportStatus.
|
|
248
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
249
249
|
class SlackSource(StatefulIngestionSourceBase):
|
|
250
250
|
def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
|
|
251
251
|
super().__init__(config, ctx)
|
|
@@ -21,6 +21,7 @@ from datahub.configuration.time_window_config import (
|
|
|
21
21
|
)
|
|
22
22
|
from datahub.ingestion.api.closeable import Closeable
|
|
23
23
|
from datahub.ingestion.api.common import PipelineContext
|
|
24
|
+
from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status
|
|
24
25
|
from datahub.ingestion.api.report import Report
|
|
25
26
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
26
27
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
@@ -750,6 +751,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
750
751
|
self._exit_stack.close()
|
|
751
752
|
|
|
752
753
|
|
|
754
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
755
|
+
@config_class(SnowflakeQueriesSourceConfig)
|
|
753
756
|
class SnowflakeQueriesSource(Source):
|
|
754
757
|
def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
|
|
755
758
|
self.ctx = ctx
|
|
@@ -59,7 +59,7 @@ class SnowflakeSummaryReport(SourceReport, BaseTimeWindowReport):
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@config_class(SnowflakeSummaryConfig)
|
|
62
|
-
@support_status(SupportStatus.
|
|
62
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
63
63
|
class SnowflakeSummarySource(Source):
|
|
64
64
|
def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig):
|
|
65
65
|
super().__init__(ctx)
|
|
@@ -93,7 +93,7 @@ class SqlQueriesSourceReport(SourceReport):
|
|
|
93
93
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
@platform_name("SQL Queries")
|
|
96
|
+
@platform_name("SQL Queries", id="sql-queries")
|
|
97
97
|
@config_class(SqlQueriesSourceConfig)
|
|
98
98
|
@support_status(SupportStatus.INCUBATING)
|
|
99
99
|
@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries")
|
|
@@ -176,7 +176,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
176
176
|
supported=True,
|
|
177
177
|
)
|
|
178
178
|
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
179
|
-
@support_status(SupportStatus.
|
|
179
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
180
180
|
class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
181
181
|
"""
|
|
182
182
|
This plugin extracts the following metadata from Databricks Unity Catalog:
|
|
@@ -145,7 +145,7 @@ class PipelineMetadata:
|
|
|
145
145
|
|
|
146
146
|
@platform_name("Vertex AI", id="vertexai")
|
|
147
147
|
@config_class(VertexAIConfig)
|
|
148
|
-
@support_status(SupportStatus.
|
|
148
|
+
@support_status(SupportStatus.INCUBATING)
|
|
149
149
|
@capability(
|
|
150
150
|
SourceCapability.DESCRIPTIONS,
|
|
151
151
|
"Extract descriptions for Vertex AI Registered Models and Model Versions",
|