acryl-datahub 1.3.0.1rc5__py3-none-any.whl → 1.3.0.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/METADATA +2332 -2333
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/RECORD +47 -42
- datahub/_version.py +1 -1
- datahub/cli/docker_check.py +1 -1
- datahub/emitter/mce_builder.py +6 -0
- datahub/ingestion/autogenerated/capability_summary.json +12 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +2 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/dremio/dremio_source.py +15 -15
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/config.py +33 -0
- datahub/ingestion/source/fivetran/fivetran.py +184 -13
- datahub/ingestion/source/fivetran/fivetran_log_api.py +20 -5
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/metabase.py +23 -4
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
- datahub/ingestion/source/sql_queries.py +1 -1
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/metadata/_internal_schema_classes.py +223 -0
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +208 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/sdk/mlmodel.py +19 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/top_level.txt +0 -0
|
@@ -34,6 +34,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
34
34
|
API_ENDPOINT = "API Endpoint"
|
|
35
35
|
SLACK_CHANNEL = "Slack Channel"
|
|
36
36
|
PROJECTIONS = "Projections"
|
|
37
|
+
GOOGLE_SHEETS = "Google Sheets"
|
|
38
|
+
GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
|
|
37
39
|
|
|
38
40
|
# TODO: Create separate entity...
|
|
39
41
|
NOTEBOOK = "Notebook"
|
|
@@ -338,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
338
338
|
return
|
|
339
339
|
|
|
340
340
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
341
|
-
platform=
|
|
342
|
-
name=
|
|
343
|
-
env=self.config.env,
|
|
341
|
+
platform=self.get_platform(),
|
|
342
|
+
name=dataset_name,
|
|
344
343
|
platform_instance=self.config.platform_instance,
|
|
344
|
+
env=self.config.env,
|
|
345
345
|
)
|
|
346
346
|
|
|
347
347
|
for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
|
|
@@ -421,10 +421,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
421
421
|
schema_str = ".".join(dataset_info.path)
|
|
422
422
|
dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
|
|
423
423
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
424
|
-
platform=
|
|
425
|
-
name=
|
|
426
|
-
env=self.config.env,
|
|
424
|
+
platform=self.get_platform(),
|
|
425
|
+
name=dataset_name,
|
|
427
426
|
platform_instance=self.config.platform_instance,
|
|
427
|
+
env=self.config.env,
|
|
428
428
|
)
|
|
429
429
|
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
430
430
|
|
|
@@ -436,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
436
436
|
"""
|
|
437
437
|
upstream_urns = [
|
|
438
438
|
make_dataset_urn_with_platform_instance(
|
|
439
|
-
platform=
|
|
440
|
-
name=
|
|
441
|
-
env=self.config.env,
|
|
439
|
+
platform=self.get_platform(),
|
|
440
|
+
name=upstream_table.lower(),
|
|
442
441
|
platform_instance=self.config.platform_instance,
|
|
442
|
+
env=self.config.env,
|
|
443
443
|
)
|
|
444
444
|
for upstream_table in parents
|
|
445
445
|
]
|
|
@@ -498,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
498
498
|
if query.query and query.affected_dataset:
|
|
499
499
|
upstream_urns = [
|
|
500
500
|
make_dataset_urn_with_platform_instance(
|
|
501
|
-
platform=
|
|
502
|
-
name=
|
|
503
|
-
env=self.config.env,
|
|
501
|
+
platform=self.get_platform(),
|
|
502
|
+
name=ds.lower(),
|
|
504
503
|
platform_instance=self.config.platform_instance,
|
|
504
|
+
env=self.config.env,
|
|
505
505
|
)
|
|
506
506
|
for ds in query.queried_datasets
|
|
507
507
|
]
|
|
508
508
|
|
|
509
509
|
downstream_urn = make_dataset_urn_with_platform_instance(
|
|
510
|
-
platform=
|
|
511
|
-
name=
|
|
512
|
-
env=self.config.env,
|
|
510
|
+
platform=self.get_platform(),
|
|
511
|
+
name=query.affected_dataset.lower(),
|
|
513
512
|
platform_instance=self.config.platform_instance,
|
|
513
|
+
env=self.config.env,
|
|
514
514
|
)
|
|
515
515
|
|
|
516
516
|
# Add query to SqlParsingAggregator
|
|
@@ -163,7 +163,7 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
|
|
|
163
163
|
|
|
164
164
|
@platform_name("DynamoDB", id="dynamodb")
|
|
165
165
|
@config_class(DynamoDBConfig)
|
|
166
|
-
@support_status(SupportStatus.
|
|
166
|
+
@support_status(SupportStatus.INCUBATING)
|
|
167
167
|
@capability(
|
|
168
168
|
SourceCapability.PLATFORM_INSTANCE,
|
|
169
169
|
"By default, platform_instance will use the AWS account id",
|
|
@@ -68,14 +68,22 @@ class Constant:
|
|
|
68
68
|
SUCCESSFUL = "SUCCESSFUL"
|
|
69
69
|
FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
|
|
70
70
|
CANCELED = "CANCELED"
|
|
71
|
+
GOOGLE_SHEETS_CONNECTOR_TYPE = "google_sheets"
|
|
71
72
|
|
|
72
73
|
|
|
74
|
+
# Key: Connector Type, Value: Platform ID/Name
|
|
73
75
|
KNOWN_DATA_PLATFORM_MAPPING = {
|
|
74
76
|
"google_cloud_postgresql": "postgres",
|
|
75
77
|
"postgres": "postgres",
|
|
76
78
|
"snowflake": "snowflake",
|
|
79
|
+
Constant.GOOGLE_SHEETS_CONNECTOR_TYPE: Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
77
80
|
}
|
|
78
81
|
|
|
82
|
+
# Note: (As of Oct 2025) Fivetran Platform Connector has stale lineage metadata for Google Sheets column data (deleted/renamed).
|
|
83
|
+
# Ref: https://fivetran.com/docs/connectors/files/google-sheets#deletingdata
|
|
84
|
+
# TODO: Remove Google Sheets connector type from DISABLE_LINEAGE_FOR_CONNECTOR_TYPES
|
|
85
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES = [Constant.GOOGLE_SHEETS_CONNECTOR_TYPE]
|
|
86
|
+
|
|
79
87
|
|
|
80
88
|
class SnowflakeDestinationConfig(SnowflakeConnectionConfig):
|
|
81
89
|
database: str = Field(description="The fivetran connector log database.")
|
|
@@ -97,6 +105,17 @@ class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
|
|
|
97
105
|
return warehouse_id
|
|
98
106
|
|
|
99
107
|
|
|
108
|
+
class FivetranAPIConfig(ConfigModel):
|
|
109
|
+
api_key: str = Field(description="Fivetran API key")
|
|
110
|
+
api_secret: str = Field(description="Fivetran API secret")
|
|
111
|
+
base_url: str = Field(
|
|
112
|
+
default="https://api.fivetran.com", description="Fivetran API base URL"
|
|
113
|
+
)
|
|
114
|
+
request_timeout_sec: int = Field(
|
|
115
|
+
default=30, description="Request timeout in seconds"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
100
119
|
class FivetranLogConfig(ConfigModel):
|
|
101
120
|
destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
|
|
102
121
|
pydantic.Field(
|
|
@@ -163,6 +182,7 @@ class MetadataExtractionPerfReport(Report):
|
|
|
163
182
|
@dataclasses.dataclass
|
|
164
183
|
class FivetranSourceReport(StaleEntityRemovalSourceReport):
|
|
165
184
|
connectors_scanned: int = 0
|
|
185
|
+
fivetran_rest_api_call_count: int = 0
|
|
166
186
|
filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
167
187
|
metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
|
|
168
188
|
default_factory=MetadataExtractionPerfReport
|
|
@@ -174,6 +194,9 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport):
|
|
|
174
194
|
def report_connectors_dropped(self, connector: str) -> None:
|
|
175
195
|
self.filtered_connectors.append(connector)
|
|
176
196
|
|
|
197
|
+
def report_fivetran_rest_api_call_count(self) -> None:
|
|
198
|
+
self.fivetran_rest_api_call_count += 1
|
|
199
|
+
|
|
177
200
|
|
|
178
201
|
class PlatformDetail(ConfigModel):
|
|
179
202
|
platform: Optional[str] = pydantic.Field(
|
|
@@ -234,6 +257,16 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
|
234
257
|
description="A mapping of destination id to its platform/instance/env details.",
|
|
235
258
|
)
|
|
236
259
|
|
|
260
|
+
"""
|
|
261
|
+
Use Fivetran REST API to get :
|
|
262
|
+
- Google Sheets Connector details and emit related entities
|
|
263
|
+
Fivetran Platform Connector syncs limited information about the Google Sheets Connector.
|
|
264
|
+
"""
|
|
265
|
+
api_config: Optional[FivetranAPIConfig] = Field(
|
|
266
|
+
default=None,
|
|
267
|
+
description="Fivetran REST API configuration, used to provide wider support for connections.",
|
|
268
|
+
)
|
|
269
|
+
|
|
237
270
|
@pydantic.root_validator(pre=True)
|
|
238
271
|
def compat_sources_to_database(cls, values: Dict) -> Dict:
|
|
239
272
|
if "sources_to_database" in values:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, Iterable, List, Optional, Union
|
|
3
|
+
from urllib.parse import urlparse
|
|
3
4
|
|
|
4
5
|
import datahub.emitter.mce_builder as builder
|
|
5
6
|
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
@@ -22,6 +23,7 @@ from datahub.ingestion.api.source import (
|
|
|
22
23
|
StructuredLogCategory,
|
|
23
24
|
)
|
|
24
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
25
27
|
from datahub.ingestion.source.fivetran.config import (
|
|
26
28
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
27
29
|
Constant,
|
|
@@ -35,29 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
|
|
|
35
37
|
MAX_JOBS_PER_CONNECTOR,
|
|
36
38
|
MAX_TABLE_LINEAGE_PER_CONNECTOR,
|
|
37
39
|
)
|
|
40
|
+
from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
|
|
41
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
38
42
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
39
43
|
StaleEntityRemovalHandler,
|
|
40
44
|
)
|
|
41
45
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
42
46
|
StatefulIngestionSourceBase,
|
|
43
47
|
)
|
|
48
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
44
49
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
45
50
|
FineGrainedLineage,
|
|
46
51
|
FineGrainedLineageDownstreamType,
|
|
47
52
|
FineGrainedLineageUpstreamType,
|
|
53
|
+
UpstreamLineage,
|
|
54
|
+
)
|
|
55
|
+
from datahub.metadata.schema_classes import (
|
|
56
|
+
DatasetLineageTypeClass,
|
|
57
|
+
UpstreamClass,
|
|
48
58
|
)
|
|
49
59
|
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
50
60
|
from datahub.sdk.dataflow import DataFlow
|
|
51
61
|
from datahub.sdk.datajob import DataJob
|
|
62
|
+
from datahub.sdk.dataset import Dataset
|
|
52
63
|
from datahub.sdk.entity import Entity
|
|
53
64
|
|
|
54
65
|
# Logger instance
|
|
55
66
|
logger = logging.getLogger(__name__)
|
|
67
|
+
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
|
|
56
68
|
|
|
57
69
|
|
|
58
70
|
@platform_name("Fivetran")
|
|
59
71
|
@config_class(FivetranSourceConfig)
|
|
60
|
-
@support_status(SupportStatus.
|
|
72
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
61
73
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
62
74
|
@capability(
|
|
63
75
|
SourceCapability.LINEAGE_FINE,
|
|
@@ -76,8 +88,12 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
76
88
|
super().__init__(config, ctx)
|
|
77
89
|
self.config = config
|
|
78
90
|
self.report = FivetranSourceReport()
|
|
79
|
-
|
|
80
91
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
92
|
+
self.api_client: Optional[FivetranAPIClient] = None
|
|
93
|
+
self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
|
|
94
|
+
|
|
95
|
+
if self.config.api_config:
|
|
96
|
+
self.api_client = FivetranAPIClient(self.config.api_config)
|
|
81
97
|
|
|
82
98
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
83
99
|
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
@@ -131,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
131
147
|
if source_details.include_schema_in_urn
|
|
132
148
|
else lineage.source_table.split(".", 1)[1]
|
|
133
149
|
)
|
|
134
|
-
input_dataset_urn =
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
150
|
+
input_dataset_urn: Optional[DatasetUrn] = None
|
|
151
|
+
# Special Handling for Google Sheets Connectors
|
|
152
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
153
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
154
|
+
# This is cached in the api_client
|
|
155
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
156
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if gsheets_conn_details:
|
|
160
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
161
|
+
platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
162
|
+
table_name=self._get_gsheet_named_range_dataset_id(
|
|
163
|
+
gsheets_conn_details
|
|
164
|
+
),
|
|
165
|
+
env=source_details.env,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
self.report.warning(
|
|
169
|
+
title="Failed to extract lineage for Google Sheets Connector",
|
|
170
|
+
message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
|
|
171
|
+
context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
input_dataset_urn = DatasetUrn.create_from_ids(
|
|
175
|
+
platform_id=source_details.platform,
|
|
176
|
+
table_name=(
|
|
177
|
+
f"{source_details.database.lower()}.{source_table}"
|
|
178
|
+
if source_details.database
|
|
179
|
+
else source_table
|
|
180
|
+
),
|
|
181
|
+
env=source_details.env,
|
|
182
|
+
platform_instance=source_details.platform_instance,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if input_dataset_urn:
|
|
186
|
+
input_dataset_urn_list.append(input_dataset_urn)
|
|
145
187
|
|
|
146
188
|
destination_table = (
|
|
147
189
|
lineage.destination_table
|
|
@@ -262,6 +304,67 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
262
304
|
clone_outlets=True,
|
|
263
305
|
)
|
|
264
306
|
|
|
307
|
+
def _get_connection_details_by_id(
|
|
308
|
+
self, connection_id: str
|
|
309
|
+
) -> Optional[FivetranConnectionDetails]:
|
|
310
|
+
if self.api_client is None:
|
|
311
|
+
self.report.warning(
|
|
312
|
+
title="Fivetran API client is not initialized",
|
|
313
|
+
message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
|
|
314
|
+
context=f"connector_id: {connection_id}",
|
|
315
|
+
)
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
if connection_id in self._connection_details_cache:
|
|
319
|
+
return self._connection_details_cache[connection_id]
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
self.report.report_fivetran_rest_api_call_count()
|
|
323
|
+
conn_details = self.api_client.get_connection_details_by_id(connection_id)
|
|
324
|
+
# Update Cache
|
|
325
|
+
if conn_details:
|
|
326
|
+
self._connection_details_cache[connection_id] = conn_details
|
|
327
|
+
|
|
328
|
+
return conn_details
|
|
329
|
+
except Exception as e:
|
|
330
|
+
self.report.warning(
|
|
331
|
+
title="Failed to get connection details for Google Sheets Connector",
|
|
332
|
+
message=f"Exception occurred while getting connection details from Fivetran API. {e}",
|
|
333
|
+
context=f"connector_id: {connection_id}",
|
|
334
|
+
)
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def _get_gsheet_sheet_id_from_url(
|
|
338
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
339
|
+
) -> str:
|
|
340
|
+
# Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
|
|
341
|
+
# "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
342
|
+
try:
|
|
343
|
+
parsed = urlparse(gsheets_conn_details.config.sheet_id)
|
|
344
|
+
# Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
|
|
345
|
+
parts = parsed.path.split("/")
|
|
346
|
+
return parts[3] if len(parts) > 2 else ""
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return ""
|
|
353
|
+
|
|
354
|
+
def _get_gsheet_named_range_dataset_id(
|
|
355
|
+
self, gsheets_conn_details: FivetranConnectionDetails
|
|
356
|
+
) -> str:
|
|
357
|
+
sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
|
|
358
|
+
named_range_id = (
|
|
359
|
+
f"{sheet_id}.{gsheets_conn_details.config.named_range}"
|
|
360
|
+
if sheet_id
|
|
361
|
+
else gsheets_conn_details.config.named_range
|
|
362
|
+
)
|
|
363
|
+
logger.debug(
|
|
364
|
+
f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
|
|
365
|
+
)
|
|
366
|
+
return named_range_id
|
|
367
|
+
|
|
265
368
|
def _get_dpi_workunits(
|
|
266
369
|
self, job: Job, dpi: DataProcessInstance
|
|
267
370
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -295,6 +398,74 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
295
398
|
self, connector: Connector
|
|
296
399
|
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
297
400
|
self.report.report_connectors_scanned()
|
|
401
|
+
|
|
402
|
+
"""
|
|
403
|
+
-------------------------------------------------------
|
|
404
|
+
Special Handling for Google Sheets Connectors
|
|
405
|
+
-------------------------------------------------------
|
|
406
|
+
Google Sheets source is not supported by Datahub yet.
|
|
407
|
+
As a workaround, we are emitting a dataset entity for the Google Sheet
|
|
408
|
+
and adding it to the lineage. This workaround needs to be removed once
|
|
409
|
+
Datahub supports Google Sheets source natively.
|
|
410
|
+
-------------------------------------------------------
|
|
411
|
+
"""
|
|
412
|
+
if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
|
|
413
|
+
# Get Google Sheet dataset details from Fivetran API
|
|
414
|
+
gsheets_conn_details: Optional[FivetranConnectionDetails] = (
|
|
415
|
+
self._get_connection_details_by_id(connector.connector_id)
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
if gsheets_conn_details:
|
|
419
|
+
gsheets_dataset = Dataset(
|
|
420
|
+
name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
|
|
421
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
422
|
+
env=self.config.env,
|
|
423
|
+
display_name=self._get_gsheet_sheet_id_from_url(
|
|
424
|
+
gsheets_conn_details
|
|
425
|
+
),
|
|
426
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
427
|
+
created=gsheets_conn_details.created_at,
|
|
428
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
429
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS,
|
|
430
|
+
custom_properties={
|
|
431
|
+
"ingested_by": "fivetran source",
|
|
432
|
+
"connector_id": gsheets_conn_details.id,
|
|
433
|
+
},
|
|
434
|
+
)
|
|
435
|
+
gsheets_named_range_dataset = Dataset(
|
|
436
|
+
name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
|
|
437
|
+
platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
|
|
438
|
+
env=self.config.env,
|
|
439
|
+
display_name=gsheets_conn_details.config.named_range,
|
|
440
|
+
external_url=gsheets_conn_details.config.sheet_id,
|
|
441
|
+
created=gsheets_conn_details.created_at,
|
|
442
|
+
last_modified=gsheets_conn_details.source_sync_details.last_synced,
|
|
443
|
+
subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
|
|
444
|
+
custom_properties={
|
|
445
|
+
"ingested_by": "fivetran source",
|
|
446
|
+
"connector_id": gsheets_conn_details.id,
|
|
447
|
+
},
|
|
448
|
+
upstreams=UpstreamLineage(
|
|
449
|
+
upstreams=[
|
|
450
|
+
UpstreamClass(
|
|
451
|
+
dataset=str(gsheets_dataset.urn),
|
|
452
|
+
type=DatasetLineageTypeClass.VIEW,
|
|
453
|
+
auditStamp=AuditStamp(
|
|
454
|
+
time=int(
|
|
455
|
+
gsheets_conn_details.created_at.timestamp()
|
|
456
|
+
* 1000
|
|
457
|
+
),
|
|
458
|
+
actor=CORPUSER_DATAHUB,
|
|
459
|
+
),
|
|
460
|
+
)
|
|
461
|
+
],
|
|
462
|
+
fineGrainedLineages=None,
|
|
463
|
+
),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
yield gsheets_dataset
|
|
467
|
+
yield gsheets_named_range_dataset
|
|
468
|
+
|
|
298
469
|
# Create dataflow entity with same name as connector name
|
|
299
470
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
300
471
|
yield dataflow
|
|
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
|
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
|
|
11
11
|
from datahub.ingestion.source.fivetran.config import (
|
|
12
|
+
DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
|
|
12
13
|
Constant,
|
|
13
14
|
FivetranLogConfig,
|
|
14
15
|
FivetranSourceReport,
|
|
@@ -112,7 +113,11 @@ class FivetranLogAPI:
|
|
|
112
113
|
"""
|
|
113
114
|
Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
|
|
114
115
|
"""
|
|
115
|
-
all_column_lineage = defaultdict(list)
|
|
116
|
+
all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
|
|
117
|
+
|
|
118
|
+
if not connector_ids:
|
|
119
|
+
return dict(all_column_lineage)
|
|
120
|
+
|
|
116
121
|
column_lineage_result = self._query(
|
|
117
122
|
self.fivetran_log_query.get_column_lineage_query(
|
|
118
123
|
connector_ids=connector_ids
|
|
@@ -130,7 +135,11 @@ class FivetranLogAPI:
|
|
|
130
135
|
"""
|
|
131
136
|
Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
|
|
132
137
|
"""
|
|
133
|
-
connectors_table_lineage_metadata = defaultdict(list)
|
|
138
|
+
connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
|
|
139
|
+
|
|
140
|
+
if not connector_ids:
|
|
141
|
+
return dict(connectors_table_lineage_metadata)
|
|
142
|
+
|
|
134
143
|
table_lineage_result = self._query(
|
|
135
144
|
self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
|
|
136
145
|
)
|
|
@@ -246,9 +255,15 @@ class FivetranLogAPI:
|
|
|
246
255
|
return self._get_users().get(user_id)
|
|
247
256
|
|
|
248
257
|
def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
|
|
249
|
-
connector_ids
|
|
250
|
-
|
|
251
|
-
|
|
258
|
+
# Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
|
|
259
|
+
tll_connector_ids: List[str] = []
|
|
260
|
+
cll_connector_ids: List[str] = []
|
|
261
|
+
for connector in connectors:
|
|
262
|
+
tll_connector_ids.append(connector.connector_id)
|
|
263
|
+
if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
|
|
264
|
+
cll_connector_ids.append(connector.connector_id)
|
|
265
|
+
table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
|
|
266
|
+
column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
|
|
252
267
|
for connector in connectors:
|
|
253
268
|
connector.lineage = self._extract_connector_lineage(
|
|
254
269
|
table_lineage_result=table_lineage_metadata.get(connector.connector_id),
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from requests.adapters import HTTPAdapter
|
|
5
|
+
from urllib3.util import Retry
|
|
6
|
+
|
|
7
|
+
from datahub.ingestion.source.fivetran.config import (
|
|
8
|
+
FivetranAPIConfig,
|
|
9
|
+
)
|
|
10
|
+
from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Retry configuration constants
|
|
15
|
+
RETRY_MAX_TIMES = 3
|
|
16
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
17
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
18
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FivetranAPIClient:
|
|
22
|
+
"""Client for interacting with the Fivetran REST API."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: FivetranAPIConfig) -> None:
|
|
25
|
+
self.config = config
|
|
26
|
+
self._session = self._create_session()
|
|
27
|
+
|
|
28
|
+
def _create_session(self) -> requests.Session:
|
|
29
|
+
"""
|
|
30
|
+
Create a session with retry logic and basic authentication
|
|
31
|
+
"""
|
|
32
|
+
requests_session = requests.Session()
|
|
33
|
+
|
|
34
|
+
# Configure retry strategy for transient failures
|
|
35
|
+
retry_strategy = Retry(
|
|
36
|
+
total=RETRY_MAX_TIMES,
|
|
37
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
38
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
39
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
40
|
+
raise_on_status=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
44
|
+
requests_session.mount("http://", adapter)
|
|
45
|
+
requests_session.mount("https://", adapter)
|
|
46
|
+
|
|
47
|
+
# Set up basic authentication
|
|
48
|
+
requests_session.auth = (self.config.api_key, self.config.api_secret)
|
|
49
|
+
requests_session.headers.update(
|
|
50
|
+
{
|
|
51
|
+
"Content-Type": "application/json",
|
|
52
|
+
"Accept": "application/json",
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return requests_session
|
|
56
|
+
|
|
57
|
+
def get_connection_details_by_id(
|
|
58
|
+
self, connection_id: str
|
|
59
|
+
) -> FivetranConnectionDetails:
|
|
60
|
+
"""Get details for a specific connection."""
|
|
61
|
+
connection_details = self._session.get(
|
|
62
|
+
f"{self.config.base_url}/v1/connections/{connection_id}",
|
|
63
|
+
timeout=self.config.request_timeout_sec,
|
|
64
|
+
)
|
|
65
|
+
return FivetranConnectionDetails(**connection_details.json().get("data", {}))
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FivetranConnectionWarnings(BaseModel):
|
|
8
|
+
code: str # Warning Code
|
|
9
|
+
message: str # Warning Message
|
|
10
|
+
details: Dict # Warning Details
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FivetranConnectionStatus(BaseModel):
|
|
14
|
+
setup_state: str # Setup State
|
|
15
|
+
schema_status: str # Schema Status
|
|
16
|
+
sync_state: str # Sync State
|
|
17
|
+
update_state: str # Update State
|
|
18
|
+
is_historical_sync: bool # Is Historical Sync
|
|
19
|
+
warnings: List[FivetranConnectionWarnings] # Warnings
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FivetranConnectionConfig(BaseModel):
|
|
23
|
+
# Note: Connection Config is different for different connectors
|
|
24
|
+
auth_type: str # Auth Type
|
|
25
|
+
sheet_id: str # Sheet ID - URL to the Google Sheet
|
|
26
|
+
named_range: str # Named Range
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FivetranConnectionSourceSyncDetails(BaseModel):
|
|
30
|
+
last_synced: datetime.datetime # Last Synced
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FivetranConnectionDetails(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
id: str # Source ID
|
|
39
|
+
group_id: str # Destination ID
|
|
40
|
+
service: str # Connector Type
|
|
41
|
+
created_at: datetime.datetime
|
|
42
|
+
succeeded_at: datetime.datetime
|
|
43
|
+
paused: bool # Paused Status
|
|
44
|
+
sync_frequency: int # Sync Frequency (minutes)
|
|
45
|
+
status: FivetranConnectionStatus # Status
|
|
46
|
+
config: FivetranConnectionConfig # Connection Config
|
|
47
|
+
source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
# Sample Response for Google Sheets Connector
|
|
51
|
+
{
|
|
52
|
+
"code": "Success",
|
|
53
|
+
"data": {
|
|
54
|
+
"id": "dialectical_remindful",
|
|
55
|
+
"group_id": "empties_classification",
|
|
56
|
+
"service": "google_sheets",
|
|
57
|
+
"service_version": 1,
|
|
58
|
+
"schema": "fivetran_google_sheets.fivetran_google_sheets",
|
|
59
|
+
"connected_by": "sewn_restrained",
|
|
60
|
+
"created_at": "2025-10-06T17:53:01.554289Z",
|
|
61
|
+
"succeeded_at": "2025-10-06T22:55:45.275000Z",
|
|
62
|
+
"failed_at": null,
|
|
63
|
+
"paused": true,
|
|
64
|
+
"pause_after_trial": false,
|
|
65
|
+
"sync_frequency": 360,
|
|
66
|
+
"data_delay_threshold": 0,
|
|
67
|
+
"data_delay_sensitivity": "NORMAL",
|
|
68
|
+
"private_link_id": null,
|
|
69
|
+
"networking_method": "Directly",
|
|
70
|
+
"proxy_agent_id": null,
|
|
71
|
+
"schedule_type": "auto",
|
|
72
|
+
"status": {
|
|
73
|
+
"setup_state": "connected",
|
|
74
|
+
"schema_status": "ready",
|
|
75
|
+
"sync_state": "paused",
|
|
76
|
+
"update_state": "on_schedule",
|
|
77
|
+
"is_historical_sync": false,
|
|
78
|
+
"tasks": [],
|
|
79
|
+
"warnings": [
|
|
80
|
+
{
|
|
81
|
+
"code": "snowflake_discontinuing_password_auth",
|
|
82
|
+
"message": "Snowflake is discontinuing username/password authentication",
|
|
83
|
+
"details": {}
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
},
|
|
87
|
+
"config": {
|
|
88
|
+
"auth_type": "ServiceAccount",
|
|
89
|
+
"sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
|
|
90
|
+
"named_range": "Fivetran_Test_Range"
|
|
91
|
+
},
|
|
92
|
+
"source_sync_details": {
|
|
93
|
+
"last_synced": "2025-10-06T22:55:27.371Z"
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
"""
|
|
@@ -178,7 +178,7 @@ class HexReport(
|
|
|
178
178
|
|
|
179
179
|
@platform_name("Hex")
|
|
180
180
|
@config_class(HexSourceConfig)
|
|
181
|
-
@support_status(SupportStatus.
|
|
181
|
+
@support_status(SupportStatus.INCUBATING)
|
|
182
182
|
@capability(SourceCapability.DESCRIPTIONS, "Supported by default")
|
|
183
183
|
@capability(SourceCapability.OWNERSHIP, "Supported by default")
|
|
184
184
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -118,7 +118,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
@platform_name("Iceberg")
|
|
121
|
-
@support_status(SupportStatus.
|
|
121
|
+
@support_status(SupportStatus.INCUBATING)
|
|
122
122
|
@config_class(IcebergSourceConfig)
|
|
123
123
|
@capability(
|
|
124
124
|
SourceCapability.PLATFORM_INSTANCE,
|