acryl-datahub 1.3.0.1rc5__py3-none-any.whl → 1.3.0.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/METADATA +2332 -2333
  2. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/RECORD +47 -42
  3. datahub/_version.py +1 -1
  4. datahub/cli/docker_check.py +1 -1
  5. datahub/emitter/mce_builder.py +6 -0
  6. datahub/ingestion/autogenerated/capability_summary.json +12 -12
  7. datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
  8. datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
  9. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +2 -0
  10. datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
  11. datahub/ingestion/source/common/subtypes.py +2 -0
  12. datahub/ingestion/source/dremio/dremio_source.py +15 -15
  13. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  14. datahub/ingestion/source/fivetran/config.py +33 -0
  15. datahub/ingestion/source/fivetran/fivetran.py +184 -13
  16. datahub/ingestion/source/fivetran/fivetran_log_api.py +20 -5
  17. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  18. datahub/ingestion/source/fivetran/response_models.py +97 -0
  19. datahub/ingestion/source/hex/hex.py +1 -1
  20. datahub/ingestion/source/iceberg/iceberg.py +1 -1
  21. datahub/ingestion/source/metabase.py +23 -4
  22. datahub/ingestion/source/mlflow.py +1 -1
  23. datahub/ingestion/source/s3/source.py +1 -1
  24. datahub/ingestion/source/salesforce.py +1 -1
  25. datahub/ingestion/source/slack/slack.py +1 -1
  26. datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
  27. datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
  28. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -1
  29. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
  30. datahub/ingestion/source/sql_queries.py +1 -1
  31. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  32. datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
  33. datahub/ingestion/source/unity/source.py +1 -1
  34. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  35. datahub/metadata/_internal_schema_classes.py +223 -0
  36. datahub/metadata/_urns/urn_defs.py +56 -0
  37. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  38. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  39. datahub/metadata/schema.avsc +208 -0
  40. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  41. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  42. datahub/sdk/mlmodel.py +19 -0
  43. datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
  44. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/licenses/LICENSE +0 -0
  47. {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,8 @@ class DatasetSubTypes(StrEnum):
34
34
  API_ENDPOINT = "API Endpoint"
35
35
  SLACK_CHANNEL = "Slack Channel"
36
36
  PROJECTIONS = "Projections"
37
+ GOOGLE_SHEETS = "Google Sheets"
38
+ GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
37
39
 
38
40
  # TODO: Create separate entity...
39
41
  NOTEBOOK = "Notebook"
@@ -338,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
338
338
  return
339
339
 
340
340
  dataset_urn = make_dataset_urn_with_platform_instance(
341
- platform=make_data_platform_urn(self.get_platform()),
342
- name=f"dremio.{dataset_name}",
343
- env=self.config.env,
341
+ platform=self.get_platform(),
342
+ name=dataset_name,
344
343
  platform_instance=self.config.platform_instance,
344
+ env=self.config.env,
345
345
  )
346
346
 
347
347
  for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
@@ -421,10 +421,10 @@ class DremioSource(StatefulIngestionSourceBase):
421
421
  schema_str = ".".join(dataset_info.path)
422
422
  dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
423
423
  dataset_urn = make_dataset_urn_with_platform_instance(
424
- platform=make_data_platform_urn(self.get_platform()),
425
- name=f"dremio.{dataset_name}",
426
- env=self.config.env,
424
+ platform=self.get_platform(),
425
+ name=dataset_name,
427
426
  platform_instance=self.config.platform_instance,
427
+ env=self.config.env,
428
428
  )
429
429
  yield from self.profiler.get_workunits(dataset_info, dataset_urn)
430
430
 
@@ -436,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
436
436
  """
437
437
  upstream_urns = [
438
438
  make_dataset_urn_with_platform_instance(
439
- platform=make_data_platform_urn(self.get_platform()),
440
- name=f"dremio.{upstream_table.lower()}",
441
- env=self.config.env,
439
+ platform=self.get_platform(),
440
+ name=upstream_table.lower(),
442
441
  platform_instance=self.config.platform_instance,
442
+ env=self.config.env,
443
443
  )
444
444
  for upstream_table in parents
445
445
  ]
@@ -498,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
498
498
  if query.query and query.affected_dataset:
499
499
  upstream_urns = [
500
500
  make_dataset_urn_with_platform_instance(
501
- platform=make_data_platform_urn(self.get_platform()),
502
- name=f"dremio.{ds.lower()}",
503
- env=self.config.env,
501
+ platform=self.get_platform(),
502
+ name=ds.lower(),
504
503
  platform_instance=self.config.platform_instance,
504
+ env=self.config.env,
505
505
  )
506
506
  for ds in query.queried_datasets
507
507
  ]
508
508
 
509
509
  downstream_urn = make_dataset_urn_with_platform_instance(
510
- platform=make_data_platform_urn(self.get_platform()),
511
- name=f"dremio.{query.affected_dataset.lower()}",
512
- env=self.config.env,
510
+ platform=self.get_platform(),
511
+ name=query.affected_dataset.lower(),
513
512
  platform_instance=self.config.platform_instance,
513
+ env=self.config.env,
514
514
  )
515
515
 
516
516
  # Add query to SqlParsingAggregator
@@ -163,7 +163,7 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
163
163
 
164
164
  @platform_name("DynamoDB", id="dynamodb")
165
165
  @config_class(DynamoDBConfig)
166
- @support_status(SupportStatus.TESTING)
166
+ @support_status(SupportStatus.INCUBATING)
167
167
  @capability(
168
168
  SourceCapability.PLATFORM_INSTANCE,
169
169
  "By default, platform_instance will use the AWS account id",
@@ -68,14 +68,22 @@ class Constant:
68
68
  SUCCESSFUL = "SUCCESSFUL"
69
69
  FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
70
70
  CANCELED = "CANCELED"
71
+ GOOGLE_SHEETS_CONNECTOR_TYPE = "google_sheets"
71
72
 
72
73
 
74
+ # Key: Connector Type, Value: Platform ID/Name
73
75
  KNOWN_DATA_PLATFORM_MAPPING = {
74
76
  "google_cloud_postgresql": "postgres",
75
77
  "postgres": "postgres",
76
78
  "snowflake": "snowflake",
79
+ Constant.GOOGLE_SHEETS_CONNECTOR_TYPE: Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
77
80
  }
78
81
 
82
+ # Note: (As of Oct 2025) Fivetran Platform Connector has stale lineage metadata for Google Sheets column data (deleted/renamed).
83
+ # Ref: https://fivetran.com/docs/connectors/files/google-sheets#deletingdata
84
+ # TODO: Remove Google Sheets connector type from DISABLE_LINEAGE_FOR_CONNECTOR_TYPES
85
+ DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES = [Constant.GOOGLE_SHEETS_CONNECTOR_TYPE]
86
+
79
87
 
80
88
  class SnowflakeDestinationConfig(SnowflakeConnectionConfig):
81
89
  database: str = Field(description="The fivetran connector log database.")
@@ -97,6 +105,17 @@ class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
97
105
  return warehouse_id
98
106
 
99
107
 
108
+ class FivetranAPIConfig(ConfigModel):
109
+ api_key: str = Field(description="Fivetran API key")
110
+ api_secret: str = Field(description="Fivetran API secret")
111
+ base_url: str = Field(
112
+ default="https://api.fivetran.com", description="Fivetran API base URL"
113
+ )
114
+ request_timeout_sec: int = Field(
115
+ default=30, description="Request timeout in seconds"
116
+ )
117
+
118
+
100
119
  class FivetranLogConfig(ConfigModel):
101
120
  destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
102
121
  pydantic.Field(
@@ -163,6 +182,7 @@ class MetadataExtractionPerfReport(Report):
163
182
  @dataclasses.dataclass
164
183
  class FivetranSourceReport(StaleEntityRemovalSourceReport):
165
184
  connectors_scanned: int = 0
185
+ fivetran_rest_api_call_count: int = 0
166
186
  filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
167
187
  metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
168
188
  default_factory=MetadataExtractionPerfReport
@@ -174,6 +194,9 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport):
174
194
  def report_connectors_dropped(self, connector: str) -> None:
175
195
  self.filtered_connectors.append(connector)
176
196
 
197
+ def report_fivetran_rest_api_call_count(self) -> None:
198
+ self.fivetran_rest_api_call_count += 1
199
+
177
200
 
178
201
  class PlatformDetail(ConfigModel):
179
202
  platform: Optional[str] = pydantic.Field(
@@ -234,6 +257,16 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
234
257
  description="A mapping of destination id to its platform/instance/env details.",
235
258
  )
236
259
 
260
+ """
261
+ Use Fivetran REST API to get :
262
+ - Google Sheets Connector details and emit related entities
263
+ Fivetran Platform Connector syncs limited information about the Google Sheets Connector.
264
+ """
265
+ api_config: Optional[FivetranAPIConfig] = Field(
266
+ default=None,
267
+ description="Fivetran REST API configuration, used to provide wider support for connections.",
268
+ )
269
+
237
270
  @pydantic.root_validator(pre=True)
238
271
  def compat_sources_to_database(cls, values: Dict) -> Dict:
239
272
  if "sources_to_database" in values:
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  from typing import Dict, Iterable, List, Optional, Union
3
+ from urllib.parse import urlparse
3
4
 
4
5
  import datahub.emitter.mce_builder as builder
5
6
  from datahub.api.entities.datajob import DataJob as DataJobV1
@@ -22,6 +23,7 @@ from datahub.ingestion.api.source import (
22
23
  StructuredLogCategory,
23
24
  )
24
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
25
27
  from datahub.ingestion.source.fivetran.config import (
26
28
  KNOWN_DATA_PLATFORM_MAPPING,
27
29
  Constant,
@@ -35,29 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
35
37
  MAX_JOBS_PER_CONNECTOR,
36
38
  MAX_TABLE_LINEAGE_PER_CONNECTOR,
37
39
  )
40
+ from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
41
+ from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
38
42
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
39
43
  StaleEntityRemovalHandler,
40
44
  )
41
45
  from datahub.ingestion.source.state.stateful_ingestion_base import (
42
46
  StatefulIngestionSourceBase,
43
47
  )
48
+ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
44
49
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
45
50
  FineGrainedLineage,
46
51
  FineGrainedLineageDownstreamType,
47
52
  FineGrainedLineageUpstreamType,
53
+ UpstreamLineage,
54
+ )
55
+ from datahub.metadata.schema_classes import (
56
+ DatasetLineageTypeClass,
57
+ UpstreamClass,
48
58
  )
49
59
  from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
50
60
  from datahub.sdk.dataflow import DataFlow
51
61
  from datahub.sdk.datajob import DataJob
62
+ from datahub.sdk.dataset import Dataset
52
63
  from datahub.sdk.entity import Entity
53
64
 
54
65
  # Logger instance
55
66
  logger = logging.getLogger(__name__)
67
+ CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
56
68
 
57
69
 
58
70
  @platform_name("Fivetran")
59
71
  @config_class(FivetranSourceConfig)
60
- @support_status(SupportStatus.INCUBATING)
72
+ @support_status(SupportStatus.CERTIFIED)
61
73
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
62
74
  @capability(
63
75
  SourceCapability.LINEAGE_FINE,
@@ -76,8 +88,12 @@ class FivetranSource(StatefulIngestionSourceBase):
76
88
  super().__init__(config, ctx)
77
89
  self.config = config
78
90
  self.report = FivetranSourceReport()
79
-
80
91
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
92
+ self.api_client: Optional[FivetranAPIClient] = None
93
+ self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
94
+
95
+ if self.config.api_config:
96
+ self.api_client = FivetranAPIClient(self.config.api_config)
81
97
 
82
98
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
83
99
  input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
@@ -131,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
131
147
  if source_details.include_schema_in_urn
132
148
  else lineage.source_table.split(".", 1)[1]
133
149
  )
134
- input_dataset_urn = DatasetUrn.create_from_ids(
135
- platform_id=source_details.platform,
136
- table_name=(
137
- f"{source_details.database.lower()}.{source_table}"
138
- if source_details.database
139
- else source_table
140
- ),
141
- env=source_details.env,
142
- platform_instance=source_details.platform_instance,
143
- )
144
- input_dataset_urn_list.append(input_dataset_urn)
150
+ input_dataset_urn: Optional[DatasetUrn] = None
151
+ # Special Handling for Google Sheets Connectors
152
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
153
+ # Get Google Sheet dataset details from Fivetran API
154
+ # This is cached in the api_client
155
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
156
+ self._get_connection_details_by_id(connector.connector_id)
157
+ )
158
+
159
+ if gsheets_conn_details:
160
+ input_dataset_urn = DatasetUrn.create_from_ids(
161
+ platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
162
+ table_name=self._get_gsheet_named_range_dataset_id(
163
+ gsheets_conn_details
164
+ ),
165
+ env=source_details.env,
166
+ )
167
+ else:
168
+ self.report.warning(
169
+ title="Failed to extract lineage for Google Sheets Connector",
170
+ message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
171
+ context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
172
+ )
173
+ else:
174
+ input_dataset_urn = DatasetUrn.create_from_ids(
175
+ platform_id=source_details.platform,
176
+ table_name=(
177
+ f"{source_details.database.lower()}.{source_table}"
178
+ if source_details.database
179
+ else source_table
180
+ ),
181
+ env=source_details.env,
182
+ platform_instance=source_details.platform_instance,
183
+ )
184
+
185
+ if input_dataset_urn:
186
+ input_dataset_urn_list.append(input_dataset_urn)
145
187
 
146
188
  destination_table = (
147
189
  lineage.destination_table
@@ -262,6 +304,67 @@ class FivetranSource(StatefulIngestionSourceBase):
262
304
  clone_outlets=True,
263
305
  )
264
306
 
307
+ def _get_connection_details_by_id(
308
+ self, connection_id: str
309
+ ) -> Optional[FivetranConnectionDetails]:
310
+ if self.api_client is None:
311
+ self.report.warning(
312
+ title="Fivetran API client is not initialized",
313
+ message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
314
+ context=f"connector_id: {connection_id}",
315
+ )
316
+ return None
317
+
318
+ if connection_id in self._connection_details_cache:
319
+ return self._connection_details_cache[connection_id]
320
+
321
+ try:
322
+ self.report.report_fivetran_rest_api_call_count()
323
+ conn_details = self.api_client.get_connection_details_by_id(connection_id)
324
+ # Update Cache
325
+ if conn_details:
326
+ self._connection_details_cache[connection_id] = conn_details
327
+
328
+ return conn_details
329
+ except Exception as e:
330
+ self.report.warning(
331
+ title="Failed to get connection details for Google Sheets Connector",
332
+ message=f"Exception occurred while getting connection details from Fivetran API. {e}",
333
+ context=f"connector_id: {connection_id}",
334
+ )
335
+ return None
336
+
337
+ def _get_gsheet_sheet_id_from_url(
338
+ self, gsheets_conn_details: FivetranConnectionDetails
339
+ ) -> str:
340
+ # Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
341
+ # "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
342
+ try:
343
+ parsed = urlparse(gsheets_conn_details.config.sheet_id)
344
+ # Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
345
+ parts = parsed.path.split("/")
346
+ return parts[3] if len(parts) > 2 else ""
347
+ except Exception as e:
348
+ logger.warning(
349
+ f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
350
+ )
351
+
352
+ return ""
353
+
354
+ def _get_gsheet_named_range_dataset_id(
355
+ self, gsheets_conn_details: FivetranConnectionDetails
356
+ ) -> str:
357
+ sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
358
+ named_range_id = (
359
+ f"{sheet_id}.{gsheets_conn_details.config.named_range}"
360
+ if sheet_id
361
+ else gsheets_conn_details.config.named_range
362
+ )
363
+ logger.debug(
364
+ f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
365
+ )
366
+ return named_range_id
367
+
265
368
  def _get_dpi_workunits(
266
369
  self, job: Job, dpi: DataProcessInstance
267
370
  ) -> Iterable[MetadataWorkUnit]:
@@ -295,6 +398,74 @@ class FivetranSource(StatefulIngestionSourceBase):
295
398
  self, connector: Connector
296
399
  ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
297
400
  self.report.report_connectors_scanned()
401
+
402
+ """
403
+ -------------------------------------------------------
404
+ Special Handling for Google Sheets Connectors
405
+ -------------------------------------------------------
406
+ Google Sheets source is not supported by Datahub yet.
407
+ As a workaround, we are emitting a dataset entity for the Google Sheet
408
+ and adding it to the lineage. This workaround needs to be removed once
409
+ Datahub supports Google Sheets source natively.
410
+ -------------------------------------------------------
411
+ """
412
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
413
+ # Get Google Sheet dataset details from Fivetran API
414
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
415
+ self._get_connection_details_by_id(connector.connector_id)
416
+ )
417
+
418
+ if gsheets_conn_details:
419
+ gsheets_dataset = Dataset(
420
+ name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
421
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
422
+ env=self.config.env,
423
+ display_name=self._get_gsheet_sheet_id_from_url(
424
+ gsheets_conn_details
425
+ ),
426
+ external_url=gsheets_conn_details.config.sheet_id,
427
+ created=gsheets_conn_details.created_at,
428
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
429
+ subtype=DatasetSubTypes.GOOGLE_SHEETS,
430
+ custom_properties={
431
+ "ingested_by": "fivetran source",
432
+ "connector_id": gsheets_conn_details.id,
433
+ },
434
+ )
435
+ gsheets_named_range_dataset = Dataset(
436
+ name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
437
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
438
+ env=self.config.env,
439
+ display_name=gsheets_conn_details.config.named_range,
440
+ external_url=gsheets_conn_details.config.sheet_id,
441
+ created=gsheets_conn_details.created_at,
442
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
443
+ subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
444
+ custom_properties={
445
+ "ingested_by": "fivetran source",
446
+ "connector_id": gsheets_conn_details.id,
447
+ },
448
+ upstreams=UpstreamLineage(
449
+ upstreams=[
450
+ UpstreamClass(
451
+ dataset=str(gsheets_dataset.urn),
452
+ type=DatasetLineageTypeClass.VIEW,
453
+ auditStamp=AuditStamp(
454
+ time=int(
455
+ gsheets_conn_details.created_at.timestamp()
456
+ * 1000
457
+ ),
458
+ actor=CORPUSER_DATAHUB,
459
+ ),
460
+ )
461
+ ],
462
+ fineGrainedLineages=None,
463
+ ),
464
+ )
465
+
466
+ yield gsheets_dataset
467
+ yield gsheets_named_range_dataset
468
+
298
469
  # Create dataflow entity with same name as connector name
299
470
  dataflow = self._generate_dataflow_from_connector(connector)
300
471
  yield dataflow
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
9
9
 
10
10
  from datahub.configuration.common import AllowDenyPattern, ConfigurationError
11
11
  from datahub.ingestion.source.fivetran.config import (
12
+ DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
12
13
  Constant,
13
14
  FivetranLogConfig,
14
15
  FivetranSourceReport,
@@ -112,7 +113,11 @@ class FivetranLogAPI:
112
113
  """
113
114
  Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
114
115
  """
115
- all_column_lineage = defaultdict(list)
116
+ all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
117
+
118
+ if not connector_ids:
119
+ return dict(all_column_lineage)
120
+
116
121
  column_lineage_result = self._query(
117
122
  self.fivetran_log_query.get_column_lineage_query(
118
123
  connector_ids=connector_ids
@@ -130,7 +135,11 @@ class FivetranLogAPI:
130
135
  """
131
136
  Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
132
137
  """
133
- connectors_table_lineage_metadata = defaultdict(list)
138
+ connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
139
+
140
+ if not connector_ids:
141
+ return dict(connectors_table_lineage_metadata)
142
+
134
143
  table_lineage_result = self._query(
135
144
  self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
136
145
  )
@@ -246,9 +255,15 @@ class FivetranLogAPI:
246
255
  return self._get_users().get(user_id)
247
256
 
248
257
  def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
249
- connector_ids = [connector.connector_id for connector in connectors]
250
- table_lineage_metadata = self._get_table_lineage_metadata(connector_ids)
251
- column_lineage_metadata = self._get_column_lineage_metadata(connector_ids)
258
+ # Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
259
+ tll_connector_ids: List[str] = []
260
+ cll_connector_ids: List[str] = []
261
+ for connector in connectors:
262
+ tll_connector_ids.append(connector.connector_id)
263
+ if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
264
+ cll_connector_ids.append(connector.connector_id)
265
+ table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
266
+ column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
252
267
  for connector in connectors:
253
268
  connector.lineage = self._extract_connector_lineage(
254
269
  table_lineage_result=table_lineage_metadata.get(connector.connector_id),
@@ -0,0 +1,65 @@
1
+ import logging
2
+
3
+ import requests
4
+ from requests.adapters import HTTPAdapter
5
+ from urllib3.util import Retry
6
+
7
+ from datahub.ingestion.source.fivetran.config import (
8
+ FivetranAPIConfig,
9
+ )
10
+ from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Retry configuration constants
15
+ RETRY_MAX_TIMES = 3
16
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
17
+ RETRY_BACKOFF_FACTOR = 1
18
+ RETRY_ALLOWED_METHODS = ["GET"]
19
+
20
+
21
+ class FivetranAPIClient:
22
+ """Client for interacting with the Fivetran REST API."""
23
+
24
+ def __init__(self, config: FivetranAPIConfig) -> None:
25
+ self.config = config
26
+ self._session = self._create_session()
27
+
28
+ def _create_session(self) -> requests.Session:
29
+ """
30
+ Create a session with retry logic and basic authentication
31
+ """
32
+ requests_session = requests.Session()
33
+
34
+ # Configure retry strategy for transient failures
35
+ retry_strategy = Retry(
36
+ total=RETRY_MAX_TIMES,
37
+ backoff_factor=RETRY_BACKOFF_FACTOR,
38
+ status_forcelist=RETRY_STATUS_CODES,
39
+ allowed_methods=RETRY_ALLOWED_METHODS,
40
+ raise_on_status=True,
41
+ )
42
+
43
+ adapter = HTTPAdapter(max_retries=retry_strategy)
44
+ requests_session.mount("http://", adapter)
45
+ requests_session.mount("https://", adapter)
46
+
47
+ # Set up basic authentication
48
+ requests_session.auth = (self.config.api_key, self.config.api_secret)
49
+ requests_session.headers.update(
50
+ {
51
+ "Content-Type": "application/json",
52
+ "Accept": "application/json",
53
+ }
54
+ )
55
+ return requests_session
56
+
57
+ def get_connection_details_by_id(
58
+ self, connection_id: str
59
+ ) -> FivetranConnectionDetails:
60
+ """Get details for a specific connection."""
61
+ connection_details = self._session.get(
62
+ f"{self.config.base_url}/v1/connections/{connection_id}",
63
+ timeout=self.config.request_timeout_sec,
64
+ )
65
+ return FivetranConnectionDetails(**connection_details.json().get("data", {}))
@@ -0,0 +1,97 @@
1
+ import datetime
2
+ from typing import Dict, List
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class FivetranConnectionWarnings(BaseModel):
8
+ code: str # Warning Code
9
+ message: str # Warning Message
10
+ details: Dict # Warning Details
11
+
12
+
13
+ class FivetranConnectionStatus(BaseModel):
14
+ setup_state: str # Setup State
15
+ schema_status: str # Schema Status
16
+ sync_state: str # Sync State
17
+ update_state: str # Update State
18
+ is_historical_sync: bool # Is Historical Sync
19
+ warnings: List[FivetranConnectionWarnings] # Warnings
20
+
21
+
22
+ class FivetranConnectionConfig(BaseModel):
23
+ # Note: Connection Config is different for different connectors
24
+ auth_type: str # Auth Type
25
+ sheet_id: str # Sheet ID - URL to the Google Sheet
26
+ named_range: str # Named Range
27
+
28
+
29
+ class FivetranConnectionSourceSyncDetails(BaseModel):
30
+ last_synced: datetime.datetime # Last Synced
31
+
32
+
33
+ class FivetranConnectionDetails(BaseModel):
34
+ """
35
+ Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
36
+ """
37
+
38
+ id: str # Source ID
39
+ group_id: str # Destination ID
40
+ service: str # Connector Type
41
+ created_at: datetime.datetime
42
+ succeeded_at: datetime.datetime
43
+ paused: bool # Paused Status
44
+ sync_frequency: int # Sync Frequency (minutes)
45
+ status: FivetranConnectionStatus # Status
46
+ config: FivetranConnectionConfig # Connection Config
47
+ source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
48
+
49
+ """
50
+ # Sample Response for Google Sheets Connector
51
+ {
52
+ "code": "Success",
53
+ "data": {
54
+ "id": "dialectical_remindful",
55
+ "group_id": "empties_classification",
56
+ "service": "google_sheets",
57
+ "service_version": 1,
58
+ "schema": "fivetran_google_sheets.fivetran_google_sheets",
59
+ "connected_by": "sewn_restrained",
60
+ "created_at": "2025-10-06T17:53:01.554289Z",
61
+ "succeeded_at": "2025-10-06T22:55:45.275000Z",
62
+ "failed_at": null,
63
+ "paused": true,
64
+ "pause_after_trial": false,
65
+ "sync_frequency": 360,
66
+ "data_delay_threshold": 0,
67
+ "data_delay_sensitivity": "NORMAL",
68
+ "private_link_id": null,
69
+ "networking_method": "Directly",
70
+ "proxy_agent_id": null,
71
+ "schedule_type": "auto",
72
+ "status": {
73
+ "setup_state": "connected",
74
+ "schema_status": "ready",
75
+ "sync_state": "paused",
76
+ "update_state": "on_schedule",
77
+ "is_historical_sync": false,
78
+ "tasks": [],
79
+ "warnings": [
80
+ {
81
+ "code": "snowflake_discontinuing_password_auth",
82
+ "message": "Snowflake is discontinuing username/password authentication",
83
+ "details": {}
84
+ }
85
+ ]
86
+ },
87
+ "config": {
88
+ "auth_type": "ServiceAccount",
89
+ "sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
90
+ "named_range": "Fivetran_Test_Range"
91
+ },
92
+ "source_sync_details": {
93
+ "last_synced": "2025-10-06T22:55:27.371Z"
94
+ }
95
+ }
96
+ }
97
+ """
@@ -178,7 +178,7 @@ class HexReport(
178
178
 
179
179
  @platform_name("Hex")
180
180
  @config_class(HexSourceConfig)
181
- @support_status(SupportStatus.TESTING)
181
+ @support_status(SupportStatus.INCUBATING)
182
182
  @capability(SourceCapability.DESCRIPTIONS, "Supported by default")
183
183
  @capability(SourceCapability.OWNERSHIP, "Supported by default")
184
184
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@@ -118,7 +118,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
118
118
 
119
119
 
120
120
  @platform_name("Iceberg")
121
- @support_status(SupportStatus.TESTING)
121
+ @support_status(SupportStatus.INCUBATING)
122
122
  @config_class(IcebergSourceConfig)
123
123
  @capability(
124
124
  SourceCapability.PLATFORM_INSTANCE,