airbyte-source-google-drive 0.0.12__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-google-drive might be problematic. Click here for more details.

@@ -1,23 +1,22 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-drive
3
- Version: 0.0.12
3
+ Version: 0.1.0
4
4
  Summary: Source implementation for Google Drive.
5
- Home-page: https://airbyte.com
6
5
  License: ELv2
7
6
  Author: Airbyte
8
7
  Author-email: contact@airbyte.io
9
- Requires-Python: >=3.9,<3.12
8
+ Requires-Python: >=3.10,<3.12
10
9
  Classifier: License :: Other/Proprietary License
11
10
  Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.9
13
11
  Classifier: Programming Language :: Python :: 3.10
14
12
  Classifier: Programming Language :: Python :: 3.11
15
- Requires-Dist: airbyte-cdk[file-based] (>=1,<2)
13
+ Requires-Dist: airbyte-cdk[file-based] (>=6.18.2,<7.0.0)
16
14
  Requires-Dist: google-api-python-client (==2.104.0)
17
15
  Requires-Dist: google-api-python-client-stubs (==1.18.0)
18
16
  Requires-Dist: google-auth-httplib2 (==0.1.1)
19
17
  Requires-Dist: google-auth-oauthlib (==1.1.0)
20
18
  Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-drive
19
+ Project-URL: Homepage, https://airbyte.com
21
20
  Project-URL: Repository, https://github.com/airbytehq/airbyte
22
21
  Description-Content-Type: text/markdown
23
22
 
@@ -5,7 +5,7 @@ requires = [
5
5
  build-backend = "poetry.core.masonry.api"
6
6
 
7
7
  [tool.poetry]
8
- version = "0.0.12"
8
+ version = "0.1.0"
9
9
  name = "airbyte-source-google-drive"
10
10
  description = "Source implementation for Google Drive."
11
11
  authors = [
@@ -21,7 +21,7 @@ packages = [
21
21
  ]
22
22
 
23
23
  [tool.poetry.dependencies]
24
- python = "^3.9,<3.12"
24
+ python = "^3.10,<3.12"
25
25
  google-api-python-client = "==2.104.0"
26
26
  google-auth-httplib2 = "==0.1.1"
27
27
  google-auth-oauthlib = "==1.1.0"
@@ -31,7 +31,7 @@ google-api-python-client-stubs = "==1.18.0"
31
31
  extras = [
32
32
  "file-based",
33
33
  ]
34
- version = "^1"
34
+ version = "^6.18.2"
35
35
 
36
36
  [tool.poetry.scripts]
37
37
  source-google-drive = "source_google_drive.run:run"
@@ -0,0 +1,11 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from airbyte_cdk.sources.file_based.exceptions import BaseFileBasedSourceError
4
+
5
+
6
+ class ErrorFetchingMetadata(BaseFileBasedSourceError):
7
+ pass
8
+
9
+
10
+ class ErrorDownloadingFile(BaseFileBasedSourceError):
11
+ pass
@@ -6,6 +6,7 @@
6
6
  from typing import Any, Mapping, Optional
7
7
 
8
8
  from airbyte_cdk import AdvancedAuth, ConfiguredAirbyteCatalog, ConnectorSpecification, OAuthConfigSpecification, TState
9
+ from airbyte_cdk.models import AuthFlowType
9
10
  from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
10
11
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
11
12
  from source_google_drive.spec import SourceGoogleDriveSpec
@@ -32,7 +33,7 @@ class SourceGoogleDrive(FileBasedSource):
32
33
  documentationUrl=self.spec_class.documentation_url(),
33
34
  connectionSpecification=self.spec_class.schema(),
34
35
  advanced_auth=AdvancedAuth(
35
- auth_flow_type="oauth2.0",
36
+ auth_flow_type=AuthFlowType.oauth2_0,
36
37
  predicate_key=["credentials", "auth_type"],
37
38
  predicate_value="Client",
38
39
  oauth_config_specification=OAuthConfigSpecification(
@@ -6,9 +6,10 @@
6
6
  from typing import Any, Dict, Literal, Union
7
7
 
8
8
  import dpath.util
9
+ from pydantic.v1 import BaseModel, Field
10
+
9
11
  from airbyte_cdk import OneOfOptionConfig
10
- from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
11
- from pydantic import BaseModel, Field
12
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec, DeliverRawFiles, DeliverRecords
12
13
 
13
14
 
14
15
  class OAuthCredentials(BaseModel):
@@ -59,6 +60,16 @@ class SourceGoogleDriveSpec(AbstractFileBasedSpec, BaseModel):
59
60
  pattern_descriptor="https://drive.google.com/drive/folders/MY-FOLDER-ID",
60
61
  )
61
62
 
63
+ delivery_method: DeliverRecords | DeliverRawFiles = Field(
64
+ title="Delivery Method",
65
+ discriminator="delivery_type",
66
+ type="object",
67
+ order=1,
68
+ display_type="radio",
69
+ group="advanced",
70
+ default="use_records_transfer",
71
+ )
72
+
62
73
  credentials: Union[OAuthCredentials, ServiceAccountCredentials] = Field(
63
74
  title="Authentication", description="Credentials for connecting to the Google Drive API", discriminator="auth_type", type="object"
64
75
  )
@@ -8,25 +8,51 @@ import json
8
8
  import logging
9
9
  from datetime import datetime
10
10
  from io import IOBase
11
- from typing import Iterable, List, Optional, Set
11
+ from os.path import getsize
12
+ from typing import Dict, Iterable, List, Optional, Set
12
13
 
13
- from airbyte_cdk import AirbyteTracedException, FailureType
14
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
15
- from airbyte_cdk.sources.file_based.remote_file import RemoteFile
16
14
  from google.oauth2 import credentials, service_account
17
15
  from googleapiclient.discovery import build
18
16
  from googleapiclient.http import MediaIoBaseDownload
17
+
18
+ from airbyte_cdk import AirbyteTracedException, FailureType
19
+ from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
20
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
21
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
19
22
  from source_google_drive.utils import get_folder_id
20
23
 
24
+ from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
21
25
  from .spec import SourceGoogleDriveSpec
22
26
 
27
+
23
28
  FOLDER_MIME_TYPE = "application/vnd.google-apps.folder"
24
29
  GOOGLE_DOC_MIME_TYPE = "application/vnd.google-apps.document"
25
- EXPORTABLE_DOCUMENTS_MIME_TYPES = [
26
- GOOGLE_DOC_MIME_TYPE,
27
- "application/vnd.google-apps.presentation",
28
- "application/vnd.google-apps.drawing",
29
- ]
30
+ GOOGLE_PRESENTATION_MIME_TYPE = "application/vnd.google-apps.presentation"
31
+ GOOGLE_SPREADSHEET_MIME_TYPE = "application/vnd.google-apps.spreadsheet"
32
+ GOOGLE_DRAWING_MIME_TYPE = "application/vnd.google-apps.drawing"
33
+ EXPORTABLE_DOCUMENTS_MIME_TYPES = [GOOGLE_DOC_MIME_TYPE, GOOGLE_PRESENTATION_MIME_TYPE, GOOGLE_DRAWING_MIME_TYPE]
34
+
35
+ EXPORT_MEDIA_MIME_TYPE_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
36
+ EXPORT_MEDIA_MIME_TYPE_SPREADSHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
37
+ EXPORT_MEDIA_MIME_TYPE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
38
+ EXPORT_MEDIA_MIME_TYPE_PDF = "application/pdf"
39
+
40
+ # This key is used to relate the required mimeType parameter for export_media() method
41
+ EXPORT_MEDIA_MIME_TYPE_KEY = "exportable_mime_type"
42
+ DOCUMENT_FILE_EXTENSION_KEY = "document_file_extension"
43
+
44
+ DOWNLOADABLE_DOCUMENTS_MIME_TYPES = {
45
+ GOOGLE_DOC_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_DOC, DOCUMENT_FILE_EXTENSION_KEY: ".docx"},
46
+ GOOGLE_SPREADSHEET_MIME_TYPE: {
47
+ EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_SPREADSHEET,
48
+ DOCUMENT_FILE_EXTENSION_KEY: ".xlsx",
49
+ },
50
+ GOOGLE_PRESENTATION_MIME_TYPE: {
51
+ EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PRESENTATION,
52
+ DOCUMENT_FILE_EXTENSION_KEY: ".pptx",
53
+ },
54
+ GOOGLE_DRAWING_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PDF, DOCUMENT_FILE_EXTENSION_KEY: ".pdf"},
55
+ }
30
56
 
31
57
 
32
58
  class GoogleDriveRemoteFile(RemoteFile):
@@ -37,6 +63,8 @@ class GoogleDriveRemoteFile(RemoteFile):
37
63
 
38
64
 
39
65
  class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
66
+ FILE_SIZE_LIMIT = 1_500_000_000
67
+
40
68
  def __init__(self):
41
69
  super().__init__()
42
70
  self._drive_service = None
@@ -146,6 +174,8 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
146
174
  """
147
175
  Returns true if the given file is a Google App document that can be exported.
148
176
  """
177
+ if self.use_file_transfer():
178
+ return mime_type in DOWNLOADABLE_DOCUMENTS_MIME_TYPES
149
179
  return mime_type in EXPORTABLE_DOCUMENTS_MIME_TYPES
150
180
 
151
181
  def open_file(self, file: GoogleDriveRemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
@@ -176,10 +206,79 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
176
206
  def _get_export_mime_type(self, original_mime_type: str):
177
207
  """
178
208
  Returns the mime type to export Google App documents as.
179
-
180
- Google Docs are exported as Docx to preserve as much formatting as possible, everything else goes through PDF.
181
209
  """
210
+ if self.use_file_transfer():
211
+ return DOWNLOADABLE_DOCUMENTS_MIME_TYPES[original_mime_type][EXPORT_MEDIA_MIME_TYPE_KEY]
212
+
182
213
  if original_mime_type.startswith(GOOGLE_DOC_MIME_TYPE):
214
+ # Google Docs are exported as Docx to preserve as much formatting as possible, everything else goes through PDF.
183
215
  return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
184
216
  else:
185
217
  return "application/pdf"
218
+
219
+ def file_size(self, file: GoogleDriveRemoteFile) -> int:
220
+ """
221
+ Retrieves the size of a file in Google Drive.
222
+
223
+ Args:
224
+ file (RemoteFile): The file to get the size for.
225
+
226
+ Returns:
227
+ int: The file size in bytes.
228
+ ref: https://developers.google.com/drive/api/reference/rest/v3/files/get
229
+ """
230
+ try:
231
+ file_metadata = self.google_drive_service.files().get(fileId=file.id, fields="size", supportsAllDrives=True).execute()
232
+ return int(file_metadata["size"])
233
+ except KeyError:
234
+ raise ErrorFetchingMetadata(f"Size was expected in metadata response but was missing")
235
+ except Exception as e:
236
+ raise ErrorFetchingMetadata(f"An error occurred while retrieving file size: {str(e)}")
237
+
238
+ def get_file(self, file: GoogleDriveRemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, str | int]:
239
+ """
240
+ Downloads a file from Google Drive to a specified local directory.
241
+
242
+ Args:
243
+ file (RemoteFile): The file to download, containing its Google Drive ID.
244
+ local_directory (str): The local directory to save the file.
245
+ logger (logging.Logger): Logger for debugging and information.
246
+
247
+ Returns:
248
+ Dict[str, str | int]: Contains the local file path and file size in bytes.
249
+ ref: https://developers.google.com/drive/api/guides/manage-downloads#python
250
+ """
251
+ file_size = self.file_size(file)
252
+ # I'm putting this check here so we can remove the safety wheels per connector when ready.
253
+ if file_size > self.FILE_SIZE_LIMIT:
254
+ message = "File size exceeds the 1 GB limit."
255
+ raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
256
+
257
+ try:
258
+ file_relative_path, local_file_path, absolute_file_path = self._get_file_transfer_paths(file, local_directory)
259
+
260
+ if self._is_exportable_document(file.original_mime_type):
261
+ request = self.google_drive_service.files().export_media(fileId=file.id, mimeType=file.mime_type)
262
+
263
+ file_extension = DOWNLOADABLE_DOCUMENTS_MIME_TYPES[file.original_mime_type][DOCUMENT_FILE_EXTENSION_KEY]
264
+ local_file_path += file_extension
265
+ absolute_file_path += file_extension
266
+ file_relative_path += file_extension
267
+ else:
268
+ request = self.google_drive_service.files().get_media(fileId=file.id)
269
+
270
+ with open(local_file_path, "wb") as local_file:
271
+ downloader = MediaIoBaseDownload(local_file, request)
272
+ done = False
273
+ while not done:
274
+ status, done = downloader.next_chunk(num_retries=5)
275
+ progress = status.resumable_progress / status.total_size * 100 if status.total_size else 0
276
+ logger.info(f"Processing file {file.uri}, progress: {progress:.2f}%")
277
+
278
+ # native google objects seems to be reporting lower size through the api than the final download size
279
+ file_size = getsize(local_file_path)
280
+
281
+ return {"file_url": absolute_file_path, "bytes": file_size, "file_relative_path": file_relative_path}
282
+
283
+ except Exception as e:
284
+ raise ErrorDownloadingFile(f"There was an error while trying to download the file {file.uri}: {str(e)}")