airbyte-source-google-drive 0.0.12__tar.gz → 0.1.0rc1.dev202501201950__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of airbyte-source-google-drive might be problematic. Click here for more details.
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/PKG-INFO +5 -6
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/pyproject.toml +3 -3
- airbyte_source_google_drive-0.1.0rc1.dev202501201950/source_google_drive/exceptions.py +11 -0
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/source.py +2 -1
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/spec.py +13 -2
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/stream_reader.py +110 -11
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/README.md +0 -0
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/__init__.py +0 -0
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/run.py +0 -0
- {airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/source_google_drive/utils.py +0 -0
{airbyte_source_google_drive-0.0.12 → airbyte_source_google_drive-0.1.0rc1.dev202501201950}/PKG-INFO
RENAMED
|
@@ -1,23 +1,22 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: airbyte-source-google-drive
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.0rc1.dev202501201950
|
|
4
4
|
Summary: Source implementation for Google Drive.
|
|
5
|
-
Home-page: https://airbyte.com
|
|
6
5
|
License: ELv2
|
|
7
6
|
Author: Airbyte
|
|
8
7
|
Author-email: contact@airbyte.io
|
|
9
|
-
Requires-Python: >=3.
|
|
8
|
+
Requires-Python: >=3.10,<3.12
|
|
10
9
|
Classifier: License :: Other/Proprietary License
|
|
11
10
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
13
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Requires-Dist: airbyte-cdk[file-based] (>=
|
|
13
|
+
Requires-Dist: airbyte-cdk[file-based] (>=6.18.2,<7.0.0)
|
|
16
14
|
Requires-Dist: google-api-python-client (==2.104.0)
|
|
17
15
|
Requires-Dist: google-api-python-client-stubs (==1.18.0)
|
|
18
16
|
Requires-Dist: google-auth-httplib2 (==0.1.1)
|
|
19
17
|
Requires-Dist: google-auth-oauthlib (==1.1.0)
|
|
20
18
|
Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/google-drive
|
|
19
|
+
Project-URL: Homepage, https://airbyte.com
|
|
21
20
|
Project-URL: Repository, https://github.com/airbytehq/airbyte
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
22
|
|
|
@@ -5,7 +5,7 @@ requires = [
|
|
|
5
5
|
build-backend = "poetry.core.masonry.api"
|
|
6
6
|
|
|
7
7
|
[tool.poetry]
|
|
8
|
-
version = "0.0.
|
|
8
|
+
version = "0.1.0-rc.1.dev202501201950"
|
|
9
9
|
name = "airbyte-source-google-drive"
|
|
10
10
|
description = "Source implementation for Google Drive."
|
|
11
11
|
authors = [
|
|
@@ -21,7 +21,7 @@ packages = [
|
|
|
21
21
|
]
|
|
22
22
|
|
|
23
23
|
[tool.poetry.dependencies]
|
|
24
|
-
python = "^3.
|
|
24
|
+
python = "^3.10,<3.12"
|
|
25
25
|
google-api-python-client = "==2.104.0"
|
|
26
26
|
google-auth-httplib2 = "==0.1.1"
|
|
27
27
|
google-auth-oauthlib = "==1.1.0"
|
|
@@ -31,7 +31,7 @@ google-api-python-client-stubs = "==1.18.0"
|
|
|
31
31
|
extras = [
|
|
32
32
|
"file-based",
|
|
33
33
|
]
|
|
34
|
-
version = "^
|
|
34
|
+
version = "^6.18.2"
|
|
35
35
|
|
|
36
36
|
[tool.poetry.scripts]
|
|
37
37
|
source-google-drive = "source_google_drive.run:run"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
2
|
+
|
|
3
|
+
from airbyte_cdk.sources.file_based.exceptions import BaseFileBasedSourceError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ErrorFetchingMetadata(BaseFileBasedSourceError):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ErrorDownloadingFile(BaseFileBasedSourceError):
|
|
11
|
+
pass
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
from typing import Any, Mapping, Optional
|
|
7
7
|
|
|
8
8
|
from airbyte_cdk import AdvancedAuth, ConfiguredAirbyteCatalog, ConnectorSpecification, OAuthConfigSpecification, TState
|
|
9
|
+
from airbyte_cdk.models import AuthFlowType
|
|
9
10
|
from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
|
|
10
11
|
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
|
11
12
|
from source_google_drive.spec import SourceGoogleDriveSpec
|
|
@@ -32,7 +33,7 @@ class SourceGoogleDrive(FileBasedSource):
|
|
|
32
33
|
documentationUrl=self.spec_class.documentation_url(),
|
|
33
34
|
connectionSpecification=self.spec_class.schema(),
|
|
34
35
|
advanced_auth=AdvancedAuth(
|
|
35
|
-
auth_flow_type=
|
|
36
|
+
auth_flow_type=AuthFlowType.oauth2_0,
|
|
36
37
|
predicate_key=["credentials", "auth_type"],
|
|
37
38
|
predicate_value="Client",
|
|
38
39
|
oauth_config_specification=OAuthConfigSpecification(
|
|
@@ -6,9 +6,10 @@
|
|
|
6
6
|
from typing import Any, Dict, Literal, Union
|
|
7
7
|
|
|
8
8
|
import dpath.util
|
|
9
|
+
from pydantic.v1 import BaseModel, Field
|
|
10
|
+
|
|
9
11
|
from airbyte_cdk import OneOfOptionConfig
|
|
10
|
-
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
|
11
|
-
from pydantic import BaseModel, Field
|
|
12
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec, DeliverRawFiles, DeliverRecords
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class OAuthCredentials(BaseModel):
|
|
@@ -59,6 +60,16 @@ class SourceGoogleDriveSpec(AbstractFileBasedSpec, BaseModel):
|
|
|
59
60
|
pattern_descriptor="https://drive.google.com/drive/folders/MY-FOLDER-ID",
|
|
60
61
|
)
|
|
61
62
|
|
|
63
|
+
delivery_method: DeliverRecords | DeliverRawFiles = Field(
|
|
64
|
+
title="Delivery Method",
|
|
65
|
+
discriminator="delivery_type",
|
|
66
|
+
type="object",
|
|
67
|
+
order=1,
|
|
68
|
+
display_type="radio",
|
|
69
|
+
group="advanced",
|
|
70
|
+
default="use_records_transfer",
|
|
71
|
+
)
|
|
72
|
+
|
|
62
73
|
credentials: Union[OAuthCredentials, ServiceAccountCredentials] = Field(
|
|
63
74
|
title="Authentication", description="Credentials for connecting to the Google Drive API", discriminator="auth_type", type="object"
|
|
64
75
|
)
|
|
@@ -8,25 +8,51 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
from io import IOBase
|
|
11
|
-
from
|
|
11
|
+
from os.path import getsize
|
|
12
|
+
from typing import Dict, Iterable, List, Optional, Set
|
|
12
13
|
|
|
13
|
-
from airbyte_cdk import AirbyteTracedException, FailureType
|
|
14
|
-
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
|
15
|
-
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
16
14
|
from google.oauth2 import credentials, service_account
|
|
17
15
|
from googleapiclient.discovery import build
|
|
18
16
|
from googleapiclient.http import MediaIoBaseDownload
|
|
17
|
+
|
|
18
|
+
from airbyte_cdk import AirbyteTracedException, FailureType
|
|
19
|
+
from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
|
|
20
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
|
21
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
19
22
|
from source_google_drive.utils import get_folder_id
|
|
20
23
|
|
|
24
|
+
from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
|
|
21
25
|
from .spec import SourceGoogleDriveSpec
|
|
22
26
|
|
|
27
|
+
|
|
23
28
|
FOLDER_MIME_TYPE = "application/vnd.google-apps.folder"
|
|
24
29
|
GOOGLE_DOC_MIME_TYPE = "application/vnd.google-apps.document"
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
GOOGLE_PRESENTATION_MIME_TYPE = "application/vnd.google-apps.presentation"
|
|
31
|
+
GOOGLE_SPREADSHEET_MIME_TYPE = "application/vnd.google-apps.spreadsheet"
|
|
32
|
+
GOOGLE_DRAWING_MIME_TYPE = "application/vnd.google-apps.drawing"
|
|
33
|
+
EXPORTABLE_DOCUMENTS_MIME_TYPES = [GOOGLE_DOC_MIME_TYPE, GOOGLE_PRESENTATION_MIME_TYPE, GOOGLE_DRAWING_MIME_TYPE]
|
|
34
|
+
|
|
35
|
+
EXPORT_MEDIA_MIME_TYPE_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
36
|
+
EXPORT_MEDIA_MIME_TYPE_SPREADSHEET = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
37
|
+
EXPORT_MEDIA_MIME_TYPE_PRESENTATION = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
38
|
+
EXPORT_MEDIA_MIME_TYPE_PDF = "application/pdf"
|
|
39
|
+
|
|
40
|
+
# This key is used to relate the required mimeType parameter for export_media() method
|
|
41
|
+
EXPORT_MEDIA_MIME_TYPE_KEY = "exportable_mime_type"
|
|
42
|
+
DOCUMENT_FILE_EXTENSION_KEY = "document_file_extension"
|
|
43
|
+
|
|
44
|
+
DOWNLOADABLE_DOCUMENTS_MIME_TYPES = {
|
|
45
|
+
GOOGLE_DOC_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_DOC, DOCUMENT_FILE_EXTENSION_KEY: ".docx"},
|
|
46
|
+
GOOGLE_SPREADSHEET_MIME_TYPE: {
|
|
47
|
+
EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_SPREADSHEET,
|
|
48
|
+
DOCUMENT_FILE_EXTENSION_KEY: ".xlsx",
|
|
49
|
+
},
|
|
50
|
+
GOOGLE_PRESENTATION_MIME_TYPE: {
|
|
51
|
+
EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PRESENTATION,
|
|
52
|
+
DOCUMENT_FILE_EXTENSION_KEY: ".pptx",
|
|
53
|
+
},
|
|
54
|
+
GOOGLE_DRAWING_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PDF, DOCUMENT_FILE_EXTENSION_KEY: ".pdf"},
|
|
55
|
+
}
|
|
30
56
|
|
|
31
57
|
|
|
32
58
|
class GoogleDriveRemoteFile(RemoteFile):
|
|
@@ -37,6 +63,8 @@ class GoogleDriveRemoteFile(RemoteFile):
|
|
|
37
63
|
|
|
38
64
|
|
|
39
65
|
class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
|
|
66
|
+
FILE_SIZE_LIMIT = 1_500_000_000
|
|
67
|
+
|
|
40
68
|
def __init__(self):
|
|
41
69
|
super().__init__()
|
|
42
70
|
self._drive_service = None
|
|
@@ -146,6 +174,8 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
|
|
|
146
174
|
"""
|
|
147
175
|
Returns true if the given file is a Google App document that can be exported.
|
|
148
176
|
"""
|
|
177
|
+
if self.use_file_transfer():
|
|
178
|
+
return mime_type in DOWNLOADABLE_DOCUMENTS_MIME_TYPES
|
|
149
179
|
return mime_type in EXPORTABLE_DOCUMENTS_MIME_TYPES
|
|
150
180
|
|
|
151
181
|
def open_file(self, file: GoogleDriveRemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
|
|
@@ -176,10 +206,79 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
|
|
|
176
206
|
def _get_export_mime_type(self, original_mime_type: str):
|
|
177
207
|
"""
|
|
178
208
|
Returns the mime type to export Google App documents as.
|
|
179
|
-
|
|
180
|
-
Google Docs are exported as Docx to preserve as much formatting as possible, everything else goes through PDF.
|
|
181
209
|
"""
|
|
210
|
+
if self.use_file_transfer():
|
|
211
|
+
return DOWNLOADABLE_DOCUMENTS_MIME_TYPES[original_mime_type][EXPORT_MEDIA_MIME_TYPE_KEY]
|
|
212
|
+
|
|
182
213
|
if original_mime_type.startswith(GOOGLE_DOC_MIME_TYPE):
|
|
214
|
+
# Google Docs are exported as Docx to preserve as much formatting as possible, everything else goes through PDF.
|
|
183
215
|
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
184
216
|
else:
|
|
185
217
|
return "application/pdf"
|
|
218
|
+
|
|
219
|
+
def file_size(self, file: GoogleDriveRemoteFile) -> int:
|
|
220
|
+
"""
|
|
221
|
+
Retrieves the size of a file in Google Drive.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
file (RemoteFile): The file to get the size for.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
int: The file size in bytes.
|
|
228
|
+
ref: https://developers.google.com/drive/api/reference/rest/v3/files/get
|
|
229
|
+
"""
|
|
230
|
+
try:
|
|
231
|
+
file_metadata = self.google_drive_service.files().get(fileId=file.id, fields="size", supportsAllDrives=True).execute()
|
|
232
|
+
return int(file_metadata["size"])
|
|
233
|
+
except KeyError:
|
|
234
|
+
raise ErrorFetchingMetadata(f"Size was expected in metadata response but was missing")
|
|
235
|
+
except Exception as e:
|
|
236
|
+
raise ErrorFetchingMetadata(f"An error occurred while retrieving file size: {str(e)}")
|
|
237
|
+
|
|
238
|
+
def get_file(self, file: GoogleDriveRemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, str | int]:
|
|
239
|
+
"""
|
|
240
|
+
Downloads a file from Google Drive to a specified local directory.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
file (RemoteFile): The file to download, containing its Google Drive ID.
|
|
244
|
+
local_directory (str): The local directory to save the file.
|
|
245
|
+
logger (logging.Logger): Logger for debugging and information.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Dict[str, str | int]: Contains the local file path and file size in bytes.
|
|
249
|
+
ref: https://developers.google.com/drive/api/guides/manage-downloads#python
|
|
250
|
+
"""
|
|
251
|
+
file_size = self.file_size(file)
|
|
252
|
+
# I'm putting this check here so we can remove the safety wheels per connector when ready.
|
|
253
|
+
if file_size > self.FILE_SIZE_LIMIT:
|
|
254
|
+
message = "File size exceeds the 1 GB limit."
|
|
255
|
+
raise FileSizeLimitError(message=message, internal_message=message, failure_type=FailureType.config_error)
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
file_relative_path, local_file_path, absolute_file_path = self._get_file_transfer_paths(file, local_directory)
|
|
259
|
+
|
|
260
|
+
if self._is_exportable_document(file.original_mime_type):
|
|
261
|
+
request = self.google_drive_service.files().export_media(fileId=file.id, mimeType=file.mime_type)
|
|
262
|
+
|
|
263
|
+
file_extension = DOWNLOADABLE_DOCUMENTS_MIME_TYPES[file.original_mime_type][DOCUMENT_FILE_EXTENSION_KEY]
|
|
264
|
+
local_file_path += file_extension
|
|
265
|
+
absolute_file_path += file_extension
|
|
266
|
+
file_relative_path += file_extension
|
|
267
|
+
else:
|
|
268
|
+
request = self.google_drive_service.files().get_media(fileId=file.id)
|
|
269
|
+
|
|
270
|
+
with open(local_file_path, "wb") as local_file:
|
|
271
|
+
downloader = MediaIoBaseDownload(local_file, request)
|
|
272
|
+
done = False
|
|
273
|
+
while not done:
|
|
274
|
+
status, done = downloader.next_chunk(num_retries=5)
|
|
275
|
+
progress = status.resumable_progress / status.total_size * 100 if status.total_size else 0
|
|
276
|
+
logger.info(f"Processing file {file.uri}, progress: {progress:.2f}%")
|
|
277
|
+
|
|
278
|
+
# native google objects seems to be reporting lower size through the api than the final download size
|
|
279
|
+
file_size = getsize(local_file_path)
|
|
280
|
+
|
|
281
|
+
return {"file_url": absolute_file_path, "bytes": file_size, "file_relative_path": file_relative_path}
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
raise ErrorDownloadingFile(f"There was an error while trying to download the file: {str(e)}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|