airbyte-source-google-drive 0.2.4.dev202503121621__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-google-drive might be problematic. Click here for more details.

Files changed (13) hide show
  1. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/PKG-INFO +2 -2
  2. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/pyproject.toml +2 -2
  3. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/source.py +2 -0
  4. airbyte_source_google_drive-0.3.0/source_google_drive/stream_permissions_reader.py +221 -0
  5. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/stream_reader.py +0 -159
  6. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/README.md +0 -0
  7. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/__init__.py +0 -0
  8. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/exceptions.py +0 -0
  9. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/run.py +0 -0
  10. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/schemas/file_permissions.json +0 -0
  11. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/schemas/identities.json +0 -0
  12. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/spec.py +0 -0
  13. {airbyte_source_google_drive-0.2.4.dev202503121621 → airbyte_source_google_drive-0.3.0}/source_google_drive/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-drive
3
- Version: 0.2.4.dev202503121621
3
+ Version: 0.3.0
4
4
  Summary: Source implementation for Google Drive.
5
5
  License: ELv2
6
6
  Author: Airbyte
@@ -10,7 +10,7 @@ Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
- Requires-Dist: airbyte-cdk[file-based] (>=6.33.6,<7.0.0)
13
+ Requires-Dist: airbyte-cdk[file-based] (>=6.38.5,<7.0.0)
14
14
  Requires-Dist: google-api-python-client (==2.104.0)
15
15
  Requires-Dist: google-api-python-client-stubs (==1.18.0)
16
16
  Requires-Dist: google-auth-httplib2 (==0.1.1)
@@ -5,7 +5,7 @@ requires = [
5
5
  build-backend = "poetry.core.masonry.api"
6
6
 
7
7
  [tool.poetry]
8
- version = "0.2.4.dev202503121621"
8
+ version = "0.3.0"
9
9
  name = "airbyte-source-google-drive"
10
10
  description = "Source implementation for Google Drive."
11
11
  authors = [
@@ -31,7 +31,7 @@ google-api-python-client-stubs = "==1.18.0"
31
31
  extras = [
32
32
  "file-based",
33
33
  ]
34
- version = "^6.33.6"
34
+ version = "^6.38.5"
35
35
 
36
36
  [tool.poetry.scripts]
37
37
  source-google-drive = "source_google_drive.run:run"
@@ -10,6 +10,7 @@ from airbyte_cdk.models import AuthFlowType, OauthConnectorInputSpecification
10
10
  from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
11
11
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
12
12
  from source_google_drive.spec import SourceGoogleDriveSpec
13
+ from source_google_drive.stream_permissions_reader import SourceGoogleDriveStreamPermissionsReader
13
14
  from source_google_drive.stream_reader import SourceGoogleDriveStreamReader
14
15
 
15
16
 
@@ -22,6 +23,7 @@ class SourceGoogleDrive(FileBasedSource):
22
23
  config=config,
23
24
  state=state,
24
25
  cursor_cls=DefaultFileBasedCursor,
26
+ stream_permissions_reader=SourceGoogleDriveStreamPermissionsReader(),
25
27
  )
26
28
 
27
29
  def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
@@ -0,0 +1,221 @@
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+
3
+ import json
4
+ import logging
5
+ import uuid
6
+ from datetime import datetime
7
+ from typing import Any, Dict, Iterator, List, Tuple
8
+
9
+ import pytz
10
+ from google.oauth2 import credentials, service_account
11
+ from googleapiclient.discovery import build
12
+
13
+ from airbyte_cdk import AirbyteTracedException, FailureType
14
+ from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import AbstractFileBasedStreamPermissionsReader
15
+ from airbyte_cdk.sources.streams.core import package_name_from_class
16
+ from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader
17
+ from source_google_drive.exceptions import ErrorFetchingMetadata
18
+ from source_google_drive.spec import RemoteIdentity, RemoteIdentityType, RemotePermissions, SourceGoogleDriveSpec
19
+
20
+
21
+ DRIVE_SERVICE_SCOPES = [
22
+ "https://www.googleapis.com/auth/admin.directory.group.readonly",
23
+ "https://www.googleapis.com/auth/admin.directory.group.member.readonly",
24
+ "https://www.googleapis.com/auth/admin.directory.user.readonly",
25
+ ]
26
+
27
+ PUBLIC_PERMISSION_IDS = [
28
+ "anyoneWithLink",
29
+ "anyoneCanFind",
30
+ "domainCanFind",
31
+ "domainWithLink",
32
+ ]
33
+
34
+
35
+ def datetime_now() -> datetime:
36
+ return datetime.now(pytz.UTC)
37
+
38
+
39
+ class SourceGoogleDriveStreamPermissionsReader(AbstractFileBasedStreamPermissionsReader):
40
+ def __init__(self):
41
+ super().__init__()
42
+ self._drive_service = None
43
+ self._directory_service = None
44
+
45
+ @property
46
+ def config(self) -> SourceGoogleDriveSpec:
47
+ return self._config
48
+
49
+ @config.setter
50
+ def config(self, value: SourceGoogleDriveSpec):
51
+ assert isinstance(value, SourceGoogleDriveSpec)
52
+ self._config = value
53
+
54
+ def _build_google_service(self, service_name: str, version: str, scopes: List[str] = None):
55
+ if self.config is None:
56
+ # We shouldn't hit this; config should always get set before attempting to
57
+ # list or read files.
58
+ raise ValueError(f"Source config is missing; cannot create the Google {service_name} client.")
59
+ try:
60
+ if self.config.credentials.auth_type == "Client":
61
+ creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
62
+ else:
63
+ creds = service_account.Credentials.from_service_account_info(
64
+ json.loads(self.config.credentials.service_account_info), scopes=scopes
65
+ )
66
+ google_service = build(service_name, version, credentials=creds)
67
+ except Exception as e:
68
+ raise AirbyteTracedException(
69
+ internal_message=str(e),
70
+ message=f"Could not authenticate with Google {service_name}. Please check your credentials.",
71
+ failure_type=FailureType.config_error,
72
+ exception=e,
73
+ )
74
+
75
+ return google_service
76
+
77
+ @property
78
+ def google_drive_service(self):
79
+ if self._drive_service is None:
80
+ self._drive_service = self._build_google_service("drive", "v3")
81
+ return self._drive_service
82
+
83
+ @property
84
+ def google_directory_service(self):
85
+ if self._directory_service is None:
86
+ self._directory_service = self._build_google_service("admin", "directory_v1", DRIVE_SERVICE_SCOPES)
87
+ return self._directory_service
88
+
89
+ def _get_looping_google_api_list_response(
90
+ self, service: Any, key: str, args: dict[str, Any], logger: logging.Logger
91
+ ) -> Iterator[dict[str, Any]]:
92
+ try:
93
+ looping = True
94
+ next_page_token: str | None = None
95
+ while looping:
96
+ rsp = service.list(pageToken=next_page_token, **args).execute()
97
+ next_page_token = rsp.get("nextPageToken")
98
+ items: list[dict[str, Any]] = rsp.get(key)
99
+
100
+ if items is None or len(items) == 0:
101
+ looping = False
102
+ break
103
+
104
+ if rsp.get("nextPageToken") is None:
105
+ looping = False
106
+ else:
107
+ next_page_token = rsp.get("nextPageToken")
108
+
109
+ for item in items:
110
+ yield item
111
+ except Exception as e:
112
+ logger.error(f"There was an error listing {key} with {args}: {str(e)}")
113
+ raise e
114
+
115
+ def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteIdentity | None:
116
+ if identity.get("id") in PUBLIC_PERMISSION_IDS:
117
+ return None
118
+ if identity.get("deleted") is True:
119
+ return None
120
+
121
+ return RemoteIdentity(
122
+ modified_at=datetime.now(),
123
+ id=uuid.uuid4(),
124
+ remote_id=identity.get("emailAddress"),
125
+ name=identity.get("name"),
126
+ email_address=identity.get("emailAddress"),
127
+ type=identity.get("type"),
128
+ description=None,
129
+ )
130
+
131
+ def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteIdentity], bool]:
132
+ """
133
+ Retrieves the permissions of a file in Google Drive and checks for public access.
134
+
135
+ Args:
136
+ file_id (str): The file to get permissions for.
137
+ file_name (str): The name of the file to get permissions for.
138
+ logger (logging.Logger): Logger for debugging and information.
139
+
140
+ Returns:
141
+ Tuple(List[RemoteFileIdentity], boolean): A list of RemoteFileIdentity objects containing permission details.
142
+ """
143
+ try:
144
+ request = self.google_drive_service.permissions().list(
145
+ fileId=file_id,
146
+ fields="permissions, permissions/role, permissions/type, permissions/id, permissions/emailAddress",
147
+ supportsAllDrives=True,
148
+ )
149
+ response = request.execute()
150
+ permissions = response.get("permissions", [])
151
+ is_public = False
152
+
153
+ remote_identities = []
154
+
155
+ for p in permissions:
156
+ identity = self._to_remote_file_identity(p)
157
+ if p.get("id") in PUBLIC_PERMISSION_IDS:
158
+ is_public = True
159
+ if identity is not None:
160
+ remote_identities.append(identity)
161
+
162
+ return remote_identities, is_public
163
+ except Exception as e:
164
+ raise ErrorFetchingMetadata(f"An error occurred while retrieving file permissions: {str(e)}")
165
+
166
+ def get_file_acl_permissions(self, file: Any, logger: logging.Logger) -> Dict[str, Any]:
167
+ remote_identities, is_public = self.get_file_permissions(file.id, file_name=file.uri, logger=logger)
168
+ return RemotePermissions(
169
+ id=file.id,
170
+ file_path=file.uri,
171
+ allowed_identity_remote_ids=[p.remote_id for p in remote_identities],
172
+ publicly_accessible=is_public,
173
+ ).dict(exclude_none=True)
174
+
175
+ def load_identity_groups(self, logger: logging.Logger) -> Iterator[Dict[str, Any]]:
176
+ domain = self.config.delivery_method.domain
177
+ if not domain:
178
+ logger.info("No domain provided. Trying to fetch identities from the user workspace.")
179
+ api_args = {"customer": "my_customer"}
180
+ else:
181
+ api_args = {"domain": domain}
182
+
183
+ users_api = self.google_directory_service.users()
184
+ groups_api = self.google_directory_service.groups()
185
+ members_api = self.google_directory_service.members()
186
+
187
+ for user in self._get_looping_google_api_list_response(users_api, "users", args=api_args, logger=logger):
188
+ rfp = RemoteIdentity(
189
+ id=uuid.uuid4(),
190
+ remote_id=user["primaryEmail"],
191
+ name=user["name"]["fullName"] if user["name"] is not None else None,
192
+ email_address=user["primaryEmail"],
193
+ member_email_addresses=[x["address"] for x in user["emails"]],
194
+ type=RemoteIdentityType.USER,
195
+ modified_at=datetime_now(),
196
+ )
197
+ yield rfp.dict()
198
+
199
+ for group in self._get_looping_google_api_list_response(groups_api, "groups", args=api_args, logger=logger):
200
+ rfp = RemoteIdentity(
201
+ id=uuid.uuid4(),
202
+ remote_id=group["email"],
203
+ name=group["name"],
204
+ email_address=group["email"],
205
+ type=RemoteIdentityType.GROUP,
206
+ modified_at=datetime_now(),
207
+ )
208
+
209
+ for member in self._get_looping_google_api_list_response(members_api, "members", {"groupKey": group["id"]}, logger):
210
+ rfp.member_email_addresses = rfp.member_email_addresses or []
211
+ rfp.member_email_addresses.append(member["email"])
212
+
213
+ yield rfp.dict()
214
+
215
+ @property
216
+ def file_permissions_schema(self) -> Dict[str, Any]:
217
+ return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("file_permissions")
218
+
219
+ @property
220
+ def identities_schema(self) -> Dict[str, Any]:
221
+ return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("identities")
@@ -58,24 +58,6 @@ DOWNLOADABLE_DOCUMENTS_MIME_TYPES = {
58
58
  GOOGLE_DRAWING_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PDF, DOCUMENT_FILE_EXTENSION_KEY: ".pdf"},
59
59
  }
60
60
 
61
- PUBLIC_PERMISSION_IDS = [
62
- "anyoneWithLink",
63
- "anyoneCanFind",
64
- "domainCanFind",
65
- "domainWithLink",
66
- ]
67
-
68
-
69
- DRIVE_SERVICE_SCOPES = [
70
- "https://www.googleapis.com/auth/admin.directory.group.readonly",
71
- "https://www.googleapis.com/auth/admin.directory.group.member.readonly",
72
- "https://www.googleapis.com/auth/admin.directory.user.readonly",
73
- ]
74
-
75
-
76
- def datetime_now() -> datetime:
77
- return datetime.now(pytz.UTC)
78
-
79
61
 
80
62
  class GoogleDriveRemoteFile(RemoteFile):
81
63
  id: str
@@ -90,7 +72,6 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
90
72
  def __init__(self):
91
73
  super().__init__()
92
74
  self._drive_service = None
93
- self._directory_service = None
94
75
 
95
76
  @property
96
77
  def config(self) -> SourceGoogleDriveSpec:
@@ -139,12 +120,6 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
139
120
  self._drive_service = self._build_google_service("drive", "v3")
140
121
  return self._drive_service
141
122
 
142
- @property
143
- def google_directory_service(self):
144
- if self._directory_service is None:
145
- self._directory_service = self._build_google_service("admin", "directory_v1", DRIVE_SERVICE_SCOPES)
146
- return self._directory_service
147
-
148
123
  def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]:
149
124
  """
150
125
  Get all files matching the specified glob patterns.
@@ -317,137 +292,3 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
317
292
 
318
293
  except Exception as e:
319
294
  raise ErrorDownloadingFile(f"There was an error while trying to download the file {file.uri}: {str(e)}")
320
-
321
- def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteIdentity], bool]:
322
- """
323
- Retrieves the permissions of a file in Google Drive and checks for public access.
324
-
325
- Args:
326
- file_id (str): The file to get permissions for.
327
- file_name (str): The name of the file to get permissions for.
328
- logger (logging.Logger): Logger for debugging and information.
329
-
330
- Returns:
331
- Tuple(List[RemoteFileIdentity], boolean): A list of RemoteFileIdentity objects containing permission details.
332
- """
333
- try:
334
- request = self.google_drive_service.permissions().list(
335
- fileId=file_id,
336
- fields="permissions, permissions/role, permissions/type, permissions/id, permissions/emailAddress",
337
- supportsAllDrives=True,
338
- )
339
- response = request.execute()
340
- permissions = response.get("permissions", [])
341
- is_public = False
342
-
343
- remote_identities = []
344
-
345
- for p in permissions:
346
- identity = self._to_remote_file_identity(p)
347
- if p.get("id") in PUBLIC_PERMISSION_IDS:
348
- is_public = True
349
- if identity is not None:
350
- remote_identities.append(identity)
351
-
352
- return remote_identities, is_public
353
- except Exception as e:
354
- raise ErrorFetchingMetadata(f"An error occurred while retrieving file permissions: {str(e)}")
355
-
356
- def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteIdentity | None:
357
- if identity.get("id") in PUBLIC_PERMISSION_IDS:
358
- return None
359
- if identity.get("deleted") is True:
360
- return None
361
-
362
- return RemoteIdentity(
363
- modified_at=datetime.now(),
364
- id=uuid.uuid4(),
365
- remote_id=identity.get("emailAddress"),
366
- name=identity.get("name"),
367
- email_address=identity.get("emailAddress"),
368
- type=identity.get("type"),
369
- description=None,
370
- )
371
-
372
- def get_file_acl_permissions(self, file: GoogleDriveRemoteFile, logger: logging.Logger) -> Dict[str, Any]:
373
- remote_identities, is_public = self.get_file_permissions(file.id, file_name=file.uri, logger=logger)
374
- return RemotePermissions(
375
- id=file.id,
376
- file_path=file.uri,
377
- allowed_identity_remote_ids=[p.remote_id for p in remote_identities],
378
- publicly_accessible=is_public,
379
- ).dict(exclude_none=True)
380
-
381
- def _get_looping_google_api_list_response(
382
- self, service: Any, key: str, args: dict[str, Any], logger: logging.Logger
383
- ) -> Iterator[dict[str, Any]]:
384
- try:
385
- looping = True
386
- next_page_token: str | None = None
387
- while looping:
388
- rsp = service.list(pageToken=next_page_token, **args).execute()
389
- next_page_token = rsp.get("nextPageToken")
390
- items: list[dict[str, Any]] = rsp.get(key)
391
-
392
- if items is None or len(items) == 0:
393
- looping = False
394
- break
395
-
396
- if rsp.get("nextPageToken") is None:
397
- looping = False
398
- else:
399
- next_page_token = rsp.get("nextPageToken")
400
-
401
- for item in items:
402
- yield item
403
- except Exception as e:
404
- logger.error(f"There was an error listing {key} with {args}: {str(e)}")
405
- raise e
406
-
407
- def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
408
- domain = self.config.delivery_method.domain
409
- if not domain:
410
- logger.info("No domain provided. Trying to fetch identities from the user workspace.")
411
- api_args = {"customer": "my_customer"}
412
- else:
413
- api_args = {"domain": domain}
414
-
415
- users_api = self.google_directory_service.users()
416
- groups_api = self.google_directory_service.groups()
417
- members_api = self.google_directory_service.members()
418
-
419
- for user in self._get_looping_google_api_list_response(users_api, "users", args=api_args, logger=logger):
420
- rfp = RemoteIdentity(
421
- id=uuid.uuid4(),
422
- remote_id=user["primaryEmail"],
423
- name=user["name"]["fullName"] if user["name"] is not None else None,
424
- email_address=user["primaryEmail"],
425
- member_email_addresses=[x["address"] for x in user["emails"]],
426
- type=RemoteIdentityType.USER,
427
- modified_at=datetime_now(),
428
- )
429
- yield rfp.dict()
430
-
431
- for group in self._get_looping_google_api_list_response(groups_api, "groups", args=api_args, logger=logger):
432
- rfp = RemoteIdentity(
433
- id=uuid.uuid4(),
434
- remote_id=group["email"],
435
- name=group["name"],
436
- email_address=group["email"],
437
- type=RemoteIdentityType.GROUP,
438
- modified_at=datetime_now(),
439
- )
440
-
441
- for member in self._get_looping_google_api_list_response(members_api, "members", {"groupKey": group["id"]}, logger):
442
- rfp.member_email_addresses = rfp.member_email_addresses or []
443
- rfp.member_email_addresses.append(member["email"])
444
-
445
- yield rfp.dict()
446
-
447
- @property
448
- def file_permissions_schema(self) -> Dict[str, Any]:
449
- return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("file_permissions")
450
-
451
- @property
452
- def identities_schema(self) -> Dict[str, Any]:
453
- return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("identities")