airbyte-source-google-drive 0.2.0.dev202501291717__py3-none-any.whl → 0.2.0.dev202502032236__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-google-drive might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-drive
3
- Version: 0.2.0.dev202501291717
3
+ Version: 0.2.0.dev202502032236
4
4
  Summary: Source implementation for Google Drive.
5
5
  License: ELv2
6
6
  Author: Airbyte
@@ -10,7 +10,7 @@ Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
- Requires-Dist: airbyte-cdk[file-based] (==6.26.0.dev04106)
13
+ Requires-Dist: airbyte-cdk[file-based] (==6.26.0.dev04107)
14
14
  Requires-Dist: google-api-python-client (==2.104.0)
15
15
  Requires-Dist: google-api-python-client-stubs (==1.18.0)
16
16
  Requires-Dist: google-auth-httplib2 (==0.1.1)
@@ -0,0 +1,11 @@
1
+ source_google_drive/__init__.py,sha256=SAgPIoGpjdHTY7mvf__A99FiElBh3WvdKSHLstg-BUE,134
2
+ source_google_drive/exceptions.py,sha256=6sBxbdZhj0CARkctfBaNgGHB_GGcFUkY4cnMUWyOruM,268
3
+ source_google_drive/run.py,sha256=5AQd906FbSGaNZj0udrr-VKmiSS4nsor7F7noMjsdi0,677
4
+ source_google_drive/source.py,sha256=OyKTjPg80KR8yZiYiC3YSi6E6egothtAn3YAHAS6Q6A,2839
5
+ source_google_drive/spec.py,sha256=-WkA2zGuQtf3G7uK8uq9BnimUlQh0s3vsqROmIHOgzI,4718
6
+ source_google_drive/stream_reader.py,sha256=8F0jEKz1ElM2wqHxVR91RUJfCSDoLtMQUxbX5lFkEy0,20166
7
+ source_google_drive/utils.py,sha256=ewR-kBKLmtD-s7zqCfGECfzWYF43tpQdscAQIlUEkR8,1022
8
+ airbyte_source_google_drive-0.2.0.dev202502032236.dist-info/METADATA,sha256=Hfb2iDOlay912TAmyj3QZ9GqvyVstYZflM_J2lpf2Lk,5536
9
+ airbyte_source_google_drive-0.2.0.dev202502032236.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
+ airbyte_source_google_drive-0.2.0.dev202502032236.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
11
+ airbyte_source_google_drive-0.2.0.dev202502032236.dist-info/RECORD,,
@@ -1,7 +1,9 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
4
+ import uuid
5
+ from datetime import datetime
6
+ from enum import Enum
5
7
  from typing import Any, Dict, Literal, Optional, Union
6
8
 
7
9
  import dpath.util
@@ -11,22 +13,40 @@ from airbyte_cdk import OneOfOptionConfig
11
13
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
12
14
  AbstractFileBasedSpec,
13
15
  DeliverRawFiles,
16
+ DeliverRecords,
14
17
  )
15
18
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
16
- DeliverRecords as DeliverRecordsBase,
19
+ DeliverPermissions as DeliverPermissionsBase,
17
20
  )
18
21
 
19
22
 
20
- class DeliverRecords(DeliverRecordsBase):
21
- # Overriding to make visible with airbyte_hidden=False
22
- sync_acl_permissions: bool = Field(
23
- title="Include ACL Permissions",
24
- description="Joins Document allowlists to each stream.",
25
- default=False,
26
- airbyte_hidden=False,
27
- order=0,
28
- )
29
- domain: Optional[str] = Field(title="Domain", description="The domain of the identities.", airbyte_hidden=False, order=1)
23
+ class RemoteIdentityType(Enum):
24
+ USER = "user"
25
+ GROUP = "group"
26
+
27
+
28
+ class RemoteIdentity(BaseModel):
29
+ id: uuid.UUID
30
+ remote_id: str
31
+ parent_id: str | None = None
32
+ name: str | None = None
33
+ description: str | None = None
34
+ email_address: str | None = None
35
+ member_email_addresses: list[str] | None = None
36
+ type: RemoteIdentityType
37
+ modified_at: datetime
38
+
39
+
40
+ class RemotePermissions(BaseModel):
41
+ id: str
42
+ file_path: str
43
+ allowed_identity_remote_ids: list[str] | None = None
44
+ denied_identity_remote_ids: list[str] | None = None
45
+ publicly_accessible: bool = False
46
+
47
+
48
+ class DeliverPermissions(DeliverPermissionsBase):
49
+ domain: Optional[str] = Field(title="Domain", description="The Google domain of the identities.", airbyte_hidden=False, order=1)
30
50
 
31
51
 
32
52
  class OAuthCredentials(BaseModel):
@@ -77,7 +97,7 @@ class SourceGoogleDriveSpec(AbstractFileBasedSpec, BaseModel):
77
97
  pattern_descriptor="https://drive.google.com/drive/folders/MY-FOLDER-ID",
78
98
  )
79
99
 
80
- delivery_method: DeliverRecords | DeliverRawFiles = Field(
100
+ delivery_method: DeliverRecords | DeliverRawFiles | DeliverPermissions = Field(
81
101
  title="Delivery Method",
82
102
  discriminator="delivery_type",
83
103
  type="object",
@@ -18,14 +18,13 @@ from googleapiclient.discovery import build
18
18
  from googleapiclient.http import MediaIoBaseDownload
19
19
 
20
20
  from airbyte_cdk import AirbyteTracedException, FailureType
21
- from airbyte_cdk.sources.file_based.config.permissions import RemoteFileIdentity, RemoteFileIdentityType, RemoteFilePermissions
22
21
  from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
23
22
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
24
23
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
24
  from source_google_drive.utils import get_folder_id
26
25
 
27
26
  from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
28
- from .spec import SourceGoogleDriveSpec
27
+ from .spec import RemoteIdentity, RemoteIdentityType, RemotePermissions, SourceGoogleDriveSpec
29
28
 
30
29
 
31
30
  FOLDER_MIME_TYPE = "application/vnd.google-apps.folder"
@@ -122,7 +121,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
122
121
  if self.config.credentials.auth_type == "Client":
123
122
  creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
124
123
  else:
125
- scopes = PERMISSIONS_API_SCOPES if self.sync_acl_permissions() else None
124
+ scopes = PERMISSIONS_API_SCOPES if self.include_identities_stream() else None
126
125
  creds = service_account.Credentials.from_service_account_info(
127
126
  json.loads(self.config.credentials.service_account_info), scopes=scopes
128
127
  )
@@ -137,73 +136,6 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
137
136
 
138
137
  return self._drive_service
139
138
 
140
- def _get_looping_google_api_list_response(
141
- self, service: Any, key: str, args: dict[str, Any], logger: logging.Logger
142
- ) -> Iterator[dict[str, Any]]:
143
- try:
144
- looping = True
145
- next_page_token: str | None = None
146
- while looping:
147
- rsp = service.list(pageToken=next_page_token, **args).execute()
148
- next_page_token = rsp.get("nextPageToken")
149
- items: list[dict[str, Any]] = rsp.get(key)
150
-
151
- if items is None or len(items) == 0:
152
- looping = False
153
- break
154
-
155
- if rsp.get("nextPageToken") is None:
156
- looping = False
157
- else:
158
- next_page_token = rsp.get("nextPageToken")
159
-
160
- for item in items:
161
- yield item
162
- except Exception as e:
163
- logger.error(f"There was an error listing {key} with {args}: {str(e)}")
164
- raise e
165
-
166
- def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
167
- domain = self.config.delivery_method.domain
168
- if not domain:
169
- raise Exception("No domain was provided")
170
- if self.google_drive_service is None or self.google_drive_service._http.credentials is None:
171
- raise Exception("No auth found")
172
-
173
- directory_service = build("admin", "directory_v1", credentials=self.google_drive_service._http.credentials)
174
- users_api = directory_service.users()
175
- groups_api = directory_service.groups()
176
- members_api = directory_service.members()
177
-
178
- # here is failing
179
- for user in self._get_looping_google_api_list_response(users_api, "users", {"domain": domain}, logger):
180
- rfp = RemoteFileIdentity(
181
- id=uuid.uuid4(),
182
- remote_id=user["primaryEmail"],
183
- name=user["name"]["fullName"] if user["name"] is not None else None,
184
- email_address=user["primaryEmail"],
185
- member_email_addresses=[x["address"] for x in user["emails"]],
186
- type=RemoteFileIdentityType.USER,
187
- modified_at=datetime_now(),
188
- )
189
- yield rfp.dict()
190
-
191
- for group in self._get_looping_google_api_list_response(groups_api, "groups", {"domain": domain}, logger):
192
- rfp = RemoteFileIdentity(
193
- id=uuid.uuid4(),
194
- remote_id=group["email"],
195
- name=group["name"],
196
- email_address=group["email"],
197
- type=RemoteFileIdentityType.GROUP,
198
- modified_at=datetime_now(),
199
- )
200
-
201
- for member in self._get_looping_google_api_list_response(members_api, "members", {"groupKey": group["id"]}, logger):
202
- rfp.member_email_addresses = rfp.member_email_addresses or []
203
- rfp.member_email_addresses.append(member["email"])
204
-
205
- yield rfp.dict()
206
-
207
139
  def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]:
208
140
  """
209
141
  Get all files matching the specified glob patterns.
@@ -377,7 +309,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
377
309
  except Exception as e:
378
310
  raise ErrorDownloadingFile(f"There was an error while trying to download the file {file.uri}: {str(e)}")
379
311
 
380
- def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteFileIdentity], bool]:
312
+ def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteIdentity], bool]:
381
313
  """
382
314
  Retrieves the permissions of a file in Google Drive and checks for public access.
383
315
 
@@ -413,13 +345,13 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
413
345
  except Exception as e:
414
346
  raise ErrorFetchingMetadata(f"An error occurred while retrieving file permissions: {str(e)}")
415
347
 
416
- def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteFileIdentity | None:
348
+ def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteIdentity | None:
417
349
  if identity.get("id") in PUBLIC_PERMISSION_IDS:
418
350
  return None
419
351
  if identity.get("deleted") is True:
420
352
  return None
421
353
 
422
- return RemoteFileIdentity(
354
+ return RemoteIdentity(
423
355
  modified_at=datetime.now(),
424
356
  id=uuid.uuid4(),
425
357
  remote_id=identity.get("emailAddress"),
@@ -431,9 +363,75 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
431
363
 
432
364
  def get_file_acl_permissions(self, file: GoogleDriveRemoteFile, logger: logging.Logger) -> Dict[str, Any]:
433
365
  remote_identities, is_public = self.get_file_permissions(file.id, file_name=file.uri, logger=logger)
434
- return RemoteFilePermissions(
366
+ return RemotePermissions(
435
367
  id=file.id,
436
368
  file_path=file.uri,
437
369
  allowed_identity_remote_ids=[p.remote_id for p in remote_identities],
438
370
  publicly_accessible=is_public,
439
371
  ).dict(exclude_none=True)
372
+
373
+ def _get_looping_google_api_list_response(
374
+ self, service: Any, key: str, args: dict[str, Any], logger: logging.Logger
375
+ ) -> Iterator[dict[str, Any]]:
376
+ try:
377
+ looping = True
378
+ next_page_token: str | None = None
379
+ while looping:
380
+ rsp = service.list(pageToken=next_page_token, **args).execute()
381
+ next_page_token = rsp.get("nextPageToken")
382
+ items: list[dict[str, Any]] = rsp.get(key)
383
+
384
+ if items is None or len(items) == 0:
385
+ looping = False
386
+ break
387
+
388
+ if rsp.get("nextPageToken") is None:
389
+ looping = False
390
+ else:
391
+ next_page_token = rsp.get("nextPageToken")
392
+
393
+ for item in items:
394
+ yield item
395
+ except Exception as e:
396
+ logger.error(f"There was an error listing {key} with {args}: {str(e)}")
397
+ raise e
398
+
399
+ def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
400
+ domain = self.config.delivery_method.domain
401
+ if not domain:
402
+ raise Exception("No domain was provided")
403
+ if self.google_drive_service is None or self.google_drive_service._http.credentials is None:
404
+ raise Exception("No auth found")
405
+
406
+ directory_service = build("admin", "directory_v1", credentials=self.google_drive_service._http.credentials)
407
+ users_api = directory_service.users()
408
+ groups_api = directory_service.groups()
409
+ members_api = directory_service.members()
410
+
411
+ for user in self._get_looping_google_api_list_response(users_api, "users", {"domain": domain}, logger):
412
+ rfp = RemoteIdentity(
413
+ id=uuid.uuid4(),
414
+ remote_id=user["primaryEmail"],
415
+ name=user["name"]["fullName"] if user["name"] is not None else None,
416
+ email_address=user["primaryEmail"],
417
+ member_email_addresses=[x["address"] for x in user["emails"]],
418
+ type=RemoteIdentityType.USER,
419
+ modified_at=datetime_now(),
420
+ )
421
+ yield rfp.dict()
422
+
423
+ for group in self._get_looping_google_api_list_response(groups_api, "groups", {"domain": domain}, logger):
424
+ rfp = RemoteIdentity(
425
+ id=uuid.uuid4(),
426
+ remote_id=group["email"],
427
+ name=group["name"],
428
+ email_address=group["email"],
429
+ type=RemoteIdentityType.GROUP,
430
+ modified_at=datetime_now(),
431
+ )
432
+
433
+ for member in self._get_looping_google_api_list_response(members_api, "members", {"groupKey": group["id"]}, logger):
434
+ rfp.member_email_addresses = rfp.member_email_addresses or []
435
+ rfp.member_email_addresses.append(member["email"])
436
+
437
+ yield rfp.dict()
@@ -15,6 +15,8 @@ def get_folder_id(url_string: str) -> str:
15
15
  if parsed_url.scheme != "https" or parsed_url.netloc != "drive.google.com":
16
16
  raise ValueError("Folder URL has to be of the form https://drive.google.com/drive/folders/<folder_id>")
17
17
  path_segments = list(filter(None, parsed_url.path.split("/")))
18
+ if path_segments[-1] in ["my-drive", "home"]:
19
+ return "root"
18
20
  if path_segments[-2] != "folders" or len(path_segments) < 3:
19
21
  raise ValueError("Folder URL has to be of the form https://drive.google.com/drive/folders/<folder_id>")
20
22
  return path_segments[-1]
@@ -1,11 +0,0 @@
1
- source_google_drive/__init__.py,sha256=SAgPIoGpjdHTY7mvf__A99FiElBh3WvdKSHLstg-BUE,134
2
- source_google_drive/exceptions.py,sha256=6sBxbdZhj0CARkctfBaNgGHB_GGcFUkY4cnMUWyOruM,268
3
- source_google_drive/run.py,sha256=5AQd906FbSGaNZj0udrr-VKmiSS4nsor7F7noMjsdi0,677
4
- source_google_drive/source.py,sha256=OyKTjPg80KR8yZiYiC3YSi6E6egothtAn3YAHAS6Q6A,2839
5
- source_google_drive/spec.py,sha256=YPOmOInQfmwHeB6Ay2z78LNvHsggncwU1bchQOsAZSk,4268
6
- source_google_drive/stream_reader.py,sha256=DClCknbFqAq1SZQQBJnWVpbIoJkfu7ZeLzLh0E0aE5U,20292
7
- source_google_drive/utils.py,sha256=1Fe3J4KXI1iIf4rklgLoR4p1xkCzLQI7zj5dkyi4Vt0,942
8
- airbyte_source_google_drive-0.2.0.dev202501291717.dist-info/METADATA,sha256=gXbq2ilBqoJCjwtdwqmSgFVL3RxFMlxgwUxODMZT4Yg,5536
9
- airbyte_source_google_drive-0.2.0.dev202501291717.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
- airbyte_source_google_drive-0.2.0.dev202501291717.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
11
- airbyte_source_google_drive-0.2.0.dev202501291717.dist-info/RECORD,,