airbyte-source-google-drive 0.2.0.dev202502031727__py3-none-any.whl → 0.2.0.dev202502052009__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-google-drive might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-drive
3
- Version: 0.2.0.dev202502031727
3
+ Version: 0.2.0.dev202502052009
4
4
  Summary: Source implementation for Google Drive.
5
5
  License: ELv2
6
6
  Author: Airbyte
@@ -10,7 +10,7 @@ Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
- Requires-Dist: airbyte-cdk[file-based] (==6.26.0.dev04106)
13
+ Requires-Dist: airbyte-cdk[file-based] (==6.26.0.dev04108)
14
14
  Requires-Dist: google-api-python-client (==2.104.0)
15
15
  Requires-Dist: google-api-python-client-stubs (==1.18.0)
16
16
  Requires-Dist: google-auth-httplib2 (==0.1.1)
@@ -2,10 +2,10 @@ source_google_drive/__init__.py,sha256=SAgPIoGpjdHTY7mvf__A99FiElBh3WvdKSHLstg-B
2
2
  source_google_drive/exceptions.py,sha256=6sBxbdZhj0CARkctfBaNgGHB_GGcFUkY4cnMUWyOruM,268
3
3
  source_google_drive/run.py,sha256=5AQd906FbSGaNZj0udrr-VKmiSS4nsor7F7noMjsdi0,677
4
4
  source_google_drive/source.py,sha256=OyKTjPg80KR8yZiYiC3YSi6E6egothtAn3YAHAS6Q6A,2839
5
- source_google_drive/spec.py,sha256=YPOmOInQfmwHeB6Ay2z78LNvHsggncwU1bchQOsAZSk,4268
6
- source_google_drive/stream_reader.py,sha256=h4p3aafE8U9BqYgRBGXuADEhm4NCGMqYl2ExnQkj6bU,20266
5
+ source_google_drive/spec.py,sha256=-WkA2zGuQtf3G7uK8uq9BnimUlQh0s3vsqROmIHOgzI,4718
6
+ source_google_drive/stream_reader.py,sha256=jKMibQGjpP-9raJ557TqbOg6iGVuPYv9vamStwXDOCQ,20176
7
7
  source_google_drive/utils.py,sha256=ewR-kBKLmtD-s7zqCfGECfzWYF43tpQdscAQIlUEkR8,1022
8
- airbyte_source_google_drive-0.2.0.dev202502031727.dist-info/METADATA,sha256=rMydjQTVW8_21SbChXDmp3i-LbEYiZi1OwOPvEKHh_k,5536
9
- airbyte_source_google_drive-0.2.0.dev202502031727.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
- airbyte_source_google_drive-0.2.0.dev202502031727.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
11
- airbyte_source_google_drive-0.2.0.dev202502031727.dist-info/RECORD,,
8
+ airbyte_source_google_drive-0.2.0.dev202502052009.dist-info/METADATA,sha256=fiVV_jIGTrIB43tzlnCv3BAkJEbJqr8BYtB25-vIUBU,5536
9
+ airbyte_source_google_drive-0.2.0.dev202502052009.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
+ airbyte_source_google_drive-0.2.0.dev202502052009.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
11
+ airbyte_source_google_drive-0.2.0.dev202502052009.dist-info/RECORD,,
@@ -1,7 +1,9 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
4
+ import uuid
5
+ from datetime import datetime
6
+ from enum import Enum
5
7
  from typing import Any, Dict, Literal, Optional, Union
6
8
 
7
9
  import dpath.util
@@ -11,22 +13,40 @@ from airbyte_cdk import OneOfOptionConfig
11
13
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
12
14
  AbstractFileBasedSpec,
13
15
  DeliverRawFiles,
16
+ DeliverRecords,
14
17
  )
15
18
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
16
- DeliverRecords as DeliverRecordsBase,
19
+ DeliverPermissions as DeliverPermissionsBase,
17
20
  )
18
21
 
19
22
 
20
- class DeliverRecords(DeliverRecordsBase):
21
- # Overriding to make visible with airbyte_hidden=False
22
- sync_acl_permissions: bool = Field(
23
- title="Include ACL Permissions",
24
- description="Joins Document allowlists to each stream.",
25
- default=False,
26
- airbyte_hidden=False,
27
- order=0,
28
- )
29
- domain: Optional[str] = Field(title="Domain", description="The domain of the identities.", airbyte_hidden=False, order=1)
23
+ class RemoteIdentityType(Enum):
24
+ USER = "user"
25
+ GROUP = "group"
26
+
27
+
28
+ class RemoteIdentity(BaseModel):
29
+ id: uuid.UUID
30
+ remote_id: str
31
+ parent_id: str | None = None
32
+ name: str | None = None
33
+ description: str | None = None
34
+ email_address: str | None = None
35
+ member_email_addresses: list[str] | None = None
36
+ type: RemoteIdentityType
37
+ modified_at: datetime
38
+
39
+
40
+ class RemotePermissions(BaseModel):
41
+ id: str
42
+ file_path: str
43
+ allowed_identity_remote_ids: list[str] | None = None
44
+ denied_identity_remote_ids: list[str] | None = None
45
+ publicly_accessible: bool = False
46
+
47
+
48
+ class DeliverPermissions(DeliverPermissionsBase):
49
+ domain: Optional[str] = Field(title="Domain", description="The Google domain of the identities.", airbyte_hidden=False, order=1)
30
50
 
31
51
 
32
52
  class OAuthCredentials(BaseModel):
@@ -77,7 +97,7 @@ class SourceGoogleDriveSpec(AbstractFileBasedSpec, BaseModel):
77
97
  pattern_descriptor="https://drive.google.com/drive/folders/MY-FOLDER-ID",
78
98
  )
79
99
 
80
- delivery_method: DeliverRecords | DeliverRawFiles = Field(
100
+ delivery_method: DeliverRecords | DeliverRawFiles | DeliverPermissions = Field(
81
101
  title="Delivery Method",
82
102
  discriminator="delivery_type",
83
103
  type="object",
@@ -18,14 +18,13 @@ from googleapiclient.discovery import build
18
18
  from googleapiclient.http import MediaIoBaseDownload
19
19
 
20
20
  from airbyte_cdk import AirbyteTracedException, FailureType
21
- from airbyte_cdk.sources.file_based.config.permissions import RemoteFileIdentity, RemoteFileIdentityType, RemoteFilePermissions
22
21
  from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
23
22
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
24
23
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
24
  from source_google_drive.utils import get_folder_id
26
25
 
27
26
  from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
28
- from .spec import SourceGoogleDriveSpec
27
+ from .spec import RemoteIdentity, RemoteIdentityType, RemotePermissions, SourceGoogleDriveSpec
29
28
 
30
29
 
31
30
  FOLDER_MIME_TYPE = "application/vnd.google-apps.folder"
@@ -64,11 +63,8 @@ PUBLIC_PERMISSION_IDS = [
64
63
  "domainWithLink",
65
64
  ]
66
65
 
67
- PERMISSIONS_API_SCOPES = [
68
- "https://www.googleapis.com/auth/drive",
69
- "https://www.googleapis.com/auth/drive.readonly",
70
- "https://www.googleapis.com/auth/drive.metadata.readonly",
71
- "https://www.googleapis.com/auth/drive.file",
66
+
67
+ DRIVE_SERVICE_SCOPES = [
72
68
  "https://www.googleapis.com/auth/admin.directory.group.readonly",
73
69
  "https://www.googleapis.com/auth/admin.directory.group.member.readonly",
74
70
  "https://www.googleapis.com/auth/admin.directory.user.readonly",
@@ -92,6 +88,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
92
88
  def __init__(self):
93
89
  super().__init__()
94
90
  self._drive_service = None
91
+ self._directory_service = None
95
92
 
96
93
  @property
97
94
  def config(self) -> SourceGoogleDriveSpec:
@@ -111,32 +108,41 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
111
108
  assert isinstance(value, SourceGoogleDriveSpec)
112
109
  self._config = value
113
110
 
114
- @property
115
- def google_drive_service(self):
111
+ def _build_google_service(self, service_name: str, version: str, scopes: List[str] = None):
116
112
  if self.config is None:
117
113
  # We shouldn't hit this; config should always get set before attempting to
118
114
  # list or read files.
119
- raise ValueError("Source config is missing; cannot create the Google Drive client.")
115
+ raise ValueError(f"Source config is missing; cannot create the Google {service_name} client.")
120
116
  try:
121
- if self._drive_service is None:
122
- if self.config.credentials.auth_type == "Client":
123
- creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
124
- else:
125
- scopes = PERMISSIONS_API_SCOPES if self.sync_acl_permissions() else None
126
- creds = service_account.Credentials.from_service_account_info(
127
- json.loads(self.config.credentials.service_account_info), scopes=scopes
128
- )
129
- self._drive_service = build("drive", "v3", credentials=creds)
117
+ if self.config.credentials.auth_type == "Client":
118
+ creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
119
+ else:
120
+ creds = service_account.Credentials.from_service_account_info(
121
+ json.loads(self.config.credentials.service_account_info), scopes=scopes
122
+ )
123
+ google_service = build(service_name, version, credentials=creds)
130
124
  except Exception as e:
131
125
  raise AirbyteTracedException(
132
126
  internal_message=str(e),
133
- message="Could not authenticate with Google Drive. Please check your credentials.",
127
+ message=f"Could not authenticate with Google {service_name}. Please check your credentials.",
134
128
  failure_type=FailureType.config_error,
135
129
  exception=e,
136
130
  )
137
131
 
132
+ return google_service
133
+
134
+ @property
135
+ def google_drive_service(self):
136
+ if self._drive_service is None:
137
+ self._drive_service = self._build_google_service("drive", "v3")
138
138
  return self._drive_service
139
139
 
140
+ @property
141
+ def google_directory_service(self):
142
+ if self._directory_service is None:
143
+ self._directory_service = self._build_google_service("admin", "directory_v1", DRIVE_SERVICE_SCOPES)
144
+ return self._directory_service
145
+
140
146
  def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]:
141
147
  """
142
148
  Get all files matching the specified glob patterns.
@@ -310,7 +316,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
310
316
  except Exception as e:
311
317
  raise ErrorDownloadingFile(f"There was an error while trying to download the file {file.uri}: {str(e)}")
312
318
 
313
- def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteFileIdentity], bool]:
319
+ def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteIdentity], bool]:
314
320
  """
315
321
  Retrieves the permissions of a file in Google Drive and checks for public access.
316
322
 
@@ -341,18 +347,17 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
341
347
  if identity is not None:
342
348
  remote_identities.append(identity)
343
349
 
344
- logger.info(f"File {file_name} has {len(remote_identities)} valid permissions")
345
350
  return remote_identities, is_public
346
351
  except Exception as e:
347
352
  raise ErrorFetchingMetadata(f"An error occurred while retrieving file permissions: {str(e)}")
348
353
 
349
- def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteFileIdentity | None:
354
+ def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteIdentity | None:
350
355
  if identity.get("id") in PUBLIC_PERMISSION_IDS:
351
356
  return None
352
357
  if identity.get("deleted") is True:
353
358
  return None
354
359
 
355
- return RemoteFileIdentity(
360
+ return RemoteIdentity(
356
361
  modified_at=datetime.now(),
357
362
  id=uuid.uuid4(),
358
363
  remote_id=identity.get("emailAddress"),
@@ -364,7 +369,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
364
369
 
365
370
  def get_file_acl_permissions(self, file: GoogleDriveRemoteFile, logger: logging.Logger) -> Dict[str, Any]:
366
371
  remote_identities, is_public = self.get_file_permissions(file.id, file_name=file.uri, logger=logger)
367
- return RemoteFilePermissions(
372
+ return RemotePermissions(
368
373
  id=file.id,
369
374
  file_path=file.uri,
370
375
  allowed_identity_remote_ids=[p.remote_id for p in remote_identities],
@@ -400,34 +405,34 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
400
405
  def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
401
406
  domain = self.config.delivery_method.domain
402
407
  if not domain:
403
- raise Exception("No domain was provided")
404
- if self.google_drive_service is None or self.google_drive_service._http.credentials is None:
405
- raise Exception("No auth found")
408
+ logger.info("No domain provided. Trying to fetch identities from the user workspace.")
409
+ api_args = {"customer": "my_customer"}
410
+ else:
411
+ api_args = {"domain": domain}
406
412
 
407
- directory_service = build("admin", "directory_v1", credentials=self.google_drive_service._http.credentials)
408
- users_api = directory_service.users()
409
- groups_api = directory_service.groups()
410
- members_api = directory_service.members()
413
+ users_api = self.google_directory_service.users()
414
+ groups_api = self.google_directory_service.groups()
415
+ members_api = self.google_directory_service.members()
411
416
 
412
- for user in self._get_looping_google_api_list_response(users_api, "users", {"domain": domain}, logger):
413
- rfp = RemoteFileIdentity(
417
+ for user in self._get_looping_google_api_list_response(users_api, "users", args=api_args, logger=logger):
418
+ rfp = RemoteIdentity(
414
419
  id=uuid.uuid4(),
415
420
  remote_id=user["primaryEmail"],
416
421
  name=user["name"]["fullName"] if user["name"] is not None else None,
417
422
  email_address=user["primaryEmail"],
418
423
  member_email_addresses=[x["address"] for x in user["emails"]],
419
- type=RemoteFileIdentityType.USER,
424
+ type=RemoteIdentityType.USER,
420
425
  modified_at=datetime_now(),
421
426
  )
422
427
  yield rfp.dict()
423
428
 
424
- for group in self._get_looping_google_api_list_response(groups_api, "groups", {"domain": domain}, logger):
425
- rfp = RemoteFileIdentity(
429
+ for group in self._get_looping_google_api_list_response(groups_api, "groups", args=api_args, logger=logger):
430
+ rfp = RemoteIdentity(
426
431
  id=uuid.uuid4(),
427
432
  remote_id=group["email"],
428
433
  name=group["name"],
429
434
  email_address=group["email"],
430
- type=RemoteFileIdentityType.GROUP,
435
+ type=RemoteIdentityType.GROUP,
431
436
  modified_at=datetime_now(),
432
437
  )
433
438