airbyte-source-google-drive 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-google-drive might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-source-google-drive
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: Source implementation for Google Drive.
5
5
  License: ELv2
6
6
  Author: Airbyte
@@ -10,7 +10,7 @@ Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
- Requires-Dist: airbyte-cdk[file-based] (>=6.18.2,<7.0.0)
13
+ Requires-Dist: airbyte-cdk[file-based] (>=6.33.6,<7.0.0)
14
14
  Requires-Dist: google-api-python-client (==2.104.0)
15
15
  Requires-Dist: google-api-python-client-stubs (==1.18.0)
16
16
  Requires-Dist: google-auth-httplib2 (==0.1.1)
@@ -0,0 +1,13 @@
1
+ source_google_drive/__init__.py,sha256=SAgPIoGpjdHTY7mvf__A99FiElBh3WvdKSHLstg-BUE,134
2
+ source_google_drive/exceptions.py,sha256=6sBxbdZhj0CARkctfBaNgGHB_GGcFUkY4cnMUWyOruM,268
3
+ source_google_drive/run.py,sha256=5AQd906FbSGaNZj0udrr-VKmiSS4nsor7F7noMjsdi0,677
4
+ source_google_drive/schemas/file_permissions.json,sha256=IPhh3UIKwgtfNLNH9sndu4pMZt8zFXaBSUIWAfIRGQc,269
5
+ source_google_drive/schemas/identities.json,sha256=JXBR_v0wpfDKiWVzLoc8bPs33x5CGm91fEdJWz8RBxc,449
6
+ source_google_drive/source.py,sha256=tOljkgeg8RIxmhrpfa-qgGhI6WPhiL_-I8itTa6JH04,3959
7
+ source_google_drive/spec.py,sha256=-WkA2zGuQtf3G7uK8uq9BnimUlQh0s3vsqROmIHOgzI,4718
8
+ source_google_drive/stream_reader.py,sha256=W9I0tE-qUXkxEFeblwqrhtyKDV8T_EGpf5a3Q1ZolAE,20683
9
+ source_google_drive/utils.py,sha256=ewR-kBKLmtD-s7zqCfGECfzWYF43tpQdscAQIlUEkR8,1022
10
+ airbyte_source_google_drive-0.2.0.dist-info/METADATA,sha256=c0ryPatxPFjpPolIHZCnPSV9dFp97r3GZU5npXlNOEs,5518
11
+ airbyte_source_google_drive-0.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
12
+ airbyte_source_google_drive-0.2.0.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
13
+ airbyte_source_google_drive-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,12 @@
1
+ {
2
+ "type": "object",
3
+ "properties": {
4
+ "id": { "type": "string" },
5
+ "file_path": { "type": "string" },
6
+ "allowed_identity_remote_ids": {
7
+ "type": "array",
8
+ "items": { "type": "string" }
9
+ },
10
+ "publicly_accessible": { "type": "boolean" }
11
+ }
12
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "type": "object",
3
+ "properties": {
4
+ "id": { "type": "string" },
5
+ "remote_id": { "type": "string" },
6
+ "parent_id": { "type": ["null", "string"] },
7
+ "name": { "type": ["null", "string"] },
8
+ "description": { "type": ["null", "string"] },
9
+ "email_address": { "type": ["null", "string"] },
10
+ "member_email_addresses": { "type": ["null", "array"] },
11
+ "type": { "type": "string" },
12
+ "modified_at": { "type": "string" }
13
+ }
14
+ }
@@ -6,7 +6,7 @@
6
6
  from typing import Any, Mapping, Optional
7
7
 
8
8
  from airbyte_cdk import AdvancedAuth, ConfiguredAirbyteCatalog, ConnectorSpecification, OAuthConfigSpecification, TState
9
- from airbyte_cdk.models import AuthFlowType
9
+ from airbyte_cdk.models import AuthFlowType, OauthConnectorInputSpecification
10
10
  from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
11
11
  from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
12
12
  from source_google_drive.spec import SourceGoogleDriveSpec
@@ -28,6 +28,11 @@ class SourceGoogleDrive(FileBasedSource):
28
28
  """
29
29
  Returns the specification describing what fields can be configured by a user when setting up a file-based source.
30
30
  """
31
+ oauth_connector_input_specification = OauthConnectorInputSpecification(
32
+ consent_url="https://accounts.google.com/o/oauth2/v2/auth?{{client_id_param}}&{{redirect_uri_param}}&response_type=code&{{scope_param}}&access_type=offline&{{state_param}}&include_granted_scopes=true&prompt=consent",
33
+ access_token_url="https://oauth2.googleapis.com/token?{{client_id_param}}&{{client_secret_param}}&{{auth_code_param}}&{{redirect_uri_param}}&grant_type=authorization_code",
34
+ scope="https://www.googleapis.com/auth/drive.readonly https://www.googleapis.com/auth/admin.directory.group.readonly https://www.googleapis.com/auth/admin.directory.group.member.readonly https://www.googleapis.com/auth/admin.directory.user.readonly",
35
+ )
31
36
 
32
37
  return ConnectorSpecification(
33
38
  documentationUrl=self.spec_class.documentation_url(),
@@ -37,10 +42,17 @@ class SourceGoogleDrive(FileBasedSource):
37
42
  predicate_key=["credentials", "auth_type"],
38
43
  predicate_value="Client",
39
44
  oauth_config_specification=OAuthConfigSpecification(
45
+ oauth_connector_input_specification=oauth_connector_input_specification,
40
46
  complete_oauth_output_specification={
41
47
  "type": "object",
42
48
  "additionalProperties": False,
43
- "properties": {"refresh_token": {"type": "string", "path_in_connector_config": ["credentials", "refresh_token"]}},
49
+ "properties": {
50
+ "refresh_token": {
51
+ "type": "string",
52
+ "path_in_connector_config": ["credentials", "refresh_token"],
53
+ "path_in_oauth_response": ["refresh_token"],
54
+ }
55
+ },
44
56
  },
45
57
  complete_oauth_server_input_specification={
46
58
  "type": "object",
@@ -1,15 +1,52 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
5
-
6
- from typing import Any, Dict, Literal, Union
4
+ import uuid
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any, Dict, Literal, Optional, Union
7
8
 
8
9
  import dpath.util
9
10
  from pydantic.v1 import BaseModel, Field
10
11
 
11
12
  from airbyte_cdk import OneOfOptionConfig
12
- from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec, DeliverRawFiles, DeliverRecords
13
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
14
+ AbstractFileBasedSpec,
15
+ DeliverRawFiles,
16
+ DeliverRecords,
17
+ )
18
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import (
19
+ DeliverPermissions as DeliverPermissionsBase,
20
+ )
21
+
22
+
23
+ class RemoteIdentityType(Enum):
24
+ USER = "user"
25
+ GROUP = "group"
26
+
27
+
28
+ class RemoteIdentity(BaseModel):
29
+ id: uuid.UUID
30
+ remote_id: str
31
+ parent_id: str | None = None
32
+ name: str | None = None
33
+ description: str | None = None
34
+ email_address: str | None = None
35
+ member_email_addresses: list[str] | None = None
36
+ type: RemoteIdentityType
37
+ modified_at: datetime
38
+
39
+
40
+ class RemotePermissions(BaseModel):
41
+ id: str
42
+ file_path: str
43
+ allowed_identity_remote_ids: list[str] | None = None
44
+ denied_identity_remote_ids: list[str] | None = None
45
+ publicly_accessible: bool = False
46
+
47
+
48
+ class DeliverPermissions(DeliverPermissionsBase):
49
+ domain: Optional[str] = Field(title="Domain", description="The Google domain of the identities.", airbyte_hidden=False, order=1)
13
50
 
14
51
 
15
52
  class OAuthCredentials(BaseModel):
@@ -60,7 +97,7 @@ class SourceGoogleDriveSpec(AbstractFileBasedSpec, BaseModel):
60
97
  pattern_descriptor="https://drive.google.com/drive/folders/MY-FOLDER-ID",
61
98
  )
62
99
 
63
- delivery_method: DeliverRecords | DeliverRawFiles = Field(
100
+ delivery_method: DeliverRecords | DeliverRawFiles | DeliverPermissions = Field(
64
101
  title="Delivery Method",
65
102
  discriminator="delivery_type",
66
103
  type="object",
@@ -6,11 +6,13 @@
6
6
  import io
7
7
  import json
8
8
  import logging
9
+ import uuid
9
10
  from datetime import datetime
10
11
  from io import IOBase
11
12
  from os.path import getsize
12
- from typing import Dict, Iterable, List, Optional, Set
13
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple
13
14
 
15
+ import pytz
14
16
  from google.oauth2 import credentials, service_account
15
17
  from googleapiclient.discovery import build
16
18
  from googleapiclient.http import MediaIoBaseDownload
@@ -19,10 +21,12 @@ from airbyte_cdk import AirbyteTracedException, FailureType
19
21
  from airbyte_cdk.sources.file_based.exceptions import FileSizeLimitError
20
22
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
21
23
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
24
+ from airbyte_cdk.sources.streams.core import package_name_from_class
25
+ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, ResourceSchemaLoader
22
26
  from source_google_drive.utils import get_folder_id
23
27
 
24
28
  from .exceptions import ErrorDownloadingFile, ErrorFetchingMetadata
25
- from .spec import SourceGoogleDriveSpec
29
+ from .spec import RemoteIdentity, RemoteIdentityType, RemotePermissions, SourceGoogleDriveSpec
26
30
 
27
31
 
28
32
  FOLDER_MIME_TYPE = "application/vnd.google-apps.folder"
@@ -54,6 +58,24 @@ DOWNLOADABLE_DOCUMENTS_MIME_TYPES = {
54
58
  GOOGLE_DRAWING_MIME_TYPE: {EXPORT_MEDIA_MIME_TYPE_KEY: EXPORT_MEDIA_MIME_TYPE_PDF, DOCUMENT_FILE_EXTENSION_KEY: ".pdf"},
55
59
  }
56
60
 
61
+ PUBLIC_PERMISSION_IDS = [
62
+ "anyoneWithLink",
63
+ "anyoneCanFind",
64
+ "domainCanFind",
65
+ "domainWithLink",
66
+ ]
67
+
68
+
69
+ DRIVE_SERVICE_SCOPES = [
70
+ "https://www.googleapis.com/auth/admin.directory.group.readonly",
71
+ "https://www.googleapis.com/auth/admin.directory.group.member.readonly",
72
+ "https://www.googleapis.com/auth/admin.directory.user.readonly",
73
+ ]
74
+
75
+
76
+ def datetime_now() -> datetime:
77
+ return datetime.now(pytz.UTC)
78
+
57
79
 
58
80
  class GoogleDriveRemoteFile(RemoteFile):
59
81
  id: str
@@ -68,6 +90,7 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
68
90
  def __init__(self):
69
91
  super().__init__()
70
92
  self._drive_service = None
93
+ self._directory_service = None
71
94
 
72
95
  @property
73
96
  def config(self) -> SourceGoogleDriveSpec:
@@ -87,29 +110,41 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
87
110
  assert isinstance(value, SourceGoogleDriveSpec)
88
111
  self._config = value
89
112
 
90
- @property
91
- def google_drive_service(self):
113
+ def _build_google_service(self, service_name: str, version: str, scopes: List[str] = None):
92
114
  if self.config is None:
93
115
  # We shouldn't hit this; config should always get set before attempting to
94
116
  # list or read files.
95
- raise ValueError("Source config is missing; cannot create the Google Drive client.")
117
+ raise ValueError(f"Source config is missing; cannot create the Google {service_name} client.")
96
118
  try:
97
- if self._drive_service is None:
98
- if self.config.credentials.auth_type == "Client":
99
- creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
100
- else:
101
- creds = service_account.Credentials.from_service_account_info(json.loads(self.config.credentials.service_account_info))
102
- self._drive_service = build("drive", "v3", credentials=creds)
119
+ if self.config.credentials.auth_type == "Client":
120
+ creds = credentials.Credentials.from_authorized_user_info(self.config.credentials.dict())
121
+ else:
122
+ creds = service_account.Credentials.from_service_account_info(
123
+ json.loads(self.config.credentials.service_account_info), scopes=scopes
124
+ )
125
+ google_service = build(service_name, version, credentials=creds)
103
126
  except Exception as e:
104
127
  raise AirbyteTracedException(
105
128
  internal_message=str(e),
106
- message="Could not authenticate with Google Drive. Please check your credentials.",
129
+ message=f"Could not authenticate with Google {service_name}. Please check your credentials.",
107
130
  failure_type=FailureType.config_error,
108
131
  exception=e,
109
132
  )
110
133
 
134
+ return google_service
135
+
136
+ @property
137
+ def google_drive_service(self):
138
+ if self._drive_service is None:
139
+ self._drive_service = self._build_google_service("drive", "v3")
111
140
  return self._drive_service
112
141
 
142
+ @property
143
+ def google_directory_service(self):
144
+ if self._directory_service is None:
145
+ self._directory_service = self._build_google_service("admin", "directory_v1", DRIVE_SERVICE_SCOPES)
146
+ return self._directory_service
147
+
113
148
  def get_matching_files(self, globs: List[str], prefix: Optional[str], logger: logging.Logger) -> Iterable[RemoteFile]:
114
149
  """
115
150
  Get all files matching the specified glob patterns.
@@ -282,3 +317,137 @@ class SourceGoogleDriveStreamReader(AbstractFileBasedStreamReader):
282
317
 
283
318
  except Exception as e:
284
319
  raise ErrorDownloadingFile(f"There was an error while trying to download the file {file.uri}: {str(e)}")
320
+
321
+ def get_file_permissions(self, file_id: str, file_name: str, logger: logging.Logger) -> Tuple[List[RemoteIdentity], bool]:
322
+ """
323
+ Retrieves the permissions of a file in Google Drive and checks for public access.
324
+
325
+ Args:
326
+ file_id (str): The file to get permissions for.
327
+ file_name (str): The name of the file to get permissions for.
328
+ logger (logging.Logger): Logger for debugging and information.
329
+
330
+ Returns:
331
+ Tuple(List[RemoteFileIdentity], boolean): A list of RemoteFileIdentity objects containing permission details.
332
+ """
333
+ try:
334
+ request = self.google_drive_service.permissions().list(
335
+ fileId=file_id,
336
+ fields="permissions, permissions/role, permissions/type, permissions/id, permissions/emailAddress",
337
+ supportsAllDrives=True,
338
+ )
339
+ response = request.execute()
340
+ permissions = response.get("permissions", [])
341
+ is_public = False
342
+
343
+ remote_identities = []
344
+
345
+ for p in permissions:
346
+ identity = self._to_remote_file_identity(p)
347
+ if p.get("id") in PUBLIC_PERMISSION_IDS:
348
+ is_public = True
349
+ if identity is not None:
350
+ remote_identities.append(identity)
351
+
352
+ return remote_identities, is_public
353
+ except Exception as e:
354
+ raise ErrorFetchingMetadata(f"An error occurred while retrieving file permissions: {str(e)}")
355
+
356
+ def _to_remote_file_identity(self, identity: dict[str, Any]) -> RemoteIdentity | None:
357
+ if identity.get("id") in PUBLIC_PERMISSION_IDS:
358
+ return None
359
+ if identity.get("deleted") is True:
360
+ return None
361
+
362
+ return RemoteIdentity(
363
+ modified_at=datetime.now(),
364
+ id=uuid.uuid4(),
365
+ remote_id=identity.get("emailAddress"),
366
+ name=identity.get("name"),
367
+ email_address=identity.get("emailAddress"),
368
+ type=identity.get("type"),
369
+ description=None,
370
+ )
371
+
372
+ def get_file_acl_permissions(self, file: GoogleDriveRemoteFile, logger: logging.Logger) -> Dict[str, Any]:
373
+ remote_identities, is_public = self.get_file_permissions(file.id, file_name=file.uri, logger=logger)
374
+ return RemotePermissions(
375
+ id=file.id,
376
+ file_path=file.uri,
377
+ allowed_identity_remote_ids=[p.remote_id for p in remote_identities],
378
+ publicly_accessible=is_public,
379
+ ).dict(exclude_none=True)
380
+
381
+ def _get_looping_google_api_list_response(
382
+ self, service: Any, key: str, args: dict[str, Any], logger: logging.Logger
383
+ ) -> Iterator[dict[str, Any]]:
384
+ try:
385
+ looping = True
386
+ next_page_token: str | None = None
387
+ while looping:
388
+ rsp = service.list(pageToken=next_page_token, **args).execute()
389
+ next_page_token = rsp.get("nextPageToken")
390
+ items: list[dict[str, Any]] = rsp.get(key)
391
+
392
+ if items is None or len(items) == 0:
393
+ looping = False
394
+ break
395
+
396
+ if rsp.get("nextPageToken") is None:
397
+ looping = False
398
+ else:
399
+ next_page_token = rsp.get("nextPageToken")
400
+
401
+ for item in items:
402
+ yield item
403
+ except Exception as e:
404
+ logger.error(f"There was an error listing {key} with {args}: {str(e)}")
405
+ raise e
406
+
407
+ def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
408
+ domain = self.config.delivery_method.domain
409
+ if not domain:
410
+ logger.info("No domain provided. Trying to fetch identities from the user workspace.")
411
+ api_args = {"customer": "my_customer"}
412
+ else:
413
+ api_args = {"domain": domain}
414
+
415
+ users_api = self.google_directory_service.users()
416
+ groups_api = self.google_directory_service.groups()
417
+ members_api = self.google_directory_service.members()
418
+
419
+ for user in self._get_looping_google_api_list_response(users_api, "users", args=api_args, logger=logger):
420
+ rfp = RemoteIdentity(
421
+ id=uuid.uuid4(),
422
+ remote_id=user["primaryEmail"],
423
+ name=user["name"]["fullName"] if user["name"] is not None else None,
424
+ email_address=user["primaryEmail"],
425
+ member_email_addresses=[x["address"] for x in user["emails"]],
426
+ type=RemoteIdentityType.USER,
427
+ modified_at=datetime_now(),
428
+ )
429
+ yield rfp.dict()
430
+
431
+ for group in self._get_looping_google_api_list_response(groups_api, "groups", args=api_args, logger=logger):
432
+ rfp = RemoteIdentity(
433
+ id=uuid.uuid4(),
434
+ remote_id=group["email"],
435
+ name=group["name"],
436
+ email_address=group["email"],
437
+ type=RemoteIdentityType.GROUP,
438
+ modified_at=datetime_now(),
439
+ )
440
+
441
+ for member in self._get_looping_google_api_list_response(members_api, "members", {"groupKey": group["id"]}, logger):
442
+ rfp.member_email_addresses = rfp.member_email_addresses or []
443
+ rfp.member_email_addresses.append(member["email"])
444
+
445
+ yield rfp.dict()
446
+
447
+ @property
448
+ def file_permissions_schema(self) -> Dict[str, Any]:
449
+ return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("file_permissions")
450
+
451
+ @property
452
+ def identities_schema(self) -> Dict[str, Any]:
453
+ return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema("identities")
@@ -15,6 +15,8 @@ def get_folder_id(url_string: str) -> str:
15
15
  if parsed_url.scheme != "https" or parsed_url.netloc != "drive.google.com":
16
16
  raise ValueError("Folder URL has to be of the form https://drive.google.com/drive/folders/<folder_id>")
17
17
  path_segments = list(filter(None, parsed_url.path.split("/")))
18
+ if path_segments[-1] in ["my-drive", "home"]:
19
+ return "root"
18
20
  if path_segments[-2] != "folders" or len(path_segments) < 3:
19
21
  raise ValueError("Folder URL has to be of the form https://drive.google.com/drive/folders/<folder_id>")
20
22
  return path_segments[-1]
@@ -1,11 +0,0 @@
1
- source_google_drive/__init__.py,sha256=SAgPIoGpjdHTY7mvf__A99FiElBh3WvdKSHLstg-BUE,134
2
- source_google_drive/exceptions.py,sha256=6sBxbdZhj0CARkctfBaNgGHB_GGcFUkY4cnMUWyOruM,268
3
- source_google_drive/run.py,sha256=5AQd906FbSGaNZj0udrr-VKmiSS4nsor7F7noMjsdi0,677
4
- source_google_drive/source.py,sha256=OyKTjPg80KR8yZiYiC3YSi6E6egothtAn3YAHAS6Q6A,2839
5
- source_google_drive/spec.py,sha256=W5tUpyt-T9JjxsN6WTgz1SHBcGJnXg__L9M-nsDWOlk,3690
6
- source_google_drive/stream_reader.py,sha256=thZEVYsF2sCI4ofFNfWWA24cBzyCm-O-30JM5Coi6Ew,13731
7
- source_google_drive/utils.py,sha256=1Fe3J4KXI1iIf4rklgLoR4p1xkCzLQI7zj5dkyi4Vt0,942
8
- airbyte_source_google_drive-0.1.2.dist-info/METADATA,sha256=Gdnl2-Pn8ixEhA6u7ufaSKfVuRQFwUbro0dwGs4lV50,5518
9
- airbyte_source_google_drive-0.1.2.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
- airbyte_source_google_drive-0.1.2.dist-info/entry_points.txt,sha256=YgpJf0nA5Mn0B7YC9VOFI847vz1jI6U4q7BeLUOXa54,67
11
- airbyte_source_google_drive-0.1.2.dist-info/RECORD,,