airbyte-source-azure-blob-storage 0.3.6__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of airbyte-source-azure-blob-storage might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-source-azure-blob-storage
3
- Version: 0.3.6
3
+ Version: 0.4.1
4
4
  Summary: Source implementation for Azure Blob Storage.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
@@ -32,6 +32,26 @@ For information about how to use this connector within Airbyte, see [the documen
32
32
  * Poetry (~=1.7) - installation instructions [here](https://python-poetry.org/docs/#installation)
33
33
 
34
34
 
35
+ ### Generate new oauth token
36
+
37
+ Tenant id should be provided by user, reason:
38
+ https://learn.microsoft.com/en-us/answers/questions/1531138/which-tenant-id-do-i-have-to-use-to-get-tokens-and
39
+
40
+ 1. GET https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/authorize
41
+ ?response_type=code
42
+ &client_id=<client_id>
43
+ &scope=offline_access https://storage.azure.com/.default
44
+ &redirect_uri=http://localhost:8000/auth_flow
45
+ &response_mode=query
46
+ &state=1234
47
+
48
+ 2. POST https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/token
49
+ client_id:<client_id>
50
+ code:<code obtained from previous request>
51
+ redirect_uri:http://localhost:8000/auth_flow
52
+ grant_type:authorization_code
53
+ client_secret:<client_secret>
54
+
35
55
  ### Installing the connector
36
56
  From this connector directory, run:
37
57
  ```bash
@@ -11,6 +11,26 @@ For information about how to use this connector within Airbyte, see [the documen
11
11
  * Poetry (~=1.7) - installation instructions [here](https://python-poetry.org/docs/#installation)
12
12
 
13
13
 
14
+ ### Generate new oauth token
15
+
16
+ Tenant id should be provided by user, reason:
17
+ https://learn.microsoft.com/en-us/answers/questions/1531138/which-tenant-id-do-i-have-to-use-to-get-tokens-and
18
+
19
+ 1. GET https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/authorize
20
+ ?response_type=code
21
+ &client_id=<client_id>
22
+ &scope=offline_access https://storage.azure.com/.default
23
+ &redirect_uri=http://localhost:8000/auth_flow
24
+ &response_mode=query
25
+ &state=1234
26
+
27
+ 2. POST https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/token
28
+ client_id:<client_id>
29
+ code:<code obtained from previous request>
30
+ redirect_uri:http://localhost:8000/auth_flow
31
+ grant_type:authorization_code
32
+ client_secret:<client_secret>
33
+
14
34
  ### Installing the connector
15
35
  From this connector directory, run:
16
36
  ```bash
@@ -5,7 +5,7 @@ requires = [
5
5
  build-backend = "poetry.core.masonry.api"
6
6
 
7
7
  [tool.poetry]
8
- version = "0.3.6"
8
+ version = "0.4.1"
9
9
  name = "airbyte-source-azure-blob-storage"
10
10
  description = "Source implementation for Azure Blob Storage."
11
11
  authors = [
@@ -41,6 +41,7 @@ source-azure-blob-storage = "source_azure_blob_storage.run:run"
41
41
 
42
42
  [tool.poetry.group.dev.dependencies]
43
43
  docker = "^7.0.0"
44
+ freezegun = "^1.4.0"
44
45
  pytest-mock = "^3.6.1"
45
46
  requests-mock = "^1.9.3"
46
47
  pandas = "2.2.1"
@@ -3,8 +3,8 @@
3
3
  #
4
4
 
5
5
 
6
- from .config import Config
7
6
  from .source import SourceAzureBlobStorage
7
+ from .spec import SourceAzureBlobStorageSpec
8
8
  from .stream_reader import SourceAzureBlobStorageStreamReader
9
9
 
10
- __all__ = ["SourceAzureBlobStorage", "SourceAzureBlobStorageStreamReader", "Config"]
10
+ __all__ = ["SourceAzureBlobStorage", "SourceAzureBlobStorageStreamReader", "SourceAzureBlobStorageSpec"]
@@ -0,0 +1,119 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+
6
+ import logging
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, List, Mapping
9
+
10
+ from airbyte_cdk.config_observation import create_connector_config_control_message
11
+ from airbyte_cdk.entrypoint import AirbyteEntrypoint
12
+ from airbyte_cdk.sources import Source
13
+
14
+ logger = logging.getLogger("airbyte_logger")
15
+
16
+
17
+ class MigrateConfig(ABC):
18
+ @classmethod
19
+ @abstractmethod
20
+ def should_migrate(cls, config: Mapping[str, Any]) -> bool:
21
+ ...
22
+
23
+ @classmethod
24
+ @abstractmethod
25
+ def migrate_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]:
26
+ ...
27
+
28
+ @classmethod
29
+ def modify_and_save(cls, config_path: str, source: Source, config: Mapping[str, Any]) -> Mapping[str, Any]:
30
+ """
31
+ Modifies the configuration and then saves it back to the source.
32
+
33
+ Args:
34
+ - config_path (str): The path where the configuration is stored.
35
+ - source (Source): The data source.
36
+ - config (Mapping[str, Any]): The current configuration.
37
+
38
+ Returns:
39
+ - Mapping[str, Any]: The updated configuration.
40
+ """
41
+ migrated_config = cls.migrate_config(config)
42
+ source.write_config(migrated_config, config_path)
43
+ return migrated_config
44
+
45
+ @classmethod
46
+ def emit_control_message(cls, migrated_config: Mapping[str, Any]) -> None:
47
+ """
48
+ Emits the control messages related to configuration migration.
49
+
50
+ Args:
51
+ - migrated_config (Mapping[str, Any]): The migrated configuration.
52
+ """
53
+ print(create_connector_config_control_message(migrated_config).json(exclude_unset=True))
54
+
55
+ @classmethod
56
+ def migrate(cls, args: List[str], source: Source) -> None:
57
+ """
58
+ Orchestrates the configuration migration process.
59
+
60
+ It first checks if the `--config` argument is provided, and if so,
61
+ determines whether migration is needed, and then performs the migration
62
+ if required.
63
+
64
+ Args:
65
+ - args (List[str]): List of command-line arguments.
66
+ - source (Source): The data source.
67
+ """
68
+ config_path = AirbyteEntrypoint(source).extract_config(args)
69
+ if config_path:
70
+ config = source.read_config(config_path)
71
+ if cls.should_migrate(config):
72
+ cls.emit_control_message(cls.modify_and_save(config_path, source, config))
73
+
74
+
75
+ class MigrateLegacyConfig(MigrateConfig):
76
+ """
77
+ Class that takes in Azure Blob Storage source configs in the legacy format and transforms them into
78
+ configs that can be used by the new Azure Blob Storage source built with the file-based CDK.
79
+ """
80
+
81
+ @classmethod
82
+ def should_migrate(cls, config: Mapping[str, Any]) -> bool:
83
+ return "streams" not in config
84
+
85
+ @classmethod
86
+ def migrate_config(cls, legacy_config: Mapping[str, Any]) -> Mapping[str, Any]:
87
+ azure_blob_storage_blobs_prefix = legacy_config.get("azure_blob_storage_blobs_prefix", "")
88
+ return {
89
+ "azure_blob_storage_endpoint": legacy_config.get("azure_blob_storage_endpoint", None),
90
+ "azure_blob_storage_account_name": legacy_config["azure_blob_storage_account_name"],
91
+ "azure_blob_storage_account_key": legacy_config["azure_blob_storage_account_key"],
92
+ "azure_blob_storage_container_name": legacy_config["azure_blob_storage_container_name"],
93
+ "streams": [
94
+ {
95
+ "name": legacy_config["azure_blob_storage_container_name"],
96
+ "legacy_prefix": azure_blob_storage_blobs_prefix,
97
+ "validation_policy": "Emit Record",
98
+ "format": {"filetype": "jsonl"},
99
+ }
100
+ ],
101
+ }
102
+
103
+
104
+ class MigrateCredentials(MigrateConfig):
105
+ """
106
+ This class stands for migrating the config azure_blob_storage_account_key inside object `credentials`
107
+ """
108
+
109
+ @classmethod
110
+ def should_migrate(cls, config: Mapping[str, Any]) -> bool:
111
+ return "credentials" not in config
112
+
113
+ @classmethod
114
+ def migrate_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]:
115
+ config["credentials"] = {
116
+ "auth_type": "storage_account_key",
117
+ "azure_blob_storage_account_key": config.pop("azure_blob_storage_account_key"),
118
+ }
119
+ return config
@@ -9,7 +9,8 @@ from datetime import datetime
9
9
  from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
10
10
  from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteTraceMessage, TraceType, Type
11
11
  from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
12
- from source_azure_blob_storage import Config, SourceAzureBlobStorage, SourceAzureBlobStorageStreamReader
12
+ from source_azure_blob_storage import SourceAzureBlobStorage, SourceAzureBlobStorageSpec, SourceAzureBlobStorageStreamReader
13
+ from source_azure_blob_storage.config_migrations import MigrateCredentials, MigrateLegacyConfig
13
14
 
14
15
 
15
16
  def run():
@@ -20,12 +21,14 @@ def run():
20
21
  try:
21
22
  source = SourceAzureBlobStorage(
22
23
  SourceAzureBlobStorageStreamReader(),
23
- Config,
24
+ SourceAzureBlobStorageSpec,
24
25
  SourceAzureBlobStorage.read_catalog(catalog_path) if catalog_path else None,
25
26
  SourceAzureBlobStorage.read_config(config_path) if catalog_path else None,
26
27
  SourceAzureBlobStorage.read_state(state_path) if catalog_path else None,
27
28
  cursor_cls=DefaultFileBasedCursor,
28
29
  )
30
+ MigrateLegacyConfig.migrate(sys.argv[1:], source)
31
+ MigrateCredentials.migrate(sys.argv[1:], source)
29
32
  except Exception:
30
33
  print(
31
34
  AirbyteMessage(
@@ -0,0 +1,52 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Any, Mapping
6
+
7
+ from airbyte_cdk.config_observation import emit_configuration_as_airbyte_control_message
8
+ from airbyte_cdk.sources.declarative.models import OAuthConfigSpecification
9
+ from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
10
+ from airbyte_protocol.models import AdvancedAuth, ConnectorSpecification
11
+
12
+
13
+ class SourceAzureBlobStorage(FileBasedSource):
14
+ def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
15
+ """
16
+ Returns the specification describing what fields can be configured by a user when setting up a file-based source.
17
+ """
18
+
19
+ return ConnectorSpecification(
20
+ documentationUrl=self.spec_class.documentation_url(),
21
+ connectionSpecification=self.spec_class.schema(),
22
+ advanced_auth=AdvancedAuth(
23
+ auth_flow_type="oauth2.0",
24
+ predicate_key=["credentials", "auth_type"],
25
+ predicate_value="oauth2",
26
+ oauth_config_specification=OAuthConfigSpecification(
27
+ complete_oauth_output_specification={
28
+ "type": "object",
29
+ "additionalProperties": False,
30
+ "properties": {"refresh_token": {"type": "string", "path_in_connector_config": ["credentials", "refresh_token"]}},
31
+ },
32
+ complete_oauth_server_input_specification={
33
+ "type": "object",
34
+ "additionalProperties": False,
35
+ "properties": {"client_id": {"type": "string"}, "client_secret": {"type": "string"}},
36
+ },
37
+ complete_oauth_server_output_specification={
38
+ "type": "object",
39
+ "additionalProperties": False,
40
+ "properties": {
41
+ "client_id": {"type": "string", "path_in_connector_config": ["credentials", "client_id"]},
42
+ "client_secret": {"type": "string", "path_in_connector_config": ["credentials", "client_secret"]},
43
+ },
44
+ },
45
+ oauth_user_input_from_connector_config_specification={
46
+ "type": "object",
47
+ "additionalProperties": False,
48
+ "properties": {"tenant_id": {"type": "string", "path_in_connector_config": ["credentials", "tenant_id"]}},
49
+ },
50
+ ),
51
+ ),
52
+ )
@@ -2,14 +2,54 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from typing import Any, Dict, Optional
5
+ from typing import Any, Dict, Literal, Optional, Union
6
6
 
7
7
  import dpath.util
8
8
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
9
- from pydantic import AnyUrl, Field
9
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
10
+ from pydantic import AnyUrl, BaseModel, Field
10
11
 
11
12
 
12
- class Config(AbstractFileBasedSpec):
13
+ class Oauth2(BaseModel):
14
+ class Config(OneOfOptionConfig):
15
+ title = "Authenticate via Oauth2"
16
+ discriminator = "auth_type"
17
+
18
+ auth_type: Literal["oauth2"] = Field("oauth2", const=True)
19
+ tenant_id: str = Field(title="Tenant ID", description="Tenant ID of the Microsoft Azure Application user", airbyte_secret=True)
20
+ client_id: str = Field(
21
+ title="Client ID",
22
+ description="Client ID of your Microsoft developer application",
23
+ airbyte_secret=True,
24
+ )
25
+ client_secret: str = Field(
26
+ title="Client Secret",
27
+ description="Client Secret of your Microsoft developer application",
28
+ airbyte_secret=True,
29
+ )
30
+ refresh_token: str = Field(
31
+ title="Refresh Token",
32
+ description="Refresh Token of your Microsoft developer application",
33
+ airbyte_secret=True,
34
+ )
35
+
36
+
37
+ class StorageAccountKey(BaseModel):
38
+ class Config(OneOfOptionConfig):
39
+ title = "Authenticate via Storage Account Key"
40
+ discriminator = "auth_type"
41
+
42
+ auth_type: Literal["storage_account_key"] = Field("storage_account_key", const=True)
43
+ azure_blob_storage_account_key: str = Field(
44
+ title="Azure Blob Storage account key",
45
+ description="The Azure blob storage account key.",
46
+ airbyte_secret=True,
47
+ examples=["Z8ZkZpteggFx394vm+PJHnGTvdRncaYS+JhLKdj789YNmD+iyGTnG+PV+POiuYNhBg/ACS+LKjd%4FG3FHGN12Nd=="],
48
+ order=3,
49
+ )
50
+
51
+
52
+ class SourceAzureBlobStorageSpec(AbstractFileBasedSpec):
13
53
  """
14
54
  NOTE: When this Spec is changed, legacy_config_transformer.py must also be modified to uptake the changes
15
55
  because it is responsible for converting legacy Azure Blob Storage v0 configs into v1 configs using the File-Based CDK.
@@ -25,11 +65,11 @@ class Config(AbstractFileBasedSpec):
25
65
  examples=["airbyte5storage"],
26
66
  order=2,
27
67
  )
28
- azure_blob_storage_account_key: str = Field(
29
- title="Azure Blob Storage account key",
30
- description="The Azure blob storage account key.",
31
- airbyte_secret=True,
32
- examples=["Z8ZkZpteggFx394vm+PJHnGTvdRncaYS+JhLKdj789YNmD+iyGTnG+PV+POiuYNhBg/ACS+LKjd%4FG3FHGN12Nd=="],
68
+ credentials: Union[Oauth2, StorageAccountKey] = Field(
69
+ title="Authentication",
70
+ description="Credentials for connecting to the Azure Blob Storage",
71
+ discriminator="auth_type",
72
+ type="object",
33
73
  order=3,
34
74
  )
35
75
  azure_blob_storage_container_name: str = Field(
@@ -2,28 +2,44 @@
2
2
 
3
3
  import logging
4
4
  from io import IOBase
5
- from typing import Iterable, List, Optional
5
+ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pytz
8
8
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
9
9
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
10
+ from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator
11
+ from azure.core.credentials import AccessToken
10
12
  from azure.storage.blob import BlobServiceClient, ContainerClient
11
13
  from smart_open import open
12
14
 
13
- from .config import Config
15
+ from .spec import SourceAzureBlobStorageSpec
16
+
17
+
18
+ class AzureOauth2Authenticator(Oauth2Authenticator):
19
+ """
20
+ Authenticator for Azure Blob Storage SDK to align with azure.core.credentials.TokenCredential protocol
21
+ """
22
+
23
+ def get_token(self, *args, **kwargs) -> AccessToken:
24
+ """Parent class handles Oauth Refresh token logic.
25
+ `expires_on` is ignored and set to year 2222 to align with protocol.
26
+ """
27
+ return AccessToken(token=self.get_access_token(), expires_on=7952342400)
14
28
 
15
29
 
16
30
  class SourceAzureBlobStorageStreamReader(AbstractFileBasedStreamReader):
31
+ _credentials = None
32
+
17
33
  def __init__(self, *args, **kwargs):
18
34
  super().__init__(*args, **kwargs)
19
35
  self._config = None
20
36
 
21
37
  @property
22
- def config(self) -> Config:
38
+ def config(self) -> SourceAzureBlobStorageSpec:
23
39
  return self._config
24
40
 
25
41
  @config.setter
26
- def config(self, value: Config) -> None:
42
+ def config(self, value: SourceAzureBlobStorageSpec) -> None:
27
43
  self._config = value
28
44
 
29
45
  @property
@@ -35,14 +51,26 @@ class SourceAzureBlobStorageStreamReader(AbstractFileBasedStreamReader):
35
51
  @property
36
52
  def azure_container_client(self):
37
53
  return ContainerClient(
38
- self.account_url,
39
- container_name=self.config.azure_blob_storage_container_name,
40
- credential=self.config.azure_blob_storage_account_key,
54
+ self.account_url, container_name=self.config.azure_blob_storage_container_name, credential=self.azure_credentials
41
55
  )
42
56
 
43
57
  @property
44
58
  def azure_blob_service_client(self):
45
- return BlobServiceClient(self.account_url, credential=self.config.azure_blob_storage_account_key)
59
+ return BlobServiceClient(self.account_url, credential=self._credentials)
60
+
61
+ @property
62
+ def azure_credentials(self) -> Union[str, AzureOauth2Authenticator]:
63
+ if not self._credentials:
64
+ if self.config.credentials.auth_type == "storage_account_key":
65
+ self._credentials = self.config.credentials.azure_blob_storage_account_key
66
+ else:
67
+ self._credentials = AzureOauth2Authenticator(
68
+ token_refresh_endpoint=f"https://login.microsoftonline.com/{self.config.credentials.tenant_id}/oauth2/v2.0/token",
69
+ client_id=self.config.credentials.client_id,
70
+ client_secret=self.config.credentials.client_secret,
71
+ refresh_token=self.config.credentials.refresh_token,
72
+ )
73
+ return self._credentials
46
74
 
47
75
  def get_matching_files(
48
76
  self,
@@ -55,8 +83,7 @@ class SourceAzureBlobStorageStreamReader(AbstractFileBasedStreamReader):
55
83
  for prefix in prefixes:
56
84
  for blob in self.azure_container_client.list_blobs(name_starts_with=prefix):
57
85
  remote_file = RemoteFile(uri=blob.name, last_modified=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None))
58
- if not globs or self.file_matches_globs(remote_file, globs):
59
- yield remote_file
86
+ yield from self.filter_files_by_globs_and_start_date([remote_file], globs)
60
87
 
61
88
  def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
62
89
  try:
@@ -1,31 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from typing import Any, Mapping, MutableMapping
6
-
7
-
8
- class LegacyConfigTransformer:
9
- """
10
- Class that takes in Azure Blob Storage source configs in the legacy format and transforms them into
11
- configs that can be used by the new Azure Blob Storage source built with the file-based CDK.
12
- """
13
-
14
- @classmethod
15
- def convert(cls, legacy_config: Mapping) -> MutableMapping[str, Any]:
16
- azure_blob_storage_blobs_prefix = legacy_config.get("azure_blob_storage_blobs_prefix", "")
17
-
18
- return {
19
- "azure_blob_storage_endpoint": legacy_config.get("azure_blob_storage_endpoint", None),
20
- "azure_blob_storage_account_name": legacy_config["azure_blob_storage_account_name"],
21
- "azure_blob_storage_account_key": legacy_config["azure_blob_storage_account_key"],
22
- "azure_blob_storage_container_name": legacy_config["azure_blob_storage_container_name"],
23
- "streams": [
24
- {
25
- "name": legacy_config["azure_blob_storage_container_name"],
26
- "legacy_prefix": azure_blob_storage_blobs_prefix,
27
- "validation_policy": "Emit Record",
28
- "format": {"filetype": "jsonl"},
29
- }
30
- ],
31
- }
@@ -1,30 +0,0 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
- from typing import Any, Mapping
6
-
7
- from airbyte_cdk.config_observation import emit_configuration_as_airbyte_control_message
8
- from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
9
-
10
- from .legacy_config_transformer import LegacyConfigTransformer
11
-
12
-
13
- class SourceAzureBlobStorage(FileBasedSource):
14
- @classmethod
15
- def read_config(cls, config_path: str) -> Mapping[str, Any]:
16
- """
17
- Used to override the default read_config so that when the new file-based Azure Blob Storage connector processes a config
18
- in the legacy format, it can be transformed into the new config. This happens in entrypoint before we
19
- validate the config against the new spec.
20
- """
21
- config = FileBasedSource.read_config(config_path)
22
- if not cls._is_v1_config(config):
23
- converted_config = LegacyConfigTransformer.convert(config)
24
- emit_configuration_as_airbyte_control_message(converted_config)
25
- return converted_config
26
- return config
27
-
28
- @staticmethod
29
- def _is_v1_config(config: Mapping[str, Any]) -> bool:
30
- return "streams" in config