airbyte-source-azure-blob-storage 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of airbyte-source-azure-blob-storage might be problematic. Click here for more details.
- airbyte_source_azure_blob_storage-0.8.6.dist-info/METADATA +146 -0
- airbyte_source_azure_blob_storage-0.8.6.dist-info/RECORD +10 -0
- airbyte_source_azure_blob_storage-0.8.6.dist-info/WHEEL +4 -0
- airbyte_source_azure_blob_storage-0.8.6.dist-info/entry_points.txt +3 -0
- source_azure_blob_storage/__init__.py +10 -0
- source_azure_blob_storage/config_migrations.py +119 -0
- source_azure_blob_storage/run.py +54 -0
- source_azure_blob_storage/source.py +53 -0
- source_azure_blob_storage/spec.py +132 -0
- source_azure_blob_storage/stream_reader.py +183 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: airbyte-source-azure-blob-storage
|
|
3
|
+
Version: 0.8.6
|
|
4
|
+
Summary: Source implementation for Azure Blob Storage.
|
|
5
|
+
Home-page: https://airbyte.com
|
|
6
|
+
License: ELv2
|
|
7
|
+
Author: Airbyte
|
|
8
|
+
Author-email: contact@airbyte.io
|
|
9
|
+
Requires-Python: >=3.11,<3.14
|
|
10
|
+
Classifier: License :: Other/Proprietary License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: airbyte-cdk[file-based] (>=7.0.0,<8.0.0)
|
|
16
|
+
Requires-Dist: pytz (>=2024.1,<2025.0)
|
|
17
|
+
Requires-Dist: smart-open[azure] (==0.8.6)
|
|
18
|
+
Project-URL: Documentation, https://docs.airbyte.com/integrations/sources/azure-blob-storage
|
|
19
|
+
Project-URL: Repository, https://github.com/airbytehq/airbyte
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Azure-Blob-Storage source connector
|
|
23
|
+
|
|
24
|
+
This is the repository for the Azure-Blob-Storage source connector, written in Python.
|
|
25
|
+
For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.com/integrations/sources/azure-blob-storage).
|
|
26
|
+
|
|
27
|
+
## Local development
|
|
28
|
+
|
|
29
|
+
### Prerequisites
|
|
30
|
+
|
|
31
|
+
- Python (~=3.9)
|
|
32
|
+
- Poetry (~=1.7) - installation instructions [here](https://python-poetry.org/docs/#installation)
|
|
33
|
+
|
|
34
|
+
### Generate new oauth token
|
|
35
|
+
|
|
36
|
+
Tenant id should be provided by user, reason:
|
|
37
|
+
https://learn.microsoft.com/en-us/answers/questions/1531138/which-tenant-id-do-i-have-to-use-to-get-tokens-and
|
|
38
|
+
|
|
39
|
+
1. GET https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/authorize
|
|
40
|
+
?response_type=code
|
|
41
|
+
&client_id=<client_id>
|
|
42
|
+
&scope=offline_access https://storage.azure.com/.default
|
|
43
|
+
&redirect_uri=http://localhost:8000/auth_flow
|
|
44
|
+
&response_mode=query
|
|
45
|
+
&state=1234
|
|
46
|
+
|
|
47
|
+
2. POST https://login.microsoftonline.com/<tenant_id>/oauth2/v2.0/token
|
|
48
|
+
client_id:<client_id>
|
|
49
|
+
code:<code obtained from previous request>
|
|
50
|
+
redirect_uri:http://localhost:8000/auth_flow
|
|
51
|
+
grant_type:authorization_code
|
|
52
|
+
client_secret:<client_secret>
|
|
53
|
+
|
|
54
|
+
### Installing the connector
|
|
55
|
+
|
|
56
|
+
From this connector directory, run:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
poetry install --with dev
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Create credentials
|
|
63
|
+
|
|
64
|
+
**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.com/integrations/sources/azure-blob-storage)
|
|
65
|
+
to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_azure_blob_storage/spec.yaml` file.
|
|
66
|
+
Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information.
|
|
67
|
+
See `sample_files/sample_config.json` for a sample config file.
|
|
68
|
+
|
|
69
|
+
### Locally running the connector
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
poetry run source-azure-blob-storage spec
|
|
73
|
+
poetry run source-azure-blob-storage check --config secrets/config.json
|
|
74
|
+
poetry run source-azure-blob-storage discover --config secrets/config.json
|
|
75
|
+
poetry run source-azure-blob-storage read --config secrets/config.json --catalog sample_files/configured_catalog.json
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Running unit tests
|
|
79
|
+
|
|
80
|
+
To run unit tests locally, from the connector directory run:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
poetry run pytest unit_tests
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Building the docker image
|
|
87
|
+
|
|
88
|
+
1. Install [`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)
|
|
89
|
+
2. Run the following command to build the docker image:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
airbyte-ci connectors --name=source-azure-blob-storage build
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
An image will be available on your host with the tag `airbyte/source-azure-blob-storage:dev`.
|
|
96
|
+
|
|
97
|
+
### Running as a docker container
|
|
98
|
+
|
|
99
|
+
Then run any of the connector commands as follows:
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
docker run --rm airbyte/source-azure-blob-storage:dev spec
|
|
103
|
+
docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-azure-blob-storage:dev check --config /secrets/config.json
|
|
104
|
+
docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-azure-blob-storage:dev discover --config /secrets/config.json
|
|
105
|
+
docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-azure-blob-storage:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Running our CI test suite
|
|
109
|
+
|
|
110
|
+
You can run our full test suite locally using [`airbyte-ci`](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md):
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
airbyte-ci connectors --name=source-azure-blob-storage test
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Customizing acceptance Tests
|
|
117
|
+
|
|
118
|
+
Customize `acceptance-test-config.yml` file to configure acceptance tests. See [Connector Acceptance Tests](https://docs.airbyte.com/connector-development/testing-connectors/connector-acceptance-tests-reference) for more information.
|
|
119
|
+
If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py.
|
|
120
|
+
|
|
121
|
+
### Dependency Management
|
|
122
|
+
|
|
123
|
+
All of your dependencies should be managed via Poetry.
|
|
124
|
+
To add a new dependency, run:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
poetry add <package-name>
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Please commit the changes to `pyproject.toml` and `poetry.lock` files.
|
|
131
|
+
|
|
132
|
+
## Publishing a new version of the connector
|
|
133
|
+
|
|
134
|
+
You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what?
|
|
135
|
+
|
|
136
|
+
1. Make sure your changes are passing our test suite: `airbyte-ci connectors --name=source-azure-blob-storage test`
|
|
137
|
+
2. Bump the connector version (please follow [semantic versioning for connectors](https://docs.airbyte.com/contributing-to-airbyte/resources/pull-requests-handbook/#semantic-versioning-for-connectors)):
|
|
138
|
+
- bump the `dockerImageTag` value in in `metadata.yaml`
|
|
139
|
+
- bump the `version` value in `pyproject.toml`
|
|
140
|
+
3. Make sure the `metadata.yaml` content is up to date.
|
|
141
|
+
4. Make sure the connector documentation and its changelog is up to date (`docs/integrations/sources/azure-blob-storage.md`).
|
|
142
|
+
5. Create a Pull Request: use [our PR naming conventions](https://docs.airbyte.com/contributing-to-airbyte/resources/pull-requests-handbook/#pull-request-title-convention).
|
|
143
|
+
6. Pat yourself on the back for being an awesome contributor.
|
|
144
|
+
7. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master.
|
|
145
|
+
8. Once your PR is merged, the new version of the connector will be automatically published to Docker Hub and our connector registry.
|
|
146
|
+
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
source_azure_blob_storage/__init__.py,sha256=Wx4PzvHg900-c2CpOOP1Wk0zcJpNVqJrkMnPtDcuaQM,319
|
|
2
|
+
source_azure_blob_storage/config_migrations.py,sha256=b0_UyUj1F4q_wPCxBpu4W8MxfMiHE2OvesXxks1bL2M,4331
|
|
3
|
+
source_azure_blob_storage/run.py,sha256=4FImO8txeARJAE-c-glw5xxnDuiRT-XSLYKFVBn70tI,2206
|
|
4
|
+
source_azure_blob_storage/source.py,sha256=ZubuoAmy_QxRtJIJVSugmBoMxw3dMjLTSAIXbZf3jjE,2493
|
|
5
|
+
source_azure_blob_storage/spec.py,sha256=tRiF9KacJFK97HaleJucQadR4e2hjnmStJnbLEFV4vc,5043
|
|
6
|
+
source_azure_blob_storage/stream_reader.py,sha256=YWzmk1mPqSy8H8gWRBYZc7F7-vBDqlNH3WrhVFCmAmk,7978
|
|
7
|
+
airbyte_source_azure_blob_storage-0.8.6.dist-info/METADATA,sha256=75I_9BSoZ-Pl7N_gSWL9pYCImT5rz8wJIFku9Eijd7s,6282
|
|
8
|
+
airbyte_source_azure_blob_storage-0.8.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
9
|
+
airbyte_source_azure_blob_storage-0.8.6.dist-info/entry_points.txt,sha256=75v_DA_Xu0qr0eqtEXyh8sPCqcL9eXKWY8UwdST3ANE,79
|
|
10
|
+
airbyte_source_azure_blob_storage-0.8.6.dist-info/RECORD,,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from .source import SourceAzureBlobStorage
|
|
7
|
+
from .spec import SourceAzureBlobStorageSpec
|
|
8
|
+
from .stream_reader import SourceAzureBlobStorageStreamReader
|
|
9
|
+
|
|
10
|
+
__all__ = ["SourceAzureBlobStorage", "SourceAzureBlobStorageStreamReader", "SourceAzureBlobStorageSpec"]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, List, Mapping
|
|
9
|
+
|
|
10
|
+
import orjson
|
|
11
|
+
|
|
12
|
+
from airbyte_cdk import AirbyteEntrypoint, Source, create_connector_config_control_message
|
|
13
|
+
from airbyte_cdk.models import AirbyteMessageSerializer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("airbyte_logger")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MigrateConfig(ABC):
|
|
20
|
+
@classmethod
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def should_migrate(cls, config: Mapping[str, Any]) -> bool: ...
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def migrate_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]: ...
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def modify_and_save(cls, config_path: str, source: Source, config: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
30
|
+
"""
|
|
31
|
+
Modifies the configuration and then saves it back to the source.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
- config_path (str): The path where the configuration is stored.
|
|
35
|
+
- source (Source): The data source.
|
|
36
|
+
- config (Mapping[str, Any]): The current configuration.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
- Mapping[str, Any]: The updated configuration.
|
|
40
|
+
"""
|
|
41
|
+
migrated_config = cls.migrate_config(config)
|
|
42
|
+
source.write_config(migrated_config, config_path)
|
|
43
|
+
return migrated_config
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def emit_control_message(cls, migrated_config: Mapping[str, Any]) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Emits the control messages related to configuration migration.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
- migrated_config (Mapping[str, Any]): The migrated configuration.
|
|
52
|
+
"""
|
|
53
|
+
print((orjson.dumps(AirbyteMessageSerializer.dump(create_connector_config_control_message(migrated_config))).decode()))
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def migrate(cls, args: List[str], source: Source) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Orchestrates the configuration migration process.
|
|
59
|
+
|
|
60
|
+
It first checks if the `--config` argument is provided, and if so,
|
|
61
|
+
determines whether migration is needed, and then performs the migration
|
|
62
|
+
if required.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
- args (List[str]): List of command-line arguments.
|
|
66
|
+
- source (Source): The data source.
|
|
67
|
+
"""
|
|
68
|
+
config_path = AirbyteEntrypoint(source).extract_config(args)
|
|
69
|
+
if config_path:
|
|
70
|
+
config = source.read_config(config_path)
|
|
71
|
+
if cls.should_migrate(config):
|
|
72
|
+
cls.emit_control_message(cls.modify_and_save(config_path, source, config))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class MigrateLegacyConfig(MigrateConfig):
|
|
76
|
+
"""
|
|
77
|
+
Class that takes in Azure Blob Storage source configs in the legacy format and transforms them into
|
|
78
|
+
configs that can be used by the new Azure Blob Storage source built with the file-based CDK.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def should_migrate(cls, config: Mapping[str, Any]) -> bool:
|
|
83
|
+
return "streams" not in config
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def migrate_config(cls, legacy_config: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
87
|
+
azure_blob_storage_blobs_prefix = legacy_config.get("azure_blob_storage_blobs_prefix", "")
|
|
88
|
+
return {
|
|
89
|
+
"azure_blob_storage_endpoint": legacy_config.get("azure_blob_storage_endpoint", None),
|
|
90
|
+
"azure_blob_storage_account_name": legacy_config["azure_blob_storage_account_name"],
|
|
91
|
+
"azure_blob_storage_account_key": legacy_config["azure_blob_storage_account_key"],
|
|
92
|
+
"azure_blob_storage_container_name": legacy_config["azure_blob_storage_container_name"],
|
|
93
|
+
"streams": [
|
|
94
|
+
{
|
|
95
|
+
"name": legacy_config["azure_blob_storage_container_name"],
|
|
96
|
+
"legacy_prefix": azure_blob_storage_blobs_prefix,
|
|
97
|
+
"validation_policy": "Emit Record",
|
|
98
|
+
"format": {"filetype": "jsonl"},
|
|
99
|
+
}
|
|
100
|
+
],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class MigrateCredentials(MigrateConfig):
|
|
105
|
+
"""
|
|
106
|
+
This class stands for migrating the config azure_blob_storage_account_key inside object `credentials`
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def should_migrate(cls, config: Mapping[str, Any]) -> bool:
|
|
111
|
+
return "credentials" not in config
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def migrate_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
115
|
+
config["credentials"] = {
|
|
116
|
+
"auth_type": "storage_account_key",
|
|
117
|
+
"azure_blob_storage_account_key": config.pop("azure_blob_storage_account_key"),
|
|
118
|
+
}
|
|
119
|
+
return config
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
import traceback
|
|
9
|
+
|
|
10
|
+
import orjson
|
|
11
|
+
|
|
12
|
+
from airbyte_cdk import AirbyteEntrypoint, AirbyteMessage, Type, launch
|
|
13
|
+
from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessageSerializer, AirbyteTraceMessage, TraceType
|
|
14
|
+
from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
|
|
15
|
+
from source_azure_blob_storage import SourceAzureBlobStorage, SourceAzureBlobStorageSpec, SourceAzureBlobStorageStreamReader
|
|
16
|
+
from source_azure_blob_storage.config_migrations import MigrateCredentials, MigrateLegacyConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run():
|
|
20
|
+
args = sys.argv[1:]
|
|
21
|
+
catalog_path = AirbyteEntrypoint.extract_catalog(args)
|
|
22
|
+
config_path = AirbyteEntrypoint.extract_config(args)
|
|
23
|
+
state_path = AirbyteEntrypoint.extract_state(args)
|
|
24
|
+
try:
|
|
25
|
+
source = SourceAzureBlobStorage(
|
|
26
|
+
SourceAzureBlobStorageStreamReader(),
|
|
27
|
+
SourceAzureBlobStorageSpec,
|
|
28
|
+
SourceAzureBlobStorage.read_catalog(catalog_path) if catalog_path else None,
|
|
29
|
+
SourceAzureBlobStorage.read_config(config_path) if catalog_path else None,
|
|
30
|
+
SourceAzureBlobStorage.read_state(state_path) if catalog_path else None,
|
|
31
|
+
cursor_cls=DefaultFileBasedCursor,
|
|
32
|
+
)
|
|
33
|
+
MigrateLegacyConfig.migrate(sys.argv[1:], source)
|
|
34
|
+
MigrateCredentials.migrate(sys.argv[1:], source)
|
|
35
|
+
except Exception:
|
|
36
|
+
print(
|
|
37
|
+
orjson.dumps(
|
|
38
|
+
AirbyteMessageSerializer.dump(
|
|
39
|
+
AirbyteMessage(
|
|
40
|
+
type=Type.TRACE,
|
|
41
|
+
trace=AirbyteTraceMessage(
|
|
42
|
+
type=TraceType.ERROR,
|
|
43
|
+
emitted_at=time.time_ns() // 1_000_000,
|
|
44
|
+
error=AirbyteErrorTraceMessage(
|
|
45
|
+
message="Error starting the sync. This could be due to an invalid configuration or catalog. Please contact Support for assistance.",
|
|
46
|
+
stack_trace=traceback.format_exc(),
|
|
47
|
+
),
|
|
48
|
+
),
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
).decode()
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
launch(source, args)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from airbyte_protocol_dataclasses.models import AuthFlowType
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk import AdvancedAuth, ConnectorSpecification, OAuthConfigSpecification
|
|
11
|
+
from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SourceAzureBlobStorage(FileBasedSource):
|
|
15
|
+
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
|
16
|
+
"""
|
|
17
|
+
Returns the specification describing what fields can be configured by a user when setting up a file-based source.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
return ConnectorSpecification(
|
|
21
|
+
documentationUrl=self.spec_class.documentation_url(),
|
|
22
|
+
connectionSpecification=self.spec_class.schema(),
|
|
23
|
+
advanced_auth=AdvancedAuth(
|
|
24
|
+
auth_flow_type=AuthFlowType.oauth2_0,
|
|
25
|
+
predicate_key=["credentials", "auth_type"],
|
|
26
|
+
predicate_value="oauth2",
|
|
27
|
+
oauth_config_specification=OAuthConfigSpecification(
|
|
28
|
+
complete_oauth_output_specification={
|
|
29
|
+
"type": "object",
|
|
30
|
+
"additionalProperties": False,
|
|
31
|
+
"properties": {"refresh_token": {"type": "string", "path_in_connector_config": ["credentials", "refresh_token"]}},
|
|
32
|
+
},
|
|
33
|
+
complete_oauth_server_input_specification={
|
|
34
|
+
"type": "object",
|
|
35
|
+
"additionalProperties": False,
|
|
36
|
+
"properties": {"client_id": {"type": "string"}, "client_secret": {"type": "string"}},
|
|
37
|
+
},
|
|
38
|
+
complete_oauth_server_output_specification={
|
|
39
|
+
"type": "object",
|
|
40
|
+
"additionalProperties": False,
|
|
41
|
+
"properties": {
|
|
42
|
+
"client_id": {"type": "string", "path_in_connector_config": ["credentials", "client_id"]},
|
|
43
|
+
"client_secret": {"type": "string", "path_in_connector_config": ["credentials", "client_secret"]},
|
|
44
|
+
},
|
|
45
|
+
},
|
|
46
|
+
oauth_user_input_from_connector_config_specification={
|
|
47
|
+
"type": "object",
|
|
48
|
+
"additionalProperties": False,
|
|
49
|
+
"properties": {"tenant_id": {"type": "string", "path_in_connector_config": ["credentials", "tenant_id"]}},
|
|
50
|
+
},
|
|
51
|
+
),
|
|
52
|
+
),
|
|
53
|
+
)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, Literal, Optional, Union
|
|
7
|
+
|
|
8
|
+
import dpath.util
|
|
9
|
+
from pydantic.v1 import AnyUrl, BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from airbyte_cdk import OneOfOptionConfig
|
|
12
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec, DeliverRawFiles, DeliverRecords
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Oauth2(BaseModel):
|
|
16
|
+
class Config(OneOfOptionConfig):
|
|
17
|
+
title = "Authenticate via Oauth2"
|
|
18
|
+
discriminator = "auth_type"
|
|
19
|
+
|
|
20
|
+
auth_type: Literal["oauth2"] = Field("oauth2", const=True)
|
|
21
|
+
tenant_id: str = Field(title="Tenant ID", description="Tenant ID of the Microsoft Azure Application user", airbyte_secret=True)
|
|
22
|
+
client_id: str = Field(
|
|
23
|
+
title="Client ID",
|
|
24
|
+
description="Client ID of your Microsoft developer application",
|
|
25
|
+
airbyte_secret=True,
|
|
26
|
+
)
|
|
27
|
+
client_secret: str = Field(
|
|
28
|
+
title="Client Secret",
|
|
29
|
+
description="Client Secret of your Microsoft developer application",
|
|
30
|
+
airbyte_secret=True,
|
|
31
|
+
)
|
|
32
|
+
refresh_token: str = Field(
|
|
33
|
+
title="Refresh Token",
|
|
34
|
+
description="Refresh Token of your Microsoft developer application",
|
|
35
|
+
airbyte_secret=True,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ClientCredentials(BaseModel):
|
|
40
|
+
class Config(OneOfOptionConfig):
|
|
41
|
+
title = "Authenticate via Client Credentials"
|
|
42
|
+
discriminator = "auth_type"
|
|
43
|
+
|
|
44
|
+
auth_type: Literal["client_credentials"] = Field("client_credentials", const=True)
|
|
45
|
+
app_tenant_id: str = Field(title="Tenant ID", description="Tenant ID of the Microsoft Azure Application", airbyte_secret=True)
|
|
46
|
+
app_client_id: str = Field(
|
|
47
|
+
title="Client ID",
|
|
48
|
+
description="Client ID of your Microsoft developer application",
|
|
49
|
+
airbyte_secret=True,
|
|
50
|
+
)
|
|
51
|
+
app_client_secret: str = Field(
|
|
52
|
+
title="Client Secret",
|
|
53
|
+
description="Client Secret of your Microsoft developer application",
|
|
54
|
+
airbyte_secret=True,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class StorageAccountKey(BaseModel):
|
|
59
|
+
class Config(OneOfOptionConfig):
|
|
60
|
+
title = "Authenticate via Storage Account Key"
|
|
61
|
+
discriminator = "auth_type"
|
|
62
|
+
|
|
63
|
+
auth_type: Literal["storage_account_key"] = Field("storage_account_key", const=True)
|
|
64
|
+
azure_blob_storage_account_key: str = Field(
|
|
65
|
+
title="Azure Blob Storage account key",
|
|
66
|
+
description="The Azure blob storage account key.",
|
|
67
|
+
airbyte_secret=True,
|
|
68
|
+
examples=["Z8ZkZpteggFx394vm+PJHnGTvdRncaYS+JhLKdj789YNmD+iyGTnG+PV+POiuYNhBg/ACS+LKjd%4FG3FHGN12Nd=="],
|
|
69
|
+
order=3,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class SourceAzureBlobStorageSpec(AbstractFileBasedSpec):
|
|
74
|
+
"""
|
|
75
|
+
NOTE: When this Spec is changed, legacy_config_transformer.py must also be modified to uptake the changes
|
|
76
|
+
because it is responsible for converting legacy Azure Blob Storage v0 configs into v1 configs using the File-Based CDK.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def documentation_url(cls) -> AnyUrl:
|
|
81
|
+
return AnyUrl("https://docs.airbyte.com/integrations/sources/azure-blob-storage", scheme="https")
|
|
82
|
+
|
|
83
|
+
credentials: Union[Oauth2, ClientCredentials, StorageAccountKey] = Field(
|
|
84
|
+
title="Authentication",
|
|
85
|
+
description="Credentials for connecting to the Azure Blob Storage",
|
|
86
|
+
discriminator="auth_type",
|
|
87
|
+
type="object",
|
|
88
|
+
order=2,
|
|
89
|
+
)
|
|
90
|
+
azure_blob_storage_account_name: str = Field(
|
|
91
|
+
title="Azure Blob Storage account name",
|
|
92
|
+
description="The account's name of the Azure Blob Storage.",
|
|
93
|
+
examples=["airbyte5storage"],
|
|
94
|
+
order=3,
|
|
95
|
+
)
|
|
96
|
+
azure_blob_storage_container_name: str = Field(
|
|
97
|
+
title="Azure blob storage container (Bucket) Name",
|
|
98
|
+
description="The name of the Azure blob storage container.",
|
|
99
|
+
examples=["airbytetescontainername"],
|
|
100
|
+
order=4,
|
|
101
|
+
)
|
|
102
|
+
azure_blob_storage_endpoint: Optional[str] = Field(
|
|
103
|
+
title="Endpoint Domain Name",
|
|
104
|
+
description="This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from "
|
|
105
|
+
"command line) to use Microsoft native from example.",
|
|
106
|
+
examples=["blob.core.windows.net"],
|
|
107
|
+
order=11,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field(
|
|
111
|
+
title="Delivery Method",
|
|
112
|
+
discriminator="delivery_type",
|
|
113
|
+
type="object",
|
|
114
|
+
order=7,
|
|
115
|
+
display_type="radio",
|
|
116
|
+
group="advanced",
|
|
117
|
+
default="use_records_transfer",
|
|
118
|
+
airbyte_hidden=True,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
|
123
|
+
"""
|
|
124
|
+
Generates the mapping comprised of the config fields
|
|
125
|
+
"""
|
|
126
|
+
schema = super().schema(*args, **kwargs)
|
|
127
|
+
|
|
128
|
+
# Hide API processing option until https://github.com/airbytehq/airbyte-platform-internal/issues/10354 is fixed
|
|
129
|
+
processing_options = dpath.util.get(schema, "properties/streams/items/properties/format/oneOf/4/properties/processing/oneOf")
|
|
130
|
+
dpath.util.set(schema, "properties/streams/items/properties/format/oneOf/4/properties/processing/oneOf", processing_options[:1])
|
|
131
|
+
|
|
132
|
+
return schema
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from io import IOBase
|
|
6
|
+
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
|
7
|
+
|
|
8
|
+
import pytz
|
|
9
|
+
from azure.core.credentials import AccessToken, TokenCredential
|
|
10
|
+
from azure.core.exceptions import ResourceNotFoundError
|
|
11
|
+
from azure.storage.blob import BlobServiceClient, ContainerClient
|
|
12
|
+
from smart_open import open as so_open
|
|
13
|
+
|
|
14
|
+
from airbyte_cdk import AirbyteTracedException, FailureType
|
|
15
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
|
16
|
+
from airbyte_cdk.sources.file_based.remote_file import UploadableRemoteFile
|
|
17
|
+
from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator
|
|
18
|
+
|
|
19
|
+
from .spec import SourceAzureBlobStorageSpec
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AzureClientCredentialsAuthenticator(Oauth2Authenticator, TokenCredential):
|
|
23
|
+
def __init__(self, tenant_id: str, client_id: str, client_secret: str, **kwargs):
|
|
24
|
+
super().__init__(
|
|
25
|
+
token_refresh_endpoint=f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token",
|
|
26
|
+
client_id=client_id,
|
|
27
|
+
client_secret=client_secret,
|
|
28
|
+
grant_type="client_credentials",
|
|
29
|
+
scopes=["https://storage.azure.com/.default"],
|
|
30
|
+
refresh_token=None,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def build_refresh_request_body(self) -> Mapping[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Returns the request body to set on the refresh request
|
|
36
|
+
|
|
37
|
+
Override to define additional parameters
|
|
38
|
+
"""
|
|
39
|
+
payload: MutableMapping[str, Any] = {
|
|
40
|
+
"grant_type": self.get_grant_type(),
|
|
41
|
+
"client_id": self.get_client_id(),
|
|
42
|
+
"client_secret": self.get_client_secret(),
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if self.get_scopes():
|
|
46
|
+
payload["scope"] = " ".join(self.get_scopes())
|
|
47
|
+
|
|
48
|
+
if self.get_refresh_request_body():
|
|
49
|
+
for key, val in self.get_refresh_request_body().items():
|
|
50
|
+
# We defer to existing oauth constructs over custom configured fields
|
|
51
|
+
if key not in payload:
|
|
52
|
+
payload[key] = val
|
|
53
|
+
|
|
54
|
+
return payload
|
|
55
|
+
|
|
56
|
+
def get_token(self, *args, **kwargs) -> AccessToken:
|
|
57
|
+
"""Parent class handles Oauth Refresh token logic."""
|
|
58
|
+
return AccessToken(token=self.get_access_token(), expires_on=int(self.get_token_expiry_date().timestamp()))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AzureOauth2Authenticator(Oauth2Authenticator, TokenCredential):
|
|
62
|
+
"""
|
|
63
|
+
Authenticator for Azure Blob Storage SDK to align with azure.core.credentials.TokenCredential protocol
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def get_token(self, *args, **kwargs) -> AccessToken:
|
|
67
|
+
"""Parent class handles Oauth Refresh token logic.
|
|
68
|
+
`expires_on` is ignored and set to year 2222 to align with protocol.
|
|
69
|
+
"""
|
|
70
|
+
return AccessToken(token=self.get_access_token(), expires_on=7952342400)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AzureBlobStorageUploadableRemoteFile(UploadableRemoteFile):
|
|
74
|
+
blob_client: Any
|
|
75
|
+
blob_properties: Any
|
|
76
|
+
|
|
77
|
+
def __init__(self, blob_client: Any, blob_properties: Any, **kwargs):
|
|
78
|
+
super().__init__(**kwargs)
|
|
79
|
+
self.blob_client = blob_client
|
|
80
|
+
self.blob_properties = blob_properties
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def size(self) -> int:
|
|
84
|
+
return self.blob_properties.size
|
|
85
|
+
|
|
86
|
+
def download_to_local_directory(self, local_file_path: str) -> None:
|
|
87
|
+
blob_client = self.blob_client.get_blob_client(container=self.blob_properties.container, blob=self.uri)
|
|
88
|
+
with open(file=local_file_path, mode="wb") as f:
|
|
89
|
+
download_stream = blob_client.download_blob()
|
|
90
|
+
f.write(download_stream.readall())
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class SourceAzureBlobStorageStreamReader(AbstractFileBasedStreamReader):
|
|
94
|
+
_credentials = None
|
|
95
|
+
|
|
96
|
+
def __init__(self, *args, **kwargs):
|
|
97
|
+
super().__init__(*args, **kwargs)
|
|
98
|
+
self._config = None
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def config(self) -> SourceAzureBlobStorageSpec:
|
|
102
|
+
return self._config
|
|
103
|
+
|
|
104
|
+
@config.setter
|
|
105
|
+
def config(self, value: SourceAzureBlobStorageSpec) -> None:
|
|
106
|
+
self._config = value
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def account_url(self) -> str:
|
|
110
|
+
if not self.config.azure_blob_storage_endpoint:
|
|
111
|
+
return f"https://{self.config.azure_blob_storage_account_name}.blob.core.windows.net"
|
|
112
|
+
return self.config.azure_blob_storage_endpoint
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def azure_container_client(self):
|
|
116
|
+
return ContainerClient(
|
|
117
|
+
self.account_url, container_name=self.config.azure_blob_storage_container_name, credential=self.azure_credentials
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def azure_blob_service_client(self):
|
|
122
|
+
return BlobServiceClient(self.account_url, credential=self._credentials)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def azure_credentials(self) -> Union[str, AzureOauth2Authenticator, AzureClientCredentialsAuthenticator]:
|
|
126
|
+
if not self._credentials:
|
|
127
|
+
if self.config.credentials.auth_type == "storage_account_key":
|
|
128
|
+
self._credentials = self.config.credentials.azure_blob_storage_account_key
|
|
129
|
+
elif self.config.credentials.auth_type == "oauth2":
|
|
130
|
+
self._credentials = AzureOauth2Authenticator(
|
|
131
|
+
token_refresh_endpoint=f"https://login.microsoftonline.com/{self.config.credentials.tenant_id}/oauth2/v2.0/token",
|
|
132
|
+
client_id=self.config.credentials.client_id,
|
|
133
|
+
client_secret=self.config.credentials.client_secret,
|
|
134
|
+
refresh_token=self.config.credentials.refresh_token,
|
|
135
|
+
)
|
|
136
|
+
elif self.config.credentials.auth_type == "client_credentials":
|
|
137
|
+
self._credentials = AzureClientCredentialsAuthenticator(
|
|
138
|
+
tenant_id=self.config.credentials.app_tenant_id,
|
|
139
|
+
client_id=self.config.credentials.app_client_id,
|
|
140
|
+
client_secret=self.config.credentials.app_client_secret,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return self._credentials
|
|
144
|
+
|
|
145
|
+
def get_matching_files(
|
|
146
|
+
self,
|
|
147
|
+
globs: List[str],
|
|
148
|
+
prefix: Optional[str],
|
|
149
|
+
logger: logging.Logger,
|
|
150
|
+
) -> Iterable[AzureBlobStorageUploadableRemoteFile]:
|
|
151
|
+
prefixes = [prefix] if prefix else self.get_prefixes_from_globs(globs)
|
|
152
|
+
prefixes = prefixes or [None]
|
|
153
|
+
try:
|
|
154
|
+
for prefix in prefixes:
|
|
155
|
+
for blob in self.azure_container_client.list_blobs(name_starts_with=prefix):
|
|
156
|
+
remote_file = AzureBlobStorageUploadableRemoteFile(
|
|
157
|
+
uri=blob.name,
|
|
158
|
+
last_modified=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None),
|
|
159
|
+
blob_client=self.azure_blob_service_client,
|
|
160
|
+
blob_properties=blob,
|
|
161
|
+
created_at=blob.creation_time.astimezone(pytz.utc).replace(tzinfo=None).strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
162
|
+
updated_at=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None).strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
163
|
+
)
|
|
164
|
+
yield from self.filter_files_by_globs_and_start_date([remote_file], globs)
|
|
165
|
+
except ResourceNotFoundError as e:
|
|
166
|
+
raise AirbyteTracedException(failure_type=FailureType.config_error, internal_message=e.message, message=e.reason or e.message)
|
|
167
|
+
|
|
168
|
+
def open_file(
|
|
169
|
+
self, file: AzureBlobStorageUploadableRemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
|
|
170
|
+
) -> IOBase:
|
|
171
|
+
try:
|
|
172
|
+
result = so_open(
|
|
173
|
+
f"azure://{self.config.azure_blob_storage_container_name}/{file.uri}",
|
|
174
|
+
transport_params={"client": self.azure_blob_service_client},
|
|
175
|
+
mode=mode.value,
|
|
176
|
+
encoding=encoding,
|
|
177
|
+
)
|
|
178
|
+
except OSError:
|
|
179
|
+
logger.warning(
|
|
180
|
+
f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
|
|
181
|
+
f"Check whether key {file.uri} exists in `{self.config.azure_blob_storage_container_name}` container and/or has proper ACL permissions"
|
|
182
|
+
)
|
|
183
|
+
return result
|