airbyte-cdk 6.45.4.post48.dev14477787653__py3-none-any.whl → 6.45.4.post72.dev14497997772__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/models/__init__.py +0 -1
- airbyte_cdk/models/airbyte_protocol.py +3 -1
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -8
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +0 -36
- airbyte_cdk/sources/declarative/extractors/record_selector.py +1 -6
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +0 -31
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1 -39
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -9
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -38
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +15 -8
- airbyte_cdk/sources/file_based/schema_helpers.py +1 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +12 -3
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +38 -15
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +3 -1
- airbyte_cdk/sources/streams/concurrent/default_stream.py +0 -3
- airbyte_cdk/sources/types.py +2 -11
- airbyte_cdk/sources/utils/record_helper.py +8 -8
- airbyte_cdk/test/declarative/__init__.py +6 -0
- airbyte_cdk/test/declarative/models/__init__.py +7 -0
- airbyte_cdk/test/declarative/models/scenario.py +74 -0
- airbyte_cdk/test/declarative/utils/__init__.py +0 -0
- airbyte_cdk/test/declarative/utils/job_runner.py +159 -0
- airbyte_cdk/test/entrypoint_wrapper.py +4 -0
- airbyte_cdk/test/mock_http/response_builder.py +0 -8
- airbyte_cdk/test/standard_tests/__init__.py +46 -0
- airbyte_cdk/test/standard_tests/connector_base.py +148 -0
- airbyte_cdk/test/standard_tests/declarative_sources.py +92 -0
- airbyte_cdk/test/standard_tests/destination_base.py +16 -0
- airbyte_cdk/test/standard_tests/pytest_hooks.py +61 -0
- airbyte_cdk/test/standard_tests/source_base.py +140 -0
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/METADATA +3 -2
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/RECORD +38 -29
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +0 -89
- airbyte_cdk/sources/file_based/file_record_data.py +0 -22
- airbyte_cdk/sources/utils/files_directory.py +0 -15
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.45.4.post48.dev14477787653.dist-info → airbyte_cdk-6.45.4.post72.dev14497997772.dist-info}/entry_points.txt +0 -0
airbyte_cdk/models/__init__.py
CHANGED
@@ -8,6 +8,8 @@ from typing import Annotated, Any, Dict, List, Mapping, Optional, Union
|
|
8
8
|
from airbyte_protocol_dataclasses.models import * # noqa: F403 # Allow '*'
|
9
9
|
from serpyco_rs.metadata import Alias
|
10
10
|
|
11
|
+
from airbyte_cdk.models.file_transfer_record_message import AirbyteFileTransferRecordMessage
|
12
|
+
|
11
13
|
# ruff: noqa: F405 # ignore fuzzy import issues with 'import *'
|
12
14
|
|
13
15
|
|
@@ -82,7 +84,7 @@ class AirbyteMessage:
|
|
82
84
|
spec: Optional[ConnectorSpecification] = None # type: ignore [name-defined]
|
83
85
|
connectionStatus: Optional[AirbyteConnectionStatus] = None # type: ignore [name-defined]
|
84
86
|
catalog: Optional[AirbyteCatalog] = None # type: ignore [name-defined]
|
85
|
-
record: Optional[AirbyteRecordMessage] = None # type: ignore [name-defined]
|
87
|
+
record: Optional[Union[AirbyteFileTransferRecordMessage, AirbyteRecordMessage]] = None # type: ignore [name-defined]
|
86
88
|
state: Optional[AirbyteStateMessage] = None
|
87
89
|
trace: Optional[AirbyteTraceMessage] = None # type: ignore [name-defined]
|
88
90
|
control: Optional[AirbyteControlMessage] = None # type: ignore [name-defined]
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Any, Dict, Optional
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class AirbyteFileTransferRecordMessage:
|
9
|
+
stream: str
|
10
|
+
file: Dict[str, Any]
|
11
|
+
emitted_at: int
|
12
|
+
namespace: Optional[str] = None
|
13
|
+
data: Optional[Dict[str, Any]] = None
|
@@ -149,7 +149,7 @@ class ConcurrentReadProcessor:
|
|
149
149
|
message = stream_data_to_airbyte_message(
|
150
150
|
stream_name=record.stream_name,
|
151
151
|
data_or_message=record.data,
|
152
|
-
|
152
|
+
is_file_transfer_message=record.is_file_transfer_message,
|
153
153
|
)
|
154
154
|
stream = self._stream_name_to_instance[record.stream_name]
|
155
155
|
|
@@ -25,7 +25,6 @@ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global impor
|
|
25
25
|
PerPartitionWithGlobalCursor,
|
26
26
|
)
|
27
27
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
|
-
from airbyte_cdk.sources.declarative.models import FileUploader
|
29
28
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
30
29
|
ConcurrencyLevel as ConcurrencyLevelModel,
|
31
30
|
)
|
@@ -207,10 +206,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
207
206
|
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
|
208
207
|
# so we need to treat them as synchronous
|
209
208
|
|
210
|
-
supports_file_transfer = (
|
211
|
-
"file_uploader" in name_to_stream_mapping[declarative_stream.name]
|
212
|
-
)
|
213
|
-
|
214
209
|
if (
|
215
210
|
isinstance(declarative_stream, DeclarativeStream)
|
216
211
|
and name_to_stream_mapping[declarative_stream.name]["type"]
|
@@ -327,7 +322,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
327
322
|
else None,
|
328
323
|
logger=self.logger,
|
329
324
|
cursor=cursor,
|
330
|
-
supports_file_transfer=supports_file_transfer,
|
331
325
|
)
|
332
326
|
)
|
333
327
|
elif (
|
@@ -359,7 +353,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
359
353
|
cursor_field=None,
|
360
354
|
logger=self.logger,
|
361
355
|
cursor=final_state_cursor,
|
362
|
-
supports_file_transfer=supports_file_transfer,
|
363
356
|
)
|
364
357
|
)
|
365
358
|
elif (
|
@@ -413,7 +406,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
413
406
|
cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
|
414
407
|
logger=self.logger,
|
415
408
|
cursor=perpartition_cursor,
|
416
|
-
supports_file_transfer=supports_file_transfer,
|
417
409
|
)
|
418
410
|
)
|
419
411
|
else:
|
@@ -1448,42 +1448,6 @@ definitions:
|
|
1448
1448
|
- "$ref": "#/definitions/LegacyToPerPartitionStateMigration"
|
1449
1449
|
- "$ref": "#/definitions/CustomStateMigration"
|
1450
1450
|
default: []
|
1451
|
-
file_uploader:
|
1452
|
-
title: File Uploader
|
1453
|
-
description: (experimental) Describes how to fetch a file
|
1454
|
-
type: object
|
1455
|
-
required:
|
1456
|
-
- type
|
1457
|
-
- requester
|
1458
|
-
- download_target_extractor
|
1459
|
-
properties:
|
1460
|
-
type:
|
1461
|
-
type: string
|
1462
|
-
enum: [ FileUploader ]
|
1463
|
-
requester:
|
1464
|
-
description: Requester component that describes how to prepare HTTP requests to send to the source API.
|
1465
|
-
anyOf:
|
1466
|
-
- "$ref": "#/definitions/CustomRequester"
|
1467
|
-
- "$ref": "#/definitions/HttpRequester"
|
1468
|
-
download_target_extractor:
|
1469
|
-
description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response
|
1470
|
-
anyOf:
|
1471
|
-
- "$ref": "#/definitions/CustomRecordExtractor"
|
1472
|
-
- "$ref": "#/definitions/DpathExtractor"
|
1473
|
-
file_extractor:
|
1474
|
-
description: Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content
|
1475
|
-
anyOf:
|
1476
|
-
- "$ref": "#/definitions/CustomRecordExtractor"
|
1477
|
-
- "$ref": "#/definitions/DpathExtractor"
|
1478
|
-
filename_extractor:
|
1479
|
-
description: Defines the name to store the file. Stream name is automatically added to the file path. File unique ID can be used to avoid overwriting files. Random UUID will be used if the extractor is not provided.
|
1480
|
-
type: string
|
1481
|
-
interpolation_context:
|
1482
|
-
- config
|
1483
|
-
- record
|
1484
|
-
examples:
|
1485
|
-
- "{{ record.id }}/{{ record.file_name }}/"
|
1486
|
-
- "{{ record.id }}_{{ record.file_name }}/"
|
1487
1451
|
$parameters:
|
1488
1452
|
type: object
|
1489
1453
|
additional_properties: true
|
@@ -15,7 +15,6 @@ from airbyte_cdk.sources.declarative.extractors.type_transformer import (
|
|
15
15
|
)
|
16
16
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
17
17
|
from airbyte_cdk.sources.declarative.models import SchemaNormalization
|
18
|
-
from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
|
19
18
|
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
20
19
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
21
20
|
from airbyte_cdk.sources.utils.transform import TypeTransformer
|
@@ -43,7 +42,6 @@ class RecordSelector(HttpSelector):
|
|
43
42
|
record_filter: Optional[RecordFilter] = None
|
44
43
|
transformations: List[RecordTransformation] = field(default_factory=lambda: [])
|
45
44
|
transform_before_filtering: bool = False
|
46
|
-
file_uploader: Optional[FileUploader] = None
|
47
45
|
|
48
46
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
49
47
|
self._parameters = parameters
|
@@ -119,10 +117,7 @@ class RecordSelector(HttpSelector):
|
|
119
117
|
transformed_filtered_data, schema=records_schema
|
120
118
|
)
|
121
119
|
for data in normalized_data:
|
122
|
-
|
123
|
-
if self.file_uploader:
|
124
|
-
self.file_uploader.upload(record)
|
125
|
-
yield record
|
120
|
+
yield Record(data=data, stream_name=self.name, associated_slice=stream_slice)
|
126
121
|
|
127
122
|
def _normalize_by_schema(
|
128
123
|
self, records: Iterable[Mapping[str, Any]], schema: Optional[Mapping[str, Any]]
|
@@ -2066,31 +2066,6 @@ class SelectiveAuthenticator(BaseModel):
|
|
2066
2066
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2067
2067
|
|
2068
2068
|
|
2069
|
-
class FileUploader(BaseModel):
|
2070
|
-
type: Literal["FileUploader"]
|
2071
|
-
requester: Union[CustomRequester, HttpRequester] = Field(
|
2072
|
-
...,
|
2073
|
-
description="Requester component that describes how to prepare HTTP requests to send to the source API.",
|
2074
|
-
)
|
2075
|
-
download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2076
|
-
...,
|
2077
|
-
description="Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response",
|
2078
|
-
)
|
2079
|
-
file_extractor: Optional[Union[CustomRecordExtractor, DpathExtractor]] = Field(
|
2080
|
-
None,
|
2081
|
-
description="Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content",
|
2082
|
-
)
|
2083
|
-
filename_extractor: Optional[str] = Field(
|
2084
|
-
None,
|
2085
|
-
description="Defines the name to store the file. Stream name is automatically added to the file path. File unique ID can be used to avoid overwriting files. Random UUID will be used if the extractor is not provided.",
|
2086
|
-
examples=[
|
2087
|
-
"{{ record.id }}/{{ record.file_name }}/",
|
2088
|
-
"{{ record.id }}_{{ record.file_name }}/",
|
2089
|
-
],
|
2090
|
-
)
|
2091
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2092
|
-
|
2093
|
-
|
2094
2069
|
class DeclarativeStream(BaseModel):
|
2095
2070
|
class Config:
|
2096
2071
|
extra = Extra.allow
|
@@ -2149,11 +2124,6 @@ class DeclarativeStream(BaseModel):
|
|
2149
2124
|
description="Array of state migrations to be applied on the input state",
|
2150
2125
|
title="State Migrations",
|
2151
2126
|
)
|
2152
|
-
file_uploader: Optional[FileUploader] = Field(
|
2153
|
-
None,
|
2154
|
-
description="(experimental) Describes how to fetch a file",
|
2155
|
-
title="File Uploader",
|
2156
|
-
)
|
2157
2127
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2158
2128
|
|
2159
2129
|
|
@@ -2647,7 +2617,6 @@ CompositeErrorHandler.update_forward_refs()
|
|
2647
2617
|
DeclarativeSource1.update_forward_refs()
|
2648
2618
|
DeclarativeSource2.update_forward_refs()
|
2649
2619
|
SelectiveAuthenticator.update_forward_refs()
|
2650
|
-
FileUploader.update_forward_refs()
|
2651
2620
|
DeclarativeStream.update_forward_refs()
|
2652
2621
|
SessionTokenAuthenticator.update_forward_refs()
|
2653
2622
|
DynamicSchemaLoader.update_forward_refs()
|
@@ -106,6 +106,7 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
|
|
106
106
|
)
|
107
107
|
from airbyte_cdk.sources.declarative.models import (
|
108
108
|
CustomStateMigration,
|
109
|
+
GzipDecoder,
|
109
110
|
)
|
110
111
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
111
112
|
AddedFieldDefinition as AddedFieldDefinitionModel,
|
@@ -227,9 +228,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
227
228
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
228
229
|
ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
|
229
230
|
)
|
230
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
231
|
-
FileUploader as FileUploaderModel,
|
232
|
-
)
|
233
231
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
234
232
|
FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel,
|
235
233
|
)
|
@@ -481,7 +479,6 @@ from airbyte_cdk.sources.declarative.retrievers import (
|
|
481
479
|
SimpleRetriever,
|
482
480
|
SimpleRetrieverTestReadDecorator,
|
483
481
|
)
|
484
|
-
from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
|
485
482
|
from airbyte_cdk.sources.declarative.schema import (
|
486
483
|
ComplexFieldType,
|
487
484
|
DefaultSchemaLoader,
|
@@ -679,7 +676,6 @@ class ModelToComponentFactory:
|
|
679
676
|
ComponentMappingDefinitionModel: self.create_components_mapping_definition,
|
680
677
|
ZipfileDecoderModel: self.create_zipfile_decoder,
|
681
678
|
HTTPAPIBudgetModel: self.create_http_api_budget,
|
682
|
-
FileUploaderModel: self.create_file_uploader,
|
683
679
|
FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy,
|
684
680
|
MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy,
|
685
681
|
UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy,
|
@@ -1842,11 +1838,6 @@ class ModelToComponentFactory:
|
|
1842
1838
|
transformations.append(
|
1843
1839
|
self._create_component_from_model(model=transformation_model, config=config)
|
1844
1840
|
)
|
1845
|
-
file_uploader = None
|
1846
|
-
if model.file_uploader:
|
1847
|
-
file_uploader = self._create_component_from_model(
|
1848
|
-
model=model.file_uploader, config=config
|
1849
|
-
)
|
1850
1841
|
|
1851
1842
|
retriever = self._create_component_from_model(
|
1852
1843
|
model=model.retriever,
|
@@ -1858,7 +1849,6 @@ class ModelToComponentFactory:
|
|
1858
1849
|
stop_condition_on_cursor=stop_condition_on_cursor,
|
1859
1850
|
client_side_incremental_sync=client_side_incremental_sync,
|
1860
1851
|
transformations=transformations,
|
1861
|
-
file_uploader=file_uploader,
|
1862
1852
|
incremental_sync=model.incremental_sync,
|
1863
1853
|
)
|
1864
1854
|
cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None
|
@@ -2804,7 +2794,6 @@ class ModelToComponentFactory:
|
|
2804
2794
|
transformations: List[RecordTransformation] | None = None,
|
2805
2795
|
decoder: Decoder | None = None,
|
2806
2796
|
client_side_incremental_sync: Dict[str, Any] | None = None,
|
2807
|
-
file_uploader: Optional[FileUploader] = None,
|
2808
2797
|
**kwargs: Any,
|
2809
2798
|
) -> RecordSelector:
|
2810
2799
|
extractor = self._create_component_from_model(
|
@@ -2842,7 +2831,6 @@ class ModelToComponentFactory:
|
|
2842
2831
|
config=config,
|
2843
2832
|
record_filter=record_filter,
|
2844
2833
|
transformations=transformations or [],
|
2845
|
-
file_uploader=file_uploader,
|
2846
2834
|
schema_normalization=schema_normalization,
|
2847
2835
|
parameters=model.parameters or {},
|
2848
2836
|
transform_before_filtering=transform_before_filtering,
|
@@ -2900,7 +2888,6 @@ class ModelToComponentFactory:
|
|
2900
2888
|
stop_condition_on_cursor: bool = False,
|
2901
2889
|
client_side_incremental_sync: Optional[Dict[str, Any]] = None,
|
2902
2890
|
transformations: List[RecordTransformation],
|
2903
|
-
file_uploader: Optional[FileUploader] = None,
|
2904
2891
|
incremental_sync: Optional[
|
2905
2892
|
Union[
|
2906
2893
|
IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel
|
@@ -2921,7 +2908,6 @@ class ModelToComponentFactory:
|
|
2921
2908
|
decoder=decoder,
|
2922
2909
|
transformations=transformations,
|
2923
2910
|
client_side_incremental_sync=client_side_incremental_sync,
|
2924
|
-
file_uploader=file_uploader,
|
2925
2911
|
)
|
2926
2912
|
|
2927
2913
|
query_properties: Optional[QueryProperties] = None
|
@@ -3588,30 +3574,6 @@ class ModelToComponentFactory:
|
|
3588
3574
|
matchers=matchers,
|
3589
3575
|
)
|
3590
3576
|
|
3591
|
-
def create_file_uploader(
|
3592
|
-
self, model: FileUploaderModel, config: Config, **kwargs: Any
|
3593
|
-
) -> FileUploader:
|
3594
|
-
name = "File Uploader"
|
3595
|
-
requester = self._create_component_from_model(
|
3596
|
-
model=model.requester,
|
3597
|
-
config=config,
|
3598
|
-
name=name,
|
3599
|
-
**kwargs,
|
3600
|
-
)
|
3601
|
-
download_target_extractor = self._create_component_from_model(
|
3602
|
-
model=model.download_target_extractor,
|
3603
|
-
config=config,
|
3604
|
-
name=name,
|
3605
|
-
**kwargs,
|
3606
|
-
)
|
3607
|
-
return FileUploader(
|
3608
|
-
requester=requester,
|
3609
|
-
download_target_extractor=download_target_extractor,
|
3610
|
-
config=config,
|
3611
|
-
parameters=model.parameters or {},
|
3612
|
-
filename_extractor=model.filename_extractor if model.filename_extractor else None,
|
3613
|
-
)
|
3614
|
-
|
3615
3577
|
def create_moving_window_call_rate_policy(
|
3616
3578
|
self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any
|
3617
3579
|
) -> MovingWindowCallRatePolicy:
|
@@ -58,16 +58,11 @@ class DeclarativePartition(Partition):
|
|
58
58
|
def read(self) -> Iterable[Record]:
|
59
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
60
60
|
if isinstance(stream_data, Mapping):
|
61
|
-
|
62
|
-
stream_data
|
63
|
-
|
64
|
-
|
65
|
-
data=stream_data,
|
66
|
-
stream_name=self.stream_name(),
|
67
|
-
associated_slice=self._stream_slice,
|
68
|
-
)
|
61
|
+
yield Record(
|
62
|
+
data=stream_data,
|
63
|
+
stream_name=self.stream_name(),
|
64
|
+
associated_slice=self._stream_slice,
|
69
65
|
)
|
70
|
-
yield record
|
71
66
|
else:
|
72
67
|
self._message_repository.emit_message(stream_data)
|
73
68
|
|
@@ -8,18 +8,16 @@ from datetime import datetime
|
|
8
8
|
from enum import Enum
|
9
9
|
from io import IOBase
|
10
10
|
from os import makedirs, path
|
11
|
-
from typing import Any,
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Set
|
12
12
|
|
13
13
|
from wcmatch.glob import GLOBSTAR, globmatch
|
14
14
|
|
15
|
-
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
16
15
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
17
16
|
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
18
17
|
include_identities_stream,
|
19
18
|
preserve_directory_structure,
|
20
19
|
use_file_transfer,
|
21
20
|
)
|
22
|
-
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
23
21
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
24
22
|
|
25
23
|
|
@@ -30,10 +28,6 @@ class FileReadMode(Enum):
|
|
30
28
|
|
31
29
|
class AbstractFileBasedStreamReader(ABC):
|
32
30
|
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
33
|
-
FILE_RELATIVE_PATH = "file_relative_path"
|
34
|
-
FILE_NAME = "file_name"
|
35
|
-
LOCAL_FILE_PATH = "local_file_path"
|
36
|
-
FILE_FOLDER = "file_folder"
|
37
31
|
|
38
32
|
def __init__(self) -> None:
|
39
33
|
self._config = None
|
@@ -154,9 +148,9 @@ class AbstractFileBasedStreamReader(ABC):
|
|
154
148
|
return False
|
155
149
|
|
156
150
|
@abstractmethod
|
157
|
-
def
|
151
|
+
def get_file(
|
158
152
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
159
|
-
) ->
|
153
|
+
) -> Dict[str, Any]:
|
160
154
|
"""
|
161
155
|
This is required for connectors that will support writing to
|
162
156
|
files. It will handle the logic to download,get,read,acquire or
|
@@ -168,41 +162,25 @@ class AbstractFileBasedStreamReader(ABC):
|
|
168
162
|
logger (logging.Logger): Logger for logging information and errors.
|
169
163
|
|
170
164
|
Returns:
|
171
|
-
|
172
|
-
-
|
173
|
-
-
|
174
|
-
-
|
165
|
+
dict: A dictionary containing the following:
|
166
|
+
- "file_url" (str): The absolute path of the downloaded file.
|
167
|
+
- "bytes" (int): The file size in bytes.
|
168
|
+
- "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
|
169
|
+
this a mounted volume in the pod container.
|
170
|
+
|
175
171
|
"""
|
176
172
|
...
|
177
173
|
|
178
|
-
def _get_file_transfer_paths(
|
179
|
-
self, source_file_relative_path: str, staging_directory: str
|
180
|
-
) -> MutableMapping[str, Any]:
|
181
|
-
"""
|
182
|
-
This method is used to get the file transfer paths for a given source file relative path and local directory.
|
183
|
-
It returns a dictionary with the following keys:
|
184
|
-
- FILE_RELATIVE_PATH: The relative path to file in reference to the staging directory.
|
185
|
-
- LOCAL_FILE_PATH: The absolute path to the file.
|
186
|
-
- FILE_NAME: The name of the referenced file.
|
187
|
-
- FILE_FOLDER: The folder of the referenced file.
|
188
|
-
"""
|
174
|
+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
|
189
175
|
preserve_directory_structure = self.preserve_directory_structure()
|
190
|
-
|
191
|
-
file_name = path.basename(source_file_relative_path)
|
192
|
-
file_folder = path.dirname(source_file_relative_path)
|
193
176
|
if preserve_directory_structure:
|
194
177
|
# Remove left slashes from source path format to make relative path for writing locally
|
195
|
-
file_relative_path =
|
178
|
+
file_relative_path = file.uri.lstrip("/")
|
196
179
|
else:
|
197
|
-
file_relative_path =
|
198
|
-
local_file_path = path.join(
|
180
|
+
file_relative_path = path.basename(file.uri)
|
181
|
+
local_file_path = path.join(local_directory, file_relative_path)
|
182
|
+
|
199
183
|
# Ensure the local directory exists
|
200
184
|
makedirs(path.dirname(local_file_path), exist_ok=True)
|
201
|
-
|
202
|
-
|
203
|
-
self.FILE_RELATIVE_PATH: file_relative_path,
|
204
|
-
self.LOCAL_FILE_PATH: local_file_path,
|
205
|
-
self.FILE_NAME: file_name,
|
206
|
-
self.FILE_FOLDER: file_folder,
|
207
|
-
}
|
208
|
-
return file_paths
|
185
|
+
absolute_file_path = path.abspath(local_file_path)
|
186
|
+
return [file_relative_path, local_file_path, absolute_file_path]
|
@@ -2,27 +2,34 @@
|
|
2
2
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
|
5
|
+
import os
|
6
|
+
from typing import Any, Dict, Iterable
|
6
7
|
|
7
|
-
from airbyte_cdk.
|
8
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
8
9
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
9
|
-
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
|
-
|
11
|
+
|
12
|
+
AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
|
13
|
+
DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
|
12
14
|
|
13
15
|
|
14
16
|
class FileTransfer:
|
15
17
|
def __init__(self) -> None:
|
16
|
-
self._local_directory =
|
18
|
+
self._local_directory = (
|
19
|
+
AIRBYTE_STAGING_DIRECTORY
|
20
|
+
if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
|
21
|
+
else DEFAULT_LOCAL_DIRECTORY
|
22
|
+
)
|
17
23
|
|
18
|
-
def
|
24
|
+
def get_file(
|
19
25
|
self,
|
26
|
+
config: FileBasedStreamConfig,
|
20
27
|
file: RemoteFile,
|
21
28
|
stream_reader: AbstractFileBasedStreamReader,
|
22
29
|
logger: logging.Logger,
|
23
|
-
) -> Iterable[
|
30
|
+
) -> Iterable[Dict[str, Any]]:
|
24
31
|
try:
|
25
|
-
yield stream_reader.
|
32
|
+
yield stream_reader.get_file(
|
26
33
|
file=file, local_directory=self._local_directory, logger=logger
|
27
34
|
)
|
28
35
|
except Exception as ex:
|
@@ -18,18 +18,9 @@ JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
|
|
18
18
|
SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
|
19
19
|
|
20
20
|
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
|
21
|
-
|
22
21
|
file_transfer_schema = {
|
23
22
|
"type": "object",
|
24
|
-
"properties": {
|
25
|
-
"folder": {"type": "string"},
|
26
|
-
"file_name": {"type": "string"},
|
27
|
-
"source_uri": {"type": "string"},
|
28
|
-
"bytes": {"type": "integer"},
|
29
|
-
"id": {"type": ["null", "string"]},
|
30
|
-
"updated_at": {"type": ["null", "string"]},
|
31
|
-
"mime_type": {"type": ["null", "string"]},
|
32
|
-
},
|
23
|
+
"properties": {"data": {"type": "object"}, "file": {"type": "object"}},
|
33
24
|
}
|
34
25
|
|
35
26
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import copy
|
6
6
|
import logging
|
7
|
-
from functools import lru_cache
|
7
|
+
from functools import cache, lru_cache
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
10
|
from typing_extensions import deprecated
|
@@ -258,14 +258,19 @@ class FileBasedStreamPartition(Partition):
|
|
258
258
|
and record_data.record is not None
|
259
259
|
):
|
260
260
|
# `AirbyteMessage`s of type `Record` should also be yielded so they are enqueued
|
261
|
-
|
261
|
+
# If stream is flagged for file_transfer the record should data in file key
|
262
|
+
record_message_data = (
|
263
|
+
record_data.record.file
|
264
|
+
if self._use_file_transfer()
|
265
|
+
else record_data.record.data
|
266
|
+
)
|
262
267
|
if not record_message_data:
|
263
268
|
raise ExceptionWithDisplayMessage("A record without data was found")
|
264
269
|
else:
|
265
270
|
yield Record(
|
266
271
|
data=record_message_data,
|
267
272
|
stream_name=self.stream_name(),
|
268
|
-
|
273
|
+
is_file_transfer_message=self._use_file_transfer(),
|
269
274
|
)
|
270
275
|
else:
|
271
276
|
self._message_repository.emit_message(record_data)
|
@@ -301,6 +306,10 @@ class FileBasedStreamPartition(Partition):
|
|
301
306
|
def stream_name(self) -> str:
|
302
307
|
return self._stream.name
|
303
308
|
|
309
|
+
@cache
|
310
|
+
def _use_file_transfer(self) -> bool:
|
311
|
+
return hasattr(self._stream, "use_file_transfer") and self._stream.use_file_transfer
|
312
|
+
|
304
313
|
def __repr__(self) -> str:
|
305
314
|
return f"FileBasedStreamPartition({self._stream.name}, {self._slice})"
|
306
315
|
|