airbyte-cdk 6.44.0__py3-none-any.whl → 6.45.0.dev4100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +6 -45
- airbyte_cdk/connector_builder/main.py +2 -5
- airbyte_cdk/models/__init__.py +1 -0
- airbyte_cdk/models/airbyte_protocol.py +1 -3
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
- airbyte_cdk/sources/declarative/async_job/job.py +0 -6
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
- airbyte_cdk/sources/declarative/checks/__init__.py +2 -5
- airbyte_cdk/sources/declarative/checks/check_stream.py +11 -113
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +8 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +49 -93
- airbyte_cdk/sources/declarative/extractors/record_selector.py +6 -1
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +4 -8
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -23
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +42 -68
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +4 -16
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +42 -83
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -5
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +9 -4
- airbyte_cdk/sources/declarative/transformations/add_fields.py +1 -3
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +9 -9
- airbyte_cdk/sources/file_based/file_record_data.py +24 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +11 -1
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +16 -31
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +1 -3
- airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
- airbyte_cdk/sources/types.py +11 -2
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +8 -8
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/RECORD +42 -41
- airbyte_cdk/models/file_transfer_record_message.py +0 -13
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/entry_points.txt +0 -0
@@ -54,11 +54,7 @@ from airbyte_cdk.sources.declarative.auth.token_provider import (
|
|
54
54
|
SessionTokenProvider,
|
55
55
|
TokenProvider,
|
56
56
|
)
|
57
|
-
from airbyte_cdk.sources.declarative.checks import
|
58
|
-
CheckDynamicStream,
|
59
|
-
CheckStream,
|
60
|
-
DynamicStreamCheckConfig,
|
61
|
-
)
|
57
|
+
from airbyte_cdk.sources.declarative.checks import CheckDynamicStream, CheckStream
|
62
58
|
from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel
|
63
59
|
from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime
|
64
60
|
from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
|
@@ -106,7 +102,6 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
|
|
106
102
|
)
|
107
103
|
from airbyte_cdk.sources.declarative.models import (
|
108
104
|
CustomStateMigration,
|
109
|
-
GzipDecoder,
|
110
105
|
)
|
111
106
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
112
107
|
AddedFieldDefinition as AddedFieldDefinitionModel,
|
@@ -223,10 +218,10 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
223
218
|
DynamicSchemaLoader as DynamicSchemaLoaderModel,
|
224
219
|
)
|
225
220
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
226
|
-
|
221
|
+
ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
|
227
222
|
)
|
228
223
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
229
|
-
|
224
|
+
FileUploader as FileUploaderModel,
|
230
225
|
)
|
231
226
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
232
227
|
FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel,
|
@@ -234,9 +229,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
234
229
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
235
230
|
FlattenFields as FlattenFieldsModel,
|
236
231
|
)
|
237
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
238
|
-
GroupingPartitionRouter as GroupingPartitionRouterModel,
|
239
|
-
)
|
240
232
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
241
233
|
GzipDecoder as GzipDecoderModel,
|
242
234
|
)
|
@@ -395,7 +387,6 @@ from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
|
395
387
|
)
|
396
388
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
397
389
|
CartesianProductStreamSlicer,
|
398
|
-
GroupingPartitionRouter,
|
399
390
|
ListPartitionRouter,
|
400
391
|
PartitionRouter,
|
401
392
|
SinglePartitionRouter,
|
@@ -453,6 +444,7 @@ from airbyte_cdk.sources.declarative.retrievers import (
|
|
453
444
|
SimpleRetriever,
|
454
445
|
SimpleRetrieverTestReadDecorator,
|
455
446
|
)
|
447
|
+
from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
|
456
448
|
from airbyte_cdk.sources.declarative.schema import (
|
457
449
|
ComplexFieldType,
|
458
450
|
DefaultSchemaLoader,
|
@@ -566,7 +558,6 @@ class ModelToComponentFactory:
|
|
566
558
|
BasicHttpAuthenticatorModel: self.create_basic_http_authenticator,
|
567
559
|
BearerAuthenticatorModel: self.create_bearer_authenticator,
|
568
560
|
CheckStreamModel: self.create_check_stream,
|
569
|
-
DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config,
|
570
561
|
CheckDynamicStreamModel: self.create_check_dynamic_stream,
|
571
562
|
CompositeErrorHandlerModel: self.create_composite_error_handler,
|
572
563
|
ConcurrencyLevelModel: self.create_concurrency_level,
|
@@ -645,12 +636,12 @@ class ModelToComponentFactory:
|
|
645
636
|
ComponentMappingDefinitionModel: self.create_components_mapping_definition,
|
646
637
|
ZipfileDecoderModel: self.create_zipfile_decoder,
|
647
638
|
HTTPAPIBudgetModel: self.create_http_api_budget,
|
639
|
+
FileUploaderModel: self.create_file_uploader,
|
648
640
|
FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy,
|
649
641
|
MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy,
|
650
642
|
UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy,
|
651
643
|
RateModel: self.create_rate,
|
652
644
|
HttpRequestRegexMatcherModel: self.create_http_request_matcher,
|
653
|
-
GroupingPartitionRouterModel: self.create_grouping_partition_router,
|
654
645
|
}
|
655
646
|
|
656
647
|
# Needed for the case where we need to perform a second parse on the fields of a custom component
|
@@ -944,36 +935,8 @@ class ModelToComponentFactory:
|
|
944
935
|
)
|
945
936
|
|
946
937
|
@staticmethod
|
947
|
-
def
|
948
|
-
model
|
949
|
-
) -> DynamicStreamCheckConfig:
|
950
|
-
return DynamicStreamCheckConfig(
|
951
|
-
dynamic_stream_name=model.dynamic_stream_name,
|
952
|
-
stream_count=model.stream_count or 0,
|
953
|
-
)
|
954
|
-
|
955
|
-
def create_check_stream(
|
956
|
-
self, model: CheckStreamModel, config: Config, **kwargs: Any
|
957
|
-
) -> CheckStream:
|
958
|
-
if model.dynamic_streams_check_configs is None and model.stream_names is None:
|
959
|
-
raise ValueError(
|
960
|
-
"Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream"
|
961
|
-
)
|
962
|
-
|
963
|
-
dynamic_streams_check_configs = (
|
964
|
-
[
|
965
|
-
self._create_component_from_model(model=dynamic_stream_check_config, config=config)
|
966
|
-
for dynamic_stream_check_config in model.dynamic_streams_check_configs
|
967
|
-
]
|
968
|
-
if model.dynamic_streams_check_configs
|
969
|
-
else []
|
970
|
-
)
|
971
|
-
|
972
|
-
return CheckStream(
|
973
|
-
stream_names=model.stream_names or [],
|
974
|
-
dynamic_streams_check_configs=dynamic_streams_check_configs,
|
975
|
-
parameters={},
|
976
|
-
)
|
938
|
+
def create_check_stream(model: CheckStreamModel, config: Config, **kwargs: Any) -> CheckStream:
|
939
|
+
return CheckStream(stream_names=model.stream_names, parameters={})
|
977
940
|
|
978
941
|
@staticmethod
|
979
942
|
def create_check_dynamic_stream(
|
@@ -1396,9 +1359,6 @@ class ModelToComponentFactory:
|
|
1396
1359
|
)
|
1397
1360
|
stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state)
|
1398
1361
|
|
1399
|
-
# Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state
|
1400
|
-
use_global_cursor = isinstance(partition_router, GroupingPartitionRouter)
|
1401
|
-
|
1402
1362
|
# Return the concurrent cursor and state converter
|
1403
1363
|
return ConcurrentPerPartitionCursor(
|
1404
1364
|
cursor_factory=cursor_factory,
|
@@ -1410,7 +1370,6 @@ class ModelToComponentFactory:
|
|
1410
1370
|
connector_state_manager=state_manager,
|
1411
1371
|
connector_state_converter=connector_state_converter,
|
1412
1372
|
cursor_field=cursor_field,
|
1413
|
-
use_global_cursor=use_global_cursor,
|
1414
1373
|
)
|
1415
1374
|
|
1416
1375
|
@staticmethod
|
@@ -1796,6 +1755,11 @@ class ModelToComponentFactory:
|
|
1796
1755
|
transformations.append(
|
1797
1756
|
self._create_component_from_model(model=transformation_model, config=config)
|
1798
1757
|
)
|
1758
|
+
file_uploader = None
|
1759
|
+
if model.file_uploader:
|
1760
|
+
file_uploader = self._create_component_from_model(
|
1761
|
+
model=model.file_uploader, config=config
|
1762
|
+
)
|
1799
1763
|
|
1800
1764
|
retriever = self._create_component_from_model(
|
1801
1765
|
model=model.retriever,
|
@@ -1807,6 +1771,7 @@ class ModelToComponentFactory:
|
|
1807
1771
|
stop_condition_on_cursor=stop_condition_on_cursor,
|
1808
1772
|
client_side_incremental_sync=client_side_incremental_sync,
|
1809
1773
|
transformations=transformations,
|
1774
|
+
file_uploader=file_uploader,
|
1810
1775
|
incremental_sync=model.incremental_sync,
|
1811
1776
|
)
|
1812
1777
|
cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None
|
@@ -2648,6 +2613,7 @@ class ModelToComponentFactory:
|
|
2648
2613
|
transformations: List[RecordTransformation] | None = None,
|
2649
2614
|
decoder: Decoder | None = None,
|
2650
2615
|
client_side_incremental_sync: Dict[str, Any] | None = None,
|
2616
|
+
file_uploader: Optional[FileUploader] = None,
|
2651
2617
|
**kwargs: Any,
|
2652
2618
|
) -> RecordSelector:
|
2653
2619
|
extractor = self._create_component_from_model(
|
@@ -2685,6 +2651,7 @@ class ModelToComponentFactory:
|
|
2685
2651
|
config=config,
|
2686
2652
|
record_filter=record_filter,
|
2687
2653
|
transformations=transformations or [],
|
2654
|
+
file_uploader=file_uploader,
|
2688
2655
|
schema_normalization=schema_normalization,
|
2689
2656
|
parameters=model.parameters or {},
|
2690
2657
|
transform_before_filtering=transform_before_filtering,
|
@@ -2742,6 +2709,7 @@ class ModelToComponentFactory:
|
|
2742
2709
|
stop_condition_on_cursor: bool = False,
|
2743
2710
|
client_side_incremental_sync: Optional[Dict[str, Any]] = None,
|
2744
2711
|
transformations: List[RecordTransformation],
|
2712
|
+
file_uploader: Optional[FileUploader] = None,
|
2745
2713
|
incremental_sync: Optional[
|
2746
2714
|
Union[
|
2747
2715
|
IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel
|
@@ -2764,6 +2732,7 @@ class ModelToComponentFactory:
|
|
2764
2732
|
decoder=decoder,
|
2765
2733
|
transformations=transformations,
|
2766
2734
|
client_side_incremental_sync=client_side_incremental_sync,
|
2735
|
+
file_uploader=file_uploader,
|
2767
2736
|
)
|
2768
2737
|
url_base = (
|
2769
2738
|
model.requester.url_base
|
@@ -3118,11 +3087,8 @@ class ModelToComponentFactory:
|
|
3118
3087
|
stream_slices,
|
3119
3088
|
self._job_tracker,
|
3120
3089
|
self._message_repository,
|
3121
|
-
# FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
|
3122
3090
|
has_bulk_parent=False,
|
3123
|
-
#
|
3124
|
-
# `None` == default retry is set to 3 attempts, under the hood.
|
3125
|
-
job_max_retry=1 if self._emit_connector_builder_messages else None,
|
3091
|
+
# FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
|
3126
3092
|
),
|
3127
3093
|
stream_slicer=stream_slicer,
|
3128
3094
|
config=config,
|
@@ -3366,6 +3332,30 @@ class ModelToComponentFactory:
|
|
3366
3332
|
matchers=matchers,
|
3367
3333
|
)
|
3368
3334
|
|
3335
|
+
def create_file_uploader(
|
3336
|
+
self, model: FileUploaderModel, config: Config, **kwargs: Any
|
3337
|
+
) -> FileUploader:
|
3338
|
+
name = "File Uploader"
|
3339
|
+
requester = self._create_component_from_model(
|
3340
|
+
model=model.requester,
|
3341
|
+
config=config,
|
3342
|
+
name=name,
|
3343
|
+
**kwargs,
|
3344
|
+
)
|
3345
|
+
download_target_extractor = self._create_component_from_model(
|
3346
|
+
model=model.download_target_extractor,
|
3347
|
+
config=config,
|
3348
|
+
name=name,
|
3349
|
+
**kwargs,
|
3350
|
+
)
|
3351
|
+
return FileUploader(
|
3352
|
+
requester=requester,
|
3353
|
+
download_target_extractor=download_target_extractor,
|
3354
|
+
config=config,
|
3355
|
+
parameters=model.parameters or {},
|
3356
|
+
filename_extractor=model.filename_extractor if model.filename_extractor else None,
|
3357
|
+
)
|
3358
|
+
|
3369
3359
|
def create_moving_window_call_rate_policy(
|
3370
3360
|
self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any
|
3371
3361
|
) -> MovingWindowCallRatePolicy:
|
@@ -3415,34 +3405,3 @@ class ModelToComponentFactory:
|
|
3415
3405
|
self._api_budget = self.create_component(
|
3416
3406
|
model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config
|
3417
3407
|
)
|
3418
|
-
|
3419
|
-
def create_grouping_partition_router(
|
3420
|
-
self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
|
3421
|
-
) -> GroupingPartitionRouter:
|
3422
|
-
underlying_router = self._create_component_from_model(
|
3423
|
-
model=model.underlying_partition_router, config=config
|
3424
|
-
)
|
3425
|
-
if model.group_size < 1:
|
3426
|
-
raise ValueError(f"Group size must be greater than 0, got {model.group_size}")
|
3427
|
-
|
3428
|
-
# Request options in underlying partition routers are not supported for GroupingPartitionRouter
|
3429
|
-
# because they are specific to individual partitions and cannot be aggregated or handled
|
3430
|
-
# when grouping, potentially leading to incorrect API calls. Any request customization
|
3431
|
-
# should be managed at the stream level through the requester's configuration.
|
3432
|
-
if isinstance(underlying_router, SubstreamPartitionRouter):
|
3433
|
-
if any(
|
3434
|
-
parent_config.request_option
|
3435
|
-
for parent_config in underlying_router.parent_stream_configs
|
3436
|
-
):
|
3437
|
-
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3438
|
-
|
3439
|
-
if isinstance(underlying_router, ListPartitionRouter):
|
3440
|
-
if underlying_router.request_option:
|
3441
|
-
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3442
|
-
|
3443
|
-
return GroupingPartitionRouter(
|
3444
|
-
group_size=model.group_size,
|
3445
|
-
underlying_partition_router=underlying_router,
|
3446
|
-
deduplicate=model.deduplicate if model.deduplicate is not None else True,
|
3447
|
-
config=config,
|
3448
|
-
)
|
@@ -8,9 +8,6 @@ from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_route
|
|
8
8
|
from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import (
|
9
9
|
CartesianProductStreamSlicer,
|
10
10
|
)
|
11
|
-
from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import (
|
12
|
-
GroupingPartitionRouter,
|
13
|
-
)
|
14
11
|
from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import (
|
15
12
|
ListPartitionRouter,
|
16
13
|
)
|
@@ -25,7 +22,6 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_route
|
|
25
22
|
__all__ = [
|
26
23
|
"AsyncJobPartitionRouter",
|
27
24
|
"CartesianProductStreamSlicer",
|
28
|
-
"GroupingPartitionRouter",
|
29
25
|
"ListPartitionRouter",
|
30
26
|
"SinglePartitionRouter",
|
31
27
|
"SubstreamPartitionRouter",
|
@@ -374,11 +374,7 @@ class SubstreamPartitionRouter(PartitionRouter):
|
|
374
374
|
# Ignore per-partition states or invalid formats.
|
375
375
|
if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
|
376
376
|
# If a global state is present under the key "state", use its first value.
|
377
|
-
if (
|
378
|
-
"state" in stream_state
|
379
|
-
and isinstance(stream_state["state"], dict)
|
380
|
-
and stream_state["state"] != {}
|
381
|
-
):
|
377
|
+
if "state" in stream_state and isinstance(stream_state["state"], dict):
|
382
378
|
substream_state = list(stream_state["state"].values())[0]
|
383
379
|
else:
|
384
380
|
return {}
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import uuid
|
8
|
+
from dataclasses import InitVar, dataclass, field
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import Any, Mapping, Optional, Union
|
11
|
+
|
12
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
13
|
+
from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
|
14
|
+
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import (
|
15
|
+
InterpolatedString,
|
16
|
+
)
|
17
|
+
from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
|
18
|
+
SafeResponse,
|
19
|
+
)
|
20
|
+
from airbyte_cdk.sources.declarative.requesters import Requester
|
21
|
+
from airbyte_cdk.sources.declarative.types import Record, StreamSlice
|
22
|
+
from airbyte_cdk.sources.types import Config
|
23
|
+
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
24
|
+
|
25
|
+
logger = logging.getLogger("airbyte")
|
26
|
+
|
27
|
+
|
28
|
+
@dataclass
|
29
|
+
class FileUploader:
|
30
|
+
requester: Requester
|
31
|
+
download_target_extractor: RecordExtractor
|
32
|
+
config: Config
|
33
|
+
parameters: InitVar[Mapping[str, Any]]
|
34
|
+
|
35
|
+
filename_extractor: Optional[Union[InterpolatedString, str]] = None
|
36
|
+
content_extractor: Optional[RecordExtractor] = None
|
37
|
+
|
38
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
39
|
+
if self.filename_extractor:
|
40
|
+
self.filename_extractor = InterpolatedString.create(
|
41
|
+
self.filename_extractor,
|
42
|
+
parameters=parameters,
|
43
|
+
)
|
44
|
+
|
45
|
+
def upload(self, record: Record) -> None:
|
46
|
+
mocked_response = SafeResponse()
|
47
|
+
mocked_response.content = json.dumps(record.data).encode()
|
48
|
+
download_target = list(self.download_target_extractor.extract_records(mocked_response))[0]
|
49
|
+
if not isinstance(download_target, str):
|
50
|
+
raise ValueError(
|
51
|
+
f"download_target is expected to be a str but was {type(download_target)}: {download_target}"
|
52
|
+
)
|
53
|
+
|
54
|
+
response = self.requester.send_request(
|
55
|
+
stream_slice=StreamSlice(
|
56
|
+
partition={}, cursor_slice={}, extra_fields={"download_target": download_target}
|
57
|
+
),
|
58
|
+
)
|
59
|
+
|
60
|
+
if self.content_extractor:
|
61
|
+
raise NotImplementedError("TODO")
|
62
|
+
else:
|
63
|
+
files_directory = Path(get_files_directory())
|
64
|
+
|
65
|
+
file_name = (
|
66
|
+
self.filename_extractor.eval(self.config, record=record)
|
67
|
+
if self.filename_extractor
|
68
|
+
else str(uuid.uuid4())
|
69
|
+
)
|
70
|
+
file_name = file_name.lstrip("/")
|
71
|
+
file_relative_path = Path(record.stream_name) / Path(file_name)
|
72
|
+
|
73
|
+
full_path = files_directory / file_relative_path
|
74
|
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
75
|
+
|
76
|
+
with open(str(full_path), "wb") as f:
|
77
|
+
f.write(response.content)
|
78
|
+
file_size_bytes = full_path.stat().st_size
|
79
|
+
|
80
|
+
logger.info("File uploaded successfully")
|
81
|
+
logger.info(f"File url: {str(full_path)}")
|
82
|
+
logger.info(f"File size: {file_size_bytes / 1024} KB")
|
83
|
+
logger.info(f"File relative path: {str(file_relative_path)}")
|
84
|
+
|
85
|
+
record.file_reference = AirbyteRecordMessageFileReference(
|
86
|
+
file_url=str(full_path),
|
87
|
+
file_relative_path=str(file_relative_path),
|
88
|
+
file_size_bytes=file_size_bytes,
|
89
|
+
)
|
@@ -58,11 +58,16 @@ class DeclarativePartition(Partition):
|
|
58
58
|
def read(self) -> Iterable[Record]:
|
59
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
60
60
|
if isinstance(stream_data, Mapping):
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
record = (
|
62
|
+
stream_data
|
63
|
+
if isinstance(stream_data, Record)
|
64
|
+
else Record(
|
65
|
+
data=stream_data,
|
66
|
+
stream_name=self.stream_name(),
|
67
|
+
associated_slice=self._stream_slice,
|
68
|
+
)
|
65
69
|
)
|
70
|
+
yield record
|
66
71
|
else:
|
67
72
|
self._message_repository.emit_message(stream_data)
|
68
73
|
|
@@ -139,9 +139,7 @@ class AddFields(RecordTransformation):
|
|
139
139
|
valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
|
140
140
|
value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
|
141
141
|
is_empty_condition = not self.condition
|
142
|
-
if is_empty_condition or self._filter_interpolator.eval(
|
143
|
-
config, value=value, path=parsed_field.path, **kwargs
|
144
|
-
):
|
142
|
+
if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
|
145
143
|
dpath.new(record, parsed_field.path, value)
|
146
144
|
|
147
145
|
def __eq__(self, other: Any) -> bool:
|
@@ -8,16 +8,18 @@ from datetime import datetime
|
|
8
8
|
from enum import Enum
|
9
9
|
from io import IOBase
|
10
10
|
from os import makedirs, path
|
11
|
-
from typing import
|
11
|
+
from typing import Iterable, List, Optional, Set, Tuple
|
12
12
|
|
13
13
|
from wcmatch.glob import GLOBSTAR, globmatch
|
14
14
|
|
15
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
15
16
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
16
17
|
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
17
18
|
include_identities_stream,
|
18
19
|
preserve_directory_structure,
|
19
20
|
use_file_transfer,
|
20
21
|
)
|
22
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
21
23
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
24
|
|
23
25
|
|
@@ -148,9 +150,9 @@ class AbstractFileBasedStreamReader(ABC):
|
|
148
150
|
return False
|
149
151
|
|
150
152
|
@abstractmethod
|
151
|
-
def
|
153
|
+
def upload(
|
152
154
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
153
|
-
) ->
|
155
|
+
) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
|
154
156
|
"""
|
155
157
|
This is required for connectors that will support writing to
|
156
158
|
files. It will handle the logic to download,get,read,acquire or
|
@@ -162,12 +164,10 @@ class AbstractFileBasedStreamReader(ABC):
|
|
162
164
|
logger (logging.Logger): Logger for logging information and errors.
|
163
165
|
|
164
166
|
Returns:
|
165
|
-
|
166
|
-
-
|
167
|
-
-
|
168
|
-
-
|
169
|
-
this a mounted volume in the pod container.
|
170
|
-
|
167
|
+
AirbyteRecordMessageFileReference: A file reference object containing:
|
168
|
+
- staging_file_url (str): The absolute path to the referenced file in the staging area.
|
169
|
+
- file_size_bytes (int): The size of the referenced file in bytes.
|
170
|
+
- source_file_relative_path (str): The relative path to the referenced file in source.
|
171
171
|
"""
|
172
172
|
...
|
173
173
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from datetime import datetime
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
from pydantic.v1 import BaseModel
|
9
|
+
|
10
|
+
|
11
|
+
class FileRecordData(BaseModel):
|
12
|
+
"""
|
13
|
+
A record in a file-based stream.
|
14
|
+
"""
|
15
|
+
|
16
|
+
folder: str
|
17
|
+
filename: str
|
18
|
+
bytes: int
|
19
|
+
|
20
|
+
id: Optional[str] = None
|
21
|
+
created_at: Optional[int] = None
|
22
|
+
updated_at: Optional[int] = None
|
23
|
+
mime_type: Optional[str] = None
|
24
|
+
description: Optional[str] = None
|
@@ -2,34 +2,27 @@
|
|
2
2
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
-
import
|
6
|
-
from typing import Any, Dict, Iterable
|
5
|
+
from typing import Iterable, Tuple
|
7
6
|
|
8
|
-
from airbyte_cdk.
|
7
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
9
8
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
9
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
|
-
|
12
|
-
AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
|
13
|
-
DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
|
11
|
+
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
14
12
|
|
15
13
|
|
16
14
|
class FileTransfer:
|
17
15
|
def __init__(self) -> None:
|
18
|
-
self._local_directory = (
|
19
|
-
AIRBYTE_STAGING_DIRECTORY
|
20
|
-
if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
|
21
|
-
else DEFAULT_LOCAL_DIRECTORY
|
22
|
-
)
|
16
|
+
self._local_directory = get_files_directory()
|
23
17
|
|
24
|
-
def
|
18
|
+
def upload(
|
25
19
|
self,
|
26
|
-
config: FileBasedStreamConfig,
|
27
20
|
file: RemoteFile,
|
28
21
|
stream_reader: AbstractFileBasedStreamReader,
|
29
22
|
logger: logging.Logger,
|
30
|
-
) -> Iterable[
|
23
|
+
) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
|
31
24
|
try:
|
32
|
-
yield stream_reader.
|
25
|
+
yield stream_reader.upload(
|
33
26
|
file=file, local_directory=self._local_directory, logger=logger
|
34
27
|
)
|
35
28
|
except Exception as ex:
|
@@ -18,9 +18,19 @@ JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
|
|
18
18
|
SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
|
19
19
|
|
20
20
|
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
|
21
|
+
|
21
22
|
file_transfer_schema = {
|
22
23
|
"type": "object",
|
23
|
-
"properties": {
|
24
|
+
"properties": {
|
25
|
+
"folder": {"type": "string"},
|
26
|
+
"file_name": {"type": "string"},
|
27
|
+
"bytes": {"type": "integer"},
|
28
|
+
"id": {"type": ["null", "string"]},
|
29
|
+
"created_at": {"type": ["null", "integer"]},
|
30
|
+
"updated_at": {"type": ["null", "integer"]},
|
31
|
+
"mime_type": {"type": ["null", "string"]},
|
32
|
+
"description": {"type": ["null", "string"]},
|
33
|
+
},
|
24
34
|
}
|
25
35
|
|
26
36
|
|
@@ -11,7 +11,7 @@ from functools import cache
|
|
11
11
|
from os import path
|
12
12
|
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
13
13
|
|
14
|
-
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
|
14
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
|
15
15
|
from airbyte_cdk.models import Type as MessageType
|
16
16
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
17
17
|
from airbyte_cdk.sources.file_based.exceptions import (
|
@@ -97,14 +97,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
97
97
|
self, configured_catalog_json_schema: Dict[str, Any]
|
98
98
|
) -> Dict[str, Any]:
|
99
99
|
if self.use_file_transfer:
|
100
|
-
return
|
101
|
-
"type": "object",
|
102
|
-
"properties": {
|
103
|
-
"file_path": {"type": "string"},
|
104
|
-
"file_size": {"type": "string"},
|
105
|
-
self.ab_file_name_col: {"type": "string"},
|
106
|
-
},
|
107
|
-
}
|
100
|
+
return file_transfer_schema
|
108
101
|
else:
|
109
102
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
110
103
|
|
@@ -145,14 +138,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
145
138
|
record[self.ab_file_name_col] = file.uri
|
146
139
|
return record
|
147
140
|
|
148
|
-
def transform_record_for_file_transfer(
|
149
|
-
self, record: dict[str, Any], file: RemoteFile
|
150
|
-
) -> dict[str, Any]:
|
151
|
-
# timstamp() returns a float representing the number of seconds since the unix epoch
|
152
|
-
record[self.modified] = int(file.last_modified.timestamp()) * 1000
|
153
|
-
record[self.source_file_url] = file.uri
|
154
|
-
return record
|
155
|
-
|
156
141
|
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
|
157
142
|
"""
|
158
143
|
Yield all records from all remote files in `list_files_for_this_sync`.
|
@@ -166,6 +151,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
166
151
|
raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
|
167
152
|
# The stream only supports a single file type, so we can use the same parser for all files
|
168
153
|
parser = self.get_parser()
|
154
|
+
file_transfer = FileTransfer()
|
169
155
|
for file in stream_slice["files"]:
|
170
156
|
# only serialize the datetime once
|
171
157
|
file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
@@ -173,19 +159,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
173
159
|
|
174
160
|
try:
|
175
161
|
if self.use_file_transfer:
|
176
|
-
|
177
|
-
|
178
|
-
file_transfer = FileTransfer()
|
179
|
-
for record in file_transfer.get_file(
|
180
|
-
self.config, file, self.stream_reader, self.logger
|
162
|
+
for file_record_data, file_reference in file_transfer.upload(
|
163
|
+
file=file, stream_reader=self.stream_reader, logger=self.logger
|
181
164
|
):
|
182
|
-
line_no += 1
|
183
|
-
if not self.record_passes_validation_policy(record):
|
184
|
-
n_skipped += 1
|
185
|
-
continue
|
186
|
-
record = self.transform_record_for_file_transfer(record, file)
|
187
165
|
yield stream_data_to_airbyte_message(
|
188
|
-
self.name,
|
166
|
+
self.name,
|
167
|
+
file_record_data.dict(exclude_none=True),
|
168
|
+
file_reference=file_reference,
|
189
169
|
)
|
190
170
|
else:
|
191
171
|
for record in parser.parse_records(
|
@@ -259,6 +239,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
259
239
|
|
260
240
|
@cache
|
261
241
|
def get_json_schema(self) -> JsonSchema:
|
242
|
+
if self.use_file_transfer:
|
243
|
+
return file_transfer_schema
|
262
244
|
extra_fields = {
|
263
245
|
self.ab_last_mod_col: {"type": "string"},
|
264
246
|
self.ab_file_name_col: {"type": "string"},
|
@@ -282,9 +264,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
282
264
|
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
|
283
265
|
|
284
266
|
def _get_raw_json_schema(self) -> JsonSchema:
|
285
|
-
if self.
|
286
|
-
return file_transfer_schema
|
287
|
-
elif self.config.input_schema:
|
267
|
+
if self.config.input_schema:
|
288
268
|
return self.config.get_input_schema() # type: ignore
|
289
269
|
elif self.config.schemaless:
|
290
270
|
return schemaless_schema
|
@@ -341,6 +321,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
341
321
|
self.config.globs or [], self.config.legacy_prefix, self.logger
|
342
322
|
)
|
343
323
|
|
324
|
+
def as_airbyte_stream(self) -> AirbyteStream:
|
325
|
+
file_stream = super().as_airbyte_stream()
|
326
|
+
file_stream.is_file_based = self.use_file_transfer
|
327
|
+
return file_stream
|
328
|
+
|
344
329
|
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
345
330
|
loop = asyncio.get_event_loop()
|
346
331
|
schema = loop.run_until_complete(self._infer_schema(files))
|