airbyte-cdk 6.44.0__py3-none-any.whl → 6.45.0.dev4100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +6 -45
  2. airbyte_cdk/connector_builder/main.py +2 -5
  3. airbyte_cdk/models/__init__.py +1 -0
  4. airbyte_cdk/models/airbyte_protocol.py +1 -3
  5. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
  6. airbyte_cdk/sources/declarative/async_job/job.py +0 -6
  7. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
  8. airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
  9. airbyte_cdk/sources/declarative/checks/__init__.py +2 -5
  10. airbyte_cdk/sources/declarative/checks/check_stream.py +11 -113
  11. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +8 -0
  12. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +49 -93
  13. airbyte_cdk/sources/declarative/extractors/record_selector.py +6 -1
  14. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
  15. airbyte_cdk/sources/declarative/interpolation/macros.py +4 -8
  16. airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -23
  17. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +42 -68
  18. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +4 -16
  19. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +42 -83
  20. airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
  21. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -5
  22. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  23. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +9 -4
  24. airbyte_cdk/sources/declarative/transformations/add_fields.py +1 -3
  25. airbyte_cdk/sources/file_based/file_based_stream_reader.py +9 -9
  26. airbyte_cdk/sources/file_based/file_record_data.py +24 -0
  27. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -15
  28. airbyte_cdk/sources/file_based/schema_helpers.py +11 -1
  29. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -1
  30. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +16 -31
  31. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +1 -3
  32. airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
  33. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
  34. airbyte_cdk/sources/types.py +11 -2
  35. airbyte_cdk/sources/utils/files_directory.py +15 -0
  36. airbyte_cdk/sources/utils/record_helper.py +8 -8
  37. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/METADATA +2 -2
  38. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/RECORD +42 -41
  39. airbyte_cdk/models/file_transfer_record_message.py +0 -13
  40. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
  41. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/LICENSE.txt +0 -0
  42. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/LICENSE_SHORT +0 -0
  43. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/WHEEL +0 -0
  44. {airbyte_cdk-6.44.0.dist-info → airbyte_cdk-6.45.0.dev4100.dist-info}/entry_points.txt +0 -0
@@ -54,11 +54,7 @@ from airbyte_cdk.sources.declarative.auth.token_provider import (
54
54
  SessionTokenProvider,
55
55
  TokenProvider,
56
56
  )
57
- from airbyte_cdk.sources.declarative.checks import (
58
- CheckDynamicStream,
59
- CheckStream,
60
- DynamicStreamCheckConfig,
61
- )
57
+ from airbyte_cdk.sources.declarative.checks import CheckDynamicStream, CheckStream
62
58
  from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel
63
59
  from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime
64
60
  from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
@@ -106,7 +102,6 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
106
102
  )
107
103
  from airbyte_cdk.sources.declarative.models import (
108
104
  CustomStateMigration,
109
- GzipDecoder,
110
105
  )
111
106
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
112
107
  AddedFieldDefinition as AddedFieldDefinitionModel,
@@ -223,10 +218,10 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
223
218
  DynamicSchemaLoader as DynamicSchemaLoaderModel,
224
219
  )
225
220
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
226
- DynamicStreamCheckConfig as DynamicStreamCheckConfigModel,
221
+ ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
227
222
  )
228
223
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
229
- ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
224
+ FileUploader as FileUploaderModel,
230
225
  )
231
226
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
232
227
  FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel,
@@ -234,9 +229,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
234
229
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
235
230
  FlattenFields as FlattenFieldsModel,
236
231
  )
237
- from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
238
- GroupingPartitionRouter as GroupingPartitionRouterModel,
239
- )
240
232
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
241
233
  GzipDecoder as GzipDecoderModel,
242
234
  )
@@ -395,7 +387,6 @@ from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
395
387
  )
396
388
  from airbyte_cdk.sources.declarative.partition_routers import (
397
389
  CartesianProductStreamSlicer,
398
- GroupingPartitionRouter,
399
390
  ListPartitionRouter,
400
391
  PartitionRouter,
401
392
  SinglePartitionRouter,
@@ -453,6 +444,7 @@ from airbyte_cdk.sources.declarative.retrievers import (
453
444
  SimpleRetriever,
454
445
  SimpleRetrieverTestReadDecorator,
455
446
  )
447
+ from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
456
448
  from airbyte_cdk.sources.declarative.schema import (
457
449
  ComplexFieldType,
458
450
  DefaultSchemaLoader,
@@ -566,7 +558,6 @@ class ModelToComponentFactory:
566
558
  BasicHttpAuthenticatorModel: self.create_basic_http_authenticator,
567
559
  BearerAuthenticatorModel: self.create_bearer_authenticator,
568
560
  CheckStreamModel: self.create_check_stream,
569
- DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config,
570
561
  CheckDynamicStreamModel: self.create_check_dynamic_stream,
571
562
  CompositeErrorHandlerModel: self.create_composite_error_handler,
572
563
  ConcurrencyLevelModel: self.create_concurrency_level,
@@ -645,12 +636,12 @@ class ModelToComponentFactory:
645
636
  ComponentMappingDefinitionModel: self.create_components_mapping_definition,
646
637
  ZipfileDecoderModel: self.create_zipfile_decoder,
647
638
  HTTPAPIBudgetModel: self.create_http_api_budget,
639
+ FileUploaderModel: self.create_file_uploader,
648
640
  FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy,
649
641
  MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy,
650
642
  UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy,
651
643
  RateModel: self.create_rate,
652
644
  HttpRequestRegexMatcherModel: self.create_http_request_matcher,
653
- GroupingPartitionRouterModel: self.create_grouping_partition_router,
654
645
  }
655
646
 
656
647
  # Needed for the case where we need to perform a second parse on the fields of a custom component
@@ -944,36 +935,8 @@ class ModelToComponentFactory:
944
935
  )
945
936
 
946
937
  @staticmethod
947
- def create_dynamic_stream_check_config(
948
- model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any
949
- ) -> DynamicStreamCheckConfig:
950
- return DynamicStreamCheckConfig(
951
- dynamic_stream_name=model.dynamic_stream_name,
952
- stream_count=model.stream_count or 0,
953
- )
954
-
955
- def create_check_stream(
956
- self, model: CheckStreamModel, config: Config, **kwargs: Any
957
- ) -> CheckStream:
958
- if model.dynamic_streams_check_configs is None and model.stream_names is None:
959
- raise ValueError(
960
- "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream"
961
- )
962
-
963
- dynamic_streams_check_configs = (
964
- [
965
- self._create_component_from_model(model=dynamic_stream_check_config, config=config)
966
- for dynamic_stream_check_config in model.dynamic_streams_check_configs
967
- ]
968
- if model.dynamic_streams_check_configs
969
- else []
970
- )
971
-
972
- return CheckStream(
973
- stream_names=model.stream_names or [],
974
- dynamic_streams_check_configs=dynamic_streams_check_configs,
975
- parameters={},
976
- )
938
+ def create_check_stream(model: CheckStreamModel, config: Config, **kwargs: Any) -> CheckStream:
939
+ return CheckStream(stream_names=model.stream_names, parameters={})
977
940
 
978
941
  @staticmethod
979
942
  def create_check_dynamic_stream(
@@ -1396,9 +1359,6 @@ class ModelToComponentFactory:
1396
1359
  )
1397
1360
  stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state)
1398
1361
 
1399
- # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state
1400
- use_global_cursor = isinstance(partition_router, GroupingPartitionRouter)
1401
-
1402
1362
  # Return the concurrent cursor and state converter
1403
1363
  return ConcurrentPerPartitionCursor(
1404
1364
  cursor_factory=cursor_factory,
@@ -1410,7 +1370,6 @@ class ModelToComponentFactory:
1410
1370
  connector_state_manager=state_manager,
1411
1371
  connector_state_converter=connector_state_converter,
1412
1372
  cursor_field=cursor_field,
1413
- use_global_cursor=use_global_cursor,
1414
1373
  )
1415
1374
 
1416
1375
  @staticmethod
@@ -1796,6 +1755,11 @@ class ModelToComponentFactory:
1796
1755
  transformations.append(
1797
1756
  self._create_component_from_model(model=transformation_model, config=config)
1798
1757
  )
1758
+ file_uploader = None
1759
+ if model.file_uploader:
1760
+ file_uploader = self._create_component_from_model(
1761
+ model=model.file_uploader, config=config
1762
+ )
1799
1763
 
1800
1764
  retriever = self._create_component_from_model(
1801
1765
  model=model.retriever,
@@ -1807,6 +1771,7 @@ class ModelToComponentFactory:
1807
1771
  stop_condition_on_cursor=stop_condition_on_cursor,
1808
1772
  client_side_incremental_sync=client_side_incremental_sync,
1809
1773
  transformations=transformations,
1774
+ file_uploader=file_uploader,
1810
1775
  incremental_sync=model.incremental_sync,
1811
1776
  )
1812
1777
  cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None
@@ -2648,6 +2613,7 @@ class ModelToComponentFactory:
2648
2613
  transformations: List[RecordTransformation] | None = None,
2649
2614
  decoder: Decoder | None = None,
2650
2615
  client_side_incremental_sync: Dict[str, Any] | None = None,
2616
+ file_uploader: Optional[FileUploader] = None,
2651
2617
  **kwargs: Any,
2652
2618
  ) -> RecordSelector:
2653
2619
  extractor = self._create_component_from_model(
@@ -2685,6 +2651,7 @@ class ModelToComponentFactory:
2685
2651
  config=config,
2686
2652
  record_filter=record_filter,
2687
2653
  transformations=transformations or [],
2654
+ file_uploader=file_uploader,
2688
2655
  schema_normalization=schema_normalization,
2689
2656
  parameters=model.parameters or {},
2690
2657
  transform_before_filtering=transform_before_filtering,
@@ -2742,6 +2709,7 @@ class ModelToComponentFactory:
2742
2709
  stop_condition_on_cursor: bool = False,
2743
2710
  client_side_incremental_sync: Optional[Dict[str, Any]] = None,
2744
2711
  transformations: List[RecordTransformation],
2712
+ file_uploader: Optional[FileUploader] = None,
2745
2713
  incremental_sync: Optional[
2746
2714
  Union[
2747
2715
  IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel
@@ -2764,6 +2732,7 @@ class ModelToComponentFactory:
2764
2732
  decoder=decoder,
2765
2733
  transformations=transformations,
2766
2734
  client_side_incremental_sync=client_side_incremental_sync,
2735
+ file_uploader=file_uploader,
2767
2736
  )
2768
2737
  url_base = (
2769
2738
  model.requester.url_base
@@ -3118,11 +3087,8 @@ class ModelToComponentFactory:
3118
3087
  stream_slices,
3119
3088
  self._job_tracker,
3120
3089
  self._message_repository,
3121
- # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
3122
3090
  has_bulk_parent=False,
3123
- # set the `job_max_retry` to 1 for the `Connector Builder`` use-case.
3124
- # `None` == default retry is set to 3 attempts, under the hood.
3125
- job_max_retry=1 if self._emit_connector_builder_messages else None,
3091
+ # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
3126
3092
  ),
3127
3093
  stream_slicer=stream_slicer,
3128
3094
  config=config,
@@ -3366,6 +3332,30 @@ class ModelToComponentFactory:
3366
3332
  matchers=matchers,
3367
3333
  )
3368
3334
 
3335
+ def create_file_uploader(
3336
+ self, model: FileUploaderModel, config: Config, **kwargs: Any
3337
+ ) -> FileUploader:
3338
+ name = "File Uploader"
3339
+ requester = self._create_component_from_model(
3340
+ model=model.requester,
3341
+ config=config,
3342
+ name=name,
3343
+ **kwargs,
3344
+ )
3345
+ download_target_extractor = self._create_component_from_model(
3346
+ model=model.download_target_extractor,
3347
+ config=config,
3348
+ name=name,
3349
+ **kwargs,
3350
+ )
3351
+ return FileUploader(
3352
+ requester=requester,
3353
+ download_target_extractor=download_target_extractor,
3354
+ config=config,
3355
+ parameters=model.parameters or {},
3356
+ filename_extractor=model.filename_extractor if model.filename_extractor else None,
3357
+ )
3358
+
3369
3359
  def create_moving_window_call_rate_policy(
3370
3360
  self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any
3371
3361
  ) -> MovingWindowCallRatePolicy:
@@ -3415,34 +3405,3 @@ class ModelToComponentFactory:
3415
3405
  self._api_budget = self.create_component(
3416
3406
  model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config
3417
3407
  )
3418
-
3419
- def create_grouping_partition_router(
3420
- self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
3421
- ) -> GroupingPartitionRouter:
3422
- underlying_router = self._create_component_from_model(
3423
- model=model.underlying_partition_router, config=config
3424
- )
3425
- if model.group_size < 1:
3426
- raise ValueError(f"Group size must be greater than 0, got {model.group_size}")
3427
-
3428
- # Request options in underlying partition routers are not supported for GroupingPartitionRouter
3429
- # because they are specific to individual partitions and cannot be aggregated or handled
3430
- # when grouping, potentially leading to incorrect API calls. Any request customization
3431
- # should be managed at the stream level through the requester's configuration.
3432
- if isinstance(underlying_router, SubstreamPartitionRouter):
3433
- if any(
3434
- parent_config.request_option
3435
- for parent_config in underlying_router.parent_stream_configs
3436
- ):
3437
- raise ValueError("Request options are not supported for GroupingPartitionRouter.")
3438
-
3439
- if isinstance(underlying_router, ListPartitionRouter):
3440
- if underlying_router.request_option:
3441
- raise ValueError("Request options are not supported for GroupingPartitionRouter.")
3442
-
3443
- return GroupingPartitionRouter(
3444
- group_size=model.group_size,
3445
- underlying_partition_router=underlying_router,
3446
- deduplicate=model.deduplicate if model.deduplicate is not None else True,
3447
- config=config,
3448
- )
@@ -8,9 +8,6 @@ from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_route
8
8
  from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import (
9
9
  CartesianProductStreamSlicer,
10
10
  )
11
- from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import (
12
- GroupingPartitionRouter,
13
- )
14
11
  from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import (
15
12
  ListPartitionRouter,
16
13
  )
@@ -25,7 +22,6 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_route
25
22
  __all__ = [
26
23
  "AsyncJobPartitionRouter",
27
24
  "CartesianProductStreamSlicer",
28
- "GroupingPartitionRouter",
29
25
  "ListPartitionRouter",
30
26
  "SinglePartitionRouter",
31
27
  "SubstreamPartitionRouter",
@@ -374,11 +374,7 @@ class SubstreamPartitionRouter(PartitionRouter):
374
374
  # Ignore per-partition states or invalid formats.
375
375
  if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
376
376
  # If a global state is present under the key "state", use its first value.
377
- if (
378
- "state" in stream_state
379
- and isinstance(stream_state["state"], dict)
380
- and stream_state["state"] != {}
381
- ):
377
+ if "state" in stream_state and isinstance(stream_state["state"], dict):
382
378
  substream_state = list(stream_state["state"].values())[0]
383
379
  else:
384
380
  return {}
@@ -0,0 +1,89 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import json
6
+ import logging
7
+ import uuid
8
+ from dataclasses import InitVar, dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, Mapping, Optional, Union
11
+
12
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
13
+ from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
14
+ from airbyte_cdk.sources.declarative.interpolation.interpolated_string import (
15
+ InterpolatedString,
16
+ )
17
+ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
18
+ SafeResponse,
19
+ )
20
+ from airbyte_cdk.sources.declarative.requesters import Requester
21
+ from airbyte_cdk.sources.declarative.types import Record, StreamSlice
22
+ from airbyte_cdk.sources.types import Config
23
+ from airbyte_cdk.sources.utils.files_directory import get_files_directory
24
+
25
+ logger = logging.getLogger("airbyte")
26
+
27
+
28
+ @dataclass
29
+ class FileUploader:
30
+ requester: Requester
31
+ download_target_extractor: RecordExtractor
32
+ config: Config
33
+ parameters: InitVar[Mapping[str, Any]]
34
+
35
+ filename_extractor: Optional[Union[InterpolatedString, str]] = None
36
+ content_extractor: Optional[RecordExtractor] = None
37
+
38
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
39
+ if self.filename_extractor:
40
+ self.filename_extractor = InterpolatedString.create(
41
+ self.filename_extractor,
42
+ parameters=parameters,
43
+ )
44
+
45
+ def upload(self, record: Record) -> None:
46
+ mocked_response = SafeResponse()
47
+ mocked_response.content = json.dumps(record.data).encode()
48
+ download_target = list(self.download_target_extractor.extract_records(mocked_response))[0]
49
+ if not isinstance(download_target, str):
50
+ raise ValueError(
51
+ f"download_target is expected to be a str but was {type(download_target)}: {download_target}"
52
+ )
53
+
54
+ response = self.requester.send_request(
55
+ stream_slice=StreamSlice(
56
+ partition={}, cursor_slice={}, extra_fields={"download_target": download_target}
57
+ ),
58
+ )
59
+
60
+ if self.content_extractor:
61
+ raise NotImplementedError("TODO")
62
+ else:
63
+ files_directory = Path(get_files_directory())
64
+
65
+ file_name = (
66
+ self.filename_extractor.eval(self.config, record=record)
67
+ if self.filename_extractor
68
+ else str(uuid.uuid4())
69
+ )
70
+ file_name = file_name.lstrip("/")
71
+ file_relative_path = Path(record.stream_name) / Path(file_name)
72
+
73
+ full_path = files_directory / file_relative_path
74
+ full_path.parent.mkdir(parents=True, exist_ok=True)
75
+
76
+ with open(str(full_path), "wb") as f:
77
+ f.write(response.content)
78
+ file_size_bytes = full_path.stat().st_size
79
+
80
+ logger.info("File uploaded successfully")
81
+ logger.info(f"File url: {str(full_path)}")
82
+ logger.info(f"File size: {file_size_bytes / 1024} KB")
83
+ logger.info(f"File relative path: {str(file_relative_path)}")
84
+
85
+ record.file_reference = AirbyteRecordMessageFileReference(
86
+ file_url=str(full_path),
87
+ file_relative_path=str(file_relative_path),
88
+ file_size_bytes=file_size_bytes,
89
+ )
@@ -58,11 +58,16 @@ class DeclarativePartition(Partition):
58
58
  def read(self) -> Iterable[Record]:
59
59
  for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
60
60
  if isinstance(stream_data, Mapping):
61
- yield Record(
62
- data=stream_data,
63
- stream_name=self.stream_name(),
64
- associated_slice=self._stream_slice,
61
+ record = (
62
+ stream_data
63
+ if isinstance(stream_data, Record)
64
+ else Record(
65
+ data=stream_data,
66
+ stream_name=self.stream_name(),
67
+ associated_slice=self._stream_slice,
68
+ )
65
69
  )
70
+ yield record
66
71
  else:
67
72
  self._message_repository.emit_message(stream_data)
68
73
 
@@ -139,9 +139,7 @@ class AddFields(RecordTransformation):
139
139
  valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
140
140
  value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
141
141
  is_empty_condition = not self.condition
142
- if is_empty_condition or self._filter_interpolator.eval(
143
- config, value=value, path=parsed_field.path, **kwargs
144
- ):
142
+ if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
145
143
  dpath.new(record, parsed_field.path, value)
146
144
 
147
145
  def __eq__(self, other: Any) -> bool:
@@ -8,16 +8,18 @@ from datetime import datetime
8
8
  from enum import Enum
9
9
  from io import IOBase
10
10
  from os import makedirs, path
11
- from typing import Any, Dict, Iterable, List, Optional, Set
11
+ from typing import Iterable, List, Optional, Set, Tuple
12
12
 
13
13
  from wcmatch.glob import GLOBSTAR, globmatch
14
14
 
15
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
15
16
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
16
17
  from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
17
18
  include_identities_stream,
18
19
  preserve_directory_structure,
19
20
  use_file_transfer,
20
21
  )
22
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
21
23
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
22
24
 
23
25
 
@@ -148,9 +150,9 @@ class AbstractFileBasedStreamReader(ABC):
148
150
  return False
149
151
 
150
152
  @abstractmethod
151
- def get_file(
153
+ def upload(
152
154
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
153
- ) -> Dict[str, Any]:
155
+ ) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
154
156
  """
155
157
  This is required for connectors that will support writing to
156
158
  files. It will handle the logic to download,get,read,acquire or
@@ -162,12 +164,10 @@ class AbstractFileBasedStreamReader(ABC):
162
164
  logger (logging.Logger): Logger for logging information and errors.
163
165
 
164
166
  Returns:
165
- dict: A dictionary containing the following:
166
- - "file_url" (str): The absolute path of the downloaded file.
167
- - "bytes" (int): The file size in bytes.
168
- - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
169
- this a mounted volume in the pod container.
170
-
167
+ AirbyteRecordMessageFileReference: A file reference object containing:
168
+ - staging_file_url (str): The absolute path to the referenced file in the staging area.
169
+ - file_size_bytes (int): The size of the referenced file in bytes.
170
+ - source_file_relative_path (str): The relative path to the referenced file in source.
171
171
  """
172
172
  ...
173
173
 
@@ -0,0 +1,24 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from pydantic.v1 import BaseModel
9
+
10
+
11
+ class FileRecordData(BaseModel):
12
+ """
13
+ A record in a file-based stream.
14
+ """
15
+
16
+ folder: str
17
+ filename: str
18
+ bytes: int
19
+
20
+ id: Optional[str] = None
21
+ created_at: Optional[int] = None
22
+ updated_at: Optional[int] = None
23
+ mime_type: Optional[str] = None
24
+ description: Optional[str] = None
@@ -2,34 +2,27 @@
2
2
  # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import os
6
- from typing import Any, Dict, Iterable
5
+ from typing import Iterable, Tuple
7
6
 
8
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
7
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
9
8
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
9
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
10
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
-
12
- AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
13
- DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
11
+ from airbyte_cdk.sources.utils.files_directory import get_files_directory
14
12
 
15
13
 
16
14
  class FileTransfer:
17
15
  def __init__(self) -> None:
18
- self._local_directory = (
19
- AIRBYTE_STAGING_DIRECTORY
20
- if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
21
- else DEFAULT_LOCAL_DIRECTORY
22
- )
16
+ self._local_directory = get_files_directory()
23
17
 
24
- def get_file(
18
+ def upload(
25
19
  self,
26
- config: FileBasedStreamConfig,
27
20
  file: RemoteFile,
28
21
  stream_reader: AbstractFileBasedStreamReader,
29
22
  logger: logging.Logger,
30
- ) -> Iterable[Dict[str, Any]]:
23
+ ) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
31
24
  try:
32
- yield stream_reader.get_file(
25
+ yield stream_reader.upload(
33
26
  file=file, local_directory=self._local_directory, logger=logger
34
27
  )
35
28
  except Exception as ex:
@@ -18,9 +18,19 @@ JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
18
18
  SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
19
19
 
20
20
  schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
21
+
21
22
  file_transfer_schema = {
22
23
  "type": "object",
23
- "properties": {"data": {"type": "object"}, "file": {"type": "object"}},
24
+ "properties": {
25
+ "folder": {"type": "string"},
26
+ "file_name": {"type": "string"},
27
+ "bytes": {"type": "integer"},
28
+ "id": {"type": ["null", "string"]},
29
+ "created_at": {"type": ["null", "integer"]},
30
+ "updated_at": {"type": ["null", "integer"]},
31
+ "mime_type": {"type": ["null", "string"]},
32
+ "description": {"type": ["null", "string"]},
33
+ },
24
34
  }
25
35
 
26
36
 
@@ -270,7 +270,6 @@ class FileBasedStreamPartition(Partition):
270
270
  yield Record(
271
271
  data=record_message_data,
272
272
  stream_name=self.stream_name(),
273
- is_file_transfer_message=self._use_file_transfer(),
274
273
  )
275
274
  else:
276
275
  self._message_repository.emit_message(record_data)
@@ -11,7 +11,7 @@ from functools import cache
11
11
  from os import path
12
12
  from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
13
13
 
14
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
14
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
15
15
  from airbyte_cdk.models import Type as MessageType
16
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
17
17
  from airbyte_cdk.sources.file_based.exceptions import (
@@ -97,14 +97,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
97
97
  self, configured_catalog_json_schema: Dict[str, Any]
98
98
  ) -> Dict[str, Any]:
99
99
  if self.use_file_transfer:
100
- return {
101
- "type": "object",
102
- "properties": {
103
- "file_path": {"type": "string"},
104
- "file_size": {"type": "string"},
105
- self.ab_file_name_col: {"type": "string"},
106
- },
107
- }
100
+ return file_transfer_schema
108
101
  else:
109
102
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
110
103
 
@@ -145,14 +138,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
145
138
  record[self.ab_file_name_col] = file.uri
146
139
  return record
147
140
 
148
- def transform_record_for_file_transfer(
149
- self, record: dict[str, Any], file: RemoteFile
150
- ) -> dict[str, Any]:
151
- # timstamp() returns a float representing the number of seconds since the unix epoch
152
- record[self.modified] = int(file.last_modified.timestamp()) * 1000
153
- record[self.source_file_url] = file.uri
154
- return record
155
-
156
141
  def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
157
142
  """
158
143
  Yield all records from all remote files in `list_files_for_this_sync`.
@@ -166,6 +151,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
166
151
  raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
167
152
  # The stream only supports a single file type, so we can use the same parser for all files
168
153
  parser = self.get_parser()
154
+ file_transfer = FileTransfer()
169
155
  for file in stream_slice["files"]:
170
156
  # only serialize the datetime once
171
157
  file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
@@ -173,19 +159,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
173
159
 
174
160
  try:
175
161
  if self.use_file_transfer:
176
- self.logger.info(f"{self.name}: {file} file-based syncing")
177
- # todo: complete here the code to not rely on local parser
178
- file_transfer = FileTransfer()
179
- for record in file_transfer.get_file(
180
- self.config, file, self.stream_reader, self.logger
162
+ for file_record_data, file_reference in file_transfer.upload(
163
+ file=file, stream_reader=self.stream_reader, logger=self.logger
181
164
  ):
182
- line_no += 1
183
- if not self.record_passes_validation_policy(record):
184
- n_skipped += 1
185
- continue
186
- record = self.transform_record_for_file_transfer(record, file)
187
165
  yield stream_data_to_airbyte_message(
188
- self.name, record, is_file_transfer_message=True
166
+ self.name,
167
+ file_record_data.dict(exclude_none=True),
168
+ file_reference=file_reference,
189
169
  )
190
170
  else:
191
171
  for record in parser.parse_records(
@@ -259,6 +239,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
259
239
 
260
240
  @cache
261
241
  def get_json_schema(self) -> JsonSchema:
242
+ if self.use_file_transfer:
243
+ return file_transfer_schema
262
244
  extra_fields = {
263
245
  self.ab_last_mod_col: {"type": "string"},
264
246
  self.ab_file_name_col: {"type": "string"},
@@ -282,9 +264,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
282
264
  return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
283
265
 
284
266
  def _get_raw_json_schema(self) -> JsonSchema:
285
- if self.use_file_transfer:
286
- return file_transfer_schema
287
- elif self.config.input_schema:
267
+ if self.config.input_schema:
288
268
  return self.config.get_input_schema() # type: ignore
289
269
  elif self.config.schemaless:
290
270
  return schemaless_schema
@@ -341,6 +321,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
341
321
  self.config.globs or [], self.config.legacy_prefix, self.logger
342
322
  )
343
323
 
324
+ def as_airbyte_stream(self) -> AirbyteStream:
325
+ file_stream = super().as_airbyte_stream()
326
+ file_stream.is_file_based = self.use_file_transfer
327
+ return file_stream
328
+
344
329
  def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
345
330
  loop = asyncio.get_event_loop()
346
331
  schema = loop.run_until_complete(self._infer_schema(files))