airbyte-cdk 6.45.0__py3-none-any.whl → 6.45.0.dev4101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +6 -45
  2. airbyte_cdk/connector_builder/main.py +2 -5
  3. airbyte_cdk/models/__init__.py +1 -0
  4. airbyte_cdk/models/airbyte_protocol.py +1 -3
  5. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
  6. airbyte_cdk/sources/declarative/async_job/job.py +0 -6
  7. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
  8. airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
  9. airbyte_cdk/sources/declarative/checks/__init__.py +2 -5
  10. airbyte_cdk/sources/declarative/checks/check_stream.py +11 -113
  11. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +8 -0
  12. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +50 -210
  13. airbyte_cdk/sources/declarative/extractors/record_selector.py +6 -1
  14. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
  15. airbyte_cdk/sources/declarative/interpolation/macros.py +4 -8
  16. airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -23
  17. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +43 -142
  18. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +4 -16
  19. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +50 -263
  20. airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
  21. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -5
  22. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -25
  23. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  24. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +30 -101
  25. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +9 -4
  26. airbyte_cdk/sources/declarative/transformations/add_fields.py +1 -3
  27. airbyte_cdk/sources/file_based/file_based_stream_reader.py +32 -14
  28. airbyte_cdk/sources/file_based/file_record_data.py +24 -0
  29. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -15
  30. airbyte_cdk/sources/file_based/schema_helpers.py +11 -1
  31. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -1
  32. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +16 -31
  33. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +1 -3
  34. airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
  35. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
  36. airbyte_cdk/sources/types.py +11 -2
  37. airbyte_cdk/sources/utils/files_directory.py +15 -0
  38. airbyte_cdk/sources/utils/record_helper.py +8 -8
  39. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/METADATA +2 -2
  40. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/RECORD +44 -50
  41. airbyte_cdk/models/file_transfer_record_message.py +0 -13
  42. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
  43. airbyte_cdk/sources/declarative/requesters/query_properties/__init__.py +0 -13
  44. airbyte_cdk/sources/declarative/requesters/query_properties/properties_from_endpoint.py +0 -40
  45. airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py +0 -69
  46. airbyte_cdk/sources/declarative/requesters/query_properties/query_properties.py +0 -58
  47. airbyte_cdk/sources/declarative/requesters/query_properties/strategies/__init__.py +0 -10
  48. airbyte_cdk/sources/declarative/requesters/query_properties/strategies/group_by_key.py +0 -33
  49. airbyte_cdk/sources/declarative/requesters/query_properties/strategies/merge_strategy.py +0 -19
  50. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/LICENSE.txt +0 -0
  51. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/LICENSE_SHORT +0 -0
  52. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/WHEEL +0 -0
  53. {airbyte_cdk-6.45.0.dist-info → airbyte_cdk-6.45.0.dev4101.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,89 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import json
6
+ import logging
7
+ import uuid
8
+ from dataclasses import InitVar, dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, Mapping, Optional, Union
11
+
12
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
13
+ from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
14
+ from airbyte_cdk.sources.declarative.interpolation.interpolated_string import (
15
+ InterpolatedString,
16
+ )
17
+ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
18
+ SafeResponse,
19
+ )
20
+ from airbyte_cdk.sources.declarative.requesters import Requester
21
+ from airbyte_cdk.sources.declarative.types import Record, StreamSlice
22
+ from airbyte_cdk.sources.types import Config
23
+ from airbyte_cdk.sources.utils.files_directory import get_files_directory
24
+
25
+ logger = logging.getLogger("airbyte")
26
+
27
+
28
+ @dataclass
29
+ class FileUploader:
30
+ requester: Requester
31
+ download_target_extractor: RecordExtractor
32
+ config: Config
33
+ parameters: InitVar[Mapping[str, Any]]
34
+
35
+ filename_extractor: Optional[Union[InterpolatedString, str]] = None
36
+ content_extractor: Optional[RecordExtractor] = None
37
+
38
+ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
39
+ if self.filename_extractor:
40
+ self.filename_extractor = InterpolatedString.create(
41
+ self.filename_extractor,
42
+ parameters=parameters,
43
+ )
44
+
45
+ def upload(self, record: Record) -> None:
46
+ mocked_response = SafeResponse()
47
+ mocked_response.content = json.dumps(record.data).encode()
48
+ download_target = list(self.download_target_extractor.extract_records(mocked_response))[0]
49
+ if not isinstance(download_target, str):
50
+ raise ValueError(
51
+ f"download_target is expected to be a str but was {type(download_target)}: {download_target}"
52
+ )
53
+
54
+ response = self.requester.send_request(
55
+ stream_slice=StreamSlice(
56
+ partition={}, cursor_slice={}, extra_fields={"download_target": download_target}
57
+ ),
58
+ )
59
+
60
+ if self.content_extractor:
61
+ raise NotImplementedError("TODO")
62
+ else:
63
+ files_directory = Path(get_files_directory())
64
+
65
+ file_name = (
66
+ self.filename_extractor.eval(self.config, record=record)
67
+ if self.filename_extractor
68
+ else str(uuid.uuid4())
69
+ )
70
+ file_name = file_name.lstrip("/")
71
+ file_relative_path = Path(record.stream_name) / Path(file_name)
72
+
73
+ full_path = files_directory / file_relative_path
74
+ full_path.parent.mkdir(parents=True, exist_ok=True)
75
+
76
+ with open(str(full_path), "wb") as f:
77
+ f.write(response.content)
78
+ file_size_bytes = full_path.stat().st_size
79
+
80
+ logger.info("File uploaded successfully")
81
+ logger.info(f"File url: {str(full_path)}")
82
+ logger.info(f"File size: {file_size_bytes / 1024} KB")
83
+ logger.info(f"File relative path: {str(file_relative_path)}")
84
+
85
+ record.file_reference = AirbyteRecordMessageFileReference(
86
+ file_url=str(full_path),
87
+ file_relative_path=str(file_relative_path),
88
+ file_size_bytes=file_size_bytes,
89
+ )
@@ -1,9 +1,8 @@
1
1
  #
2
- # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
5
  import json
6
- from collections import defaultdict
7
6
  from dataclasses import InitVar, dataclass, field
8
7
  from functools import partial
9
8
  from itertools import islice
@@ -13,7 +12,6 @@ from typing import (
13
12
  Iterable,
14
13
  List,
15
14
  Mapping,
16
- MutableMapping,
17
15
  Optional,
18
16
  Set,
19
17
  Tuple,
@@ -33,7 +31,6 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router i
33
31
  )
34
32
  from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination
35
33
  from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator
36
- from airbyte_cdk.sources.declarative.requesters.query_properties import QueryProperties
37
34
  from airbyte_cdk.sources.declarative.requesters.request_options import (
38
35
  DefaultRequestOptionsProvider,
39
36
  RequestOptionsProvider,
@@ -91,7 +88,6 @@ class SimpleRetriever(Retriever):
91
88
  )
92
89
  cursor: Optional[DeclarativeCursor] = None
93
90
  ignore_stream_slicer_parameters_on_paginated_requests: bool = False
94
- additional_query_properties: Optional[QueryProperties] = None
95
91
 
96
92
  def __post_init__(self, parameters: Mapping[str, Any]) -> None:
97
93
  self._paginator = self.paginator or NoPagination(parameters=parameters)
@@ -449,110 +445,43 @@ class SimpleRetriever(Retriever):
449
445
  :param stream_slice: The stream slice to read data for
450
446
  :return: The records read from the API source
451
447
  """
448
+ _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
452
449
 
453
- property_chunks = (
454
- list(
455
- self.additional_query_properties.get_request_property_chunks(
456
- stream_slice=stream_slice
457
- )
458
- )
459
- if self.additional_query_properties
460
- else []
450
+ most_recent_record_from_slice = None
451
+ record_generator = partial(
452
+ self._parse_records,
453
+ stream_slice=stream_slice,
454
+ stream_state=self.state or {},
455
+ records_schema=records_schema,
461
456
  )
462
- records_without_merge_key = []
463
- merged_records: MutableMapping[str, Any] = defaultdict(dict)
464
457
 
465
- _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
466
- most_recent_record_from_slice = None
458
+ if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
459
+ stream_state = self.state
467
460
 
468
- if self.additional_query_properties:
469
- for properties in property_chunks:
470
- _slice = StreamSlice(
471
- partition=_slice.partition or {},
472
- cursor_slice=_slice.cursor_slice or {},
473
- extra_fields={"query_properties": properties},
474
- ) # None-check
475
-
476
- record_generator = partial(
477
- self._parse_records,
478
- stream_slice=_slice,
479
- stream_state=self.state or {},
480
- records_schema=records_schema,
461
+ # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
462
+ # fetch more records. The platform deletes stream state for full refresh streams before starting a
463
+ # new job, so we don't need to worry about this value existing for the initial attempt
464
+ if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
465
+ return
466
+
467
+ yield from self._read_single_page(record_generator, stream_state, _slice)
468
+ else:
469
+ for stream_data in self._read_pages(record_generator, self.state, _slice):
470
+ current_record = self._extract_record(stream_data, _slice)
471
+ if self.cursor and current_record:
472
+ self.cursor.observe(_slice, current_record)
473
+
474
+ # Latest record read, not necessarily within slice boundaries.
475
+ # TODO Remove once all custom components implement `observe` method.
476
+ # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
477
+ most_recent_record_from_slice = self._get_most_recent_record(
478
+ most_recent_record_from_slice, current_record, _slice
481
479
  )
480
+ yield stream_data
482
481
 
483
- for stream_data in self._read_pages(record_generator, self.state, _slice):
484
- current_record = self._extract_record(stream_data, _slice)
485
- if self.cursor and current_record:
486
- self.cursor.observe(_slice, current_record)
487
-
488
- # Latest record read, not necessarily within slice boundaries.
489
- # TODO Remove once all custom components implement `observe` method.
490
- # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
491
- most_recent_record_from_slice = self._get_most_recent_record(
492
- most_recent_record_from_slice, current_record, _slice
493
- )
494
-
495
- if current_record and self.additional_query_properties.property_chunking:
496
- merge_key = (
497
- self.additional_query_properties.property_chunking.get_merge_key(
498
- current_record
499
- )
500
- )
501
- if merge_key:
502
- merged_records[merge_key].update(current_record)
503
- else:
504
- # We should still emit records even if the record did not have a merge key
505
- records_without_merge_key.append(current_record)
506
- else:
507
- yield stream_data
508
482
  if self.cursor:
509
483
  self.cursor.close_slice(_slice, most_recent_record_from_slice)
510
-
511
- if len(merged_records) > 0:
512
- yield from [
513
- Record(data=merged_record, stream_name=self.name, associated_slice=stream_slice)
514
- for merged_record in merged_records.values()
515
- ]
516
- if len(records_without_merge_key) > 0:
517
- yield from records_without_merge_key
518
- else:
519
- _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
520
-
521
- most_recent_record_from_slice = None
522
- record_generator = partial(
523
- self._parse_records,
524
- stream_slice=stream_slice,
525
- stream_state=self.state or {},
526
- records_schema=records_schema,
527
- )
528
-
529
- if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
530
- stream_state = self.state
531
-
532
- # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
533
- # fetch more records. The platform deletes stream state for full refresh streams before starting a
534
- # new job, so we don't need to worry about this value existing for the initial attempt
535
- if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
536
- return
537
-
538
- yield from self._read_single_page(record_generator, stream_state, _slice)
539
- else:
540
- for stream_data in self._read_pages(record_generator, self.state, _slice):
541
- current_record = self._extract_record(stream_data, _slice)
542
- if self.cursor and current_record:
543
- self.cursor.observe(_slice, current_record)
544
-
545
- # Latest record read, not necessarily within slice boundaries.
546
- # TODO Remove once all custom components implement `observe` method.
547
- # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
548
- most_recent_record_from_slice = self._get_most_recent_record(
549
- most_recent_record_from_slice, current_record, _slice
550
- )
551
- yield stream_data
552
-
553
- if self.cursor:
554
- self.cursor.close_slice(_slice, most_recent_record_from_slice)
555
- return
484
+ return
556
485
 
557
486
  def _get_most_recent_record(
558
487
  self,
@@ -58,11 +58,16 @@ class DeclarativePartition(Partition):
58
58
  def read(self) -> Iterable[Record]:
59
59
  for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
60
60
  if isinstance(stream_data, Mapping):
61
- yield Record(
62
- data=stream_data,
63
- stream_name=self.stream_name(),
64
- associated_slice=self._stream_slice,
61
+ record = (
62
+ stream_data
63
+ if isinstance(stream_data, Record)
64
+ else Record(
65
+ data=stream_data,
66
+ stream_name=self.stream_name(),
67
+ associated_slice=self._stream_slice,
68
+ )
65
69
  )
70
+ yield record
66
71
  else:
67
72
  self._message_repository.emit_message(stream_data)
68
73
 
@@ -139,9 +139,7 @@ class AddFields(RecordTransformation):
139
139
  valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
140
140
  value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
141
141
  is_empty_condition = not self.condition
142
- if is_empty_condition or self._filter_interpolator.eval(
143
- config, value=value, path=parsed_field.path, **kwargs
144
- ):
142
+ if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
145
143
  dpath.new(record, parsed_field.path, value)
146
144
 
147
145
  def __eq__(self, other: Any) -> bool:
@@ -8,16 +8,18 @@ from datetime import datetime
8
8
  from enum import Enum
9
9
  from io import IOBase
10
10
  from os import makedirs, path
11
- from typing import Any, Dict, Iterable, List, Optional, Set
11
+ from typing import Any, Iterable, List, Optional, Set, Tuple, MutableMapping
12
12
 
13
13
  from wcmatch.glob import GLOBSTAR, globmatch
14
14
 
15
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
15
16
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
16
17
  from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
17
18
  include_identities_stream,
18
19
  preserve_directory_structure,
19
20
  use_file_transfer,
20
21
  )
22
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
21
23
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
22
24
 
23
25
 
@@ -28,6 +30,12 @@ class FileReadMode(Enum):
28
30
 
29
31
  class AbstractFileBasedStreamReader(ABC):
30
32
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
33
+ FILE_RELATIVE_PATH = "file_relative_path"
34
+ FILE_NAME = "file_name"
35
+ LOCAL_FILE_PATH = "local_file_path"
36
+ ABSOLUTE_FILE_PATH = "absolute_file_path"
37
+ SOURCE_FILE_URI = "source_file_relative_path"
38
+ FILE_FOLDER = "file_folder"
31
39
 
32
40
  def __init__(self) -> None:
33
41
  self._config = None
@@ -148,9 +156,9 @@ class AbstractFileBasedStreamReader(ABC):
148
156
  return False
149
157
 
150
158
  @abstractmethod
151
- def get_file(
159
+ def upload(
152
160
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
153
- ) -> Dict[str, Any]:
161
+ ) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
154
162
  """
155
163
  This is required for connectors that will support writing to
156
164
  files. It will handle the logic to download,get,read,acquire or
@@ -162,25 +170,35 @@ class AbstractFileBasedStreamReader(ABC):
162
170
  logger (logging.Logger): Logger for logging information and errors.
163
171
 
164
172
  Returns:
165
- dict: A dictionary containing the following:
166
- - "file_url" (str): The absolute path of the downloaded file.
167
- - "bytes" (int): The file size in bytes.
168
- - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
169
- this a mounted volume in the pod container.
170
-
173
+ AirbyteRecordMessageFileReference: A file reference object containing:
174
+ - staging_file_url (str): The absolute path to the referenced file in the staging area.
175
+ - file_size_bytes (int): The size of the referenced file in bytes.
176
+ - source_file_relative_path (str): The relative path to the referenced file in source.
171
177
  """
172
178
  ...
173
179
 
174
- def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
180
+ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> MutableMapping[str, Any]:
175
181
  preserve_directory_structure = self.preserve_directory_structure()
182
+ file_uri = file.uri
183
+ file_name = path.basename(file_uri)
184
+ file_folder = path.dirname(file_uri)
176
185
  if preserve_directory_structure:
177
186
  # Remove left slashes from source path format to make relative path for writing locally
178
- file_relative_path = file.uri.lstrip("/")
187
+ file_relative_path = file_uri.lstrip("/")
179
188
  else:
180
- file_relative_path = path.basename(file.uri)
189
+ file_relative_path = file_name
181
190
  local_file_path = path.join(local_directory, file_relative_path)
182
-
183
191
  # Ensure the local directory exists
184
192
  makedirs(path.dirname(local_file_path), exist_ok=True)
185
193
  absolute_file_path = path.abspath(local_file_path)
186
- return [file_relative_path, local_file_path, absolute_file_path]
194
+
195
+ file_paths = {
196
+ self.FILE_RELATIVE_PATH: file_relative_path,
197
+ self.LOCAL_FILE_PATH: local_file_path,
198
+ self.ABSOLUTE_FILE_PATH: absolute_file_path,
199
+ self.FILE_NAME: file_name,
200
+ self.FILE_FOLDER: file_folder,
201
+ self.SOURCE_FILE_URI: file_uri,
202
+
203
+ }
204
+ return file_paths
@@ -0,0 +1,24 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from pydantic.v1 import BaseModel
9
+
10
+
11
+ class FileRecordData(BaseModel):
12
+ """
13
+ A record in a file-based stream.
14
+ """
15
+
16
+ folder: str
17
+ filename: str
18
+ bytes: int
19
+
20
+ id: Optional[str] = None
21
+ created_at: Optional[int] = None
22
+ updated_at: Optional[int] = None
23
+ mime_type: Optional[str] = None
24
+ description: Optional[str] = None
@@ -2,34 +2,27 @@
2
2
  # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import os
6
- from typing import Any, Dict, Iterable
5
+ from typing import Iterable, Tuple
7
6
 
8
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
7
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
9
8
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
9
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
10
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
-
12
- AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
13
- DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
11
+ from airbyte_cdk.sources.utils.files_directory import get_files_directory
14
12
 
15
13
 
16
14
  class FileTransfer:
17
15
  def __init__(self) -> None:
18
- self._local_directory = (
19
- AIRBYTE_STAGING_DIRECTORY
20
- if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
21
- else DEFAULT_LOCAL_DIRECTORY
22
- )
16
+ self._local_directory = get_files_directory()
23
17
 
24
- def get_file(
18
+ def upload(
25
19
  self,
26
- config: FileBasedStreamConfig,
27
20
  file: RemoteFile,
28
21
  stream_reader: AbstractFileBasedStreamReader,
29
22
  logger: logging.Logger,
30
- ) -> Iterable[Dict[str, Any]]:
23
+ ) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
31
24
  try:
32
- yield stream_reader.get_file(
25
+ yield stream_reader.upload(
33
26
  file=file, local_directory=self._local_directory, logger=logger
34
27
  )
35
28
  except Exception as ex:
@@ -18,9 +18,19 @@ JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
18
18
  SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
19
19
 
20
20
  schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
21
+
21
22
  file_transfer_schema = {
22
23
  "type": "object",
23
- "properties": {"data": {"type": "object"}, "file": {"type": "object"}},
24
+ "properties": {
25
+ "folder": {"type": "string"},
26
+ "file_name": {"type": "string"},
27
+ "bytes": {"type": "integer"},
28
+ "id": {"type": ["null", "string"]},
29
+ "created_at": {"type": ["null", "integer"]},
30
+ "updated_at": {"type": ["null", "integer"]},
31
+ "mime_type": {"type": ["null", "string"]},
32
+ "description": {"type": ["null", "string"]},
33
+ },
24
34
  }
25
35
 
26
36
 
@@ -270,7 +270,6 @@ class FileBasedStreamPartition(Partition):
270
270
  yield Record(
271
271
  data=record_message_data,
272
272
  stream_name=self.stream_name(),
273
- is_file_transfer_message=self._use_file_transfer(),
274
273
  )
275
274
  else:
276
275
  self._message_repository.emit_message(record_data)
@@ -11,7 +11,7 @@ from functools import cache
11
11
  from os import path
12
12
  from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
13
13
 
14
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
14
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
15
15
  from airbyte_cdk.models import Type as MessageType
16
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
17
17
  from airbyte_cdk.sources.file_based.exceptions import (
@@ -97,14 +97,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
97
97
  self, configured_catalog_json_schema: Dict[str, Any]
98
98
  ) -> Dict[str, Any]:
99
99
  if self.use_file_transfer:
100
- return {
101
- "type": "object",
102
- "properties": {
103
- "file_path": {"type": "string"},
104
- "file_size": {"type": "string"},
105
- self.ab_file_name_col: {"type": "string"},
106
- },
107
- }
100
+ return file_transfer_schema
108
101
  else:
109
102
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
110
103
 
@@ -145,14 +138,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
145
138
  record[self.ab_file_name_col] = file.uri
146
139
  return record
147
140
 
148
- def transform_record_for_file_transfer(
149
- self, record: dict[str, Any], file: RemoteFile
150
- ) -> dict[str, Any]:
151
- # timstamp() returns a float representing the number of seconds since the unix epoch
152
- record[self.modified] = int(file.last_modified.timestamp()) * 1000
153
- record[self.source_file_url] = file.uri
154
- return record
155
-
156
141
  def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
157
142
  """
158
143
  Yield all records from all remote files in `list_files_for_this_sync`.
@@ -166,6 +151,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
166
151
  raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
167
152
  # The stream only supports a single file type, so we can use the same parser for all files
168
153
  parser = self.get_parser()
154
+ file_transfer = FileTransfer()
169
155
  for file in stream_slice["files"]:
170
156
  # only serialize the datetime once
171
157
  file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
@@ -173,19 +159,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
173
159
 
174
160
  try:
175
161
  if self.use_file_transfer:
176
- self.logger.info(f"{self.name}: {file} file-based syncing")
177
- # todo: complete here the code to not rely on local parser
178
- file_transfer = FileTransfer()
179
- for record in file_transfer.get_file(
180
- self.config, file, self.stream_reader, self.logger
162
+ for file_record_data, file_reference in file_transfer.upload(
163
+ file=file, stream_reader=self.stream_reader, logger=self.logger
181
164
  ):
182
- line_no += 1
183
- if not self.record_passes_validation_policy(record):
184
- n_skipped += 1
185
- continue
186
- record = self.transform_record_for_file_transfer(record, file)
187
165
  yield stream_data_to_airbyte_message(
188
- self.name, record, is_file_transfer_message=True
166
+ self.name,
167
+ file_record_data.dict(exclude_none=True),
168
+ file_reference=file_reference,
189
169
  )
190
170
  else:
191
171
  for record in parser.parse_records(
@@ -259,6 +239,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
259
239
 
260
240
  @cache
261
241
  def get_json_schema(self) -> JsonSchema:
242
+ if self.use_file_transfer:
243
+ return file_transfer_schema
262
244
  extra_fields = {
263
245
  self.ab_last_mod_col: {"type": "string"},
264
246
  self.ab_file_name_col: {"type": "string"},
@@ -282,9 +264,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
282
264
  return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
283
265
 
284
266
  def _get_raw_json_schema(self) -> JsonSchema:
285
- if self.use_file_transfer:
286
- return file_transfer_schema
287
- elif self.config.input_schema:
267
+ if self.config.input_schema:
288
268
  return self.config.get_input_schema() # type: ignore
289
269
  elif self.config.schemaless:
290
270
  return schemaless_schema
@@ -341,6 +321,11 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
341
321
  self.config.globs or [], self.config.legacy_prefix, self.logger
342
322
  )
343
323
 
324
+ def as_airbyte_stream(self) -> AirbyteStream:
325
+ file_stream = super().as_airbyte_stream()
326
+ file_stream.is_file_based = self.use_file_transfer
327
+ return file_stream
328
+
344
329
  def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
345
330
  loop = asyncio.get_event_loop()
346
331
  schema = loop.run_until_complete(self._infer_schema(files))
@@ -61,9 +61,7 @@ class PermissionsFileBasedStream(DefaultFileBasedStream):
61
61
  permissions_record = self.transform_record(
62
62
  permissions_record, file, file_datetime_string
63
63
  )
64
- yield stream_data_to_airbyte_message(
65
- self.name, permissions_record, is_file_transfer_message=False
66
- )
64
+ yield stream_data_to_airbyte_message(self.name, permissions_record)
67
65
  except Exception as e:
68
66
  self.logger.error(f"Failed to retrieve permissions for file {file.uri}: {str(e)}")
69
67
  yield AirbyteMessage(
@@ -29,6 +29,7 @@ class DefaultStream(AbstractStream):
29
29
  logger: Logger,
30
30
  cursor: Cursor,
31
31
  namespace: Optional[str] = None,
32
+ supports_file_transfer: bool = False,
32
33
  ) -> None:
33
34
  self._stream_partition_generator = partition_generator
34
35
  self._name = name
@@ -39,6 +40,7 @@ class DefaultStream(AbstractStream):
39
40
  self._logger = logger
40
41
  self._cursor = cursor
41
42
  self._namespace = namespace
43
+ self._supports_file_transfer = supports_file_transfer
42
44
 
43
45
  def generate_partitions(self) -> Iterable[Partition]:
44
46
  yield from self._stream_partition_generator.generate()
@@ -68,6 +70,7 @@ class DefaultStream(AbstractStream):
68
70
  json_schema=dict(self._json_schema),
69
71
  supported_sync_modes=[SyncMode.full_refresh],
70
72
  is_resumable=False,
73
+ is_file_based=self._supports_file_transfer,
71
74
  )
72
75
 
73
76
  if self._namespace:
@@ -71,10 +71,6 @@ class AbstractStreamStateConverter(ABC):
71
71
  for stream_slice in state.get("slices", []):
72
72
  stream_slice[self.START_KEY] = self._from_state_message(stream_slice[self.START_KEY])
73
73
  stream_slice[self.END_KEY] = self._from_state_message(stream_slice[self.END_KEY])
74
- if self.MOST_RECENT_RECORD_KEY in stream_slice:
75
- stream_slice[self.MOST_RECENT_RECORD_KEY] = self._from_state_message(
76
- stream_slice[self.MOST_RECENT_RECORD_KEY]
77
- )
78
74
  return state
79
75
 
80
76
  def serialize(