airbyte-cdk 6.18.1__py3-none-any.whl → 6.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -678,7 +678,7 @@ definitions:
678
678
  properties:
679
679
  type:
680
680
  type: string
681
- enum: [ CustomSchemaNormalization ]
681
+ enum: [CustomSchemaNormalization]
682
682
  class_name:
683
683
  title: Class Name
684
684
  description: Fully-qualified name of the class that will be implementing the custom normalization. The format is `source_<name>.<package>.<class_name>`.
@@ -2886,6 +2886,7 @@ definitions:
2886
2886
  parser:
2887
2887
  anyOf:
2888
2888
  - "$ref": "#/definitions/GzipParser"
2889
+ - "$ref": "#/definitions/JsonParser"
2889
2890
  - "$ref": "#/definitions/JsonLineParser"
2890
2891
  - "$ref": "#/definitions/CsvParser"
2891
2892
  # PARSERS
@@ -2902,6 +2903,20 @@ definitions:
2902
2903
  anyOf:
2903
2904
  - "$ref": "#/definitions/JsonLineParser"
2904
2905
  - "$ref": "#/definitions/CsvParser"
2906
+ - "$ref": "#/definitions/JsonParser"
2907
+ JsonParser:
2908
+ title: JsonParser
2909
+ description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
2910
+ type: object
2911
+ required:
2912
+ - type
2913
+ properties:
2914
+ type:
2915
+ type: string
2916
+ enum: [JsonParser]
2917
+ encoding:
2918
+ type: string
2919
+ default: utf-8
2905
2920
  JsonLineParser:
2906
2921
  type: object
2907
2922
  required:
@@ -7,9 +7,12 @@ from dataclasses import dataclass
7
7
  from io import BufferedIOBase, TextIOWrapper
8
8
  from typing import Any, Generator, MutableMapping, Optional
9
9
 
10
+ import orjson
10
11
  import requests
11
12
 
13
+ from airbyte_cdk.models import FailureType
12
14
  from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
15
+ from airbyte_cdk.utils import AirbyteTracedException
13
16
 
14
17
  logger = logging.getLogger("airbyte")
15
18
 
@@ -42,6 +45,46 @@ class GzipParser(Parser):
42
45
  yield from self.inner_parser.parse(gzipobj)
43
46
 
44
47
 
48
+ @dataclass
49
+ class JsonParser(Parser):
50
+ encoding: str = "utf-8"
51
+
52
+ def parse(self, data: BufferedIOBase) -> Generator[MutableMapping[str, Any], None, None]:
53
+ """
54
+ Attempts to deserialize data using orjson library. As an extra layer of safety we fallback on the json library to deserialize the data.
55
+ """
56
+ raw_data = data.read()
57
+ body_json = self._parse_orjson(raw_data) or self._parse_json(raw_data)
58
+
59
+ if body_json is None:
60
+ raise AirbyteTracedException(
61
+ message="Response JSON data failed to be parsed. See logs for more information.",
62
+ internal_message=f"Response JSON data failed to be parsed.",
63
+ failure_type=FailureType.system_error,
64
+ )
65
+
66
+ if isinstance(body_json, list):
67
+ yield from body_json
68
+ else:
69
+ yield from [body_json]
70
+
71
+ def _parse_orjson(self, raw_data: bytes) -> Optional[Any]:
72
+ try:
73
+ return orjson.loads(raw_data.decode(self.encoding))
74
+ except Exception as exc:
75
+ logger.debug(
76
+ f"Failed to parse JSON data using orjson library. Falling back to json library. {exc}"
77
+ )
78
+ return None
79
+
80
+ def _parse_json(self, raw_data: bytes) -> Optional[Any]:
81
+ try:
82
+ return json.loads(raw_data.decode(self.encoding))
83
+ except Exception as exc:
84
+ logger.error(f"Failed to parse JSON data using json library. {exc}")
85
+ return None
86
+
87
+
45
88
  @dataclass
46
89
  class JsonLineParser(Parser):
47
90
  encoding: Optional[str] = "utf-8"
@@ -1201,6 +1201,14 @@ class LegacySessionTokenAuthenticator(BaseModel):
1201
1201
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
1202
1202
 
1203
1203
 
1204
+ class JsonParser(BaseModel):
1205
+ class Config:
1206
+ extra = Extra.allow
1207
+
1208
+ type: Literal["JsonParser"]
1209
+ encoding: Optional[str] = "utf-8"
1210
+
1211
+
1204
1212
  class JsonLineParser(BaseModel):
1205
1213
  type: Literal["JsonLineParser"]
1206
1214
  encoding: Optional[str] = "utf-8"
@@ -1599,7 +1607,7 @@ class RecordSelector(BaseModel):
1599
1607
 
1600
1608
  class GzipParser(BaseModel):
1601
1609
  type: Literal["GzipParser"]
1602
- inner_parser: Union[JsonLineParser, CsvParser]
1610
+ inner_parser: Union[JsonLineParser, CsvParser, JsonParser]
1603
1611
 
1604
1612
 
1605
1613
  class Spec(BaseModel):
@@ -1634,7 +1642,7 @@ class CompositeErrorHandler(BaseModel):
1634
1642
 
1635
1643
  class CompositeRawDecoder(BaseModel):
1636
1644
  type: Literal["CompositeRawDecoder"]
1637
- parser: Union[GzipParser, JsonLineParser, CsvParser]
1645
+ parser: Union[GzipParser, JsonParser, JsonLineParser, CsvParser]
1638
1646
 
1639
1647
 
1640
1648
  class DeclarativeSource1(BaseModel):
@@ -72,6 +72,8 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
72
72
  CsvParser,
73
73
  GzipParser,
74
74
  JsonLineParser,
75
+ JsonParser,
76
+ Parser,
75
77
  )
76
78
  from airbyte_cdk.sources.declarative.extractors import (
77
79
  DpathExtractor,
@@ -247,6 +249,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
247
249
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
248
250
  JsonLineParser as JsonLineParserModel,
249
251
  )
252
+ from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
253
+ JsonParser as JsonParserModel,
254
+ )
250
255
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
251
256
  JwtAuthenticator as JwtAuthenticatorModel,
252
257
  )
@@ -522,6 +527,7 @@ class ModelToComponentFactory:
522
527
  JsonDecoderModel: self.create_json_decoder,
523
528
  JsonlDecoderModel: self.create_jsonl_decoder,
524
529
  JsonLineParserModel: self.create_json_line_parser,
530
+ JsonParserModel: self.create_json_parser,
525
531
  GzipJsonDecoderModel: self.create_gzipjson_decoder,
526
532
  GzipParserModel: self.create_gzip_parser,
527
533
  KeysToLowerModel: self.create_keys_to_lower_transformation,
@@ -1032,17 +1038,17 @@ class ModelToComponentFactory:
1032
1038
  self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any
1033
1039
  ) -> CursorPaginationStrategy:
1034
1040
  if isinstance(decoder, PaginationDecoderDecorator):
1035
- if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
1036
- raise ValueError(
1037
- f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1038
- )
1041
+ inner_decoder = decoder.decoder
1042
+ else:
1043
+ inner_decoder = decoder
1044
+ decoder = PaginationDecoderDecorator(decoder=decoder)
1045
+
1046
+ if self._is_supported_decoder_for_pagination(inner_decoder):
1039
1047
  decoder_to_use = decoder
1040
1048
  else:
1041
- if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1042
- raise ValueError(
1043
- f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1044
- )
1045
- decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1049
+ raise ValueError(
1050
+ self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
1051
+ )
1046
1052
 
1047
1053
  return CursorPaginationStrategy(
1048
1054
  cursor_value=model.cursor_value,
@@ -1515,11 +1521,10 @@ class ModelToComponentFactory:
1515
1521
  cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None,
1516
1522
  ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]:
1517
1523
  if decoder:
1518
- if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1519
- raise ValueError(
1520
- f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1521
- )
1522
- decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1524
+ if self._is_supported_decoder_for_pagination(decoder):
1525
+ decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1526
+ else:
1527
+ raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder)))
1523
1528
  else:
1524
1529
  decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
1525
1530
  page_size_option = (
@@ -1748,6 +1753,11 @@ class ModelToComponentFactory:
1748
1753
  def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
1749
1754
  return JsonDecoder(parameters={})
1750
1755
 
1756
+ @staticmethod
1757
+ def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
1758
+ encoding = model.encoding if model.encoding else "utf-8"
1759
+ return JsonParser(encoding=encoding)
1760
+
1751
1761
  @staticmethod
1752
1762
  def create_jsonl_decoder(
1753
1763
  model: JsonlDecoderModel, config: Config, **kwargs: Any
@@ -1940,22 +1950,22 @@ class ModelToComponentFactory:
1940
1950
  message_repository=self._message_repository,
1941
1951
  )
1942
1952
 
1943
- @staticmethod
1944
1953
  def create_offset_increment(
1945
- model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
1954
+ self, model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
1946
1955
  ) -> OffsetIncrement:
1947
1956
  if isinstance(decoder, PaginationDecoderDecorator):
1948
- if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
1949
- raise ValueError(
1950
- f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1951
- )
1957
+ inner_decoder = decoder.decoder
1958
+ else:
1959
+ inner_decoder = decoder
1960
+ decoder = PaginationDecoderDecorator(decoder=decoder)
1961
+
1962
+ if self._is_supported_decoder_for_pagination(inner_decoder):
1952
1963
  decoder_to_use = decoder
1953
1964
  else:
1954
- if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1955
- raise ValueError(
1956
- f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1957
- )
1958
- decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1965
+ raise ValueError(
1966
+ self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
1967
+ )
1968
+
1959
1969
  return OffsetIncrement(
1960
1970
  page_size=model.page_size,
1961
1971
  config=config,
@@ -2555,3 +2565,25 @@ class ModelToComponentFactory:
2555
2565
  components_mapping=components_mapping,
2556
2566
  parameters=model.parameters or {},
2557
2567
  )
2568
+
2569
+ _UNSUPPORTED_DECODER_ERROR = (
2570
+ "Specified decoder of {decoder_type} is not supported for pagination."
2571
+ "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead."
2572
+ "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`."
2573
+ )
2574
+
2575
+ def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool:
2576
+ if isinstance(decoder, (JsonDecoder, XmlDecoder)):
2577
+ return True
2578
+ elif isinstance(decoder, CompositeRawDecoder):
2579
+ return self._is_supported_parser_for_pagination(decoder.parser)
2580
+ else:
2581
+ return False
2582
+
2583
+ def _is_supported_parser_for_pagination(self, parser: Parser) -> bool:
2584
+ if isinstance(parser, JsonParser):
2585
+ return True
2586
+ elif isinstance(parser, GzipParser):
2587
+ return isinstance(parser.inner_parser, JsonParser)
2588
+ else:
2589
+ return False
@@ -31,6 +31,17 @@ class DeliverRawFiles(BaseModel):
31
31
 
32
32
  delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
33
33
 
34
+ preserve_directory_structure: bool = Field(
35
+ title="Preserve Sub-Directories in File Paths",
36
+ description=(
37
+ "If enabled, sends subdirectory folder structure "
38
+ "along with source file names to the destination. "
39
+ "Otherwise, files will be synced by their names only. "
40
+ "This option is ignored when file-based replication is not enabled."
41
+ ),
42
+ default=True,
43
+ )
44
+
34
45
 
35
46
  class AbstractFileBasedSpec(BaseModel):
36
47
  """
@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
111
111
  pass
112
112
 
113
113
 
114
+ class DuplicatedFilesError(BaseFileBasedSourceError):
115
+ def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
+ self._duplicated_files_names = duplicated_files_names
117
+ self._stream_name: str = kwargs["stream"]
118
+ super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
+
120
+ def _format_duplicate_files_error_message(self) -> str:
121
+ duplicated_files_messages = []
122
+ for duplicated_file in self._duplicated_files_names:
123
+ for duplicated_file_name, file_paths in duplicated_file.items():
124
+ file_duplicated_message = (
125
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
+ + "".join(f"\n - {file_paths}")
127
+ )
128
+ duplicated_files_messages.append(file_duplicated_message)
129
+
130
+ error_message = (
131
+ f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
+ "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
+ + "\n".join(duplicated_files_messages)
135
+ )
136
+
137
+ return error_message
138
+
139
+ def __repr__(self) -> str:
140
+ """Return a string representation of the exception."""
141
+ class_name = self.__class__.__name__
142
+ properties_str = ", ".join(
143
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
+ )
145
+ return f"{class_name}({properties_str})"
146
+
147
+
114
148
  class CustomFileBasedException(AirbyteTracedException):
115
149
  """
116
150
  A specialized exception for file-based connectors.
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
242
242
  stream=self._make_default_stream(
243
243
  stream_config=stream_config,
244
244
  cursor=cursor,
245
- use_file_transfer=self._use_file_transfer(parsed_config),
245
+ parsed_config=parsed_config,
246
246
  ),
247
247
  source=self,
248
248
  logger=self.logger,
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
273
273
  stream=self._make_default_stream(
274
274
  stream_config=stream_config,
275
275
  cursor=cursor,
276
- use_file_transfer=self._use_file_transfer(parsed_config),
276
+ parsed_config=parsed_config,
277
277
  ),
278
278
  source=self,
279
279
  logger=self.logger,
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
285
285
  stream = self._make_default_stream(
286
286
  stream_config=stream_config,
287
287
  cursor=cursor,
288
- use_file_transfer=self._use_file_transfer(parsed_config),
288
+ parsed_config=parsed_config,
289
289
  )
290
290
 
291
291
  streams.append(stream)
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
298
298
  self,
299
299
  stream_config: FileBasedStreamConfig,
300
300
  cursor: Optional[AbstractFileBasedCursor],
301
- use_file_transfer: bool = False,
301
+ parsed_config: AbstractFileBasedSpec,
302
302
  ) -> AbstractFileBasedStream:
303
303
  return DefaultFileBasedStream(
304
304
  config=stream_config,
@@ -310,7 +310,8 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
310
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
311
311
  errors_collector=self.errors_collector,
312
312
  cursor=cursor,
313
- use_file_transfer=use_file_transfer,
313
+ use_file_transfer=self._use_file_transfer(parsed_config),
314
+ preserve_directory_structure=self._preserve_directory_structure(parsed_config),
314
315
  )
315
316
 
316
317
  def _get_stream_from_catalog(
@@ -385,3 +386,25 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
385
386
  and parsed_config.delivery_method.delivery_type == "use_file_transfer"
386
387
  )
387
388
  return use_file_transfer
389
+
390
+ @staticmethod
391
+ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
+ """
393
+ Determines whether to preserve directory structure during file transfer.
394
+
395
+ When enabled, files maintain their subdirectory paths in the destination.
396
+ When disabled, files are flattened to the root of the destination.
397
+
398
+ Args:
399
+ parsed_config: The parsed configuration containing delivery method settings
400
+
401
+ Returns:
402
+ True if directory structure should be preserved (default), False otherwise
403
+ """
404
+ if (
405
+ FileBasedSource._use_file_transfer(parsed_config)
406
+ and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
+ and parsed_config.delivery_method.preserve_directory_structure is not None
408
+ ):
409
+ return parsed_config.delivery_method.preserve_directory_structure
410
+ return True
@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
135
135
  return use_file_transfer
136
136
  return False
137
137
 
138
+ def preserve_directory_structure(self) -> bool:
139
+ # fall back to preserve subdirectories if config is not present or incomplete
140
+ if (
141
+ self.use_file_transfer()
142
+ and self.config
143
+ and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
+ and self.config.delivery_method.preserve_directory_structure is not None
145
+ ):
146
+ return self.config.delivery_method.preserve_directory_structure
147
+ return True
148
+
138
149
  @abstractmethod
139
150
  def get_file(
140
151
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
159
170
  """
160
171
  ...
161
172
 
162
- @staticmethod
163
- def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164
- # Remove left slashes from source path format to make relative path for writing locally
165
- file_relative_path = file.uri.lstrip("/")
173
+ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
+ preserve_directory_structure = self.preserve_directory_structure()
175
+ if preserve_directory_structure:
176
+ # Remove left slashes from source path format to make relative path for writing locally
177
+ file_relative_path = file.uri.lstrip("/")
178
+ else:
179
+ file_relative_path = path.basename(file.uri)
166
180
  local_file_path = path.join(local_directory, file_relative_path)
167
181
 
168
182
  # Ensure the local directory exists
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  import traceback
6
7
  from datetime import datetime
7
8
  from io import BytesIO, IOBase
@@ -42,12 +43,34 @@ unstructured_partition_pdf = None
42
43
  unstructured_partition_docx = None
43
44
  unstructured_partition_pptx = None
44
45
 
46
+ AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
+ TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
+
49
+
50
+ def get_nltk_temp_folder() -> str:
51
+ """
52
+ For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
+ It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
+ """
55
+ try:
56
+ nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
+ os.makedirs(nltk_data_dir, exist_ok=True)
58
+ except OSError:
59
+ nltk_data_dir = TMP_NLTK_DATA_DIR
60
+ os.makedirs(nltk_data_dir, exist_ok=True)
61
+ return nltk_data_dir
62
+
63
+
45
64
  try:
65
+ nltk_data_dir = get_nltk_temp_folder()
66
+ nltk.data.path.append(nltk_data_dir)
46
67
  nltk.data.find("tokenizers/punkt.zip")
47
68
  nltk.data.find("tokenizers/punkt_tab.zip")
69
+ nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
48
70
  except LookupError:
49
- nltk.download("punkt")
50
- nltk.download("punkt_tab")
71
+ nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
+ nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
+ nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
51
74
 
52
75
 
53
76
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -5,14 +5,17 @@
5
5
  import asyncio
6
6
  import itertools
7
7
  import traceback
8
+ from collections import defaultdict
8
9
  from copy import deepcopy
9
10
  from functools import cache
10
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
11
+ from os import path
12
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
11
13
 
12
14
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
13
15
  from airbyte_cdk.models import Type as MessageType
14
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
15
17
  from airbyte_cdk.sources.file_based.exceptions import (
18
+ DuplicatedFilesError,
16
19
  FileBasedSourceError,
17
20
  InvalidSchemaError,
18
21
  MissingSchemaError,
@@ -43,6 +46,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
43
46
  """
44
47
 
45
48
  FILE_TRANSFER_KW = "use_file_transfer"
49
+ PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
+ FILES_KEY = "files"
46
51
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
47
52
  ab_last_mod_col = "_ab_source_file_last_modified"
48
53
  ab_file_name_col = "_ab_source_file_url"
@@ -50,10 +55,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
50
55
  source_file_url = "source_file_url"
51
56
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
52
57
  use_file_transfer = False
58
+ preserve_directory_structure = True
53
59
 
54
60
  def __init__(self, **kwargs: Any):
55
61
  if self.FILE_TRANSFER_KW in kwargs:
56
62
  self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63
+ if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64
+ self.preserve_directory_structure = kwargs.pop(
65
+ self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
66
+ )
57
67
  super().__init__(**kwargs)
58
68
 
59
69
  @property
@@ -98,15 +108,33 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
98
108
  else:
99
109
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
100
110
 
111
+ def _duplicated_files_names(
112
+ self, slices: List[dict[str, List[RemoteFile]]]
113
+ ) -> List[dict[str, List[str]]]:
114
+ seen_file_names: Dict[str, List[str]] = defaultdict(list)
115
+ for file_slice in slices:
116
+ for file_found in file_slice[self.FILES_KEY]:
117
+ file_name = path.basename(file_found.uri)
118
+ seen_file_names[file_name].append(file_found.uri)
119
+ return [
120
+ {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
121
+ ]
122
+
101
123
  def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
102
124
  # Sort files by last_modified, uri and return them grouped by last_modified
103
125
  all_files = self.list_files()
104
126
  files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
105
127
  sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
106
128
  slices = [
107
- {"files": list(group[1])}
129
+ {self.FILES_KEY: list(group[1])}
108
130
  for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
109
131
  ]
132
+ if slices and not self.preserve_directory_structure:
133
+ duplicated_files_names = self._duplicated_files_names(slices)
134
+ if duplicated_files_names:
135
+ raise DuplicatedFilesError(
136
+ stream=self.name, duplicated_files_names=duplicated_files_names
137
+ )
110
138
  return slices
111
139
 
112
140
  def transform_record(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-cdk
3
- Version: 6.18.1
3
+ Version: 6.19.0
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  License: MIT
6
6
  Keywords: airbyte,connector-development-kit,cdk
@@ -66,11 +66,11 @@ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyM
66
66
  airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
67
67
  airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
68
68
  airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=00X3palFmBp9WqQugXgtzFVn7s17KYWKTrn83ObmBzc,134673
69
+ airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=IVkRHPGvhEvB2kokL8CAfrvCHpxE_b9Ox5um42br41U,135095
70
70
  airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
71
71
  airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
72
72
  airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
73
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=-aO3ujXX9YTP2ZDvI2BP-x0VOKdAq2TlHo4zG8DCTlY,2748
73
+ airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=kQfUVMVhChKe5OngwIQrs0F9KGnRUN-CKVFakCU23DQ,4354
74
74
  airbyte_cdk/sources/declarative/decoders/decoder.py,sha256=sl-Gt8lXi7yD2Q-sD8je5QS2PbgrgsYjxRLWsay7DMc,826
75
75
  airbyte_cdk/sources/declarative/decoders/json_decoder.py,sha256=qdbjeR6RffKaah_iWvMsOcDolYuxJY5DaI3b9AMTZXg,3327
76
76
  airbyte_cdk/sources/declarative/decoders/noop_decoder.py,sha256=iZh0yKY_JzgBnJWiubEusf5c0o6Khd-8EWFWT-8EgFo,542
@@ -106,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
106
106
  airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
107
107
  airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
108
108
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
109
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=1wrAW9XeEq2xdUAAkmHcelka-LOwyYb-izRcACkNPKM,94915
109
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=dy9CSSvW1gAoMCAXkoOxLJTRVTrcHpYFENYgLqaUOwM,95087
110
110
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
111
111
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
112
112
  airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
113
113
  airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
114
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=NElLb7eLDVmxDgtTX9fQ-ZPrpfH3d7RpMDaQiLtvuuQ,110550
114
+ airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=v0Rd3V2b6AWJpQTOVlJbP58jRfNUwuhH22Q2fiA0itc,111475
115
115
  airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
116
116
  airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
117
117
  airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
@@ -196,7 +196,7 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=ddKQfUmk
196
196
  airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=01Nd4b7ERAbp-OZo_8rrAzFXWPTMwr02SnWiN17nx8Q,2363
197
197
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=j9T5TimfWFUz7nqsaj-83G3xWmDpsmeSbDnaUNmz0UM,5849
198
198
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
199
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=tj-M1L5BTa5yIQ3jHo09CtCTSq_eR-68zgyOPqwsurw,6455
199
+ airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=gXlZwnEKLWknnK_n7j14lANgR6vkqhlLJ-G3rRu-ox4,6897
200
200
  airbyte_cdk/sources/file_based/config/avro_format.py,sha256=NxTF96ewzn6HuhgodsY7Rpb-ybr1ZEWW5d4Vid64g5A,716
201
201
  airbyte_cdk/sources/file_based/config/csv_format.py,sha256=NWekkyT8dTwiVK0mwa_krQD4FJPHSDfILo8kPAg3-Vs,8006
202
202
  airbyte_cdk/sources/file_based/config/excel_format.py,sha256=9qAmTsT6SoVzNfNv0oBVkVCmiyqQuVAbfRKajjoa7Js,378
@@ -207,9 +207,9 @@ airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=tIbB9Pn1HqU6
207
207
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=gl3ey6mZbyfraB9P3pFhf9UJp2JeTZ1SUFAopy2iBvY,301
208
208
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=dCfXX529Rd5rtopg4VeEgTPJjFtqjtjzPq6LCw18Wt0,605
209
209
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=-xujTidtrq6HC00WKbjQh1CZdT5LMuzkp5BLjqDmfTY,1007
210
- airbyte_cdk/sources/file_based/exceptions.py,sha256=AEELNIRzKPX6eopKd_2jhE7WiNeR0Aw7nQWVOL8fvkc,5760
211
- airbyte_cdk/sources/file_based/file_based_source.py,sha256=RfpctRNLJ_EHKKEc2E1EZGYRfhG0Z9o6TgsKS4XrSNY,16652
212
- airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=ohxKlqPuV7TGwjyRy_gaWUol8QN5lBSoCYoaqBtRh1c,6179
210
+ airbyte_cdk/sources/file_based/exceptions.py,sha256=WP0qkG6fpWoBpOyyicgp5YNE393VWyegq5qSy0v4QtM,7362
211
+ airbyte_cdk/sources/file_based/file_based_source.py,sha256=Biv2QufYQtHZQCBZs4iCUpqTd82rk7xo8SDYkEeau3k,17616
212
+ airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=e1KhgTh7mzvkBOz9DjLwzOsDwevrTmbxSYIcvhgWgGM,6856
213
213
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=blCLn0-2LC-ZdgcNyDEhqM2RiUvEjEBh-G4-t32ZtuM,1268
214
214
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=XNx-JC-sgzH9u3nOJ2M59FxBXvtig8LN6BIkeDOavZA,10858
215
215
  airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=QlCXB-ry3np67Q_VerQEPoWDOTcPTB6Go4ydZxY9ae4,20445
@@ -218,7 +218,7 @@ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=HyGRihJxcb_lEs
218
218
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
219
219
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=GwyNyxmST4RX-XpXy7xVH0D-znYWWBmGv_pVAu95oHQ,5886
220
220
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=XenFg5sJ-UBnIkSmsiNJRou11NO0zZXx-RXgPHMT2NA,10487
221
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=r5FNcJadiI5PTyl1-doIodPCwW7xZWOTHl4Epd-w0-8,18602
221
+ airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=2TYOQl62FQPCa8otLbkDIk_j01EP3oWaKSfXGhCjCHg,19492
222
222
  airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
223
223
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=Cf8FH1bDFP0qCDDfEYir_WjP4exXUnikz8hZ40y1Ek0,9601
224
224
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=FkByIyEy56x2_awYnxGPqGaOp7zAzpAoRkPZHKySI9M,536
@@ -235,7 +235,7 @@ airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_c
235
235
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
236
236
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=om-x3gZFPgWDpi15S9RxZmR36VHnk8sytgN6LlBQhAw,1934
237
237
  airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=VGV7xLyBribuBMVrXtO1xqkWJD86bl7yhXtjnwLMohM,7051
238
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=rpwU6AOyhFLuXtcFKkcOHFWbRQ4kLCOKzAjcID_M87k,16770
238
+ airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=XLU5cNqQ-5mj243gNzMyXtm_oCtg1ORyoqbCsUo9Dn4,18044
239
239
  airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
240
240
  airbyte_cdk/sources/http_config.py,sha256=OBZeuyFilm6NlDlBhFQvHhTWabEvZww6OHDIlZujIS0,730
241
241
  airbyte_cdk/sources/http_logger.py,sha256=TyBmtRA6D9g0XDkKGvdM415b36RXDjgfkwRewDsH8-0,1576
@@ -343,8 +343,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
343
343
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
344
344
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
345
345
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
346
- airbyte_cdk-6.18.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
- airbyte_cdk-6.18.1.dist-info/METADATA,sha256=OMpca59Gc1MJOlwEgvDJX0uwp7skSel83qkbtcan6hE,6000
348
- airbyte_cdk-6.18.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
- airbyte_cdk-6.18.1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
- airbyte_cdk-6.18.1.dist-info/RECORD,,
346
+ airbyte_cdk-6.19.0.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
+ airbyte_cdk-6.19.0.dist-info/METADATA,sha256=QX8m8I4zsR63ujqpxXOBwVISjVjAB8YN-K5e0b_bJAQ,6000
348
+ airbyte_cdk-6.19.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
+ airbyte_cdk-6.19.0.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
+ airbyte_cdk-6.19.0.dist-info/RECORD,,