airbyte-cdk 6.18.0.dev3__py3-none-any.whl → 6.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. airbyte_cdk/sources/declarative/auth/oauth.py +26 -0
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +52 -36
  3. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +0 -43
  4. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +44 -20
  5. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +50 -58
  6. airbyte_cdk/sources/declarative/requesters/README.md +57 -0
  7. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +33 -4
  8. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +11 -0
  9. airbyte_cdk/sources/file_based/exceptions.py +34 -0
  10. airbyte_cdk/sources/file_based/file_based_source.py +28 -5
  11. airbyte_cdk/sources/file_based/file_based_stream_reader.py +18 -4
  12. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +25 -2
  13. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +30 -2
  14. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +20 -4
  15. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +34 -4
  16. airbyte_cdk/sources/types.py +3 -0
  17. {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/METADATA +1 -1
  18. {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/RECORD +21 -20
  19. {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/LICENSE.txt +0 -0
  20. {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/WHEEL +0 -0
  21. {airbyte_cdk-6.18.0.dev3.dist-info → airbyte_cdk-6.18.2.dist-info}/entry_points.txt +0 -0
@@ -72,8 +72,6 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
72
72
  CsvParser,
73
73
  GzipParser,
74
74
  JsonLineParser,
75
- JsonParser,
76
- Parser,
77
75
  )
78
76
  from airbyte_cdk.sources.declarative.extractors import (
79
77
  DpathExtractor,
@@ -249,9 +247,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
249
247
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
250
248
  JsonLineParser as JsonLineParserModel,
251
249
  )
252
- from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
253
- JsonParser as JsonParserModel,
254
- )
255
250
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
256
251
  JwtAuthenticator as JwtAuthenticatorModel,
257
252
  )
@@ -527,7 +522,6 @@ class ModelToComponentFactory:
527
522
  JsonDecoderModel: self.create_json_decoder,
528
523
  JsonlDecoderModel: self.create_jsonl_decoder,
529
524
  JsonLineParserModel: self.create_json_line_parser,
530
- JsonParserModel: self.create_json_parser,
531
525
  GzipJsonDecoderModel: self.create_gzipjson_decoder,
532
526
  GzipParserModel: self.create_gzip_parser,
533
527
  KeysToLowerModel: self.create_keys_to_lower_transformation,
@@ -1038,17 +1032,17 @@ class ModelToComponentFactory:
1038
1032
  self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any
1039
1033
  ) -> CursorPaginationStrategy:
1040
1034
  if isinstance(decoder, PaginationDecoderDecorator):
1041
- inner_decoder = decoder.decoder
1042
- else:
1043
- inner_decoder = decoder
1044
- decoder = PaginationDecoderDecorator(decoder=decoder)
1045
-
1046
- if self._is_supported_decoder_for_pagination(inner_decoder):
1035
+ if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
1036
+ raise ValueError(
1037
+ f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1038
+ )
1047
1039
  decoder_to_use = decoder
1048
1040
  else:
1049
- raise ValueError(
1050
- self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
1051
- )
1041
+ if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1042
+ raise ValueError(
1043
+ f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1044
+ )
1045
+ decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1052
1046
 
1053
1047
  return CursorPaginationStrategy(
1054
1048
  cursor_value=model.cursor_value,
@@ -1521,10 +1515,11 @@ class ModelToComponentFactory:
1521
1515
  cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None,
1522
1516
  ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]:
1523
1517
  if decoder:
1524
- if self._is_supported_decoder_for_pagination(decoder):
1525
- decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1526
- else:
1527
- raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder)))
1518
+ if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1519
+ raise ValueError(
1520
+ f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1521
+ )
1522
+ decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1528
1523
  else:
1529
1524
  decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
1530
1525
  page_size_option = (
@@ -1753,11 +1748,6 @@ class ModelToComponentFactory:
1753
1748
  def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
1754
1749
  return JsonDecoder(parameters={})
1755
1750
 
1756
- @staticmethod
1757
- def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
1758
- encoding = model.encoding if model.encoding else "utf-8"
1759
- return JsonParser(encoding=encoding)
1760
-
1761
1751
  @staticmethod
1762
1752
  def create_jsonl_decoder(
1763
1753
  model: JsonlDecoderModel, config: Config, **kwargs: Any
@@ -1895,15 +1885,24 @@ class ModelToComponentFactory:
1895
1885
  expires_in_name=InterpolatedString.create(
1896
1886
  model.expires_in_name or "expires_in", parameters=model.parameters or {}
1897
1887
  ).eval(config),
1888
+ client_id_name=InterpolatedString.create(
1889
+ model.client_id_name or "client_id", parameters=model.parameters or {}
1890
+ ).eval(config),
1898
1891
  client_id=InterpolatedString.create(
1899
1892
  model.client_id, parameters=model.parameters or {}
1900
1893
  ).eval(config),
1894
+ client_secret_name=InterpolatedString.create(
1895
+ model.client_secret_name or "client_secret", parameters=model.parameters or {}
1896
+ ).eval(config),
1901
1897
  client_secret=InterpolatedString.create(
1902
1898
  model.client_secret, parameters=model.parameters or {}
1903
1899
  ).eval(config),
1904
1900
  access_token_config_path=model.refresh_token_updater.access_token_config_path,
1905
1901
  refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path,
1906
1902
  token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path,
1903
+ grant_type_name=InterpolatedString.create(
1904
+ model.grant_type_name or "grant_type", parameters=model.parameters or {}
1905
+ ).eval(config),
1907
1906
  grant_type=InterpolatedString.create(
1908
1907
  model.grant_type or "refresh_token", parameters=model.parameters or {}
1909
1908
  ).eval(config),
@@ -1921,11 +1920,15 @@ class ModelToComponentFactory:
1921
1920
  return DeclarativeOauth2Authenticator( # type: ignore
1922
1921
  access_token_name=model.access_token_name or "access_token",
1923
1922
  access_token_value=model.access_token_value,
1923
+ client_id_name=model.client_id_name or "client_id",
1924
1924
  client_id=model.client_id,
1925
+ client_secret_name=model.client_secret_name or "client_secret",
1925
1926
  client_secret=model.client_secret,
1926
1927
  expires_in_name=model.expires_in_name or "expires_in",
1928
+ grant_type_name=model.grant_type_name or "grant_type",
1927
1929
  grant_type=model.grant_type or "refresh_token",
1928
1930
  refresh_request_body=model.refresh_request_body,
1931
+ refresh_token_name=model.refresh_token_name or "refresh_token",
1929
1932
  refresh_token=model.refresh_token,
1930
1933
  scopes=model.scopes,
1931
1934
  token_expiry_date=model.token_expiry_date,
@@ -1937,22 +1940,22 @@ class ModelToComponentFactory:
1937
1940
  message_repository=self._message_repository,
1938
1941
  )
1939
1942
 
1943
+ @staticmethod
1940
1944
  def create_offset_increment(
1941
- self, model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
1945
+ model: OffsetIncrementModel, config: Config, decoder: Decoder, **kwargs: Any
1942
1946
  ) -> OffsetIncrement:
1943
1947
  if isinstance(decoder, PaginationDecoderDecorator):
1944
- inner_decoder = decoder.decoder
1945
- else:
1946
- inner_decoder = decoder
1947
- decoder = PaginationDecoderDecorator(decoder=decoder)
1948
-
1949
- if self._is_supported_decoder_for_pagination(inner_decoder):
1948
+ if not isinstance(decoder.decoder, (JsonDecoder, XmlDecoder)):
1949
+ raise ValueError(
1950
+ f"Provided decoder of {type(decoder.decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1951
+ )
1950
1952
  decoder_to_use = decoder
1951
1953
  else:
1952
- raise ValueError(
1953
- self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder))
1954
- )
1955
-
1954
+ if not isinstance(decoder, (JsonDecoder, XmlDecoder)):
1955
+ raise ValueError(
1956
+ f"Provided decoder of {type(decoder)=} is not supported. Please set JsonDecoder or XmlDecoder instead."
1957
+ )
1958
+ decoder_to_use = PaginationDecoderDecorator(decoder=decoder)
1956
1959
  return OffsetIncrement(
1957
1960
  page_size=model.page_size,
1958
1961
  config=config,
@@ -2297,7 +2300,7 @@ class ModelToComponentFactory:
2297
2300
  extractor=download_extractor,
2298
2301
  name=name,
2299
2302
  record_filter=None,
2300
- transformations=[],
2303
+ transformations=transformations,
2301
2304
  schema_normalization=TypeTransformer(TransformConfig.NoTransform),
2302
2305
  config=config,
2303
2306
  parameters={},
@@ -2334,6 +2337,16 @@ class ModelToComponentFactory:
2334
2337
  if model.delete_requester
2335
2338
  else None
2336
2339
  )
2340
+ url_requester = (
2341
+ self._create_component_from_model(
2342
+ model=model.url_requester,
2343
+ decoder=decoder,
2344
+ config=config,
2345
+ name=f"job extract_url - {name}",
2346
+ )
2347
+ if model.url_requester
2348
+ else None
2349
+ )
2337
2350
  status_extractor = self._create_component_from_model(
2338
2351
  model=model.status_extractor, decoder=decoder, config=config, name=name
2339
2352
  )
@@ -2344,6 +2357,7 @@ class ModelToComponentFactory:
2344
2357
  creation_requester=creation_requester,
2345
2358
  polling_requester=polling_requester,
2346
2359
  download_retriever=download_retriever,
2360
+ url_requester=url_requester,
2347
2361
  abort_requester=abort_requester,
2348
2362
  delete_requester=delete_requester,
2349
2363
  status_extractor=status_extractor,
@@ -2541,25 +2555,3 @@ class ModelToComponentFactory:
2541
2555
  components_mapping=components_mapping,
2542
2556
  parameters=model.parameters or {},
2543
2557
  )
2544
-
2545
- _UNSUPPORTED_DECODER_ERROR = (
2546
- "Specified decoder of {decoder_type} is not supported for pagination."
2547
- "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead."
2548
- "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`."
2549
- )
2550
-
2551
- def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool:
2552
- if isinstance(decoder, (JsonDecoder, XmlDecoder)):
2553
- return True
2554
- elif isinstance(decoder, CompositeRawDecoder):
2555
- return self._is_supported_parser_for_pagination(decoder.parser)
2556
- else:
2557
- return False
2558
-
2559
- def _is_supported_parser_for_pagination(self, parser: Parser) -> bool:
2560
- if isinstance(parser, JsonParser):
2561
- return True
2562
- elif isinstance(parser, GzipParser):
2563
- return isinstance(parser.inner_parser, JsonParser)
2564
- else:
2565
- return False
@@ -0,0 +1,57 @@
1
+ # AsyncHttpJobRepository sequence diagram
2
+
3
+ - Components marked as optional are not required and can be ignored.
4
+ - if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
5
+ - interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
6
+
7
+
8
+ ```mermaid
9
+ ---
10
+ title: AsyncHttpJobRepository Sequence Diagram
11
+ ---
12
+ sequenceDiagram
13
+ participant AsyncHttpJobRepository as AsyncOrchestrator
14
+ participant CreationRequester as creation_requester
15
+ participant PollingRequester as polling_requester
16
+ participant UrlRequester as url_requester (Optional)
17
+ participant DownloadRetriever as download_retriever
18
+ participant AbortRequester as abort_requester (Optional)
19
+ participant DeleteRequester as delete_requester (Optional)
20
+ participant Reporting Server as Async Reporting Server
21
+
22
+ AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
23
+ CreationRequester ->> Reporting Server: Create job request
24
+ Reporting Server -->> CreationRequester: Job ID response
25
+ CreationRequester -->> AsyncHttpJobRepository: Job ID
26
+
27
+ loop Poll for job status
28
+ AsyncHttpJobRepository ->> PollingRequester: Check job status
29
+ PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
30
+ Reporting Server -->> PollingRequester: Status response
31
+ PollingRequester -->> AsyncHttpJobRepository: Job status
32
+ end
33
+
34
+ alt Status: Ready
35
+ AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
36
+ UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
37
+ Reporting Server -->> UrlRequester: Download URLs
38
+ UrlRequester -->> AsyncHttpJobRepository: Download URLs
39
+
40
+ AsyncHttpJobRepository ->> DownloadRetriever: Download reports
41
+ DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
42
+ Reporting Server -->> DownloadRetriever: Report data
43
+ DownloadRetriever -->> AsyncHttpJobRepository: Report data
44
+ else Status: Failed
45
+ AsyncHttpJobRepository ->> AbortRequester: Send abort request
46
+ AbortRequester ->> Reporting Server: Abort job
47
+ Reporting Server -->> AbortRequester: Abort confirmation
48
+ AbortRequester -->> AsyncHttpJobRepository: Confirmation
49
+ end
50
+
51
+ AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
52
+ DeleteRequester ->> Reporting Server: Delete job
53
+ Reporting Server -->> DeleteRequester: Deletion confirmation
54
+ DeleteRequester -->> AsyncHttpJobRepository: Confirmation
55
+
56
+
57
+ ```
@@ -31,6 +31,10 @@ LOGGER = logging.getLogger("airbyte")
31
31
 
32
32
  @dataclass
33
33
  class AsyncHttpJobRepository(AsyncJobRepository):
34
+ """
35
+ See Readme file for more details about flow.
36
+ """
37
+
34
38
  creation_requester: Requester
35
39
  polling_requester: Requester
36
40
  download_retriever: SimpleRetriever
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
44
48
  record_extractor: RecordExtractor = field(
45
49
  init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
46
50
  )
51
+ url_requester: Optional[Requester] = (
52
+ None # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
53
+ )
47
54
 
48
55
  def __post_init__(self) -> None:
49
56
  self._create_job_response_by_id: Dict[str, Response] = {}
@@ -186,10 +193,13 @@ class AsyncHttpJobRepository(AsyncJobRepository):
186
193
 
187
194
  """
188
195
 
189
- for url in self.urls_extractor.extract_records(
190
- self._polling_job_response_by_id[job.api_job_id()]
191
- ):
192
- stream_slice: StreamSlice = StreamSlice(partition={"url": url}, cursor_slice={})
196
+ for url in self._get_download_url(job):
197
+ job_slice = job.job_parameters()
198
+ stream_slice = StreamSlice(
199
+ partition=job_slice.partition,
200
+ cursor_slice=job_slice.cursor_slice,
201
+ extra_fields={**job_slice.extra_fields, "url": url},
202
+ )
193
203
  for message in self.download_retriever.read_records({}, stream_slice):
194
204
  if isinstance(message, Record):
195
205
  yield message.data
@@ -226,3 +236,22 @@ class AsyncHttpJobRepository(AsyncJobRepository):
226
236
  cursor_slice={},
227
237
  )
228
238
  return stream_slice
239
+
240
+ def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
241
+ if not self.url_requester:
242
+ url_response = self._polling_job_response_by_id[job.api_job_id()]
243
+ else:
244
+ stream_slice: StreamSlice = StreamSlice(
245
+ partition={
246
+ "polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
247
+ },
248
+ cursor_slice={},
249
+ )
250
+ url_response = self.url_requester.send_request(stream_slice=stream_slice) # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
251
+ if not url_response:
252
+ raise AirbyteTracedException(
253
+ internal_message="Always expect a response or an exception from url_requester",
254
+ failure_type=FailureType.system_error,
255
+ )
256
+
257
+ yield from self.urls_extractor.extract_records(url_response) # type: ignore # we expect urls_extractor to always return list of strings
@@ -31,6 +31,17 @@ class DeliverRawFiles(BaseModel):
31
31
 
32
32
  delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
33
33
 
34
+ preserve_directory_structure: bool = Field(
35
+ title="Preserve Sub-Directories in File Paths",
36
+ description=(
37
+ "If enabled, sends subdirectory folder structure "
38
+ "along with source file names to the destination. "
39
+ "Otherwise, files will be synced by their names only. "
40
+ "This option is ignored when file-based replication is not enabled."
41
+ ),
42
+ default=True,
43
+ )
44
+
34
45
 
35
46
  class AbstractFileBasedSpec(BaseModel):
36
47
  """
@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
111
111
  pass
112
112
 
113
113
 
114
+ class DuplicatedFilesError(BaseFileBasedSourceError):
115
+ def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
+ self._duplicated_files_names = duplicated_files_names
117
+ self._stream_name: str = kwargs["stream"]
118
+ super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
+
120
+ def _format_duplicate_files_error_message(self) -> str:
121
+ duplicated_files_messages = []
122
+ for duplicated_file in self._duplicated_files_names:
123
+ for duplicated_file_name, file_paths in duplicated_file.items():
124
+ file_duplicated_message = (
125
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
+ + "".join(f"\n - {file_paths}")
127
+ )
128
+ duplicated_files_messages.append(file_duplicated_message)
129
+
130
+ error_message = (
131
+ f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
+ "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
+ + "\n".join(duplicated_files_messages)
135
+ )
136
+
137
+ return error_message
138
+
139
+ def __repr__(self) -> str:
140
+ """Return a string representation of the exception."""
141
+ class_name = self.__class__.__name__
142
+ properties_str = ", ".join(
143
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
+ )
145
+ return f"{class_name}({properties_str})"
146
+
147
+
114
148
  class CustomFileBasedException(AirbyteTracedException):
115
149
  """
116
150
  A specialized exception for file-based connectors.
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
242
242
  stream=self._make_default_stream(
243
243
  stream_config=stream_config,
244
244
  cursor=cursor,
245
- use_file_transfer=self._use_file_transfer(parsed_config),
245
+ parsed_config=parsed_config,
246
246
  ),
247
247
  source=self,
248
248
  logger=self.logger,
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
273
273
  stream=self._make_default_stream(
274
274
  stream_config=stream_config,
275
275
  cursor=cursor,
276
- use_file_transfer=self._use_file_transfer(parsed_config),
276
+ parsed_config=parsed_config,
277
277
  ),
278
278
  source=self,
279
279
  logger=self.logger,
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
285
285
  stream = self._make_default_stream(
286
286
  stream_config=stream_config,
287
287
  cursor=cursor,
288
- use_file_transfer=self._use_file_transfer(parsed_config),
288
+ parsed_config=parsed_config,
289
289
  )
290
290
 
291
291
  streams.append(stream)
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
298
298
  self,
299
299
  stream_config: FileBasedStreamConfig,
300
300
  cursor: Optional[AbstractFileBasedCursor],
301
- use_file_transfer: bool = False,
301
+ parsed_config: AbstractFileBasedSpec,
302
302
  ) -> AbstractFileBasedStream:
303
303
  return DefaultFileBasedStream(
304
304
  config=stream_config,
@@ -310,7 +310,8 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
310
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
311
311
  errors_collector=self.errors_collector,
312
312
  cursor=cursor,
313
- use_file_transfer=use_file_transfer,
313
+ use_file_transfer=self._use_file_transfer(parsed_config),
314
+ preserve_directory_structure=self._preserve_directory_structure(parsed_config),
314
315
  )
315
316
 
316
317
  def _get_stream_from_catalog(
@@ -385,3 +386,25 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
385
386
  and parsed_config.delivery_method.delivery_type == "use_file_transfer"
386
387
  )
387
388
  return use_file_transfer
389
+
390
+ @staticmethod
391
+ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
+ """
393
+ Determines whether to preserve directory structure during file transfer.
394
+
395
+ When enabled, files maintain their subdirectory paths in the destination.
396
+ When disabled, files are flattened to the root of the destination.
397
+
398
+ Args:
399
+ parsed_config: The parsed configuration containing delivery method settings
400
+
401
+ Returns:
402
+ True if directory structure should be preserved (default), False otherwise
403
+ """
404
+ if (
405
+ FileBasedSource._use_file_transfer(parsed_config)
406
+ and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
+ and parsed_config.delivery_method.preserve_directory_structure is not None
408
+ ):
409
+ return parsed_config.delivery_method.preserve_directory_structure
410
+ return True
@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
135
135
  return use_file_transfer
136
136
  return False
137
137
 
138
+ def preserve_directory_structure(self) -> bool:
139
+ # fall back to preserve subdirectories if config is not present or incomplete
140
+ if (
141
+ self.use_file_transfer()
142
+ and self.config
143
+ and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
+ and self.config.delivery_method.preserve_directory_structure is not None
145
+ ):
146
+ return self.config.delivery_method.preserve_directory_structure
147
+ return True
148
+
138
149
  @abstractmethod
139
150
  def get_file(
140
151
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
159
170
  """
160
171
  ...
161
172
 
162
- @staticmethod
163
- def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164
- # Remove left slashes from source path format to make relative path for writing locally
165
- file_relative_path = file.uri.lstrip("/")
173
+ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
+ preserve_directory_structure = self.preserve_directory_structure()
175
+ if preserve_directory_structure:
176
+ # Remove left slashes from source path format to make relative path for writing locally
177
+ file_relative_path = file.uri.lstrip("/")
178
+ else:
179
+ file_relative_path = path.basename(file.uri)
166
180
  local_file_path = path.join(local_directory, file_relative_path)
167
181
 
168
182
  # Ensure the local directory exists
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  import traceback
6
7
  from datetime import datetime
7
8
  from io import BytesIO, IOBase
@@ -42,12 +43,34 @@ unstructured_partition_pdf = None
42
43
  unstructured_partition_docx = None
43
44
  unstructured_partition_pptx = None
44
45
 
46
+ AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
+ TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
+
49
+
50
+ def get_nltk_temp_folder() -> str:
51
+ """
52
+ For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
+ It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
+ """
55
+ try:
56
+ nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
+ os.makedirs(nltk_data_dir, exist_ok=True)
58
+ except OSError:
59
+ nltk_data_dir = TMP_NLTK_DATA_DIR
60
+ os.makedirs(nltk_data_dir, exist_ok=True)
61
+ return nltk_data_dir
62
+
63
+
45
64
  try:
65
+ nltk_data_dir = get_nltk_temp_folder()
66
+ nltk.data.path.append(nltk_data_dir)
46
67
  nltk.data.find("tokenizers/punkt.zip")
47
68
  nltk.data.find("tokenizers/punkt_tab.zip")
69
+ nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
48
70
  except LookupError:
49
- nltk.download("punkt")
50
- nltk.download("punkt_tab")
71
+ nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
+ nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
+ nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
51
74
 
52
75
 
53
76
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -5,14 +5,17 @@
5
5
  import asyncio
6
6
  import itertools
7
7
  import traceback
8
+ from collections import defaultdict
8
9
  from copy import deepcopy
9
10
  from functools import cache
10
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
11
+ from os import path
12
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
11
13
 
12
14
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
13
15
  from airbyte_cdk.models import Type as MessageType
14
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
15
17
  from airbyte_cdk.sources.file_based.exceptions import (
18
+ DuplicatedFilesError,
16
19
  FileBasedSourceError,
17
20
  InvalidSchemaError,
18
21
  MissingSchemaError,
@@ -43,6 +46,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
43
46
  """
44
47
 
45
48
  FILE_TRANSFER_KW = "use_file_transfer"
49
+ PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
+ FILES_KEY = "files"
46
51
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
47
52
  ab_last_mod_col = "_ab_source_file_last_modified"
48
53
  ab_file_name_col = "_ab_source_file_url"
@@ -50,10 +55,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
50
55
  source_file_url = "source_file_url"
51
56
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
52
57
  use_file_transfer = False
58
+ preserve_directory_structure = True
53
59
 
54
60
  def __init__(self, **kwargs: Any):
55
61
  if self.FILE_TRANSFER_KW in kwargs:
56
62
  self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63
+ if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64
+ self.preserve_directory_structure = kwargs.pop(
65
+ self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
66
+ )
57
67
  super().__init__(**kwargs)
58
68
 
59
69
  @property
@@ -98,15 +108,33 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
98
108
  else:
99
109
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
100
110
 
111
+ def _duplicated_files_names(
112
+ self, slices: List[dict[str, List[RemoteFile]]]
113
+ ) -> List[dict[str, List[str]]]:
114
+ seen_file_names: Dict[str, List[str]] = defaultdict(list)
115
+ for file_slice in slices:
116
+ for file_found in file_slice[self.FILES_KEY]:
117
+ file_name = path.basename(file_found.uri)
118
+ seen_file_names[file_name].append(file_found.uri)
119
+ return [
120
+ {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
121
+ ]
122
+
101
123
  def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
102
124
  # Sort files by last_modified, uri and return them grouped by last_modified
103
125
  all_files = self.list_files()
104
126
  files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
105
127
  sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
106
128
  slices = [
107
- {"files": list(group[1])}
129
+ {self.FILES_KEY: list(group[1])}
108
130
  for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
109
131
  ]
132
+ if slices and not self.preserve_directory_structure:
133
+ duplicated_files_names = self._duplicated_files_names(slices)
134
+ if duplicated_files_names:
135
+ raise DuplicatedFilesError(
136
+ stream=self.name, duplicated_files_names=duplicated_files_names
137
+ )
110
138
  return slices
111
139
 
112
140
  def transform_record(