airbyte-cdk 6.18.1__py3-none-any.whl → 6.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,17 @@ class DeliverRawFiles(BaseModel):
31
31
 
32
32
  delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
33
33
 
34
+ preserve_directory_structure: bool = Field(
35
+ title="Preserve Sub-Directories in File Paths",
36
+ description=(
37
+ "If enabled, sends subdirectory folder structure "
38
+ "along with source file names to the destination. "
39
+ "Otherwise, files will be synced by their names only. "
40
+ "This option is ignored when file-based replication is not enabled."
41
+ ),
42
+ default=True,
43
+ )
44
+
34
45
 
35
46
  class AbstractFileBasedSpec(BaseModel):
36
47
  """
@@ -111,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
111
111
  pass
112
112
 
113
113
 
114
+ class DuplicatedFilesError(BaseFileBasedSourceError):
115
+ def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
+ self._duplicated_files_names = duplicated_files_names
117
+ self._stream_name: str = kwargs["stream"]
118
+ super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
+
120
+ def _format_duplicate_files_error_message(self) -> str:
121
+ duplicated_files_messages = []
122
+ for duplicated_file in self._duplicated_files_names:
123
+ for duplicated_file_name, file_paths in duplicated_file.items():
124
+ file_duplicated_message = (
125
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
+ + "".join(f"\n - {file_paths}")
127
+ )
128
+ duplicated_files_messages.append(file_duplicated_message)
129
+
130
+ error_message = (
131
+ f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
+ "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
+ + "\n".join(duplicated_files_messages)
135
+ )
136
+
137
+ return error_message
138
+
139
+ def __repr__(self) -> str:
140
+ """Return a string representation of the exception."""
141
+ class_name = self.__class__.__name__
142
+ properties_str = ", ".join(
143
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
+ )
145
+ return f"{class_name}({properties_str})"
146
+
147
+
114
148
  class CustomFileBasedException(AirbyteTracedException):
115
149
  """
116
150
  A specialized exception for file-based connectors.
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
242
242
  stream=self._make_default_stream(
243
243
  stream_config=stream_config,
244
244
  cursor=cursor,
245
- use_file_transfer=self._use_file_transfer(parsed_config),
245
+ parsed_config=parsed_config,
246
246
  ),
247
247
  source=self,
248
248
  logger=self.logger,
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
273
273
  stream=self._make_default_stream(
274
274
  stream_config=stream_config,
275
275
  cursor=cursor,
276
- use_file_transfer=self._use_file_transfer(parsed_config),
276
+ parsed_config=parsed_config,
277
277
  ),
278
278
  source=self,
279
279
  logger=self.logger,
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
285
285
  stream = self._make_default_stream(
286
286
  stream_config=stream_config,
287
287
  cursor=cursor,
288
- use_file_transfer=self._use_file_transfer(parsed_config),
288
+ parsed_config=parsed_config,
289
289
  )
290
290
 
291
291
  streams.append(stream)
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
298
298
  self,
299
299
  stream_config: FileBasedStreamConfig,
300
300
  cursor: Optional[AbstractFileBasedCursor],
301
- use_file_transfer: bool = False,
301
+ parsed_config: AbstractFileBasedSpec,
302
302
  ) -> AbstractFileBasedStream:
303
303
  return DefaultFileBasedStream(
304
304
  config=stream_config,
@@ -310,7 +310,8 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
310
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
311
311
  errors_collector=self.errors_collector,
312
312
  cursor=cursor,
313
- use_file_transfer=use_file_transfer,
313
+ use_file_transfer=self._use_file_transfer(parsed_config),
314
+ preserve_directory_structure=self._preserve_directory_structure(parsed_config),
314
315
  )
315
316
 
316
317
  def _get_stream_from_catalog(
@@ -385,3 +386,25 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
385
386
  and parsed_config.delivery_method.delivery_type == "use_file_transfer"
386
387
  )
387
388
  return use_file_transfer
389
+
390
+ @staticmethod
391
+ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
+ """
393
+ Determines whether to preserve directory structure during file transfer.
394
+
395
+ When enabled, files maintain their subdirectory paths in the destination.
396
+ When disabled, files are flattened to the root of the destination.
397
+
398
+ Args:
399
+ parsed_config: The parsed configuration containing delivery method settings
400
+
401
+ Returns:
402
+ True if directory structure should be preserved (default), False otherwise
403
+ """
404
+ if (
405
+ FileBasedSource._use_file_transfer(parsed_config)
406
+ and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
+ and parsed_config.delivery_method.preserve_directory_structure is not None
408
+ ):
409
+ return parsed_config.delivery_method.preserve_directory_structure
410
+ return True
@@ -135,6 +135,17 @@ class AbstractFileBasedStreamReader(ABC):
135
135
  return use_file_transfer
136
136
  return False
137
137
 
138
+ def preserve_directory_structure(self) -> bool:
139
+ # fall back to preserve subdirectories if config is not present or incomplete
140
+ if (
141
+ self.use_file_transfer()
142
+ and self.config
143
+ and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
+ and self.config.delivery_method.preserve_directory_structure is not None
145
+ ):
146
+ return self.config.delivery_method.preserve_directory_structure
147
+ return True
148
+
138
149
  @abstractmethod
139
150
  def get_file(
140
151
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -159,10 +170,13 @@ class AbstractFileBasedStreamReader(ABC):
159
170
  """
160
171
  ...
161
172
 
162
- @staticmethod
163
- def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164
- # Remove left slashes from source path format to make relative path for writing locally
165
- file_relative_path = file.uri.lstrip("/")
173
+ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
+ preserve_directory_structure = self.preserve_directory_structure()
175
+ if preserve_directory_structure:
176
+ # Remove left slashes from source path format to make relative path for writing locally
177
+ file_relative_path = file.uri.lstrip("/")
178
+ else:
179
+ file_relative_path = path.basename(file.uri)
166
180
  local_file_path = path.join(local_directory, file_relative_path)
167
181
 
168
182
  # Ensure the local directory exists
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  import traceback
6
7
  from datetime import datetime
7
8
  from io import BytesIO, IOBase
@@ -42,12 +43,34 @@ unstructured_partition_pdf = None
42
43
  unstructured_partition_docx = None
43
44
  unstructured_partition_pptx = None
44
45
 
46
+ AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
+ TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
+
49
+
50
+ def get_nltk_temp_folder() -> str:
51
+ """
52
+ For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
+ It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
+ """
55
+ try:
56
+ nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
+ os.makedirs(nltk_data_dir, exist_ok=True)
58
+ except OSError:
59
+ nltk_data_dir = TMP_NLTK_DATA_DIR
60
+ os.makedirs(nltk_data_dir, exist_ok=True)
61
+ return nltk_data_dir
62
+
63
+
45
64
  try:
65
+ nltk_data_dir = get_nltk_temp_folder()
66
+ nltk.data.path.append(nltk_data_dir)
46
67
  nltk.data.find("tokenizers/punkt.zip")
47
68
  nltk.data.find("tokenizers/punkt_tab.zip")
69
+ nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
48
70
  except LookupError:
49
- nltk.download("punkt")
50
- nltk.download("punkt_tab")
71
+ nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
+ nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
+ nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
51
74
 
52
75
 
53
76
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -5,14 +5,17 @@
5
5
  import asyncio
6
6
  import itertools
7
7
  import traceback
8
+ from collections import defaultdict
8
9
  from copy import deepcopy
9
10
  from functools import cache
10
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
11
+ from os import path
12
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
11
13
 
12
14
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
13
15
  from airbyte_cdk.models import Type as MessageType
14
16
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
15
17
  from airbyte_cdk.sources.file_based.exceptions import (
18
+ DuplicatedFilesError,
16
19
  FileBasedSourceError,
17
20
  InvalidSchemaError,
18
21
  MissingSchemaError,
@@ -43,6 +46,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
43
46
  """
44
47
 
45
48
  FILE_TRANSFER_KW = "use_file_transfer"
49
+ PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
+ FILES_KEY = "files"
46
51
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
47
52
  ab_last_mod_col = "_ab_source_file_last_modified"
48
53
  ab_file_name_col = "_ab_source_file_url"
@@ -50,10 +55,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
50
55
  source_file_url = "source_file_url"
51
56
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
52
57
  use_file_transfer = False
58
+ preserve_directory_structure = True
53
59
 
54
60
  def __init__(self, **kwargs: Any):
55
61
  if self.FILE_TRANSFER_KW in kwargs:
56
62
  self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63
+ if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64
+ self.preserve_directory_structure = kwargs.pop(
65
+ self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
66
+ )
57
67
  super().__init__(**kwargs)
58
68
 
59
69
  @property
@@ -98,15 +108,33 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
98
108
  else:
99
109
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
100
110
 
111
+ def _duplicated_files_names(
112
+ self, slices: List[dict[str, List[RemoteFile]]]
113
+ ) -> List[dict[str, List[str]]]:
114
+ seen_file_names: Dict[str, List[str]] = defaultdict(list)
115
+ for file_slice in slices:
116
+ for file_found in file_slice[self.FILES_KEY]:
117
+ file_name = path.basename(file_found.uri)
118
+ seen_file_names[file_name].append(file_found.uri)
119
+ return [
120
+ {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
121
+ ]
122
+
101
123
  def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
102
124
  # Sort files by last_modified, uri and return them grouped by last_modified
103
125
  all_files = self.list_files()
104
126
  files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
105
127
  sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
106
128
  slices = [
107
- {"files": list(group[1])}
129
+ {self.FILES_KEY: list(group[1])}
108
130
  for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
109
131
  ]
132
+ if slices and not self.preserve_directory_structure:
133
+ duplicated_files_names = self._duplicated_files_names(slices)
134
+ if duplicated_files_names:
135
+ raise DuplicatedFilesError(
136
+ stream=self.name, duplicated_files_names=duplicated_files_names
137
+ )
110
138
  return slices
111
139
 
112
140
  def transform_record(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-cdk
3
- Version: 6.18.1
3
+ Version: 6.18.2
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  License: MIT
6
6
  Keywords: airbyte,connector-development-kit,cdk
@@ -196,7 +196,7 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=ddKQfUmk
196
196
  airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=01Nd4b7ERAbp-OZo_8rrAzFXWPTMwr02SnWiN17nx8Q,2363
197
197
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=j9T5TimfWFUz7nqsaj-83G3xWmDpsmeSbDnaUNmz0UM,5849
198
198
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
199
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=tj-M1L5BTa5yIQ3jHo09CtCTSq_eR-68zgyOPqwsurw,6455
199
+ airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=gXlZwnEKLWknnK_n7j14lANgR6vkqhlLJ-G3rRu-ox4,6897
200
200
  airbyte_cdk/sources/file_based/config/avro_format.py,sha256=NxTF96ewzn6HuhgodsY7Rpb-ybr1ZEWW5d4Vid64g5A,716
201
201
  airbyte_cdk/sources/file_based/config/csv_format.py,sha256=NWekkyT8dTwiVK0mwa_krQD4FJPHSDfILo8kPAg3-Vs,8006
202
202
  airbyte_cdk/sources/file_based/config/excel_format.py,sha256=9qAmTsT6SoVzNfNv0oBVkVCmiyqQuVAbfRKajjoa7Js,378
@@ -207,9 +207,9 @@ airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=tIbB9Pn1HqU6
207
207
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=gl3ey6mZbyfraB9P3pFhf9UJp2JeTZ1SUFAopy2iBvY,301
208
208
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=dCfXX529Rd5rtopg4VeEgTPJjFtqjtjzPq6LCw18Wt0,605
209
209
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=-xujTidtrq6HC00WKbjQh1CZdT5LMuzkp5BLjqDmfTY,1007
210
- airbyte_cdk/sources/file_based/exceptions.py,sha256=AEELNIRzKPX6eopKd_2jhE7WiNeR0Aw7nQWVOL8fvkc,5760
211
- airbyte_cdk/sources/file_based/file_based_source.py,sha256=RfpctRNLJ_EHKKEc2E1EZGYRfhG0Z9o6TgsKS4XrSNY,16652
212
- airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=ohxKlqPuV7TGwjyRy_gaWUol8QN5lBSoCYoaqBtRh1c,6179
210
+ airbyte_cdk/sources/file_based/exceptions.py,sha256=WP0qkG6fpWoBpOyyicgp5YNE393VWyegq5qSy0v4QtM,7362
211
+ airbyte_cdk/sources/file_based/file_based_source.py,sha256=Biv2QufYQtHZQCBZs4iCUpqTd82rk7xo8SDYkEeau3k,17616
212
+ airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=e1KhgTh7mzvkBOz9DjLwzOsDwevrTmbxSYIcvhgWgGM,6856
213
213
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=blCLn0-2LC-ZdgcNyDEhqM2RiUvEjEBh-G4-t32ZtuM,1268
214
214
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=XNx-JC-sgzH9u3nOJ2M59FxBXvtig8LN6BIkeDOavZA,10858
215
215
  airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=QlCXB-ry3np67Q_VerQEPoWDOTcPTB6Go4ydZxY9ae4,20445
@@ -218,7 +218,7 @@ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=HyGRihJxcb_lEs
218
218
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
219
219
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=GwyNyxmST4RX-XpXy7xVH0D-znYWWBmGv_pVAu95oHQ,5886
220
220
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=XenFg5sJ-UBnIkSmsiNJRou11NO0zZXx-RXgPHMT2NA,10487
221
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=r5FNcJadiI5PTyl1-doIodPCwW7xZWOTHl4Epd-w0-8,18602
221
+ airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=2TYOQl62FQPCa8otLbkDIk_j01EP3oWaKSfXGhCjCHg,19492
222
222
  airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
223
223
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=Cf8FH1bDFP0qCDDfEYir_WjP4exXUnikz8hZ40y1Ek0,9601
224
224
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=FkByIyEy56x2_awYnxGPqGaOp7zAzpAoRkPZHKySI9M,536
@@ -235,7 +235,7 @@ airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_c
235
235
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
236
236
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=om-x3gZFPgWDpi15S9RxZmR36VHnk8sytgN6LlBQhAw,1934
237
237
  airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=VGV7xLyBribuBMVrXtO1xqkWJD86bl7yhXtjnwLMohM,7051
238
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=rpwU6AOyhFLuXtcFKkcOHFWbRQ4kLCOKzAjcID_M87k,16770
238
+ airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=XLU5cNqQ-5mj243gNzMyXtm_oCtg1ORyoqbCsUo9Dn4,18044
239
239
  airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
240
240
  airbyte_cdk/sources/http_config.py,sha256=OBZeuyFilm6NlDlBhFQvHhTWabEvZww6OHDIlZujIS0,730
241
241
  airbyte_cdk/sources/http_logger.py,sha256=TyBmtRA6D9g0XDkKGvdM415b36RXDjgfkwRewDsH8-0,1576
@@ -343,8 +343,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
343
343
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
344
344
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
345
345
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
346
- airbyte_cdk-6.18.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
- airbyte_cdk-6.18.1.dist-info/METADATA,sha256=OMpca59Gc1MJOlwEgvDJX0uwp7skSel83qkbtcan6hE,6000
348
- airbyte_cdk-6.18.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
- airbyte_cdk-6.18.1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
- airbyte_cdk-6.18.1.dist-info/RECORD,,
346
+ airbyte_cdk-6.18.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
+ airbyte_cdk-6.18.2.dist-info/METADATA,sha256=Uwnd37XQTCsP3n7GzWDM5mQJsphngbGzpyvqnRyMt7I,6000
348
+ airbyte_cdk-6.18.2.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
+ airbyte_cdk-6.18.2.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
+ airbyte_cdk-6.18.2.dist-info/RECORD,,