airbyte-cdk 6.13.1.dev4109__py3-none-any.whl → 6.13.1.dev41012__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. airbyte_cdk/entrypoint.py +1 -13
  2. airbyte_cdk/sources/declarative/auth/oauth.py +0 -26
  3. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +51 -24
  4. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +20 -128
  5. airbyte_cdk/sources/declarative/extractors/__init__.py +0 -2
  6. airbyte_cdk/sources/declarative/extractors/record_selector.py +7 -5
  7. airbyte_cdk/sources/declarative/interpolation/macros.py +0 -21
  8. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +11 -97
  9. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +14 -71
  10. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +4 -33
  11. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +35 -52
  12. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +7 -10
  13. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +4 -9
  14. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +6 -11
  15. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +11 -13
  16. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +13 -14
  17. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +7 -6
  18. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +7 -10
  19. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +4 -1
  20. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +64 -71
  21. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -4
  22. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +1 -3
  23. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +3 -8
  24. airbyte_cdk/sources/file_based/exceptions.py +23 -31
  25. airbyte_cdk/sources/file_based/file_based_source.py +8 -17
  26. airbyte_cdk/sources/file_based/file_based_stream_reader.py +6 -7
  27. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +2 -25
  28. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +20 -10
  29. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +4 -20
  30. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +4 -34
  31. airbyte_cdk/sources/types.py +0 -3
  32. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/METADATA +2 -2
  33. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/RECORD +36 -39
  34. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/WHEEL +1 -1
  35. airbyte_cdk/sources/declarative/extractors/type_transformer.py +0 -55
  36. airbyte_cdk/sources/declarative/requesters/README.md +0 -57
  37. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +0 -61
  38. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/LICENSE.txt +0 -0
  39. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.13.1.dev41012.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,18 @@ import json
6
6
  from dataclasses import InitVar, dataclass, field
7
7
  from functools import partial
8
8
  from itertools import islice
9
- from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union
9
+ from typing import (
10
+ Any,
11
+ Callable,
12
+ Iterable,
13
+ List,
14
+ Mapping,
15
+ MutableMapping,
16
+ Optional,
17
+ Set,
18
+ Tuple,
19
+ Union,
20
+ )
10
21
 
11
22
  import requests
12
23
 
@@ -79,6 +90,9 @@ class SimpleRetriever(Retriever):
79
90
 
80
91
  def __post_init__(self, parameters: Mapping[str, Any]) -> None:
81
92
  self._paginator = self.paginator or NoPagination(parameters=parameters)
93
+ self._last_response: Optional[requests.Response] = None
94
+ self._last_page_size: int = 0
95
+ self._last_record: Optional[Record] = None
82
96
  self._parameters = parameters
83
97
  self._name = (
84
98
  InterpolatedString(self._name, parameters=parameters)
@@ -86,6 +100,10 @@ class SimpleRetriever(Retriever):
86
100
  else self._name
87
101
  )
88
102
 
103
+ # This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing
104
+ # records. Partitions serve as the key and map to True if they already began processing records
105
+ self._partition_started: MutableMapping[Any, bool] = dict()
106
+
89
107
  @property # type: ignore
90
108
  def name(self) -> str:
91
109
  """
@@ -233,13 +251,17 @@ class SimpleRetriever(Retriever):
233
251
  raise ValueError("Request body json cannot be a string")
234
252
  return body_json
235
253
 
236
- def _paginator_path(self, next_page_token: Optional[Mapping[str, Any]] = None) -> Optional[str]:
254
+ def _paginator_path(
255
+ self,
256
+ ) -> Optional[str]:
237
257
  """
238
258
  If the paginator points to a path, follow it, else return nothing so the requester is used.
259
+ :param stream_state:
260
+ :param stream_slice:
239
261
  :param next_page_token:
240
262
  :return:
241
263
  """
242
- return self._paginator.path(next_page_token=next_page_token)
264
+ return self._paginator.path()
243
265
 
244
266
  def _parse_response(
245
267
  self,
@@ -250,15 +272,22 @@ class SimpleRetriever(Retriever):
250
272
  next_page_token: Optional[Mapping[str, Any]] = None,
251
273
  ) -> Iterable[Record]:
252
274
  if not response:
275
+ self._last_response = None
253
276
  yield from []
254
277
  else:
255
- yield from self.record_selector.select_records(
278
+ self._last_response = response
279
+ record_generator = self.record_selector.select_records(
256
280
  response=response,
257
281
  stream_state=stream_state,
258
282
  records_schema=records_schema,
259
283
  stream_slice=stream_slice,
260
284
  next_page_token=next_page_token,
261
285
  )
286
+ self._last_page_size = 0
287
+ for record in record_generator:
288
+ self._last_page_size += 1
289
+ self._last_record = record
290
+ yield record
262
291
 
263
292
  @property # type: ignore
264
293
  def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
@@ -270,13 +299,7 @@ class SimpleRetriever(Retriever):
270
299
  if not isinstance(value, property):
271
300
  self._primary_key = value
272
301
 
273
- def _next_page_token(
274
- self,
275
- response: requests.Response,
276
- last_page_size: int,
277
- last_record: Optional[Record],
278
- last_page_token_value: Optional[Any],
279
- ) -> Optional[Mapping[str, Any]]:
302
+ def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
280
303
  """
281
304
  Specifies a pagination strategy.
282
305
 
@@ -284,12 +307,7 @@ class SimpleRetriever(Retriever):
284
307
 
285
308
  :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
286
309
  """
287
- return self._paginator.next_page_token(
288
- response=response,
289
- last_page_size=last_page_size,
290
- last_record=last_record,
291
- last_page_token_value=last_page_token_value,
292
- )
310
+ return self._paginator.next_page_token(response, self._last_page_size, self._last_record)
293
311
 
294
312
  def _fetch_next_page(
295
313
  self,
@@ -298,7 +316,7 @@ class SimpleRetriever(Retriever):
298
316
  next_page_token: Optional[Mapping[str, Any]] = None,
299
317
  ) -> Optional[requests.Response]:
300
318
  return self.requester.send_request(
301
- path=self._paginator_path(next_page_token=next_page_token),
319
+ path=self._paginator_path(),
302
320
  stream_state=stream_state,
303
321
  stream_slice=stream_slice,
304
322
  next_page_token=next_page_token,
@@ -327,37 +345,20 @@ class SimpleRetriever(Retriever):
327
345
  # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
328
346
  def _read_pages(
329
347
  self,
330
- records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
348
+ records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
331
349
  stream_state: Mapping[str, Any],
332
350
  stream_slice: StreamSlice,
333
- ) -> Iterable[Record]:
351
+ ) -> Iterable[StreamData]:
334
352
  pagination_complete = False
335
- initial_token = self._paginator.get_initial_token()
336
- next_page_token: Optional[Mapping[str, Any]] = (
337
- {"next_page_token": initial_token} if initial_token else None
338
- )
353
+ next_page_token = None
339
354
  while not pagination_complete:
340
355
  response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
341
-
342
- last_page_size = 0
343
- last_record: Optional[Record] = None
344
- for record in records_generator_fn(response):
345
- last_page_size += 1
346
- last_record = record
347
- yield record
356
+ yield from records_generator_fn(response)
348
357
 
349
358
  if not response:
350
359
  pagination_complete = True
351
360
  else:
352
- last_page_token_value = (
353
- next_page_token.get("next_page_token") if next_page_token else None
354
- )
355
- next_page_token = self._next_page_token(
356
- response=response,
357
- last_page_size=last_page_size,
358
- last_record=last_record,
359
- last_page_token_value=last_page_token_value,
360
- )
361
+ next_page_token = self._next_page_token(response)
361
362
  if not next_page_token:
362
363
  pagination_complete = True
363
364
 
@@ -366,38 +367,19 @@ class SimpleRetriever(Retriever):
366
367
 
367
368
  def _read_single_page(
368
369
  self,
369
- records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
370
+ records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
370
371
  stream_state: Mapping[str, Any],
371
372
  stream_slice: StreamSlice,
372
373
  ) -> Iterable[StreamData]:
373
- initial_token = stream_state.get("next_page_token")
374
- if initial_token is None:
375
- initial_token = self._paginator.get_initial_token()
376
- next_page_token: Optional[Mapping[str, Any]] = (
377
- {"next_page_token": initial_token} if initial_token else None
378
- )
379
-
380
- response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
381
-
382
- last_page_size = 0
383
- last_record: Optional[Record] = None
384
- for record in records_generator_fn(response):
385
- last_page_size += 1
386
- last_record = record
387
- yield record
374
+ response = self._fetch_next_page(stream_state, stream_slice)
375
+ yield from records_generator_fn(response)
388
376
 
389
377
  if not response:
390
- next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
378
+ next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
391
379
  else:
392
- last_page_token_value = (
393
- next_page_token.get("next_page_token") if next_page_token else None
394
- )
395
- next_page_token = self._next_page_token(
396
- response=response,
397
- last_page_size=last_page_size,
398
- last_record=last_record,
399
- last_page_token_value=last_page_token_value,
400
- ) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
380
+ next_page_token = self._next_page_token(response) or {
381
+ FULL_REFRESH_SYNC_COMPLETE_KEY: True
382
+ }
401
383
 
402
384
  if self.cursor:
403
385
  self.cursor.close_slice(
@@ -432,14 +414,25 @@ class SimpleRetriever(Retriever):
432
414
  if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
433
415
  stream_state = self.state
434
416
 
435
- # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
436
- # fetch more records. The platform deletes stream state for full refresh streams before starting a
437
- # new job, so we don't need to worry about this value existing for the initial attempt
417
+ # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records
418
+ # The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about
419
+ # this value existing for the initial attempt
438
420
  if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
439
421
  return
422
+ cursor_value = stream_state.get("next_page_token")
423
+
424
+ # The first attempt to read a page for the current partition should reset the paginator to the current
425
+ # cursor state which is initially assigned to the incoming state from the platform
426
+ partition_key = self._to_partition_key(_slice.partition)
427
+ if partition_key not in self._partition_started:
428
+ self._partition_started[partition_key] = True
429
+ self._paginator.reset(reset_value=cursor_value)
440
430
 
441
431
  yield from self._read_single_page(record_generator, stream_state, _slice)
442
432
  else:
433
+ # Fixing paginator types has a long tail of dependencies
434
+ self._paginator.reset()
435
+
443
436
  for stream_data in self._read_pages(record_generator, self.state, _slice):
444
437
  current_record = self._extract_record(stream_data, _slice)
445
438
  if self.cursor and current_record:
@@ -525,7 +518,7 @@ class SimpleRetriever(Retriever):
525
518
  stream_state: Mapping[str, Any],
526
519
  records_schema: Mapping[str, Any],
527
520
  stream_slice: Optional[StreamSlice],
528
- ) -> Iterable[Record]:
521
+ ) -> Iterable[StreamData]:
529
522
  yield from self._parse_response(
530
523
  response,
531
524
  stream_slice=stream_slice,
@@ -569,7 +562,7 @@ class SimpleRetrieverTestReadDecorator(SimpleRetriever):
569
562
  next_page_token: Optional[Mapping[str, Any]] = None,
570
563
  ) -> Optional[requests.Response]:
571
564
  return self.requester.send_request(
572
- path=self._paginator_path(next_page_token=next_page_token),
565
+ path=self._paginator_path(),
573
566
  stream_state=stream_state,
574
567
  stream_slice=stream_slice,
575
568
  next_page_token=next_page_token,
@@ -1,6 +1,6 @@
1
1
  # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
2
 
3
- from typing import Any, Iterable, Mapping, Optional
3
+ from typing import Any, Callable, Iterable, Mapping, Optional
4
4
 
5
5
  from airbyte_cdk.sources.declarative.retrievers import Retriever
6
6
  from airbyte_cdk.sources.message import MessageRepository
@@ -16,7 +16,7 @@ class DeclarativePartitionFactory:
16
16
  self,
17
17
  stream_name: str,
18
18
  json_schema: Mapping[str, Any],
19
- retriever: Retriever,
19
+ retriever_factory: Callable[[], Retriever],
20
20
  message_repository: MessageRepository,
21
21
  ) -> None:
22
22
  """
@@ -26,14 +26,14 @@ class DeclarativePartitionFactory:
26
26
  """
27
27
  self._stream_name = stream_name
28
28
  self._json_schema = json_schema
29
- self._retriever = retriever
29
+ self._retriever_factory = retriever_factory
30
30
  self._message_repository = message_repository
31
31
 
32
32
  def create(self, stream_slice: StreamSlice) -> Partition:
33
33
  return DeclarativePartition(
34
34
  self._stream_name,
35
35
  self._json_schema,
36
- self._retriever,
36
+ self._retriever_factory(),
37
37
  self._message_repository,
38
38
  stream_slice,
39
39
  )
@@ -11,8 +11,6 @@ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
11
11
 
12
12
  @dataclass
13
13
  class FlattenFields(RecordTransformation):
14
- flatten_lists: bool = True
15
-
16
14
  def transform(
17
15
  self,
18
16
  record: Dict[str, Any],
@@ -41,7 +39,7 @@ class FlattenFields(RecordTransformation):
41
39
  )
42
40
  stack.append((value, new_key))
43
41
 
44
- elif isinstance(current_record, list) and self.flatten_lists:
42
+ elif isinstance(current_record, list):
45
43
  for i, item in enumerate(current_record):
46
44
  force_with_parent_name = True
47
45
  stack.append((item, f"{parent_key}.{i}"))
@@ -31,14 +31,9 @@ class DeliverRawFiles(BaseModel):
31
31
 
32
32
  delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
33
33
 
34
- preserve_directory_structure: bool = Field(
35
- title="Preserve Sub-Directories in File Paths",
36
- description=(
37
- "If enabled, sends subdirectory folder structure "
38
- "along with source file names to the destination. "
39
- "Otherwise, files will be synced by their names only. "
40
- "This option is ignored when file-based replication is not enabled."
41
- ),
34
+ preserve_subdirectories_directories: bool = Field(
35
+ title="Preserve Subdirectories in File Paths",
36
+ description="If enabled replicate source folder structure",
42
37
  default=True,
43
38
  )
44
39
 
@@ -112,37 +112,7 @@ class ErrorListingFiles(BaseFileBasedSourceError):
112
112
 
113
113
 
114
114
  class DuplicatedFilesError(BaseFileBasedSourceError):
115
- def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
- self._duplicated_files_names = duplicated_files_names
117
- self._stream_name: str = kwargs["stream"]
118
- super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
-
120
- def _format_duplicate_files_error_message(self) -> str:
121
- duplicated_files_messages = []
122
- for duplicated_file in self._duplicated_files_names:
123
- for duplicated_file_name, file_paths in duplicated_file.items():
124
- file_duplicated_message = (
125
- f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
- + "".join(f"\n - {file_paths}")
127
- )
128
- duplicated_files_messages.append(file_duplicated_message)
129
-
130
- error_message = (
131
- f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
- "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
- "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
- + "\n".join(duplicated_files_messages)
135
- )
136
-
137
- return error_message
138
-
139
- def __repr__(self) -> str:
140
- """Return a string representation of the exception."""
141
- class_name = self.__class__.__name__
142
- properties_str = ", ".join(
143
- f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
- )
145
- return f"{class_name}({properties_str})"
115
+ pass
146
116
 
147
117
 
148
118
  class CustomFileBasedException(AirbyteTracedException):
@@ -157,3 +127,25 @@ class CustomFileBasedException(AirbyteTracedException):
157
127
 
158
128
  class FileSizeLimitError(CustomFileBasedException):
159
129
  pass
130
+
131
+
132
+ def format_duplicate_files_error_message(
133
+ stream_name: str, duplicated_files_names: List[dict[str, List[str]]]
134
+ ) -> str:
135
+ duplicated_files_messages = []
136
+ for duplicated_file in duplicated_files_names:
137
+ for duplicated_file_name, file_paths in duplicated_file.items():
138
+ file_duplicated_message = (
139
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
140
+ + "".join(f"\n - {file_paths}")
141
+ )
142
+ duplicated_files_messages.append(file_duplicated_message)
143
+
144
+ error_message = (
145
+ f"ERROR: Duplicate filenames found for stream {stream_name}. "
146
+ "Duplicate file names are not allowed if the Preserve Subdirectories in File Paths option is disabled. "
147
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
148
+ + "\n".join(duplicated_files_messages)
149
+ )
150
+
151
+ return error_message
@@ -311,7 +311,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
311
311
  errors_collector=self.errors_collector,
312
312
  cursor=cursor,
313
313
  use_file_transfer=self._use_file_transfer(parsed_config),
314
- preserve_directory_structure=self._preserve_directory_structure(parsed_config),
314
+ preserve_subdirectories_directories=self._preserve_subdirectories_directories(
315
+ parsed_config
316
+ ),
315
317
  )
316
318
 
317
319
  def _get_stream_from_catalog(
@@ -388,23 +390,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
388
390
  return use_file_transfer
389
391
 
390
392
  @staticmethod
391
- def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
- """
393
- Determines whether to preserve directory structure during file transfer.
394
-
395
- When enabled, files maintain their subdirectory paths in the destination.
396
- When disabled, files are flattened to the root of the destination.
397
-
398
- Args:
399
- parsed_config: The parsed configuration containing delivery method settings
400
-
401
- Returns:
402
- True if directory structure should be preserved (default), False otherwise
403
- """
393
+ def _preserve_subdirectories_directories(parsed_config: AbstractFileBasedSpec) -> bool:
394
+ # fall back to preserve subdirectories if config is not present or incomplete
404
395
  if (
405
396
  FileBasedSource._use_file_transfer(parsed_config)
406
- and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
- and parsed_config.delivery_method.preserve_directory_structure is not None
397
+ and hasattr(parsed_config.delivery_method, "preserve_subdirectories_directories")
398
+ and parsed_config.delivery_method.preserve_subdirectories_directories is not None
408
399
  ):
409
- return parsed_config.delivery_method.preserve_directory_structure
400
+ return parsed_config.delivery_method.preserve_subdirectories_directories
410
401
  return True
@@ -135,15 +135,14 @@ class AbstractFileBasedStreamReader(ABC):
135
135
  return use_file_transfer
136
136
  return False
137
137
 
138
- def preserve_directory_structure(self) -> bool:
138
+ def preserve_subdirectories_directories(self) -> bool:
139
139
  # fall back to preserve subdirectories if config is not present or incomplete
140
140
  if (
141
141
  self.use_file_transfer()
142
- and self.config
143
- and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
- and self.config.delivery_method.preserve_directory_structure is not None
142
+ and hasattr(self.config.delivery_method, "preserve_subdirectories_directories")
143
+ and self.config.delivery_method.preserve_subdirectories_directories is not None
145
144
  ):
146
- return self.config.delivery_method.preserve_directory_structure
145
+ return self.config.delivery_method.preserve_subdirectories_directories
147
146
  return True
148
147
 
149
148
  @abstractmethod
@@ -171,8 +170,8 @@ class AbstractFileBasedStreamReader(ABC):
171
170
  ...
172
171
 
173
172
  def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
- preserve_directory_structure = self.preserve_directory_structure()
175
- if preserve_directory_structure:
173
+ preserve_subdirectories_directories = self.preserve_subdirectories_directories()
174
+ if preserve_subdirectories_directories:
176
175
  # Remove left slashes from source path format to make relative path for writing locally
177
176
  file_relative_path = file.uri.lstrip("/")
178
177
  else:
@@ -2,7 +2,6 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import os
6
5
  import traceback
7
6
  from datetime import datetime
8
7
  from io import BytesIO, IOBase
@@ -43,34 +42,12 @@ unstructured_partition_pdf = None
43
42
  unstructured_partition_docx = None
44
43
  unstructured_partition_pptx = None
45
44
 
46
- AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
- TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
-
49
-
50
- def get_nltk_temp_folder() -> str:
51
- """
52
- For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
- It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
- """
55
- try:
56
- nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
- os.makedirs(nltk_data_dir, exist_ok=True)
58
- except OSError:
59
- nltk_data_dir = TMP_NLTK_DATA_DIR
60
- os.makedirs(nltk_data_dir, exist_ok=True)
61
- return nltk_data_dir
62
-
63
-
64
45
  try:
65
- nltk_data_dir = get_nltk_temp_folder()
66
- nltk.data.path.append(nltk_data_dir)
67
46
  nltk.data.find("tokenizers/punkt.zip")
68
47
  nltk.data.find("tokenizers/punkt_tab.zip")
69
- nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
70
48
  except LookupError:
71
- nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
- nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
- nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
49
+ nltk.download("punkt")
50
+ nltk.download("punkt_tab")
74
51
 
75
52
 
76
53
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -22,6 +22,7 @@ from airbyte_cdk.sources.file_based.exceptions import (
22
22
  RecordParseError,
23
23
  SchemaInferenceError,
24
24
  StopSyncPerValidationPolicy,
25
+ format_duplicate_files_error_message,
25
26
  )
26
27
  from airbyte_cdk.sources.file_based.file_types import FileTransfer
27
28
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -46,7 +47,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
46
47
  """
47
48
 
48
49
  FILE_TRANSFER_KW = "use_file_transfer"
49
- PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
+ PRESERVE_SUBDIRECTORIES_KW = "preserve_subdirectories_directories"
50
51
  FILES_KEY = "files"
51
52
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
52
53
  ab_last_mod_col = "_ab_source_file_last_modified"
@@ -55,14 +56,13 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
55
56
  source_file_url = "source_file_url"
56
57
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
57
58
  use_file_transfer = False
58
- preserve_directory_structure = True
59
+ preserve_subdirectories_directories = True
59
60
 
60
61
  def __init__(self, **kwargs: Any):
61
62
  if self.FILE_TRANSFER_KW in kwargs:
62
63
  self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63
- if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64
- self.preserve_directory_structure = kwargs.pop(
65
- self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
64
+ self.preserve_subdirectories_directories = kwargs.pop(
65
+ self.PRESERVE_SUBDIRECTORIES_KW, True
66
66
  )
67
67
  super().__init__(**kwargs)
68
68
 
@@ -111,13 +111,20 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
111
111
  def _duplicated_files_names(
112
112
  self, slices: List[dict[str, List[RemoteFile]]]
113
113
  ) -> List[dict[str, List[str]]]:
114
- seen_file_names: Dict[str, List[str]] = defaultdict(list)
114
+ seen_file_names = set()
115
+ duplicates_file_names = set()
116
+ file_paths = defaultdict(list)
115
117
  for file_slice in slices:
116
118
  for file_found in file_slice[self.FILES_KEY]:
117
119
  file_name = path.basename(file_found.uri)
118
- seen_file_names[file_name].append(file_found.uri)
120
+ if file_name not in seen_file_names:
121
+ seen_file_names.add(file_name)
122
+ else:
123
+ duplicates_file_names.add(file_name)
124
+ file_paths[file_name].append(file_found.uri)
119
125
  return [
120
- {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
126
+ {duplicated_file: file_paths[duplicated_file]}
127
+ for duplicated_file in duplicates_file_names
121
128
  ]
122
129
 
123
130
  def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
@@ -129,11 +136,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
129
136
  {self.FILES_KEY: list(group[1])}
130
137
  for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
131
138
  ]
132
- if slices and not self.preserve_directory_structure:
139
+ if slices and not self.preserve_subdirectories_directories:
133
140
  duplicated_files_names = self._duplicated_files_names(slices)
134
141
  if duplicated_files_names:
135
142
  raise DuplicatedFilesError(
136
- stream=self.name, duplicated_files_names=duplicated_files_names
143
+ format_duplicate_files_error_message(
144
+ stream_name=self.name, duplicated_files_names=duplicated_files_names
145
+ ),
146
+ stream=self.name,
137
147
  )
138
148
  return slices
139
149
 
@@ -81,10 +81,10 @@ class AbstractOauth2Authenticator(AuthBase):
81
81
  Override to define additional parameters
82
82
  """
83
83
  payload: MutableMapping[str, Any] = {
84
- self.get_grant_type_name(): self.get_grant_type(),
85
- self.get_client_id_name(): self.get_client_id(),
86
- self.get_client_secret_name(): self.get_client_secret(),
87
- self.get_refresh_token_name(): self.get_refresh_token(),
84
+ "grant_type": self.get_grant_type(),
85
+ "client_id": self.get_client_id(),
86
+ "client_secret": self.get_client_secret(),
87
+ "refresh_token": self.get_refresh_token(),
88
88
  }
89
89
 
90
90
  if self.get_scopes():
@@ -206,26 +206,14 @@ class AbstractOauth2Authenticator(AuthBase):
206
206
  def get_token_refresh_endpoint(self) -> Optional[str]:
207
207
  """Returns the endpoint to refresh the access token"""
208
208
 
209
- @abstractmethod
210
- def get_client_id_name(self) -> str:
211
- """The client id name to authenticate"""
212
-
213
209
  @abstractmethod
214
210
  def get_client_id(self) -> str:
215
211
  """The client id to authenticate"""
216
212
 
217
- @abstractmethod
218
- def get_client_secret_name(self) -> str:
219
- """The client secret name to authenticate"""
220
-
221
213
  @abstractmethod
222
214
  def get_client_secret(self) -> str:
223
215
  """The client secret to authenticate"""
224
216
 
225
- @abstractmethod
226
- def get_refresh_token_name(self) -> str:
227
- """The refresh token name to authenticate"""
228
-
229
217
  @abstractmethod
230
218
  def get_refresh_token(self) -> Optional[str]:
231
219
  """The token used to refresh the access token when it expires"""
@@ -258,10 +246,6 @@ class AbstractOauth2Authenticator(AuthBase):
258
246
  def get_grant_type(self) -> str:
259
247
  """Returns grant_type specified for requesting access_token"""
260
248
 
261
- @abstractmethod
262
- def get_grant_type_name(self) -> str:
263
- """Returns grant_type specified name for requesting access_token"""
264
-
265
249
  @property
266
250
  @abstractmethod
267
251
  def access_token(self) -> str: