airbyte-cdk 6.13.1.dev4109__py3-none-any.whl → 6.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. airbyte_cdk/entrypoint.py +1 -13
  2. airbyte_cdk/sources/declarative/auth/oauth.py +0 -26
  3. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +51 -24
  4. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +20 -128
  5. airbyte_cdk/sources/declarative/extractors/__init__.py +0 -2
  6. airbyte_cdk/sources/declarative/extractors/record_selector.py +7 -5
  7. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +11 -97
  8. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +14 -71
  9. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +4 -33
  10. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +35 -52
  11. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +7 -10
  12. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +4 -9
  13. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +6 -11
  14. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +13 -13
  15. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +13 -14
  16. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +7 -6
  17. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +10 -10
  18. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +4 -1
  19. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +64 -71
  20. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -4
  21. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +1 -3
  22. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +0 -11
  23. airbyte_cdk/sources/file_based/exceptions.py +0 -34
  24. airbyte_cdk/sources/file_based/file_based_source.py +5 -28
  25. airbyte_cdk/sources/file_based/file_based_stream_reader.py +4 -18
  26. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +2 -25
  27. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +2 -30
  28. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +4 -20
  29. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +4 -34
  30. airbyte_cdk/sources/types.py +0 -3
  31. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/METADATA +2 -2
  32. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/RECORD +35 -38
  33. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/WHEEL +1 -1
  34. airbyte_cdk/sources/declarative/extractors/type_transformer.py +0 -55
  35. airbyte_cdk/sources/declarative/requesters/README.md +0 -57
  36. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +0 -61
  37. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/LICENSE.txt +0 -0
  38. {airbyte_cdk-6.13.1.dev4109.dist-info → airbyte_cdk-6.14.0.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,18 @@ import json
6
6
  from dataclasses import InitVar, dataclass, field
7
7
  from functools import partial
8
8
  from itertools import islice
9
- from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union
9
+ from typing import (
10
+ Any,
11
+ Callable,
12
+ Iterable,
13
+ List,
14
+ Mapping,
15
+ MutableMapping,
16
+ Optional,
17
+ Set,
18
+ Tuple,
19
+ Union,
20
+ )
10
21
 
11
22
  import requests
12
23
 
@@ -79,6 +90,9 @@ class SimpleRetriever(Retriever):
79
90
 
80
91
  def __post_init__(self, parameters: Mapping[str, Any]) -> None:
81
92
  self._paginator = self.paginator or NoPagination(parameters=parameters)
93
+ self._last_response: Optional[requests.Response] = None
94
+ self._last_page_size: int = 0
95
+ self._last_record: Optional[Record] = None
82
96
  self._parameters = parameters
83
97
  self._name = (
84
98
  InterpolatedString(self._name, parameters=parameters)
@@ -86,6 +100,10 @@ class SimpleRetriever(Retriever):
86
100
  else self._name
87
101
  )
88
102
 
103
+ # This mapping is used during a resumable full refresh syncs to indicate whether a partition has started syncing
104
+ # records. Partitions serve as the key and map to True if they already began processing records
105
+ self._partition_started: MutableMapping[Any, bool] = dict()
106
+
89
107
  @property # type: ignore
90
108
  def name(self) -> str:
91
109
  """
@@ -233,13 +251,17 @@ class SimpleRetriever(Retriever):
233
251
  raise ValueError("Request body json cannot be a string")
234
252
  return body_json
235
253
 
236
- def _paginator_path(self, next_page_token: Optional[Mapping[str, Any]] = None) -> Optional[str]:
254
+ def _paginator_path(
255
+ self,
256
+ ) -> Optional[str]:
237
257
  """
238
258
  If the paginator points to a path, follow it, else return nothing so the requester is used.
259
+ :param stream_state:
260
+ :param stream_slice:
239
261
  :param next_page_token:
240
262
  :return:
241
263
  """
242
- return self._paginator.path(next_page_token=next_page_token)
264
+ return self._paginator.path()
243
265
 
244
266
  def _parse_response(
245
267
  self,
@@ -250,15 +272,22 @@ class SimpleRetriever(Retriever):
250
272
  next_page_token: Optional[Mapping[str, Any]] = None,
251
273
  ) -> Iterable[Record]:
252
274
  if not response:
275
+ self._last_response = None
253
276
  yield from []
254
277
  else:
255
- yield from self.record_selector.select_records(
278
+ self._last_response = response
279
+ record_generator = self.record_selector.select_records(
256
280
  response=response,
257
281
  stream_state=stream_state,
258
282
  records_schema=records_schema,
259
283
  stream_slice=stream_slice,
260
284
  next_page_token=next_page_token,
261
285
  )
286
+ self._last_page_size = 0
287
+ for record in record_generator:
288
+ self._last_page_size += 1
289
+ self._last_record = record
290
+ yield record
262
291
 
263
292
  @property # type: ignore
264
293
  def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
@@ -270,13 +299,7 @@ class SimpleRetriever(Retriever):
270
299
  if not isinstance(value, property):
271
300
  self._primary_key = value
272
301
 
273
- def _next_page_token(
274
- self,
275
- response: requests.Response,
276
- last_page_size: int,
277
- last_record: Optional[Record],
278
- last_page_token_value: Optional[Any],
279
- ) -> Optional[Mapping[str, Any]]:
302
+ def _next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
280
303
  """
281
304
  Specifies a pagination strategy.
282
305
 
@@ -284,12 +307,7 @@ class SimpleRetriever(Retriever):
284
307
 
285
308
  :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
286
309
  """
287
- return self._paginator.next_page_token(
288
- response=response,
289
- last_page_size=last_page_size,
290
- last_record=last_record,
291
- last_page_token_value=last_page_token_value,
292
- )
310
+ return self._paginator.next_page_token(response, self._last_page_size, self._last_record)
293
311
 
294
312
  def _fetch_next_page(
295
313
  self,
@@ -298,7 +316,7 @@ class SimpleRetriever(Retriever):
298
316
  next_page_token: Optional[Mapping[str, Any]] = None,
299
317
  ) -> Optional[requests.Response]:
300
318
  return self.requester.send_request(
301
- path=self._paginator_path(next_page_token=next_page_token),
319
+ path=self._paginator_path(),
302
320
  stream_state=stream_state,
303
321
  stream_slice=stream_slice,
304
322
  next_page_token=next_page_token,
@@ -327,37 +345,20 @@ class SimpleRetriever(Retriever):
327
345
  # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
328
346
  def _read_pages(
329
347
  self,
330
- records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
348
+ records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
331
349
  stream_state: Mapping[str, Any],
332
350
  stream_slice: StreamSlice,
333
- ) -> Iterable[Record]:
351
+ ) -> Iterable[StreamData]:
334
352
  pagination_complete = False
335
- initial_token = self._paginator.get_initial_token()
336
- next_page_token: Optional[Mapping[str, Any]] = (
337
- {"next_page_token": initial_token} if initial_token else None
338
- )
353
+ next_page_token = None
339
354
  while not pagination_complete:
340
355
  response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
341
-
342
- last_page_size = 0
343
- last_record: Optional[Record] = None
344
- for record in records_generator_fn(response):
345
- last_page_size += 1
346
- last_record = record
347
- yield record
356
+ yield from records_generator_fn(response)
348
357
 
349
358
  if not response:
350
359
  pagination_complete = True
351
360
  else:
352
- last_page_token_value = (
353
- next_page_token.get("next_page_token") if next_page_token else None
354
- )
355
- next_page_token = self._next_page_token(
356
- response=response,
357
- last_page_size=last_page_size,
358
- last_record=last_record,
359
- last_page_token_value=last_page_token_value,
360
- )
361
+ next_page_token = self._next_page_token(response)
361
362
  if not next_page_token:
362
363
  pagination_complete = True
363
364
 
@@ -366,38 +367,19 @@ class SimpleRetriever(Retriever):
366
367
 
367
368
  def _read_single_page(
368
369
  self,
369
- records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
370
+ records_generator_fn: Callable[[Optional[requests.Response]], Iterable[StreamData]],
370
371
  stream_state: Mapping[str, Any],
371
372
  stream_slice: StreamSlice,
372
373
  ) -> Iterable[StreamData]:
373
- initial_token = stream_state.get("next_page_token")
374
- if initial_token is None:
375
- initial_token = self._paginator.get_initial_token()
376
- next_page_token: Optional[Mapping[str, Any]] = (
377
- {"next_page_token": initial_token} if initial_token else None
378
- )
379
-
380
- response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
381
-
382
- last_page_size = 0
383
- last_record: Optional[Record] = None
384
- for record in records_generator_fn(response):
385
- last_page_size += 1
386
- last_record = record
387
- yield record
374
+ response = self._fetch_next_page(stream_state, stream_slice)
375
+ yield from records_generator_fn(response)
388
376
 
389
377
  if not response:
390
- next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
378
+ next_page_token: Mapping[str, Any] = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
391
379
  else:
392
- last_page_token_value = (
393
- next_page_token.get("next_page_token") if next_page_token else None
394
- )
395
- next_page_token = self._next_page_token(
396
- response=response,
397
- last_page_size=last_page_size,
398
- last_record=last_record,
399
- last_page_token_value=last_page_token_value,
400
- ) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
380
+ next_page_token = self._next_page_token(response) or {
381
+ FULL_REFRESH_SYNC_COMPLETE_KEY: True
382
+ }
401
383
 
402
384
  if self.cursor:
403
385
  self.cursor.close_slice(
@@ -432,14 +414,25 @@ class SimpleRetriever(Retriever):
432
414
  if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
433
415
  stream_state = self.state
434
416
 
435
- # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
436
- # fetch more records. The platform deletes stream state for full refresh streams before starting a
437
- # new job, so we don't need to worry about this value existing for the initial attempt
417
+ # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to fetch more records
418
+ # The platform deletes stream state for full refresh streams before starting a new job, so we don't need to worry about
419
+ # this value existing for the initial attempt
438
420
  if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
439
421
  return
422
+ cursor_value = stream_state.get("next_page_token")
423
+
424
+ # The first attempt to read a page for the current partition should reset the paginator to the current
425
+ # cursor state which is initially assigned to the incoming state from the platform
426
+ partition_key = self._to_partition_key(_slice.partition)
427
+ if partition_key not in self._partition_started:
428
+ self._partition_started[partition_key] = True
429
+ self._paginator.reset(reset_value=cursor_value)
440
430
 
441
431
  yield from self._read_single_page(record_generator, stream_state, _slice)
442
432
  else:
433
+ # Fixing paginator types has a long tail of dependencies
434
+ self._paginator.reset()
435
+
443
436
  for stream_data in self._read_pages(record_generator, self.state, _slice):
444
437
  current_record = self._extract_record(stream_data, _slice)
445
438
  if self.cursor and current_record:
@@ -525,7 +518,7 @@ class SimpleRetriever(Retriever):
525
518
  stream_state: Mapping[str, Any],
526
519
  records_schema: Mapping[str, Any],
527
520
  stream_slice: Optional[StreamSlice],
528
- ) -> Iterable[Record]:
521
+ ) -> Iterable[StreamData]:
529
522
  yield from self._parse_response(
530
523
  response,
531
524
  stream_slice=stream_slice,
@@ -569,7 +562,7 @@ class SimpleRetrieverTestReadDecorator(SimpleRetriever):
569
562
  next_page_token: Optional[Mapping[str, Any]] = None,
570
563
  ) -> Optional[requests.Response]:
571
564
  return self.requester.send_request(
572
- path=self._paginator_path(next_page_token=next_page_token),
565
+ path=self._paginator_path(),
573
566
  stream_state=stream_state,
574
567
  stream_slice=stream_slice,
575
568
  next_page_token=next_page_token,
@@ -1,6 +1,6 @@
1
1
  # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
2
 
3
- from typing import Any, Iterable, Mapping, Optional
3
+ from typing import Any, Callable, Iterable, Mapping, Optional
4
4
 
5
5
  from airbyte_cdk.sources.declarative.retrievers import Retriever
6
6
  from airbyte_cdk.sources.message import MessageRepository
@@ -16,7 +16,7 @@ class DeclarativePartitionFactory:
16
16
  self,
17
17
  stream_name: str,
18
18
  json_schema: Mapping[str, Any],
19
- retriever: Retriever,
19
+ retriever_factory: Callable[[], Retriever],
20
20
  message_repository: MessageRepository,
21
21
  ) -> None:
22
22
  """
@@ -26,14 +26,14 @@ class DeclarativePartitionFactory:
26
26
  """
27
27
  self._stream_name = stream_name
28
28
  self._json_schema = json_schema
29
- self._retriever = retriever
29
+ self._retriever_factory = retriever_factory
30
30
  self._message_repository = message_repository
31
31
 
32
32
  def create(self, stream_slice: StreamSlice) -> Partition:
33
33
  return DeclarativePartition(
34
34
  self._stream_name,
35
35
  self._json_schema,
36
- self._retriever,
36
+ self._retriever_factory(),
37
37
  self._message_repository,
38
38
  stream_slice,
39
39
  )
@@ -11,8 +11,6 @@ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
11
11
 
12
12
  @dataclass
13
13
  class FlattenFields(RecordTransformation):
14
- flatten_lists: bool = True
15
-
16
14
  def transform(
17
15
  self,
18
16
  record: Dict[str, Any],
@@ -41,7 +39,7 @@ class FlattenFields(RecordTransformation):
41
39
  )
42
40
  stack.append((value, new_key))
43
41
 
44
- elif isinstance(current_record, list) and self.flatten_lists:
42
+ elif isinstance(current_record, list):
45
43
  for i, item in enumerate(current_record):
46
44
  force_with_parent_name = True
47
45
  stack.append((item, f"{parent_key}.{i}"))
@@ -31,17 +31,6 @@ class DeliverRawFiles(BaseModel):
31
31
 
32
32
  delivery_type: Literal["use_file_transfer"] = Field("use_file_transfer", const=True)
33
33
 
34
- preserve_directory_structure: bool = Field(
35
- title="Preserve Sub-Directories in File Paths",
36
- description=(
37
- "If enabled, sends subdirectory folder structure "
38
- "along with source file names to the destination. "
39
- "Otherwise, files will be synced by their names only. "
40
- "This option is ignored when file-based replication is not enabled."
41
- ),
42
- default=True,
43
- )
44
-
45
34
 
46
35
  class AbstractFileBasedSpec(BaseModel):
47
36
  """
@@ -111,40 +111,6 @@ class ErrorListingFiles(BaseFileBasedSourceError):
111
111
  pass
112
112
 
113
113
 
114
- class DuplicatedFilesError(BaseFileBasedSourceError):
115
- def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
- self._duplicated_files_names = duplicated_files_names
117
- self._stream_name: str = kwargs["stream"]
118
- super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
-
120
- def _format_duplicate_files_error_message(self) -> str:
121
- duplicated_files_messages = []
122
- for duplicated_file in self._duplicated_files_names:
123
- for duplicated_file_name, file_paths in duplicated_file.items():
124
- file_duplicated_message = (
125
- f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
- + "".join(f"\n - {file_paths}")
127
- )
128
- duplicated_files_messages.append(file_duplicated_message)
129
-
130
- error_message = (
131
- f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
- "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
- "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
- + "\n".join(duplicated_files_messages)
135
- )
136
-
137
- return error_message
138
-
139
- def __repr__(self) -> str:
140
- """Return a string representation of the exception."""
141
- class_name = self.__class__.__name__
142
- properties_str = ", ".join(
143
- f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
- )
145
- return f"{class_name}({properties_str})"
146
-
147
-
148
114
  class CustomFileBasedException(AirbyteTracedException):
149
115
  """
150
116
  A specialized exception for file-based connectors.
@@ -242,7 +242,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
242
242
  stream=self._make_default_stream(
243
243
  stream_config=stream_config,
244
244
  cursor=cursor,
245
- parsed_config=parsed_config,
245
+ use_file_transfer=self._use_file_transfer(parsed_config),
246
246
  ),
247
247
  source=self,
248
248
  logger=self.logger,
@@ -273,7 +273,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
273
273
  stream=self._make_default_stream(
274
274
  stream_config=stream_config,
275
275
  cursor=cursor,
276
- parsed_config=parsed_config,
276
+ use_file_transfer=self._use_file_transfer(parsed_config),
277
277
  ),
278
278
  source=self,
279
279
  logger=self.logger,
@@ -285,7 +285,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
285
285
  stream = self._make_default_stream(
286
286
  stream_config=stream_config,
287
287
  cursor=cursor,
288
- parsed_config=parsed_config,
288
+ use_file_transfer=self._use_file_transfer(parsed_config),
289
289
  )
290
290
 
291
291
  streams.append(stream)
@@ -298,7 +298,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
298
298
  self,
299
299
  stream_config: FileBasedStreamConfig,
300
300
  cursor: Optional[AbstractFileBasedCursor],
301
- parsed_config: AbstractFileBasedSpec,
301
+ use_file_transfer: bool = False,
302
302
  ) -> AbstractFileBasedStream:
303
303
  return DefaultFileBasedStream(
304
304
  config=stream_config,
@@ -310,8 +310,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
310
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
311
311
  errors_collector=self.errors_collector,
312
312
  cursor=cursor,
313
- use_file_transfer=self._use_file_transfer(parsed_config),
314
- preserve_directory_structure=self._preserve_directory_structure(parsed_config),
313
+ use_file_transfer=use_file_transfer,
315
314
  )
316
315
 
317
316
  def _get_stream_from_catalog(
@@ -386,25 +385,3 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
386
385
  and parsed_config.delivery_method.delivery_type == "use_file_transfer"
387
386
  )
388
387
  return use_file_transfer
389
-
390
- @staticmethod
391
- def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
- """
393
- Determines whether to preserve directory structure during file transfer.
394
-
395
- When enabled, files maintain their subdirectory paths in the destination.
396
- When disabled, files are flattened to the root of the destination.
397
-
398
- Args:
399
- parsed_config: The parsed configuration containing delivery method settings
400
-
401
- Returns:
402
- True if directory structure should be preserved (default), False otherwise
403
- """
404
- if (
405
- FileBasedSource._use_file_transfer(parsed_config)
406
- and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
- and parsed_config.delivery_method.preserve_directory_structure is not None
408
- ):
409
- return parsed_config.delivery_method.preserve_directory_structure
410
- return True
@@ -135,17 +135,6 @@ class AbstractFileBasedStreamReader(ABC):
135
135
  return use_file_transfer
136
136
  return False
137
137
 
138
- def preserve_directory_structure(self) -> bool:
139
- # fall back to preserve subdirectories if config is not present or incomplete
140
- if (
141
- self.use_file_transfer()
142
- and self.config
143
- and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
- and self.config.delivery_method.preserve_directory_structure is not None
145
- ):
146
- return self.config.delivery_method.preserve_directory_structure
147
- return True
148
-
149
138
  @abstractmethod
150
139
  def get_file(
151
140
  self, file: RemoteFile, local_directory: str, logger: logging.Logger
@@ -170,13 +159,10 @@ class AbstractFileBasedStreamReader(ABC):
170
159
  """
171
160
  ...
172
161
 
173
- def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
- preserve_directory_structure = self.preserve_directory_structure()
175
- if preserve_directory_structure:
176
- # Remove left slashes from source path format to make relative path for writing locally
177
- file_relative_path = file.uri.lstrip("/")
178
- else:
179
- file_relative_path = path.basename(file.uri)
162
+ @staticmethod
163
+ def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164
+ # Remove left slashes from source path format to make relative path for writing locally
165
+ file_relative_path = file.uri.lstrip("/")
180
166
  local_file_path = path.join(local_directory, file_relative_path)
181
167
 
182
168
  # Ensure the local directory exists
@@ -2,7 +2,6 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
- import os
6
5
  import traceback
7
6
  from datetime import datetime
8
7
  from io import BytesIO, IOBase
@@ -43,34 +42,12 @@ unstructured_partition_pdf = None
43
42
  unstructured_partition_docx = None
44
43
  unstructured_partition_pptx = None
45
44
 
46
- AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
47
- TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
48
-
49
-
50
- def get_nltk_temp_folder() -> str:
51
- """
52
- For non-root connectors /tmp is not currently writable, but we should allow it in the future.
53
- It's safe to use /airbyte for now. Fallback to /tmp for local development.
54
- """
55
- try:
56
- nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
57
- os.makedirs(nltk_data_dir, exist_ok=True)
58
- except OSError:
59
- nltk_data_dir = TMP_NLTK_DATA_DIR
60
- os.makedirs(nltk_data_dir, exist_ok=True)
61
- return nltk_data_dir
62
-
63
-
64
45
  try:
65
- nltk_data_dir = get_nltk_temp_folder()
66
- nltk.data.path.append(nltk_data_dir)
67
46
  nltk.data.find("tokenizers/punkt.zip")
68
47
  nltk.data.find("tokenizers/punkt_tab.zip")
69
- nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
70
48
  except LookupError:
71
- nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
72
- nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
73
- nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
49
+ nltk.download("punkt")
50
+ nltk.download("punkt_tab")
74
51
 
75
52
 
76
53
  def optional_decode(contents: Union[str, bytes]) -> str:
@@ -5,17 +5,14 @@
5
5
  import asyncio
6
6
  import itertools
7
7
  import traceback
8
- from collections import defaultdict
9
8
  from copy import deepcopy
10
9
  from functools import cache
11
- from os import path
12
- from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
10
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Union
13
11
 
14
12
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, FailureType, Level
15
13
  from airbyte_cdk.models import Type as MessageType
16
14
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
17
15
  from airbyte_cdk.sources.file_based.exceptions import (
18
- DuplicatedFilesError,
19
16
  FileBasedSourceError,
20
17
  InvalidSchemaError,
21
18
  MissingSchemaError,
@@ -46,8 +43,6 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
46
43
  """
47
44
 
48
45
  FILE_TRANSFER_KW = "use_file_transfer"
49
- PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
- FILES_KEY = "files"
51
46
  DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
52
47
  ab_last_mod_col = "_ab_source_file_last_modified"
53
48
  ab_file_name_col = "_ab_source_file_url"
@@ -55,15 +50,10 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
55
50
  source_file_url = "source_file_url"
56
51
  airbyte_columns = [ab_last_mod_col, ab_file_name_col]
57
52
  use_file_transfer = False
58
- preserve_directory_structure = True
59
53
 
60
54
  def __init__(self, **kwargs: Any):
61
55
  if self.FILE_TRANSFER_KW in kwargs:
62
56
  self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
63
- if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
64
- self.preserve_directory_structure = kwargs.pop(
65
- self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
66
- )
67
57
  super().__init__(**kwargs)
68
58
 
69
59
  @property
@@ -108,33 +98,15 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
108
98
  else:
109
99
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
110
100
 
111
- def _duplicated_files_names(
112
- self, slices: List[dict[str, List[RemoteFile]]]
113
- ) -> List[dict[str, List[str]]]:
114
- seen_file_names: Dict[str, List[str]] = defaultdict(list)
115
- for file_slice in slices:
116
- for file_found in file_slice[self.FILES_KEY]:
117
- file_name = path.basename(file_found.uri)
118
- seen_file_names[file_name].append(file_found.uri)
119
- return [
120
- {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
121
- ]
122
-
123
101
  def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
124
102
  # Sort files by last_modified, uri and return them grouped by last_modified
125
103
  all_files = self.list_files()
126
104
  files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
127
105
  sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
128
106
  slices = [
129
- {self.FILES_KEY: list(group[1])}
107
+ {"files": list(group[1])}
130
108
  for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
131
109
  ]
132
- if slices and not self.preserve_directory_structure:
133
- duplicated_files_names = self._duplicated_files_names(slices)
134
- if duplicated_files_names:
135
- raise DuplicatedFilesError(
136
- stream=self.name, duplicated_files_names=duplicated_files_names
137
- )
138
110
  return slices
139
111
 
140
112
  def transform_record(
@@ -81,10 +81,10 @@ class AbstractOauth2Authenticator(AuthBase):
81
81
  Override to define additional parameters
82
82
  """
83
83
  payload: MutableMapping[str, Any] = {
84
- self.get_grant_type_name(): self.get_grant_type(),
85
- self.get_client_id_name(): self.get_client_id(),
86
- self.get_client_secret_name(): self.get_client_secret(),
87
- self.get_refresh_token_name(): self.get_refresh_token(),
84
+ "grant_type": self.get_grant_type(),
85
+ "client_id": self.get_client_id(),
86
+ "client_secret": self.get_client_secret(),
87
+ "refresh_token": self.get_refresh_token(),
88
88
  }
89
89
 
90
90
  if self.get_scopes():
@@ -206,26 +206,14 @@ class AbstractOauth2Authenticator(AuthBase):
206
206
  def get_token_refresh_endpoint(self) -> Optional[str]:
207
207
  """Returns the endpoint to refresh the access token"""
208
208
 
209
- @abstractmethod
210
- def get_client_id_name(self) -> str:
211
- """The client id name to authenticate"""
212
-
213
209
  @abstractmethod
214
210
  def get_client_id(self) -> str:
215
211
  """The client id to authenticate"""
216
212
 
217
- @abstractmethod
218
- def get_client_secret_name(self) -> str:
219
- """The client secret name to authenticate"""
220
-
221
213
  @abstractmethod
222
214
  def get_client_secret(self) -> str:
223
215
  """The client secret to authenticate"""
224
216
 
225
- @abstractmethod
226
- def get_refresh_token_name(self) -> str:
227
- """The refresh token name to authenticate"""
228
-
229
217
  @abstractmethod
230
218
  def get_refresh_token(self) -> Optional[str]:
231
219
  """The token used to refresh the access token when it expires"""
@@ -258,10 +246,6 @@ class AbstractOauth2Authenticator(AuthBase):
258
246
  def get_grant_type(self) -> str:
259
247
  """Returns grant_type specified for requesting access_token"""
260
248
 
261
- @abstractmethod
262
- def get_grant_type_name(self) -> str:
263
- """Returns grant_type specified name for requesting access_token"""
264
-
265
249
  @property
266
250
  @abstractmethod
267
251
  def access_token(self) -> str: