airbyte-cdk 6.41.9__py3-none-any.whl → 6.41.9.dev4101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. airbyte_cdk/models/__init__.py +1 -0
  2. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -0
  3. airbyte_cdk/sources/declarative/async_job/job.py +0 -6
  4. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
  5. airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
  6. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +22 -0
  7. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +39 -64
  8. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
  9. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +25 -45
  10. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +24 -45
  11. airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
  12. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +1 -5
  13. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +61 -0
  14. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +17 -4
  15. airbyte_cdk/sources/file_based/file_types/file_transfer.py +2 -8
  16. airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
  17. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
  18. airbyte_cdk/sources/types.py +11 -0
  19. airbyte_cdk/sources/utils/files_directory.py +15 -0
  20. airbyte_cdk/sources/utils/record_helper.py +8 -1
  21. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/METADATA +2 -2
  22. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/RECORD +26 -25
  23. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
  24. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/LICENSE.txt +0 -0
  25. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/LICENSE_SHORT +0 -0
  26. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/WHEEL +0 -0
  27. {airbyte_cdk-6.41.9.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/entry_points.txt +0 -0
@@ -19,6 +19,7 @@ from .airbyte_protocol import (
19
19
  AirbyteMessage,
20
20
  AirbyteProtocol,
21
21
  AirbyteRecordMessage,
22
+ AirbyteRecordMessageFileReference,
22
23
  AirbyteStateBlob,
23
24
  AirbyteStateMessage,
24
25
  AirbyteStateStats,
@@ -150,6 +150,7 @@ class ConcurrentReadProcessor:
150
150
  stream_name=record.stream_name,
151
151
  data_or_message=record.data,
152
152
  is_file_transfer_message=record.is_file_transfer_message,
153
+ file_reference=record.file_reference,
153
154
  )
154
155
  stream = self._stream_name_to_instance[record.stream_name]
155
156
 
@@ -34,12 +34,6 @@ class AsyncJob:
34
34
 
35
35
  def status(self) -> AsyncJobStatus:
36
36
  if self._timer.has_timed_out():
37
- # TODO: we should account the fact that,
38
- # certain APIs could send the `Timeout` status,
39
- # thus we should not return `Timeout` in that case,
40
- # but act based on the scenario.
41
-
42
- # the default behavior is to return `Timeout` status and retry.
43
37
  return AsyncJobStatus.TIMED_OUT
44
38
  return self._status
45
39
 
@@ -44,21 +44,16 @@ class AsyncPartition:
44
44
  This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
45
45
  """
46
46
 
47
- _DEFAULT_MAX_JOB_RETRY = 3
47
+ _MAX_NUMBER_OF_ATTEMPTS = 3
48
48
 
49
- def __init__(
50
- self, jobs: List[AsyncJob], stream_slice: StreamSlice, job_max_retry: Optional[int] = None
51
- ) -> None:
49
+ def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
52
50
  self._attempts_per_job = {job: 1 for job in jobs}
53
51
  self._stream_slice = stream_slice
54
- self._job_max_retry = (
55
- job_max_retry if job_max_retry is not None else self._DEFAULT_MAX_JOB_RETRY
56
- )
57
52
 
58
53
  def has_reached_max_attempt(self) -> bool:
59
54
  return any(
60
55
  map(
61
- lambda attempt_count: attempt_count >= self._job_max_retry,
56
+ lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
62
57
  self._attempts_per_job.values(),
63
58
  )
64
59
  )
@@ -67,7 +62,7 @@ class AsyncPartition:
67
62
  current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
68
63
  if current_attempt_count is None:
69
64
  raise ValueError("Could not find job to replace")
70
- elif current_attempt_count >= self._job_max_retry:
65
+ elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
71
66
  raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
72
67
 
73
68
  new_attempt_count = current_attempt_count + 1
@@ -160,7 +155,6 @@ class AsyncJobOrchestrator:
160
155
  message_repository: MessageRepository,
161
156
  exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
162
157
  has_bulk_parent: bool = False,
163
- job_max_retry: Optional[int] = None,
164
158
  ) -> None:
165
159
  """
166
160
  If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
@@ -181,12 +175,11 @@ class AsyncJobOrchestrator:
181
175
  self._message_repository = message_repository
182
176
  self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
183
177
  self._has_bulk_parent = has_bulk_parent
184
- self._job_max_retry = job_max_retry
185
178
 
186
179
  self._non_breaking_exceptions: List[Exception] = []
187
180
 
188
181
  def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
189
- failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT)
182
+ failed_status_jobs = (AsyncJobStatus.FAILED,)
190
183
  jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
191
184
  for job in jobs_to_replace:
192
185
  new_job = self._start_job(job.job_parameters(), job.api_job_id())
@@ -221,7 +214,7 @@ class AsyncJobOrchestrator:
221
214
  for _slice in self._slice_iterator:
222
215
  at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
223
216
  job = self._start_job(_slice)
224
- self._running_partitions.append(AsyncPartition([job], _slice, self._job_max_retry))
217
+ self._running_partitions.append(AsyncPartition([job], _slice))
225
218
  if self._has_bulk_parent and self._slice_iterator.has_next():
226
219
  break
227
220
  except ConcurrentJobLimitReached:
@@ -370,7 +363,7 @@ class AsyncJobOrchestrator:
370
363
  self._reallocate_partition(current_running_partitions, partition)
371
364
 
372
365
  # We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
373
- self._remove_completed_jobs(partition)
366
+ self._remove_completed_or_timed_out_jobs(partition)
374
367
 
375
368
  # update the referenced list with running partitions
376
369
  self._running_partitions = current_running_partitions
@@ -385,7 +378,11 @@ class AsyncJobOrchestrator:
385
378
  def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
386
379
  for job in partition.jobs:
387
380
  if job.status() == AsyncJobStatus.TIMED_OUT:
388
- self._abort_job(job, free_job_allocation=False)
381
+ self._abort_job(job, free_job_allocation=True)
382
+ raise AirbyteTracedException(
383
+ internal_message=f"Job {job.api_job_id()} has timed out. Try increasing the `polling job timeout`.",
384
+ failure_type=FailureType.config_error,
385
+ )
389
386
 
390
387
  def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
391
388
  try:
@@ -395,7 +392,7 @@ class AsyncJobOrchestrator:
395
392
  except Exception as exception:
396
393
  LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
397
394
 
398
- def _remove_completed_jobs(self, partition: AsyncPartition) -> None:
395
+ def _remove_completed_or_timed_out_jobs(self, partition: AsyncPartition) -> None:
399
396
  """
400
397
  Remove completed or timed out jobs from the partition.
401
398
 
@@ -403,7 +400,7 @@ class AsyncJobOrchestrator:
403
400
  partition (AsyncPartition): The partition to process.
404
401
  """
405
402
  for job in partition.jobs:
406
- if job.status() == AsyncJobStatus.COMPLETED:
403
+ if job.status() in [AsyncJobStatus.COMPLETED, AsyncJobStatus.TIMED_OUT]:
407
404
  self._job_tracker.remove_job(job.api_job_id())
408
405
 
409
406
  def _reallocate_partition(
@@ -418,7 +415,10 @@ class AsyncJobOrchestrator:
418
415
  current_running_partitions (list): The list of currently running partitions.
419
416
  partition (AsyncPartition): The partition to reallocate.
420
417
  """
421
- current_running_partitions.insert(0, partition)
418
+ for job in partition.jobs:
419
+ if job.status() != AsyncJobStatus.TIMED_OUT:
420
+ # allow the FAILED jobs to be re-allocated for partition
421
+ current_running_partitions.insert(0, partition)
422
422
 
423
423
  def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
424
424
  """
@@ -3,11 +3,9 @@
3
3
  import logging
4
4
  import threading
5
5
  import uuid
6
- from dataclasses import dataclass, field
7
- from typing import Any, Mapping, Set, Union
6
+ from typing import Set
8
7
 
9
8
  from airbyte_cdk.logger import lazy_log
10
- from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
11
9
 
12
10
  LOGGER = logging.getLogger("airbyte")
13
11
 
@@ -16,29 +14,15 @@ class ConcurrentJobLimitReached(Exception):
16
14
  pass
17
15
 
18
16
 
19
- @dataclass
20
17
  class JobTracker:
21
- limit: Union[int, str]
22
- config: Mapping[str, Any] = field(default_factory=dict)
23
-
24
- def __post_init__(self) -> None:
18
+ def __init__(self, limit: int):
25
19
  self._jobs: Set[str] = set()
26
- self._lock = threading.Lock()
27
- if isinstance(self.limit, str):
28
- try:
29
- self.limit = int(
30
- InterpolatedString(self.limit, parameters={}).eval(config=self.config)
31
- )
32
- except Exception as e:
33
- LOGGER.warning(
34
- f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
35
- )
36
- self.limit = 1
37
- if self.limit < 1:
20
+ if limit < 1:
38
21
  LOGGER.warning(
39
- f"The `max_concurrent_async_job_count` property is less than 1: {self.limit}. Setting to 1. Please update the source manifest to set a valid value."
22
+ f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
40
23
  )
41
- self._limit = self.limit if self.limit >= 1 else 1
24
+ self._limit = 1 if limit < 1 else limit
25
+ self._lock = threading.Lock()
42
26
 
43
27
  def try_to_get_intent(self) -> str:
44
28
  lazy_log(
@@ -25,6 +25,7 @@ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global impor
25
25
  PerPartitionWithGlobalCursor,
26
26
  )
27
27
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
+ from airbyte_cdk.sources.declarative.models import FileUploader
28
29
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
29
30
  ConcurrencyLevel as ConcurrencyLevelModel,
30
31
  )
@@ -206,6 +207,20 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
206
207
  # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
207
208
  # so we need to treat them as synchronous
208
209
 
210
+ file_uploader = None
211
+ if isinstance(declarative_stream, DeclarativeStream):
212
+ file_uploader = (
213
+ self._constructor.create_component(
214
+ model_type=FileUploader,
215
+ component_definition=name_to_stream_mapping[declarative_stream.name][
216
+ "file_uploader"
217
+ ],
218
+ config=config,
219
+ )
220
+ if "file_uploader" in name_to_stream_mapping[declarative_stream.name]
221
+ else None
222
+ )
223
+
209
224
  if (
210
225
  isinstance(declarative_stream, DeclarativeStream)
211
226
  and name_to_stream_mapping[declarative_stream.name]["type"]
@@ -273,6 +288,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
273
288
  declarative_stream.get_json_schema(),
274
289
  retriever,
275
290
  self.message_repository,
291
+ file_uploader,
276
292
  ),
277
293
  stream_slicer=declarative_stream.retriever.stream_slicer,
278
294
  )
@@ -303,6 +319,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
303
319
  declarative_stream.get_json_schema(),
304
320
  retriever,
305
321
  self.message_repository,
322
+ file_uploader,
306
323
  ),
307
324
  stream_slicer=cursor,
308
325
  )
@@ -322,6 +339,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
322
339
  else None,
323
340
  logger=self.logger,
324
341
  cursor=cursor,
342
+ supports_file_transfer=bool(file_uploader),
325
343
  )
326
344
  )
327
345
  elif (
@@ -333,6 +351,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
333
351
  declarative_stream.get_json_schema(),
334
352
  declarative_stream.retriever,
335
353
  self.message_repository,
354
+ file_uploader,
336
355
  ),
337
356
  declarative_stream.retriever.stream_slicer,
338
357
  )
@@ -353,6 +372,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
353
372
  cursor_field=None,
354
373
  logger=self.logger,
355
374
  cursor=final_state_cursor,
375
+ supports_file_transfer=bool(file_uploader),
356
376
  )
357
377
  )
358
378
  elif (
@@ -392,6 +412,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
392
412
  declarative_stream.get_json_schema(),
393
413
  retriever,
394
414
  self.message_repository,
415
+ file_uploader,
395
416
  ),
396
417
  perpartition_cursor,
397
418
  )
@@ -406,6 +427,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
406
427
  cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
407
428
  logger=self.logger,
408
429
  cursor=perpartition_cursor,
430
+ supports_file_transfer=bool(file_uploader),
409
431
  )
410
432
  )
411
433
  else:
@@ -47,12 +47,7 @@ properties:
47
47
  max_concurrent_async_job_count:
48
48
  title: Maximum Concurrent Asynchronous Jobs
49
49
  description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.
50
- type:
51
- - integer
52
- - string
53
- examples:
54
- - 3
55
- - "{{ config['max_concurrent_async_job_count'] }}"
50
+ type: integer
56
51
  metadata:
57
52
  type: object
58
53
  description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.
@@ -1427,6 +1422,33 @@ definitions:
1427
1422
  - "$ref": "#/definitions/LegacyToPerPartitionStateMigration"
1428
1423
  - "$ref": "#/definitions/CustomStateMigration"
1429
1424
  default: []
1425
+ file_uploader:
1426
+ title: File Uploader
1427
+ description: (experimental) Describes how to fetch a file
1428
+ type: object
1429
+ required:
1430
+ - type
1431
+ - requester
1432
+ - download_target_extractor
1433
+ properties:
1434
+ type:
1435
+ type: string
1436
+ enum: [ FileUploader ]
1437
+ requester:
1438
+ description: Requester component that describes how to prepare HTTP requests to send to the source API.
1439
+ anyOf:
1440
+ - "$ref": "#/definitions/CustomRequester"
1441
+ - "$ref": "#/definitions/HttpRequester"
1442
+ download_target_extractor:
1443
+ description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response
1444
+ anyOf:
1445
+ - "$ref": "#/definitions/CustomRecordExtractor"
1446
+ - "$ref": "#/definitions/DpathExtractor"
1447
+ file_extractor:
1448
+ description: Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content
1449
+ anyOf:
1450
+ - "$ref": "#/definitions/CustomRecordExtractor"
1451
+ - "$ref": "#/definitions/DpathExtractor"
1430
1452
  $parameters:
1431
1453
  type: object
1432
1454
  additional_properties: true
@@ -2197,8 +2219,7 @@ definitions:
2197
2219
  type: object
2198
2220
  additionalProperties: true
2199
2221
  JsonDecoder:
2200
- title: JSON
2201
- description: Select 'JSON' if the response is formatted as a JSON object.
2222
+ title: Json Decoder
2202
2223
  type: object
2203
2224
  required:
2204
2225
  - type
@@ -2207,8 +2228,8 @@ definitions:
2207
2228
  type: string
2208
2229
  enum: [JsonDecoder]
2209
2230
  JsonlDecoder:
2210
- title: JSON Lines
2211
- description: Select 'JSON Lines' if the response consists of JSON objects separated by new lines ('\n') in JSONL format.
2231
+ title: JSONL Decoder
2232
+ description: Use this if the response consists of JSON objects separated by new lines (`\n`) in JSONL format.
2212
2233
  type: object
2213
2234
  required:
2214
2235
  - type
@@ -2333,8 +2354,8 @@ definitions:
2333
2354
  type: object
2334
2355
  additionalProperties: true
2335
2356
  IterableDecoder:
2336
- title: Iterable
2337
- description: Select 'Iterable' if the response consists of strings separated by new lines (`\n`). The string will then be wrapped into a JSON object with the `record` key.
2357
+ title: Iterable Decoder
2358
+ description: Use this if the response consists of strings separated by new lines (`\n`). The Decoder will wrap each row into a JSON object with the `record` key.
2338
2359
  type: object
2339
2360
  required:
2340
2361
  - type
@@ -2343,8 +2364,8 @@ definitions:
2343
2364
  type: string
2344
2365
  enum: [IterableDecoder]
2345
2366
  XmlDecoder:
2346
- title: XML
2347
- description: Select 'XML' if the response consists of XML-formatted data.
2367
+ title: XML Decoder
2368
+ description: Use this if the response is XML.
2348
2369
  type: object
2349
2370
  required:
2350
2371
  - type
@@ -2375,8 +2396,8 @@ definitions:
2375
2396
  type: object
2376
2397
  additionalProperties: true
2377
2398
  ZipfileDecoder:
2378
- title: ZIP File
2379
- description: Select 'ZIP file' for response data that is returned as a zipfile. Requires specifying an inner data type/decoder to parse the unzipped data.
2399
+ title: Zipfile Decoder
2400
+ description: Decoder for response data that is returned as zipfile(s).
2380
2401
  type: object
2381
2402
  additionalProperties: true
2382
2403
  required:
@@ -2900,7 +2921,7 @@ definitions:
2900
2921
  title: Lazy Read Pointer
2901
2922
  description: If set, this will enable lazy reading, using the initial read of parent records to extract child records.
2902
2923
  type: array
2903
- default: []
2924
+ default: [ ]
2904
2925
  items:
2905
2926
  - type: string
2906
2927
  interpolation_context:
@@ -3205,7 +3226,7 @@ definitions:
3205
3226
  properties:
3206
3227
  type:
3207
3228
  type: string
3208
- enum: [StateDelegatingStream]
3229
+ enum: [ StateDelegatingStream ]
3209
3230
  name:
3210
3231
  title: Name
3211
3232
  description: The stream name.
@@ -3260,14 +3281,12 @@ definitions:
3260
3281
  - "$ref": "#/definitions/CustomPartitionRouter"
3261
3282
  - "$ref": "#/definitions/ListPartitionRouter"
3262
3283
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3263
- - "$ref": "#/definitions/GroupingPartitionRouter"
3264
3284
  - type: array
3265
3285
  items:
3266
3286
  anyOf:
3267
3287
  - "$ref": "#/definitions/CustomPartitionRouter"
3268
3288
  - "$ref": "#/definitions/ListPartitionRouter"
3269
3289
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3270
- - "$ref": "#/definitions/GroupingPartitionRouter"
3271
3290
  decoder:
3272
3291
  title: Decoder
3273
3292
  description: Component decoding the response so records can be extracted.
@@ -3284,8 +3303,6 @@ definitions:
3284
3303
  type: object
3285
3304
  additionalProperties: true
3286
3305
  GzipDecoder:
3287
- title: gzip
3288
- description: Select 'gzip' for response data that is compressed with gzip. Requires specifying an inner data type/decoder to parse the decompressed data.
3289
3306
  type: object
3290
3307
  required:
3291
3308
  - type
@@ -3301,8 +3318,6 @@ definitions:
3301
3318
  - "$ref": "#/definitions/JsonDecoder"
3302
3319
  - "$ref": "#/definitions/JsonlDecoder"
3303
3320
  CsvDecoder:
3304
- title: CSV
3305
- description: "Select 'CSV' for response data that is formatted as CSV (comma-separated values). Can specify an encoding (default: 'utf-8') and a delimiter (default: ',')."
3306
3321
  type: object
3307
3322
  required:
3308
3323
  - type
@@ -3433,14 +3448,12 @@ definitions:
3433
3448
  - "$ref": "#/definitions/CustomPartitionRouter"
3434
3449
  - "$ref": "#/definitions/ListPartitionRouter"
3435
3450
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3436
- - "$ref": "#/definitions/GroupingPartitionRouter"
3437
3451
  - type: array
3438
3452
  items:
3439
3453
  anyOf:
3440
3454
  - "$ref": "#/definitions/CustomPartitionRouter"
3441
3455
  - "$ref": "#/definitions/ListPartitionRouter"
3442
3456
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3443
- - "$ref": "#/definitions/GroupingPartitionRouter"
3444
3457
  decoder:
3445
3458
  title: Decoder
3446
3459
  description: Component decoding the response so records can be extracted.
@@ -3557,44 +3570,6 @@ definitions:
3557
3570
  $parameters:
3558
3571
  type: object
3559
3572
  additionalProperties: true
3560
- GroupingPartitionRouter:
3561
- title: Grouping Partition Router
3562
- description: >
3563
- A decorator on top of a partition router that groups partitions into batches of a specified size.
3564
- This is useful for APIs that support filtering by multiple partition keys in a single request.
3565
- Note that per-partition incremental syncs may not work as expected because the grouping
3566
- of partitions might change between syncs, potentially leading to inconsistent state tracking.
3567
- type: object
3568
- required:
3569
- - type
3570
- - group_size
3571
- - underlying_partition_router
3572
- properties:
3573
- type:
3574
- type: string
3575
- enum: [GroupingPartitionRouter]
3576
- group_size:
3577
- title: Group Size
3578
- description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
3579
- type: integer
3580
- examples:
3581
- - 10
3582
- - 50
3583
- underlying_partition_router:
3584
- title: Underlying Partition Router
3585
- description: The partition router whose output will be grouped. This can be any valid partition router component.
3586
- anyOf:
3587
- - "$ref": "#/definitions/CustomPartitionRouter"
3588
- - "$ref": "#/definitions/ListPartitionRouter"
3589
- - "$ref": "#/definitions/SubstreamPartitionRouter"
3590
- deduplicate:
3591
- title: Deduplicate Partitions
3592
- description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
3593
- type: boolean
3594
- default: true
3595
- $parameters:
3596
- type: object
3597
- additionalProperties: true
3598
3573
  WaitUntilTimeFromHeader:
3599
3574
  title: Wait Until Time Defined In Response Header
3600
3575
  description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
@@ -79,7 +79,6 @@ class ConcurrentPerPartitionCursor(Cursor):
79
79
  connector_state_manager: ConnectorStateManager,
80
80
  connector_state_converter: AbstractStreamStateConverter,
81
81
  cursor_field: CursorField,
82
- use_global_cursor: bool = False,
83
82
  ) -> None:
84
83
  self._global_cursor: Optional[StreamState] = {}
85
84
  self._stream_name = stream_name
@@ -107,7 +106,7 @@ class ConcurrentPerPartitionCursor(Cursor):
107
106
  self._lookback_window: int = 0
108
107
  self._parent_state: Optional[StreamState] = None
109
108
  self._number_of_partitions: int = 0
110
- self._use_global_cursor: bool = use_global_cursor
109
+ self._use_global_cursor: bool = False
111
110
  self._partition_serializer = PerPartitionKeySerializer()
112
111
  # Track the last time a state message was emitted
113
112
  self._last_emission_time: float = 0.0
@@ -1890,10 +1890,9 @@ class DeclarativeSource1(BaseModel):
1890
1890
  spec: Optional[Spec] = None
1891
1891
  concurrency_level: Optional[ConcurrencyLevel] = None
1892
1892
  api_budget: Optional[HTTPAPIBudget] = None
1893
- max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
1893
+ max_concurrent_async_job_count: Optional[int] = Field(
1894
1894
  None,
1895
1895
  description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
1896
- examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
1897
1896
  title="Maximum Concurrent Asynchronous Jobs",
1898
1897
  )
1899
1898
  metadata: Optional[Dict[str, Any]] = Field(
@@ -1923,10 +1922,9 @@ class DeclarativeSource2(BaseModel):
1923
1922
  spec: Optional[Spec] = None
1924
1923
  concurrency_level: Optional[ConcurrencyLevel] = None
1925
1924
  api_budget: Optional[HTTPAPIBudget] = None
1926
- max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
1925
+ max_concurrent_async_job_count: Optional[int] = Field(
1927
1926
  None,
1928
1927
  description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
1929
- examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
1930
1928
  title="Maximum Concurrent Asynchronous Jobs",
1931
1929
  )
1932
1930
  metadata: Optional[Dict[str, Any]] = Field(
@@ -2280,6 +2278,22 @@ class StateDelegatingStream(BaseModel):
2280
2278
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
2281
2279
 
2282
2280
 
2281
+ class FileUploader(BaseModel):
2282
+ type: Literal["FileUploader"]
2283
+ requester: Union[CustomRequester, HttpRequester] = Field(
2284
+ ...,
2285
+ description="Requester component that describes how to prepare HTTP requests to send to the source API.",
2286
+ )
2287
+ download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
2288
+ ...,
2289
+ description="Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response",
2290
+ )
2291
+ file_extractor: Optional[Union[CustomRecordExtractor, DpathExtractor]] = Field(
2292
+ None,
2293
+ description="Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content",
2294
+ )
2295
+
2296
+
2283
2297
  class SimpleRetriever(BaseModel):
2284
2298
  type: Literal["SimpleRetriever"]
2285
2299
  record_selector: RecordSelector = Field(
@@ -2303,21 +2317,18 @@ class SimpleRetriever(BaseModel):
2303
2317
  CustomPartitionRouter,
2304
2318
  ListPartitionRouter,
2305
2319
  SubstreamPartitionRouter,
2306
- GroupingPartitionRouter,
2307
- List[
2308
- Union[
2309
- CustomPartitionRouter,
2310
- ListPartitionRouter,
2311
- SubstreamPartitionRouter,
2312
- GroupingPartitionRouter,
2313
- ]
2314
- ],
2320
+ List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
2315
2321
  ]
2316
2322
  ] = Field(
2317
2323
  [],
2318
2324
  description="PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.",
2319
2325
  title="Partition Router",
2320
2326
  )
2327
+ file_uploader: Optional[FileUploader] = Field(
2328
+ None,
2329
+ description="(experimental) Describes how to fetch a file",
2330
+ title="File Uploader",
2331
+ )
2321
2332
  decoder: Optional[
2322
2333
  Union[
2323
2334
  CustomDecoder,
@@ -2393,15 +2404,7 @@ class AsyncRetriever(BaseModel):
2393
2404
  CustomPartitionRouter,
2394
2405
  ListPartitionRouter,
2395
2406
  SubstreamPartitionRouter,
2396
- GroupingPartitionRouter,
2397
- List[
2398
- Union[
2399
- CustomPartitionRouter,
2400
- ListPartitionRouter,
2401
- SubstreamPartitionRouter,
2402
- GroupingPartitionRouter,
2403
- ]
2404
- ],
2407
+ List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
2405
2408
  ]
2406
2409
  ] = Field(
2407
2410
  [],
@@ -2453,29 +2456,6 @@ class SubstreamPartitionRouter(BaseModel):
2453
2456
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
2454
2457
 
2455
2458
 
2456
- class GroupingPartitionRouter(BaseModel):
2457
- type: Literal["GroupingPartitionRouter"]
2458
- group_size: int = Field(
2459
- ...,
2460
- description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
2461
- examples=[10, 50],
2462
- title="Group Size",
2463
- )
2464
- underlying_partition_router: Union[
2465
- CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
2466
- ] = Field(
2467
- ...,
2468
- description="The partition router whose output will be grouped. This can be any valid partition router component.",
2469
- title="Underlying Partition Router",
2470
- )
2471
- deduplicate: Optional[bool] = Field(
2472
- True,
2473
- description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
2474
- title="Deduplicate Partitions",
2475
- )
2476
- parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
2477
-
2478
-
2479
2459
  class HttpComponentsResolver(BaseModel):
2480
2460
  type: Literal["HttpComponentsResolver"]
2481
2461
  retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(