airbyte-cdk 6.61.3.post2.dev17299502224__py3-none-any.whl → 6.62.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
  2. airbyte_cdk/manifest_server/README.md +17 -3
  3. airbyte_cdk/manifest_server/openapi.yaml +27 -27
  4. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +2 -2
  5. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
  6. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
  7. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +196 -269
  8. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +4 -7
  9. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
  10. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
  11. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
  12. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
  13. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
  14. airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
  15. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
  16. airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
  17. airbyte_cdk/sources/file_based/file_types/excel_parser.py +3 -3
  18. airbyte_cdk/sources/message/repository.py +20 -0
  19. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/METADATA +6 -5
  20. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/RECORD +24 -23
  21. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE_SHORT +0 -0
  23. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/WHEEL +0 -0
  24. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/entry_points.txt +0 -0
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
146
146
  if "state" in stream_state:
147
147
  self._state_to_migrate_from = stream_state["state"]
148
148
 
149
- # Set parent state for partition routers based on parent streams
150
- self._partition_router.set_initial_state(stream_state)
149
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
150
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
151
+ # We are still keeping this line as a comment to be explicit about the past behavior.
152
+ # self._partition_router.set_initial_state(stream_state)
151
153
 
152
154
  def observe(self, stream_slice: StreamSlice, record: Record) -> None:
153
155
  self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
@@ -37,26 +37,31 @@ The server will start on `http://localhost:8000` by default.
37
37
  ## API Endpoints
38
38
 
39
39
  ### `/v1/manifest/test_read`
40
+
40
41
  Test reading from a specific stream in the manifest.
41
42
 
42
43
  **POST** - Test stream reading with configurable limits for records, pages, and slices.
43
44
 
44
45
  ### `/v1/manifest/check`
46
+
45
47
  Check configuration against a manifest.
46
48
 
47
49
  **POST** - Validates connector configuration and returns success/failure status with message.
48
50
 
49
51
  ### `/v1/manifest/discover`
52
+
50
53
  Discover streams from a manifest.
51
54
 
52
55
  **POST** - Returns the catalog of available streams from the manifest.
53
56
 
54
- ### `/v1/manifest/resolve`
57
+ ### `/v1/manifest/resolve`
58
+
55
59
  Resolve a manifest to its final configuration.
56
60
 
57
61
  **POST** - Returns the resolved manifest without dynamic stream generation.
58
62
 
59
63
  ### `/v1/manifest/full_resolve`
64
+
60
65
  Fully resolve a manifest including dynamic streams.
61
66
 
62
67
  **POST** - Generates dynamic streams up to specified limits and includes them in the resolved manifest.
@@ -68,6 +73,7 @@ The manifest server supports custom Python components, but this feature is **dis
68
73
  ### Enabling Custom Components
69
74
 
70
75
  To allow custom Python components in your manifest files, set the environment variable:
76
+
71
77
  ```bash
72
78
  export AIRBYTE_ENABLE_UNSAFE_CODE=true
73
79
  ```
@@ -77,20 +83,25 @@ export AIRBYTE_ENABLE_UNSAFE_CODE=true
77
83
  The manifest server supports optional JWT bearer token authentication:
78
84
 
79
85
  ### Configuration
86
+
80
87
  Set the environment variable to enable authentication:
88
+
81
89
  ```bash
82
90
  export AB_JWT_SIGNATURE_SECRET="your-jwt-secret-key"
83
91
  ```
84
92
 
85
93
  ### Usage
94
+
86
95
  When authentication is enabled, include a valid JWT token in the Authorization header:
96
+
87
97
  ```bash
88
98
  curl -H "Authorization: Bearer <your-jwt-token>" \
89
99
  http://localhost:8000/v1/manifest/test_read
90
100
  ```
91
101
 
92
102
  ### Behavior
93
- - **Without `AB_JWT_SIGNATURE_SECRET`**: All requests pass through
103
+
104
+ - **Without `AB_JWT_SIGNATURE_SECRET`**: All requests pass through
94
105
  - **With `AB_JWT_SIGNATURE_SECRET`**: Requires valid JWT bearer token using HS256 algorithm
95
106
 
96
107
  ## OpenAPI Specification
@@ -98,6 +109,7 @@ curl -H "Authorization: Bearer <your-jwt-token>" \
98
109
  The manifest server provides an OpenAPI specification for API client generation:
99
110
 
100
111
  ### Generating the OpenAPI Spec
112
+
101
113
  ```bash
102
114
  # Generate OpenAPI YAML (default location)
103
115
  manifest-server generate-openapi
@@ -107,6 +119,7 @@ manifest-server generate-openapi --output /path/to/openapi.yaml
107
119
  ```
108
120
 
109
121
  The generated OpenAPI specification is consumed by other applications and tools to:
122
+
110
123
  - Generate API clients in various programming languages
111
124
  - Create SDK bindings for the manifest server
112
125
  - Provide API documentation and validation
@@ -115,6 +128,7 @@ The generated OpenAPI specification is consumed by other applications and tools
115
128
  ### Interactive API Documentation
116
129
 
117
130
  When running, interactive API documentation is available at:
131
+
118
132
  - Swagger UI: `http://localhost:8000/docs`
119
133
  - ReDoc: `http://localhost:8000/redoc`
120
134
 
@@ -139,4 +153,4 @@ docker build -f airbyte_cdk/manifest_server/Dockerfile -t manifest-server .
139
153
  docker run -p 8080:8080 manifest-server
140
154
  ```
141
155
 
142
- Note: The container runs on port 8080 by default.
156
+ Note: The container runs on port 8080 by default.
@@ -61,7 +61,7 @@ paths:
61
61
  content:
62
62
  application/json:
63
63
  schema:
64
- $ref: '#/components/schemas/StreamRead'
64
+ $ref: '#/components/schemas/StreamReadResponse'
65
65
  '422':
66
66
  description: Validation Error
67
67
  content:
@@ -159,12 +159,13 @@ paths:
159
159
  tags:
160
160
  - manifest
161
161
  summary: Full Resolve
162
- description: 'Fully resolve a manifest including dynamic streams.
162
+ description: 'Fully resolve a manifest, including dynamic streams.
163
163
 
164
164
 
165
- Generates dynamic streams up to the specified limit and includes
166
-
167
- them in the resolved manifest.'
165
+ This is a similar operation to resolve, but has an extra step which generates
166
+ streams from dynamic stream templates if the manifest contains any. This is
167
+ used when a user clicks the generate streams button on a stream template in
168
+ the Builder UI'
168
169
  operationId: fullResolve
169
170
  requestBody:
170
171
  content:
@@ -465,7 +466,26 @@ components:
465
466
  - manifest
466
467
  title: ResolveRequest
467
468
  description: Request to resolve a manifest.
468
- StreamRead:
469
+ StreamReadPages:
470
+ properties:
471
+ records:
472
+ items: {}
473
+ type: array
474
+ title: Records
475
+ request:
476
+ anyOf:
477
+ - $ref: '#/components/schemas/HttpRequest'
478
+ - type: 'null'
479
+ response:
480
+ anyOf:
481
+ - $ref: '#/components/schemas/HttpResponse'
482
+ - type: 'null'
483
+ type: object
484
+ required:
485
+ - records
486
+ title: StreamReadPages
487
+ description: Pages of data read from a stream slice.
488
+ StreamReadResponse:
469
489
  properties:
470
490
  logs:
471
491
  items:
@@ -511,27 +531,8 @@ components:
511
531
  - inferred_schema
512
532
  - inferred_datetime_formats
513
533
  - latest_config_update
514
- title: StreamRead
534
+ title: StreamReadResponse
515
535
  description: Complete stream read response with properly typed fields.
516
- StreamReadPages:
517
- properties:
518
- records:
519
- items: {}
520
- type: array
521
- title: Records
522
- request:
523
- anyOf:
524
- - $ref: '#/components/schemas/HttpRequest'
525
- - type: 'null'
526
- response:
527
- anyOf:
528
- - $ref: '#/components/schemas/HttpResponse'
529
- - type: 'null'
530
- type: object
531
- required:
532
- - records
533
- title: StreamReadPages
534
- description: Pages of data read from a stream slice.
535
536
  StreamReadSlices:
536
537
  properties:
537
538
  pages:
@@ -577,7 +578,6 @@ components:
577
578
  items: {}
578
579
  type: array
579
580
  title: State
580
- default: []
581
581
  custom_components_code:
582
582
  anyOf:
583
583
  - type: string
@@ -703,7 +703,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
703
703
  stream_slicer=declarative_stream.retriever.stream_slicer,
704
704
  slice_limit=self._limits.max_slices
705
705
  if self._limits
706
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
706
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
707
707
  )
708
708
  else:
709
709
  if (
@@ -772,7 +772,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
772
772
  declarative_stream.retriever.stream_slicer,
773
773
  slice_limit=self._limits.max_slices
774
774
  if self._limits
775
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
775
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
776
776
  )
777
777
 
778
778
  final_state_cursor = FinalStateCursor(
@@ -11,6 +11,13 @@ from copy import deepcopy
11
11
  from datetime import timedelta
12
12
  from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
13
13
 
14
+ from airbyte_cdk.models import (
15
+ AirbyteStateBlob,
16
+ AirbyteStateMessage,
17
+ AirbyteStateType,
18
+ AirbyteStreamState,
19
+ StreamDescriptor,
20
+ )
14
21
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
15
22
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
23
  Timer,
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
48
55
  Manages state per partition when a stream has many partitions, preventing data loss or duplication.
49
56
 
50
57
  Attributes:
51
- DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
58
+ DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
52
59
 
53
60
  - **Partition Limitation Logic**
54
61
  Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
128
135
 
129
136
  # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
130
137
  self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
138
+ self._synced_some_data = False
131
139
 
132
140
  @property
133
141
  def cursor_field(self) -> CursorField:
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
168
176
  with self._lock:
169
177
  self._semaphore_per_partition[partition_key].acquire()
170
178
  if not self._use_global_cursor:
171
- self._cursor_per_partition[partition_key].close_partition(partition=partition)
172
179
  cursor = self._cursor_per_partition[partition_key]
180
+ cursor.close_partition(partition=partition)
173
181
  if (
174
182
  partition_key in self._partitions_done_generating_stream_slices
175
183
  and self._semaphore_per_partition[partition_key]._value == 0
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
213
221
  if not any(
214
222
  semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
215
223
  ):
216
- self._global_cursor = self._new_global_cursor
217
- self._lookback_window = self._timer.finish()
224
+ if self._synced_some_data:
225
+ # we only update those if we actually synced some data
226
+ self._global_cursor = self._new_global_cursor
227
+ self._lookback_window = self._timer.finish()
218
228
  self._parent_state = self._partition_router.get_stream_state()
219
229
  self._emit_state_message(throttle=False)
220
230
 
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
422
432
  if stream_state.get("parent_state"):
423
433
  self._parent_state = stream_state["parent_state"]
424
434
 
425
- # Set parent state for partition routers based on parent streams
426
- self._partition_router.set_initial_state(stream_state)
427
-
428
435
  def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
429
436
  """
430
437
  Initializes the global cursor state from the provided stream state.
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
458
465
  except ValueError:
459
466
  return
460
467
 
468
+ self._synced_some_data = True
461
469
  record_cursor = self._connector_state_converter.output_format(
462
470
  self._connector_state_converter.parse_value(record_cursor_value)
463
471
  )
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
541
549
 
542
550
  def limit_reached(self) -> bool:
543
551
  return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
552
+
553
+ @staticmethod
554
+ def get_parent_state(
555
+ stream_state: Optional[StreamState], parent_stream_name: str
556
+ ) -> Optional[AirbyteStateMessage]:
557
+ if not stream_state:
558
+ return None
559
+
560
+ if "parent_state" not in stream_state:
561
+ logger.warning(
562
+ f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
563
+ )
564
+ return None
565
+ elif parent_stream_name not in stream_state["parent_state"]:
566
+ logger.info(
567
+ f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
568
+ )
569
+ return None
570
+
571
+ return AirbyteStateMessage(
572
+ type=AirbyteStateType.STREAM,
573
+ stream=AirbyteStreamState(
574
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
575
+ stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
576
+ ),
577
+ )
578
+
579
+ @staticmethod
580
+ def get_global_state(
581
+ stream_state: Optional[StreamState], parent_stream_name: str
582
+ ) -> Optional[AirbyteStateMessage]:
583
+ return (
584
+ AirbyteStateMessage(
585
+ type=AirbyteStateType.STREAM,
586
+ stream=AirbyteStreamState(
587
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
588
+ stream_state=AirbyteStateBlob(stream_state["state"]),
589
+ ),
590
+ )
591
+ if stream_state and "state" in stream_state
592
+ else None
593
+ )
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
192
192
  # Example: {"global_state_format_key": "global_state_format_value"}
193
193
  self._stream_cursor.set_initial_state(stream_state)
194
194
 
195
- # Set parent state for partition routers based on parent streams
196
- self._partition_router.set_initial_state(stream_state)
195
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
196
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
197
+ # We are still keeping this line as a comment to be explicit about the past behavior.
198
+ # self._partition_router.set_initial_state(stream_state)
197
199
 
198
200
  def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
199
201
  """