airbyte-cdk 6.61.3.post2.dev17299502224__py3-none-any.whl → 6.62.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
- airbyte_cdk/manifest_server/README.md +17 -3
- airbyte_cdk/manifest_server/openapi.yaml +27 -27
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +2 -2
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +196 -269
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +4 -7
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
- airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
- airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +3 -3
- airbyte_cdk/sources/message/repository.py +20 -0
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/METADATA +6 -5
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/RECORD +24 -23
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/entry_points.txt +0 -0
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
146
146
|
if "state" in stream_state:
|
147
147
|
self._state_to_migrate_from = stream_state["state"]
|
148
148
|
|
149
|
-
#
|
150
|
-
|
149
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
150
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
151
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
152
|
+
# self._partition_router.set_initial_state(stream_state)
|
151
153
|
|
152
154
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
153
155
|
self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
|
@@ -37,26 +37,31 @@ The server will start on `http://localhost:8000` by default.
|
|
37
37
|
## API Endpoints
|
38
38
|
|
39
39
|
### `/v1/manifest/test_read`
|
40
|
+
|
40
41
|
Test reading from a specific stream in the manifest.
|
41
42
|
|
42
43
|
**POST** - Test stream reading with configurable limits for records, pages, and slices.
|
43
44
|
|
44
45
|
### `/v1/manifest/check`
|
46
|
+
|
45
47
|
Check configuration against a manifest.
|
46
48
|
|
47
49
|
**POST** - Validates connector configuration and returns success/failure status with message.
|
48
50
|
|
49
51
|
### `/v1/manifest/discover`
|
52
|
+
|
50
53
|
Discover streams from a manifest.
|
51
54
|
|
52
55
|
**POST** - Returns the catalog of available streams from the manifest.
|
53
56
|
|
54
|
-
### `/v1/manifest/resolve`
|
57
|
+
### `/v1/manifest/resolve`
|
58
|
+
|
55
59
|
Resolve a manifest to its final configuration.
|
56
60
|
|
57
61
|
**POST** - Returns the resolved manifest without dynamic stream generation.
|
58
62
|
|
59
63
|
### `/v1/manifest/full_resolve`
|
64
|
+
|
60
65
|
Fully resolve a manifest including dynamic streams.
|
61
66
|
|
62
67
|
**POST** - Generates dynamic streams up to specified limits and includes them in the resolved manifest.
|
@@ -68,6 +73,7 @@ The manifest server supports custom Python components, but this feature is **dis
|
|
68
73
|
### Enabling Custom Components
|
69
74
|
|
70
75
|
To allow custom Python components in your manifest files, set the environment variable:
|
76
|
+
|
71
77
|
```bash
|
72
78
|
export AIRBYTE_ENABLE_UNSAFE_CODE=true
|
73
79
|
```
|
@@ -77,20 +83,25 @@ export AIRBYTE_ENABLE_UNSAFE_CODE=true
|
|
77
83
|
The manifest server supports optional JWT bearer token authentication:
|
78
84
|
|
79
85
|
### Configuration
|
86
|
+
|
80
87
|
Set the environment variable to enable authentication:
|
88
|
+
|
81
89
|
```bash
|
82
90
|
export AB_JWT_SIGNATURE_SECRET="your-jwt-secret-key"
|
83
91
|
```
|
84
92
|
|
85
93
|
### Usage
|
94
|
+
|
86
95
|
When authentication is enabled, include a valid JWT token in the Authorization header:
|
96
|
+
|
87
97
|
```bash
|
88
98
|
curl -H "Authorization: Bearer <your-jwt-token>" \
|
89
99
|
http://localhost:8000/v1/manifest/test_read
|
90
100
|
```
|
91
101
|
|
92
102
|
### Behavior
|
93
|
-
|
103
|
+
|
104
|
+
- **Without `AB_JWT_SIGNATURE_SECRET`**: All requests pass through
|
94
105
|
- **With `AB_JWT_SIGNATURE_SECRET`**: Requires valid JWT bearer token using HS256 algorithm
|
95
106
|
|
96
107
|
## OpenAPI Specification
|
@@ -98,6 +109,7 @@ curl -H "Authorization: Bearer <your-jwt-token>" \
|
|
98
109
|
The manifest server provides an OpenAPI specification for API client generation:
|
99
110
|
|
100
111
|
### Generating the OpenAPI Spec
|
112
|
+
|
101
113
|
```bash
|
102
114
|
# Generate OpenAPI YAML (default location)
|
103
115
|
manifest-server generate-openapi
|
@@ -107,6 +119,7 @@ manifest-server generate-openapi --output /path/to/openapi.yaml
|
|
107
119
|
```
|
108
120
|
|
109
121
|
The generated OpenAPI specification is consumed by other applications and tools to:
|
122
|
+
|
110
123
|
- Generate API clients in various programming languages
|
111
124
|
- Create SDK bindings for the manifest server
|
112
125
|
- Provide API documentation and validation
|
@@ -115,6 +128,7 @@ The generated OpenAPI specification is consumed by other applications and tools
|
|
115
128
|
### Interactive API Documentation
|
116
129
|
|
117
130
|
When running, interactive API documentation is available at:
|
131
|
+
|
118
132
|
- Swagger UI: `http://localhost:8000/docs`
|
119
133
|
- ReDoc: `http://localhost:8000/redoc`
|
120
134
|
|
@@ -139,4 +153,4 @@ docker build -f airbyte_cdk/manifest_server/Dockerfile -t manifest-server .
|
|
139
153
|
docker run -p 8080:8080 manifest-server
|
140
154
|
```
|
141
155
|
|
142
|
-
Note: The container runs on port 8080 by default.
|
156
|
+
Note: The container runs on port 8080 by default.
|
@@ -61,7 +61,7 @@ paths:
|
|
61
61
|
content:
|
62
62
|
application/json:
|
63
63
|
schema:
|
64
|
-
$ref: '#/components/schemas/
|
64
|
+
$ref: '#/components/schemas/StreamReadResponse'
|
65
65
|
'422':
|
66
66
|
description: Validation Error
|
67
67
|
content:
|
@@ -159,12 +159,13 @@ paths:
|
|
159
159
|
tags:
|
160
160
|
- manifest
|
161
161
|
summary: Full Resolve
|
162
|
-
description: 'Fully resolve a manifest including dynamic streams.
|
162
|
+
description: 'Fully resolve a manifest, including dynamic streams.
|
163
163
|
|
164
164
|
|
165
|
-
|
166
|
-
|
167
|
-
|
165
|
+
This is a similar operation to resolve, but has an extra step which generates
|
166
|
+
streams from dynamic stream templates if the manifest contains any. This is
|
167
|
+
used when a user clicks the generate streams button on a stream template in
|
168
|
+
the Builder UI'
|
168
169
|
operationId: fullResolve
|
169
170
|
requestBody:
|
170
171
|
content:
|
@@ -465,7 +466,26 @@ components:
|
|
465
466
|
- manifest
|
466
467
|
title: ResolveRequest
|
467
468
|
description: Request to resolve a manifest.
|
468
|
-
|
469
|
+
StreamReadPages:
|
470
|
+
properties:
|
471
|
+
records:
|
472
|
+
items: {}
|
473
|
+
type: array
|
474
|
+
title: Records
|
475
|
+
request:
|
476
|
+
anyOf:
|
477
|
+
- $ref: '#/components/schemas/HttpRequest'
|
478
|
+
- type: 'null'
|
479
|
+
response:
|
480
|
+
anyOf:
|
481
|
+
- $ref: '#/components/schemas/HttpResponse'
|
482
|
+
- type: 'null'
|
483
|
+
type: object
|
484
|
+
required:
|
485
|
+
- records
|
486
|
+
title: StreamReadPages
|
487
|
+
description: Pages of data read from a stream slice.
|
488
|
+
StreamReadResponse:
|
469
489
|
properties:
|
470
490
|
logs:
|
471
491
|
items:
|
@@ -511,27 +531,8 @@ components:
|
|
511
531
|
- inferred_schema
|
512
532
|
- inferred_datetime_formats
|
513
533
|
- latest_config_update
|
514
|
-
title:
|
534
|
+
title: StreamReadResponse
|
515
535
|
description: Complete stream read response with properly typed fields.
|
516
|
-
StreamReadPages:
|
517
|
-
properties:
|
518
|
-
records:
|
519
|
-
items: {}
|
520
|
-
type: array
|
521
|
-
title: Records
|
522
|
-
request:
|
523
|
-
anyOf:
|
524
|
-
- $ref: '#/components/schemas/HttpRequest'
|
525
|
-
- type: 'null'
|
526
|
-
response:
|
527
|
-
anyOf:
|
528
|
-
- $ref: '#/components/schemas/HttpResponse'
|
529
|
-
- type: 'null'
|
530
|
-
type: object
|
531
|
-
required:
|
532
|
-
- records
|
533
|
-
title: StreamReadPages
|
534
|
-
description: Pages of data read from a stream slice.
|
535
536
|
StreamReadSlices:
|
536
537
|
properties:
|
537
538
|
pages:
|
@@ -577,7 +578,6 @@ components:
|
|
577
578
|
items: {}
|
578
579
|
type: array
|
579
580
|
title: State
|
580
|
-
default: []
|
581
581
|
custom_components_code:
|
582
582
|
anyOf:
|
583
583
|
- type: string
|
@@ -703,7 +703,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
|
|
703
703
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
704
704
|
slice_limit=self._limits.max_slices
|
705
705
|
if self._limits
|
706
|
-
else None, # technically not needed because
|
706
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
707
707
|
)
|
708
708
|
else:
|
709
709
|
if (
|
@@ -772,7 +772,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
|
|
772
772
|
declarative_stream.retriever.stream_slicer,
|
773
773
|
slice_limit=self._limits.max_slices
|
774
774
|
if self._limits
|
775
|
-
else None, # technically not needed because
|
775
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
776
776
|
)
|
777
777
|
|
778
778
|
final_state_cursor = FinalStateCursor(
|
@@ -11,6 +11,13 @@ from copy import deepcopy
|
|
11
11
|
from datetime import timedelta
|
12
12
|
from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
|
13
13
|
|
14
|
+
from airbyte_cdk.models import (
|
15
|
+
AirbyteStateBlob,
|
16
|
+
AirbyteStateMessage,
|
17
|
+
AirbyteStateType,
|
18
|
+
AirbyteStreamState,
|
19
|
+
StreamDescriptor,
|
20
|
+
)
|
14
21
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
15
22
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
23
|
Timer,
|
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
48
55
|
Manages state per partition when a stream has many partitions, preventing data loss or duplication.
|
49
56
|
|
50
57
|
Attributes:
|
51
|
-
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
|
58
|
+
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
|
52
59
|
|
53
60
|
- **Partition Limitation Logic**
|
54
61
|
Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
|
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
128
135
|
|
129
136
|
# FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
|
130
137
|
self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
|
138
|
+
self._synced_some_data = False
|
131
139
|
|
132
140
|
@property
|
133
141
|
def cursor_field(self) -> CursorField:
|
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
168
176
|
with self._lock:
|
169
177
|
self._semaphore_per_partition[partition_key].acquire()
|
170
178
|
if not self._use_global_cursor:
|
171
|
-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
172
179
|
cursor = self._cursor_per_partition[partition_key]
|
180
|
+
cursor.close_partition(partition=partition)
|
173
181
|
if (
|
174
182
|
partition_key in self._partitions_done_generating_stream_slices
|
175
183
|
and self._semaphore_per_partition[partition_key]._value == 0
|
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
213
221
|
if not any(
|
214
222
|
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
215
223
|
):
|
216
|
-
|
217
|
-
|
224
|
+
if self._synced_some_data:
|
225
|
+
# we only update those if we actually synced some data
|
226
|
+
self._global_cursor = self._new_global_cursor
|
227
|
+
self._lookback_window = self._timer.finish()
|
218
228
|
self._parent_state = self._partition_router.get_stream_state()
|
219
229
|
self._emit_state_message(throttle=False)
|
220
230
|
|
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
422
432
|
if stream_state.get("parent_state"):
|
423
433
|
self._parent_state = stream_state["parent_state"]
|
424
434
|
|
425
|
-
# Set parent state for partition routers based on parent streams
|
426
|
-
self._partition_router.set_initial_state(stream_state)
|
427
|
-
|
428
435
|
def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
|
429
436
|
"""
|
430
437
|
Initializes the global cursor state from the provided stream state.
|
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
458
465
|
except ValueError:
|
459
466
|
return
|
460
467
|
|
468
|
+
self._synced_some_data = True
|
461
469
|
record_cursor = self._connector_state_converter.output_format(
|
462
470
|
self._connector_state_converter.parse_value(record_cursor_value)
|
463
471
|
)
|
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
541
549
|
|
542
550
|
def limit_reached(self) -> bool:
|
543
551
|
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
|
552
|
+
|
553
|
+
@staticmethod
|
554
|
+
def get_parent_state(
|
555
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
556
|
+
) -> Optional[AirbyteStateMessage]:
|
557
|
+
if not stream_state:
|
558
|
+
return None
|
559
|
+
|
560
|
+
if "parent_state" not in stream_state:
|
561
|
+
logger.warning(
|
562
|
+
f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
|
563
|
+
)
|
564
|
+
return None
|
565
|
+
elif parent_stream_name not in stream_state["parent_state"]:
|
566
|
+
logger.info(
|
567
|
+
f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
|
568
|
+
)
|
569
|
+
return None
|
570
|
+
|
571
|
+
return AirbyteStateMessage(
|
572
|
+
type=AirbyteStateType.STREAM,
|
573
|
+
stream=AirbyteStreamState(
|
574
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
575
|
+
stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
|
576
|
+
),
|
577
|
+
)
|
578
|
+
|
579
|
+
@staticmethod
|
580
|
+
def get_global_state(
|
581
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
582
|
+
) -> Optional[AirbyteStateMessage]:
|
583
|
+
return (
|
584
|
+
AirbyteStateMessage(
|
585
|
+
type=AirbyteStateType.STREAM,
|
586
|
+
stream=AirbyteStreamState(
|
587
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
588
|
+
stream_state=AirbyteStateBlob(stream_state["state"]),
|
589
|
+
),
|
590
|
+
)
|
591
|
+
if stream_state and "state" in stream_state
|
592
|
+
else None
|
593
|
+
)
|
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
192
192
|
# Example: {"global_state_format_key": "global_state_format_value"}
|
193
193
|
self._stream_cursor.set_initial_state(stream_state)
|
194
194
|
|
195
|
-
#
|
196
|
-
|
195
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
196
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
197
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
198
|
+
# self._partition_router.set_initial_state(stream_state)
|
197
199
|
|
198
200
|
def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
|
199
201
|
"""
|