airbyte-cdk 6.8.1rc9__py3-none-any.whl → 6.8.1rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +11 -5
- airbyte_cdk/config_observation.py +1 -1
- airbyte_cdk/connector_builder/main.py +1 -1
- airbyte_cdk/connector_builder/message_grouper.py +10 -10
- airbyte_cdk/destinations/destination.py +1 -1
- airbyte_cdk/destinations/vector_db_based/embedder.py +2 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +12 -4
- airbyte_cdk/entrypoint.py +7 -6
- airbyte_cdk/logger.py +2 -2
- airbyte_cdk/sources/abstract_source.py +1 -1
- airbyte_cdk/sources/config.py +1 -1
- airbyte_cdk/sources/connector_state_manager.py +9 -4
- airbyte_cdk/sources/declarative/auth/oauth.py +1 -1
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +6 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +28 -42
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +10 -4
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +116 -19
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +4 -1
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +8 -6
- airbyte_cdk/sources/declarative/interpolation/jinja.py +35 -36
- airbyte_cdk/sources/declarative/interpolation/macros.py +1 -1
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +53 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +95 -2
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +6 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +100 -27
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +2 -1
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +13 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +1 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +8 -6
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +1 -1
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +2 -2
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +1 -1
- airbyte_cdk/sources/declarative/resolvers/__init__.py +13 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +106 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +5 -2
- airbyte_cdk/sources/declarative/spec/spec.py +1 -1
- airbyte_cdk/sources/embedded/base_integration.py +3 -2
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +18 -7
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +14 -11
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +3 -3
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +11 -5
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +6 -3
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +1 -1
- airbyte_cdk/sources/http_logger.py +3 -3
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +5 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +6 -3
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -1
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +2 -2
- airbyte_cdk/sources/streams/core.py +17 -14
- airbyte_cdk/sources/streams/http/http.py +19 -19
- airbyte_cdk/sources/streams/http/http_client.py +4 -48
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +62 -33
- airbyte_cdk/sources/utils/record_helper.py +1 -1
- airbyte_cdk/sources/utils/schema_helpers.py +1 -1
- airbyte_cdk/sources/utils/transform.py +34 -15
- airbyte_cdk/test/entrypoint_wrapper.py +11 -6
- airbyte_cdk/test/mock_http/response_builder.py +1 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +1 -1
- airbyte_cdk/utils/event_timing.py +10 -10
- airbyte_cdk/utils/message_utils.py +4 -3
- airbyte_cdk/utils/spec_schema_transformations.py +3 -2
- airbyte_cdk/utils/traced_exception.py +14 -12
- airbyte_cdk-6.8.1rc10.dist-info/METADATA +111 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/RECORD +73 -70
- airbyte_cdk-6.8.1rc9.dist-info/METADATA +0 -307
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/entry_points.txt +0 -0
@@ -25,7 +25,7 @@ from datetime import datetime
|
|
25
25
|
from pathlib import Path
|
26
26
|
from typing import Any, cast
|
27
27
|
|
28
|
-
|
28
|
+
import orjson
|
29
29
|
|
30
30
|
from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
|
31
31
|
from airbyte_cdk.models import (
|
@@ -72,7 +72,7 @@ class SourceLocalYaml(YamlDeclarativeSource):
|
|
72
72
|
super().__init__(
|
73
73
|
catalog=catalog,
|
74
74
|
config=config,
|
75
|
-
state=state,
|
75
|
+
state=state, # type: ignore [arg-type]
|
76
76
|
path_to_yaml="manifest.yaml",
|
77
77
|
)
|
78
78
|
|
@@ -152,7 +152,9 @@ def handle_remote_manifest_command(args: list[str]) -> None:
|
|
152
152
|
)
|
153
153
|
|
154
154
|
|
155
|
-
def create_declarative_source(
|
155
|
+
def create_declarative_source(
|
156
|
+
args: list[str],
|
157
|
+
) -> ConcurrentDeclarativeSource: # type: ignore [type-arg]
|
156
158
|
"""Creates the source with the injected config.
|
157
159
|
|
158
160
|
This essentially does what other low-code sources do at build time, but at runtime,
|
@@ -160,10 +162,14 @@ def create_declarative_source(args: list[str]) -> ConcurrentDeclarativeSource:
|
|
160
162
|
connector builder.
|
161
163
|
"""
|
162
164
|
try:
|
165
|
+
config: Mapping[str, Any] | None
|
166
|
+
catalog: ConfiguredAirbyteCatalog | None
|
167
|
+
state: list[AirbyteStateMessage]
|
163
168
|
config, catalog, state = _parse_inputs_into_config_catalog_state(args)
|
164
|
-
if "__injected_declarative_manifest" not in config:
|
169
|
+
if config is None or "__injected_declarative_manifest" not in config:
|
165
170
|
raise ValueError(
|
166
|
-
|
171
|
+
"Invalid config: `__injected_declarative_manifest` should be provided at the root "
|
172
|
+
f"of the config but config only has keys: {list(config.keys() if config else [])}"
|
167
173
|
)
|
168
174
|
return ConcurrentDeclarativeSource(
|
169
175
|
config=config,
|
@@ -71,7 +71,7 @@ class MessageGrouper:
|
|
71
71
|
|
72
72
|
is_nested_key = isinstance(field[0], str)
|
73
73
|
if is_nested_key:
|
74
|
-
return [field]
|
74
|
+
return [field]
|
75
75
|
|
76
76
|
raise ValueError(f"Unknown type for cursor field `{field}")
|
77
77
|
|
@@ -232,9 +232,9 @@ class MessageGrouper:
|
|
232
232
|
current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
233
233
|
current_slice_pages = []
|
234
234
|
at_least_one_page_in_group = False
|
235
|
-
elif message.type == MessageType.LOG and message.log.message.startswith(
|
235
|
+
elif message.type == MessageType.LOG and message.log.message.startswith( # type: ignore[union-attr] # None doesn't have 'message'
|
236
236
|
SliceLogger.SLICE_LOG_PREFIX
|
237
|
-
):
|
237
|
+
):
|
238
238
|
# parsing the first slice
|
239
239
|
current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
240
240
|
elif message.type == MessageType.LOG:
|
@@ -274,14 +274,14 @@ class MessageGrouper:
|
|
274
274
|
if message.trace.type == TraceType.ERROR: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has trace.type
|
275
275
|
yield message.trace
|
276
276
|
elif message.type == MessageType.RECORD:
|
277
|
-
current_page_records.append(message.record.data) # type: ignore[union-attr] # AirbyteMessage with MessageType.RECORD has record.data
|
277
|
+
current_page_records.append(message.record.data) # type: ignore[arg-type, union-attr] # AirbyteMessage with MessageType.RECORD has record.data
|
278
278
|
records_count += 1
|
279
279
|
schema_inferrer.accumulate(message.record)
|
280
280
|
datetime_format_inferrer.accumulate(message.record)
|
281
281
|
elif (
|
282
282
|
message.type == MessageType.CONTROL
|
283
|
-
and message.control.type == OrchestratorType.CONNECTOR_CONFIG
|
284
|
-
):
|
283
|
+
and message.control.type == OrchestratorType.CONNECTOR_CONFIG # type: ignore[union-attr] # None doesn't have 'type'
|
284
|
+
):
|
285
285
|
yield message.control
|
286
286
|
elif message.type == MessageType.STATE:
|
287
287
|
latest_state_message = message.state # type: ignore[assignment]
|
@@ -310,8 +310,8 @@ class MessageGrouper:
|
|
310
310
|
and message.type == MessageType.LOG
|
311
311
|
and (
|
312
312
|
MessageGrouper._is_page_http_request(json_message)
|
313
|
-
or message.log.message.startswith("slice:")
|
314
|
-
)
|
313
|
+
or message.log.message.startswith("slice:") # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
314
|
+
)
|
315
315
|
)
|
316
316
|
|
317
317
|
@staticmethod
|
@@ -355,8 +355,8 @@ class MessageGrouper:
|
|
355
355
|
StreamReadPages(
|
356
356
|
request=current_page_request,
|
357
357
|
response=current_page_response,
|
358
|
-
records=deepcopy(current_page_records),
|
359
|
-
)
|
358
|
+
records=deepcopy(current_page_records), # type: ignore [arg-type]
|
359
|
+
)
|
360
360
|
)
|
361
361
|
current_page_records.clear()
|
362
362
|
|
@@ -9,7 +9,7 @@ import sys
|
|
9
9
|
from abc import ABC, abstractmethod
|
10
10
|
from typing import Any, Iterable, List, Mapping
|
11
11
|
|
12
|
-
|
12
|
+
import orjson
|
13
13
|
|
14
14
|
from airbyte_cdk.connector import Connector
|
15
15
|
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
@@ -107,7 +107,7 @@ class BaseOpenAIEmbedder(Embedder):
|
|
107
107
|
class OpenAIEmbedder(BaseOpenAIEmbedder):
|
108
108
|
def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
|
109
109
|
super().__init__(
|
110
|
-
OpenAIEmbeddings(
|
110
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
111
111
|
openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
|
112
112
|
),
|
113
113
|
chunk_size,
|
@@ -118,7 +118,7 @@ class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
|
|
118
118
|
def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
|
119
119
|
# Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
|
120
120
|
super().__init__(
|
121
|
-
OpenAIEmbeddings(
|
121
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
122
122
|
openai_api_key=config.openai_key,
|
123
123
|
chunk_size=16,
|
124
124
|
max_retries=15,
|
@@ -83,11 +83,19 @@ class Writer:
|
|
83
83
|
yield message
|
84
84
|
elif message.type == Type.RECORD:
|
85
85
|
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
86
|
-
self.chunks[
|
87
|
-
|
88
|
-
|
89
|
-
|
86
|
+
self.chunks[
|
87
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
88
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
89
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
90
90
|
)
|
91
|
+
].extend(record_chunks)
|
92
|
+
if record_id_to_delete is not None:
|
93
|
+
self.ids_to_delete[
|
94
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
95
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
96
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
97
|
+
)
|
98
|
+
].append(record_id_to_delete)
|
91
99
|
self.number_of_chunks += len(record_chunks)
|
92
100
|
if self.number_of_chunks >= self.batch_size:
|
93
101
|
self._process_batch()
|
airbyte_cdk/entrypoint.py
CHANGED
@@ -22,7 +22,7 @@ from requests import PreparedRequest, Response, Session
|
|
22
22
|
from airbyte_cdk.connector import TConfig
|
23
23
|
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
24
24
|
from airbyte_cdk.logger import init_logger
|
25
|
-
from airbyte_cdk.models import (
|
25
|
+
from airbyte_cdk.models import (
|
26
26
|
AirbyteConnectionStatus,
|
27
27
|
AirbyteMessage,
|
28
28
|
AirbyteMessageSerializer,
|
@@ -255,9 +255,10 @@ class AirbyteEntrypoint(object):
|
|
255
255
|
|
256
256
|
stream_message_count[
|
257
257
|
HashableStreamDescriptor(
|
258
|
-
name=message.record.stream,
|
258
|
+
name=message.record.stream, # type: ignore[union-attr] # record has `stream`
|
259
|
+
namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace`
|
259
260
|
)
|
260
|
-
] += 1.0
|
261
|
+
] += 1.0
|
261
262
|
case Type.STATE:
|
262
263
|
if message.state is None:
|
263
264
|
raise ValueError("State message must have a state attribute")
|
@@ -266,9 +267,9 @@ class AirbyteEntrypoint(object):
|
|
266
267
|
|
267
268
|
# Set record count from the counter onto the state message
|
268
269
|
message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats`
|
269
|
-
message.state.sourceStats.recordCount = stream_message_count.get(
|
270
|
+
message.state.sourceStats.recordCount = stream_message_count.get( # type: ignore[union-attr] # state has `sourceStats`
|
270
271
|
stream_descriptor, 0.0
|
271
|
-
)
|
272
|
+
)
|
272
273
|
|
273
274
|
# Reset the counter
|
274
275
|
stream_message_count[stream_descriptor] = 0.0
|
@@ -290,7 +291,7 @@ class AirbyteEntrypoint(object):
|
|
290
291
|
|
291
292
|
@staticmethod
|
292
293
|
def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str:
|
293
|
-
return orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode()
|
294
|
+
return orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode()
|
294
295
|
|
295
296
|
@classmethod
|
296
297
|
def extract_state(cls, args: List[str]) -> Optional[Any]:
|
airbyte_cdk/logger.py
CHANGED
@@ -7,7 +7,7 @@ import logging
|
|
7
7
|
import logging.config
|
8
8
|
from typing import Any, Callable, Mapping, Optional, Tuple
|
9
9
|
|
10
|
-
|
10
|
+
import orjson
|
11
11
|
|
12
12
|
from airbyte_cdk.models import (
|
13
13
|
AirbyteLogMessage,
|
@@ -78,7 +78,7 @@ class AirbyteLogFormatter(logging.Formatter):
|
|
78
78
|
log_message = AirbyteMessage(
|
79
79
|
type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
|
80
80
|
)
|
81
|
-
return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
|
81
|
+
return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
|
82
82
|
|
83
83
|
@staticmethod
|
84
84
|
def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]:
|
@@ -200,7 +200,7 @@ class AbstractSource(Source, ABC):
|
|
200
200
|
if len(stream_name_to_exception) > 0:
|
201
201
|
error_message = generate_failed_streams_error_message(
|
202
202
|
{key: [value] for key, value in stream_name_to_exception.items()}
|
203
|
-
)
|
203
|
+
)
|
204
204
|
logger.info(error_message)
|
205
205
|
# We still raise at least one exception when a stream raises an exception because the platform currently relies
|
206
206
|
# on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
|
airbyte_cdk/sources/config.py
CHANGED
@@ -24,4 +24,4 @@ class BaseConfig(BaseModel):
|
|
24
24
|
rename_key(schema, old_key="anyOf", new_key="oneOf") # UI supports only oneOf
|
25
25
|
expand_refs(schema)
|
26
26
|
schema.pop("description", None) # description added from the docstring
|
27
|
-
return schema
|
27
|
+
return schema
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import copy
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
|
7
|
+
from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
from airbyte_cdk.models import (
|
10
10
|
AirbyteMessage,
|
@@ -15,6 +15,7 @@ from airbyte_cdk.models import (
|
|
15
15
|
StreamDescriptor,
|
16
16
|
)
|
17
17
|
from airbyte_cdk.models import Type as MessageType
|
18
|
+
from airbyte_cdk.models.airbyte_protocol import AirbyteGlobalState, AirbyteStateBlob
|
18
19
|
|
19
20
|
|
20
21
|
@dataclass(frozen=True)
|
@@ -118,8 +119,12 @@ class ConnectorStateManager:
|
|
118
119
|
is_global = cls._is_global_state(state)
|
119
120
|
|
120
121
|
if is_global:
|
121
|
-
|
122
|
-
|
122
|
+
# We already validate that this is a global state message, not None:
|
123
|
+
global_state = cast(AirbyteGlobalState, state[0].global_)
|
124
|
+
# global_state has shared_state, also not None:
|
125
|
+
shared_state: AirbyteStateBlob = cast(
|
126
|
+
AirbyteStateBlob, copy.deepcopy(global_state.shared_state, {})
|
127
|
+
)
|
123
128
|
streams = {
|
124
129
|
HashableStreamDescriptor(
|
125
130
|
name=per_stream_state.stream_descriptor.name,
|
@@ -131,7 +136,7 @@ class ConnectorStateManager:
|
|
131
136
|
else:
|
132
137
|
streams = {
|
133
138
|
HashableStreamDescriptor(
|
134
|
-
name=per_stream_state.stream.stream_descriptor.name,
|
139
|
+
name=per_stream_state.stream.stream_descriptor.name, # type: ignore[union-attr] # stream has stream_descriptor
|
135
140
|
namespace=per_stream_state.stream.stream_descriptor.namespace, # type: ignore[union-attr] # stream has stream_descriptor
|
136
141
|
): per_stream_state.stream.stream_state # type: ignore[union-attr] # stream has stream_state
|
137
142
|
for per_stream_state in state
|
@@ -135,7 +135,7 @@ class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAut
|
|
135
135
|
return self.grant_type.eval(self.config) # type: ignore # eval returns a string in this context
|
136
136
|
|
137
137
|
def get_refresh_request_body(self) -> Mapping[str, Any]:
|
138
|
-
return self._refresh_request_body.eval(self.config)
|
138
|
+
return self._refresh_request_body.eval(self.config)
|
139
139
|
|
140
140
|
def get_token_expiry_date(self) -> pendulum.DateTime:
|
141
141
|
return self._token_expiry_date # type: ignore # _token_expiry_date is a pendulum.DateTime. It is never None despite what mypy thinks
|
@@ -28,7 +28,12 @@ class SelectiveAuthenticator(DeclarativeAuthenticator):
|
|
28
28
|
**kwargs: Any,
|
29
29
|
) -> DeclarativeAuthenticator:
|
30
30
|
try:
|
31
|
-
selected_key = str(
|
31
|
+
selected_key = str(
|
32
|
+
dpath.get(
|
33
|
+
config, # type: ignore [arg-type] # Dpath wants mutable mapping but doesn't need it.
|
34
|
+
authenticator_selection_path,
|
35
|
+
)
|
36
|
+
)
|
32
37
|
except KeyError as err:
|
33
38
|
raise ValueError(
|
34
39
|
"The path from `authenticator_selection_path` is not found in the config."
|
@@ -56,9 +56,8 @@ from airbyte_cdk.sources.types import Config, StreamState
|
|
56
56
|
|
57
57
|
|
58
58
|
class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
59
|
-
# By default, we defer to a value of
|
60
|
-
|
61
|
-
SINGLE_THREADED_CONCURRENCY_LEVEL = 2
|
59
|
+
# By default, we defer to a value of 1 which represents running a connector using the Concurrent CDK engine on only one thread.
|
60
|
+
SINGLE_THREADED_CONCURRENCY_LEVEL = 1
|
62
61
|
|
63
62
|
def __init__(
|
64
63
|
self,
|
@@ -79,9 +78,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
79
78
|
emit_connector_builder_messages=emit_connector_builder_messages,
|
80
79
|
disable_resumable_full_refresh=True,
|
81
80
|
)
|
82
|
-
self._config = config
|
83
|
-
self._concurrent_streams: Optional[List[AbstractStream]] = None
|
84
|
-
self._synchronous_streams: Optional[List[Stream]] = None
|
85
81
|
|
86
82
|
super().__init__(
|
87
83
|
source_config=source_config,
|
@@ -90,6 +86,8 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
90
86
|
component_factory=component_factory,
|
91
87
|
)
|
92
88
|
|
89
|
+
# todo: We could remove state from initialization. Now that streams are grouped during the read(), a source
|
90
|
+
# no longer needs to store the original incoming state. But maybe there's an edge case?
|
93
91
|
self._state = state
|
94
92
|
|
95
93
|
concurrency_level_from_manifest = self._source_config.get("concurrency_level")
|
@@ -110,48 +108,35 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
110
108
|
) # Partition_generation iterates using range based on this value. If this is floored to zero we end up in a dead lock during start up
|
111
109
|
else:
|
112
110
|
concurrency_level = self.SINGLE_THREADED_CONCURRENCY_LEVEL
|
113
|
-
initial_number_of_partitions_to_generate = self.SINGLE_THREADED_CONCURRENCY_LEVEL
|
111
|
+
initial_number_of_partitions_to_generate = self.SINGLE_THREADED_CONCURRENCY_LEVEL
|
114
112
|
|
115
113
|
self._concurrent_source = ConcurrentSource.create(
|
116
114
|
num_workers=concurrency_level,
|
117
115
|
initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
|
118
116
|
logger=self.logger,
|
119
117
|
slice_logger=self._slice_logger,
|
120
|
-
message_repository=self.message_repository,
|
118
|
+
message_repository=self.message_repository,
|
121
119
|
)
|
122
120
|
|
123
|
-
def _actually_group(self) -> None:
|
124
|
-
# If the connector command was SPEC, there is no incoming config, and we cannot instantiate streams because
|
125
|
-
# they might depend on it. Ideally we want to have a static method on this class to get the spec without
|
126
|
-
# any other arguments, but the existing entrypoint.py isn't designed to support this. Just noting this
|
127
|
-
# for our future improvements to the CDK.
|
128
|
-
if self._config:
|
129
|
-
self._concurrent_streams, self._synchronous_streams = self._group_streams(
|
130
|
-
config=self._config or {}
|
131
|
-
)
|
132
|
-
else:
|
133
|
-
self._concurrent_streams = None
|
134
|
-
self._synchronous_streams = None
|
135
|
-
|
136
121
|
def read(
|
137
122
|
self,
|
138
123
|
logger: logging.Logger,
|
139
124
|
config: Mapping[str, Any],
|
140
125
|
catalog: ConfiguredAirbyteCatalog,
|
141
|
-
state: Optional[
|
126
|
+
state: Optional[List[AirbyteStateMessage]] = None,
|
142
127
|
) -> Iterator[AirbyteMessage]:
|
143
|
-
|
144
|
-
# streams must be saved so that they can be removed from the catalog before starting synchronous streams
|
145
|
-
if self._concurrent_streams is None:
|
146
|
-
self._actually_group()
|
128
|
+
concurrent_streams, _ = self._group_streams(config=config)
|
147
129
|
|
148
|
-
|
130
|
+
# ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of
|
131
|
+
# the concurrent streams must be saved so that they can be removed from the catalog before starting
|
132
|
+
# synchronous streams
|
133
|
+
if len(concurrent_streams) > 0:
|
149
134
|
concurrent_stream_names = set(
|
150
|
-
[concurrent_stream.name for concurrent_stream in
|
135
|
+
[concurrent_stream.name for concurrent_stream in concurrent_streams]
|
151
136
|
)
|
152
137
|
|
153
138
|
selected_concurrent_streams = self._select_streams(
|
154
|
-
streams=
|
139
|
+
streams=concurrent_streams, configured_catalog=catalog
|
155
140
|
)
|
156
141
|
# It would appear that passing in an empty set of streams causes an infinite loop in ConcurrentReadProcessor.
|
157
142
|
# This is also evident in concurrent_source_adapter.py so I'll leave this out of scope to fix for now
|
@@ -170,11 +155,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
170
155
|
yield from super().read(logger, config, filtered_catalog, state)
|
171
156
|
|
172
157
|
def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
|
173
|
-
|
174
|
-
self._actually_group()
|
175
|
-
|
176
|
-
concurrent_streams = self._concurrent_streams or []
|
177
|
-
synchronous_streams = self._synchronous_streams or []
|
158
|
+
concurrent_streams, synchronous_streams = self._group_streams(config=config)
|
178
159
|
return AirbyteCatalog(
|
179
160
|
streams=[
|
180
161
|
stream.as_airbyte_stream() for stream in concurrent_streams + synchronous_streams
|
@@ -200,9 +181,13 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
200
181
|
|
201
182
|
state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
|
202
183
|
|
203
|
-
|
204
|
-
|
205
|
-
|
184
|
+
# Combine streams and dynamic_streams. Note: both cannot be empty at the same time,
|
185
|
+
# and this is validated during the initialization of the source.
|
186
|
+
streams = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
|
187
|
+
self._source_config, config
|
188
|
+
)
|
189
|
+
|
190
|
+
name_to_stream_mapping = {stream["name"]: stream for stream in streams}
|
206
191
|
|
207
192
|
for declarative_stream in self.streams(config=config):
|
208
193
|
# Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect
|
@@ -210,7 +195,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
210
195
|
# so we need to treat them as synchronous
|
211
196
|
if (
|
212
197
|
isinstance(declarative_stream, DeclarativeStream)
|
213
|
-
and name_to_stream_mapping[declarative_stream.name]
|
198
|
+
and name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
|
214
199
|
== "SimpleRetriever"
|
215
200
|
):
|
216
201
|
incremental_sync_component_definition = name_to_stream_mapping[
|
@@ -219,7 +204,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
219
204
|
|
220
205
|
partition_router_component_definition = (
|
221
206
|
name_to_stream_mapping[declarative_stream.name]
|
222
|
-
.get("retriever")
|
207
|
+
.get("retriever", {})
|
223
208
|
.get("partition_router")
|
224
209
|
)
|
225
210
|
is_without_partition_router_or_cursor = not bool(
|
@@ -241,7 +226,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
241
226
|
cursor = self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
|
242
227
|
state_manager=state_manager,
|
243
228
|
model_type=DatetimeBasedCursorModel,
|
244
|
-
component_definition=incremental_sync_component_definition,
|
229
|
+
component_definition=incremental_sync_component_definition, # type: ignore # Not None because of the if condition above
|
245
230
|
stream_name=declarative_stream.name,
|
246
231
|
stream_namespace=declarative_stream.namespace,
|
247
232
|
config=config or {},
|
@@ -324,10 +309,11 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
324
309
|
def _is_datetime_incremental_without_partition_routing(
|
325
310
|
self,
|
326
311
|
declarative_stream: DeclarativeStream,
|
327
|
-
incremental_sync_component_definition: Mapping[str, Any],
|
312
|
+
incremental_sync_component_definition: Mapping[str, Any] | None,
|
328
313
|
) -> bool:
|
329
314
|
return (
|
330
|
-
|
315
|
+
incremental_sync_component_definition is not None
|
316
|
+
and bool(incremental_sync_component_definition)
|
331
317
|
and incremental_sync_component_definition.get("type", "")
|
332
318
|
== DatetimeBasedCursorModel.__name__
|
333
319
|
and self._stream_supports_concurrent_partition_processing(
|
@@ -41,12 +41,12 @@ class MinMaxDatetime:
|
|
41
41
|
self.datetime = InterpolatedString.create(self.datetime, parameters=parameters or {})
|
42
42
|
self._parser = DatetimeParser()
|
43
43
|
self.min_datetime = (
|
44
|
-
InterpolatedString.create(self.min_datetime, parameters=parameters)
|
44
|
+
InterpolatedString.create(self.min_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
|
45
45
|
if self.min_datetime
|
46
46
|
else None
|
47
47
|
) # type: ignore
|
48
48
|
self.max_datetime = (
|
49
|
-
InterpolatedString.create(self.max_datetime, parameters=parameters)
|
49
|
+
InterpolatedString.create(self.max_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
|
50
50
|
if self.max_datetime
|
51
51
|
else None
|
52
52
|
) # type: ignore
|
@@ -66,7 +66,13 @@ class MinMaxDatetime:
|
|
66
66
|
datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z"
|
67
67
|
|
68
68
|
time = self._parser.parse(
|
69
|
-
str(
|
69
|
+
str(
|
70
|
+
self.datetime.eval( # type: ignore[union-attr] # str has no attribute "eval"
|
71
|
+
config,
|
72
|
+
**additional_parameters,
|
73
|
+
)
|
74
|
+
),
|
75
|
+
datetime_format,
|
70
76
|
) # type: ignore # datetime is always cast to an interpolated string
|
71
77
|
|
72
78
|
if self.min_datetime:
|
@@ -105,7 +111,7 @@ class MinMaxDatetime:
|
|
105
111
|
if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance(
|
106
112
|
interpolated_string_or_min_max_datetime, str
|
107
113
|
):
|
108
|
-
return MinMaxDatetime(
|
114
|
+
return MinMaxDatetime( # type: ignore [call-arg]
|
109
115
|
datetime=interpolated_string_or_min_max_datetime, parameters=parameters
|
110
116
|
)
|
111
117
|
else:
|