airbyte-cdk 6.8.1rc8__py3-none-any.whl → 6.8.1rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. airbyte_cdk/cli/source_declarative_manifest/_run.py +11 -5
  2. airbyte_cdk/config_observation.py +1 -1
  3. airbyte_cdk/connector_builder/main.py +1 -1
  4. airbyte_cdk/connector_builder/message_grouper.py +10 -10
  5. airbyte_cdk/destinations/destination.py +1 -1
  6. airbyte_cdk/destinations/vector_db_based/embedder.py +2 -2
  7. airbyte_cdk/destinations/vector_db_based/writer.py +12 -4
  8. airbyte_cdk/entrypoint.py +7 -6
  9. airbyte_cdk/logger.py +2 -2
  10. airbyte_cdk/sources/abstract_source.py +1 -1
  11. airbyte_cdk/sources/config.py +1 -1
  12. airbyte_cdk/sources/connector_state_manager.py +9 -4
  13. airbyte_cdk/sources/declarative/auth/oauth.py +1 -1
  14. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +6 -1
  15. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +28 -42
  16. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +10 -4
  17. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +116 -19
  18. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +4 -1
  19. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +8 -6
  20. airbyte_cdk/sources/declarative/interpolation/jinja.py +35 -36
  21. airbyte_cdk/sources/declarative/interpolation/macros.py +1 -1
  22. airbyte_cdk/sources/declarative/manifest_declarative_source.py +53 -2
  23. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +95 -2
  24. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +6 -0
  25. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +100 -27
  26. airbyte_cdk/sources/declarative/partition_routers/__init__.py +2 -1
  27. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +13 -7
  28. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +1 -1
  29. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +8 -6
  30. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +1 -1
  31. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +2 -2
  32. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +1 -1
  33. airbyte_cdk/sources/declarative/resolvers/__init__.py +13 -0
  34. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  35. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +106 -0
  36. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +5 -2
  37. airbyte_cdk/sources/declarative/spec/spec.py +1 -1
  38. airbyte_cdk/sources/embedded/base_integration.py +3 -2
  39. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  40. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +18 -7
  41. airbyte_cdk/sources/file_based/file_types/avro_parser.py +14 -11
  42. airbyte_cdk/sources/file_based/file_types/csv_parser.py +3 -3
  43. airbyte_cdk/sources/file_based/file_types/excel_parser.py +11 -5
  44. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
  45. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +2 -2
  46. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +6 -3
  47. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +1 -1
  48. airbyte_cdk/sources/http_logger.py +3 -3
  49. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +5 -2
  50. airbyte_cdk/sources/streams/concurrent/adapters.py +6 -3
  51. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +9 -3
  52. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -1
  53. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +2 -2
  54. airbyte_cdk/sources/streams/core.py +17 -14
  55. airbyte_cdk/sources/streams/http/http.py +19 -19
  56. airbyte_cdk/sources/streams/http/http_client.py +4 -48
  57. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  58. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +62 -33
  59. airbyte_cdk/sources/utils/record_helper.py +1 -1
  60. airbyte_cdk/sources/utils/schema_helpers.py +1 -1
  61. airbyte_cdk/sources/utils/transform.py +34 -15
  62. airbyte_cdk/test/entrypoint_wrapper.py +11 -6
  63. airbyte_cdk/test/mock_http/response_builder.py +1 -1
  64. airbyte_cdk/utils/airbyte_secrets_utils.py +1 -1
  65. airbyte_cdk/utils/event_timing.py +10 -10
  66. airbyte_cdk/utils/message_utils.py +4 -3
  67. airbyte_cdk/utils/spec_schema_transformations.py +3 -2
  68. airbyte_cdk/utils/traced_exception.py +14 -12
  69. airbyte_cdk-6.8.1rc10.dist-info/METADATA +111 -0
  70. {airbyte_cdk-6.8.1rc8.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/RECORD +73 -70
  71. airbyte_cdk-6.8.1rc8.dist-info/METADATA +0 -307
  72. {airbyte_cdk-6.8.1rc8.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/LICENSE.txt +0 -0
  73. {airbyte_cdk-6.8.1rc8.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/WHEEL +0 -0
  74. {airbyte_cdk-6.8.1rc8.dist-info → airbyte_cdk-6.8.1rc10.dist-info}/entry_points.txt +0 -0
@@ -25,7 +25,7 @@ from datetime import datetime
25
25
  from pathlib import Path
26
26
  from typing import Any, cast
27
27
 
28
- from orjson import orjson
28
+ import orjson
29
29
 
30
30
  from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
31
31
  from airbyte_cdk.models import (
@@ -72,7 +72,7 @@ class SourceLocalYaml(YamlDeclarativeSource):
72
72
  super().__init__(
73
73
  catalog=catalog,
74
74
  config=config,
75
- state=state,
75
+ state=state, # type: ignore [arg-type]
76
76
  path_to_yaml="manifest.yaml",
77
77
  )
78
78
 
@@ -152,7 +152,9 @@ def handle_remote_manifest_command(args: list[str]) -> None:
152
152
  )
153
153
 
154
154
 
155
- def create_declarative_source(args: list[str]) -> ConcurrentDeclarativeSource:
155
+ def create_declarative_source(
156
+ args: list[str],
157
+ ) -> ConcurrentDeclarativeSource: # type: ignore [type-arg]
156
158
  """Creates the source with the injected config.
157
159
 
158
160
  This essentially does what other low-code sources do at build time, but at runtime,
@@ -160,10 +162,14 @@ def create_declarative_source(args: list[str]) -> ConcurrentDeclarativeSource:
160
162
  connector builder.
161
163
  """
162
164
  try:
165
+ config: Mapping[str, Any] | None
166
+ catalog: ConfiguredAirbyteCatalog | None
167
+ state: list[AirbyteStateMessage]
163
168
  config, catalog, state = _parse_inputs_into_config_catalog_state(args)
164
- if "__injected_declarative_manifest" not in config:
169
+ if config is None or "__injected_declarative_manifest" not in config:
165
170
  raise ValueError(
166
- f"Invalid config: `__injected_declarative_manifest` should be provided at the root of the config but config only has keys {list(config.keys())}"
171
+ "Invalid config: `__injected_declarative_manifest` should be provided at the root "
172
+ f"of the config but config only has keys: {list(config.keys() if config else [])}"
167
173
  )
168
174
  return ConcurrentDeclarativeSource(
169
175
  config=config,
@@ -10,7 +10,7 @@ import time
10
10
  from copy import copy
11
11
  from typing import Any, List, MutableMapping
12
12
 
13
- from orjson import orjson
13
+ import orjson
14
14
 
15
15
  from airbyte_cdk.models import (
16
16
  AirbyteControlConnectorConfigMessage,
@@ -6,7 +6,7 @@
6
6
  import sys
7
7
  from typing import Any, List, Mapping, Optional, Tuple
8
8
 
9
- from orjson import orjson
9
+ import orjson
10
10
 
11
11
  from airbyte_cdk.connector import BaseConnector
12
12
  from airbyte_cdk.connector_builder.connector_builder_handler import (
@@ -71,7 +71,7 @@ class MessageGrouper:
71
71
 
72
72
  is_nested_key = isinstance(field[0], str)
73
73
  if is_nested_key:
74
- return [field] # type: ignore # the type of field is expected to be List[str] here
74
+ return [field]
75
75
 
76
76
  raise ValueError(f"Unknown type for cursor field `{field}")
77
77
 
@@ -232,9 +232,9 @@ class MessageGrouper:
232
232
  current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
233
233
  current_slice_pages = []
234
234
  at_least_one_page_in_group = False
235
- elif message.type == MessageType.LOG and message.log.message.startswith(
235
+ elif message.type == MessageType.LOG and message.log.message.startswith( # type: ignore[union-attr] # None doesn't have 'message'
236
236
  SliceLogger.SLICE_LOG_PREFIX
237
- ): # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
237
+ ):
238
238
  # parsing the first slice
239
239
  current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
240
240
  elif message.type == MessageType.LOG:
@@ -274,14 +274,14 @@ class MessageGrouper:
274
274
  if message.trace.type == TraceType.ERROR: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has trace.type
275
275
  yield message.trace
276
276
  elif message.type == MessageType.RECORD:
277
- current_page_records.append(message.record.data) # type: ignore[union-attr] # AirbyteMessage with MessageType.RECORD has record.data
277
+ current_page_records.append(message.record.data) # type: ignore[arg-type, union-attr] # AirbyteMessage with MessageType.RECORD has record.data
278
278
  records_count += 1
279
279
  schema_inferrer.accumulate(message.record)
280
280
  datetime_format_inferrer.accumulate(message.record)
281
281
  elif (
282
282
  message.type == MessageType.CONTROL
283
- and message.control.type == OrchestratorType.CONNECTOR_CONFIG
284
- ): # type: ignore[union-attr] # AirbyteMessage with MessageType.CONTROL has control.type
283
+ and message.control.type == OrchestratorType.CONNECTOR_CONFIG # type: ignore[union-attr] # None doesn't have 'type'
284
+ ):
285
285
  yield message.control
286
286
  elif message.type == MessageType.STATE:
287
287
  latest_state_message = message.state # type: ignore[assignment]
@@ -310,8 +310,8 @@ class MessageGrouper:
310
310
  and message.type == MessageType.LOG
311
311
  and (
312
312
  MessageGrouper._is_page_http_request(json_message)
313
- or message.log.message.startswith("slice:")
314
- ) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
313
+ or message.log.message.startswith("slice:") # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
314
+ )
315
315
  )
316
316
 
317
317
  @staticmethod
@@ -355,8 +355,8 @@ class MessageGrouper:
355
355
  StreamReadPages(
356
356
  request=current_page_request,
357
357
  response=current_page_response,
358
- records=deepcopy(current_page_records),
359
- ) # type: ignore
358
+ records=deepcopy(current_page_records), # type: ignore [arg-type]
359
+ )
360
360
  )
361
361
  current_page_records.clear()
362
362
 
@@ -9,7 +9,7 @@ import sys
9
9
  from abc import ABC, abstractmethod
10
10
  from typing import Any, Iterable, List, Mapping
11
11
 
12
- from orjson import orjson
12
+ import orjson
13
13
 
14
14
  from airbyte_cdk.connector import Connector
15
15
  from airbyte_cdk.exception_handler import init_uncaught_exception_handler
@@ -107,7 +107,7 @@ class BaseOpenAIEmbedder(Embedder):
107
107
  class OpenAIEmbedder(BaseOpenAIEmbedder):
108
108
  def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
109
109
  super().__init__(
110
- OpenAIEmbeddings(
110
+ OpenAIEmbeddings( # type: ignore [call-arg]
111
111
  openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
112
112
  ),
113
113
  chunk_size,
@@ -118,7 +118,7 @@ class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
118
118
  def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
119
119
  # Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
120
120
  super().__init__(
121
- OpenAIEmbeddings(
121
+ OpenAIEmbeddings( # type: ignore [call-arg]
122
122
  openai_api_key=config.openai_key,
123
123
  chunk_size=16,
124
124
  max_retries=15,
@@ -83,11 +83,19 @@ class Writer:
83
83
  yield message
84
84
  elif message.type == Type.RECORD:
85
85
  record_chunks, record_id_to_delete = self.processor.process(message.record)
86
- self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
87
- if record_id_to_delete is not None:
88
- self.ids_to_delete[(message.record.namespace, message.record.stream)].append(
89
- record_id_to_delete
86
+ self.chunks[
87
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
88
+ message.record.namespace, # type: ignore [union-attr] # record not None
89
+ message.record.stream, # type: ignore [union-attr] # record not None
90
90
  )
91
+ ].extend(record_chunks)
92
+ if record_id_to_delete is not None:
93
+ self.ids_to_delete[
94
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
95
+ message.record.namespace, # type: ignore [union-attr] # record not None
96
+ message.record.stream, # type: ignore [union-attr] # record not None
97
+ )
98
+ ].append(record_id_to_delete)
91
99
  self.number_of_chunks += len(record_chunks)
92
100
  if self.number_of_chunks >= self.batch_size:
93
101
  self._process_batch()
airbyte_cdk/entrypoint.py CHANGED
@@ -22,7 +22,7 @@ from requests import PreparedRequest, Response, Session
22
22
  from airbyte_cdk.connector import TConfig
23
23
  from airbyte_cdk.exception_handler import init_uncaught_exception_handler
24
24
  from airbyte_cdk.logger import init_logger
25
- from airbyte_cdk.models import ( # type: ignore [attr-defined]
25
+ from airbyte_cdk.models import (
26
26
  AirbyteConnectionStatus,
27
27
  AirbyteMessage,
28
28
  AirbyteMessageSerializer,
@@ -255,9 +255,10 @@ class AirbyteEntrypoint(object):
255
255
 
256
256
  stream_message_count[
257
257
  HashableStreamDescriptor(
258
- name=message.record.stream, namespace=message.record.namespace
258
+ name=message.record.stream, # type: ignore[union-attr] # record has `stream`
259
+ namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace`
259
260
  )
260
- ] += 1.0 # type: ignore[union-attr] # record has `stream` and `namespace`
261
+ ] += 1.0
261
262
  case Type.STATE:
262
263
  if message.state is None:
263
264
  raise ValueError("State message must have a state attribute")
@@ -266,9 +267,9 @@ class AirbyteEntrypoint(object):
266
267
 
267
268
  # Set record count from the counter onto the state message
268
269
  message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats`
269
- message.state.sourceStats.recordCount = stream_message_count.get(
270
+ message.state.sourceStats.recordCount = stream_message_count.get( # type: ignore[union-attr] # state has `sourceStats`
270
271
  stream_descriptor, 0.0
271
- ) # type: ignore[union-attr] # state has `sourceStats`
272
+ )
272
273
 
273
274
  # Reset the counter
274
275
  stream_message_count[stream_descriptor] = 0.0
@@ -290,7 +291,7 @@ class AirbyteEntrypoint(object):
290
291
 
291
292
  @staticmethod
292
293
  def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str:
293
- return orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode() # type: ignore[no-any-return] # orjson.dumps(message).decode() always returns string
294
+ return orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode()
294
295
 
295
296
  @classmethod
296
297
  def extract_state(cls, args: List[str]) -> Optional[Any]:
airbyte_cdk/logger.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
7
  import logging.config
8
8
  from typing import Any, Callable, Mapping, Optional, Tuple
9
9
 
10
- from orjson import orjson
10
+ import orjson
11
11
 
12
12
  from airbyte_cdk.models import (
13
13
  AirbyteLogMessage,
@@ -78,7 +78,7 @@ class AirbyteLogFormatter(logging.Formatter):
78
78
  log_message = AirbyteMessage(
79
79
  type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
80
80
  )
81
- return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode() # type: ignore[no-any-return] # orjson.dumps(message).decode() always returns string
81
+ return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
82
82
 
83
83
  @staticmethod
84
84
  def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]:
@@ -200,7 +200,7 @@ class AbstractSource(Source, ABC):
200
200
  if len(stream_name_to_exception) > 0:
201
201
  error_message = generate_failed_streams_error_message(
202
202
  {key: [value] for key, value in stream_name_to_exception.items()}
203
- ) # type: ignore # for some reason, mypy can't figure out the types for key and value
203
+ )
204
204
  logger.info(error_message)
205
205
  # We still raise at least one exception when a stream raises an exception because the platform currently relies
206
206
  # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
@@ -24,4 +24,4 @@ class BaseConfig(BaseModel):
24
24
  rename_key(schema, old_key="anyOf", new_key="oneOf") # UI supports only oneOf
25
25
  expand_refs(schema)
26
26
  schema.pop("description", None) # description added from the docstring
27
- return schema # type: ignore[no-any-return]
27
+ return schema
@@ -4,7 +4,7 @@
4
4
 
5
5
  import copy
6
6
  from dataclasses import dataclass
7
- from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
7
+ from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union, cast
8
8
 
9
9
  from airbyte_cdk.models import (
10
10
  AirbyteMessage,
@@ -15,6 +15,7 @@ from airbyte_cdk.models import (
15
15
  StreamDescriptor,
16
16
  )
17
17
  from airbyte_cdk.models import Type as MessageType
18
+ from airbyte_cdk.models.airbyte_protocol import AirbyteGlobalState, AirbyteStateBlob
18
19
 
19
20
 
20
21
  @dataclass(frozen=True)
@@ -118,8 +119,12 @@ class ConnectorStateManager:
118
119
  is_global = cls._is_global_state(state)
119
120
 
120
121
  if is_global:
121
- global_state = state[0].global_ # type: ignore # We verified state is a list in _is_global_state
122
- shared_state = copy.deepcopy(global_state.shared_state, {}) # type: ignore[union-attr] # global_state has shared_state
122
+ # We already validate that this is a global state message, not None:
123
+ global_state = cast(AirbyteGlobalState, state[0].global_)
124
+ # global_state has shared_state, also not None:
125
+ shared_state: AirbyteStateBlob = cast(
126
+ AirbyteStateBlob, copy.deepcopy(global_state.shared_state, {})
127
+ )
123
128
  streams = {
124
129
  HashableStreamDescriptor(
125
130
  name=per_stream_state.stream_descriptor.name,
@@ -131,7 +136,7 @@ class ConnectorStateManager:
131
136
  else:
132
137
  streams = {
133
138
  HashableStreamDescriptor(
134
- name=per_stream_state.stream.stream_descriptor.name,
139
+ name=per_stream_state.stream.stream_descriptor.name, # type: ignore[union-attr] # stream has stream_descriptor
135
140
  namespace=per_stream_state.stream.stream_descriptor.namespace, # type: ignore[union-attr] # stream has stream_descriptor
136
141
  ): per_stream_state.stream.stream_state # type: ignore[union-attr] # stream has stream_state
137
142
  for per_stream_state in state
@@ -135,7 +135,7 @@ class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAut
135
135
  return self.grant_type.eval(self.config) # type: ignore # eval returns a string in this context
136
136
 
137
137
  def get_refresh_request_body(self) -> Mapping[str, Any]:
138
- return self._refresh_request_body.eval(self.config) # type: ignore # eval should return a Mapping in this context
138
+ return self._refresh_request_body.eval(self.config)
139
139
 
140
140
  def get_token_expiry_date(self) -> pendulum.DateTime:
141
141
  return self._token_expiry_date # type: ignore # _token_expiry_date is a pendulum.DateTime. It is never None despite what mypy thinks
@@ -28,7 +28,12 @@ class SelectiveAuthenticator(DeclarativeAuthenticator):
28
28
  **kwargs: Any,
29
29
  ) -> DeclarativeAuthenticator:
30
30
  try:
31
- selected_key = str(dpath.get(config, authenticator_selection_path))
31
+ selected_key = str(
32
+ dpath.get(
33
+ config, # type: ignore [arg-type] # Dpath wants mutable mapping but doesn't need it.
34
+ authenticator_selection_path,
35
+ )
36
+ )
32
37
  except KeyError as err:
33
38
  raise ValueError(
34
39
  "The path from `authenticator_selection_path` is not found in the config."
@@ -56,9 +56,8 @@ from airbyte_cdk.sources.types import Config, StreamState
56
56
 
57
57
 
58
58
  class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
59
- # By default, we defer to a value of 2. A value lower than than could cause a PartitionEnqueuer to be stuck in a state of deadlock
60
- # because it has hit the limit of futures but not partition reader is consuming them.
61
- SINGLE_THREADED_CONCURRENCY_LEVEL = 2
59
+ # By default, we defer to a value of 1 which represents running a connector using the Concurrent CDK engine on only one thread.
60
+ SINGLE_THREADED_CONCURRENCY_LEVEL = 1
62
61
 
63
62
  def __init__(
64
63
  self,
@@ -79,9 +78,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
79
78
  emit_connector_builder_messages=emit_connector_builder_messages,
80
79
  disable_resumable_full_refresh=True,
81
80
  )
82
- self._config = config
83
- self._concurrent_streams: Optional[List[AbstractStream]] = None
84
- self._synchronous_streams: Optional[List[Stream]] = None
85
81
 
86
82
  super().__init__(
87
83
  source_config=source_config,
@@ -90,6 +86,8 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
90
86
  component_factory=component_factory,
91
87
  )
92
88
 
89
+ # todo: We could remove state from initialization. Now that streams are grouped during the read(), a source
90
+ # no longer needs to store the original incoming state. But maybe there's an edge case?
93
91
  self._state = state
94
92
 
95
93
  concurrency_level_from_manifest = self._source_config.get("concurrency_level")
@@ -110,48 +108,35 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
110
108
  ) # Partition_generation iterates using range based on this value. If this is floored to zero we end up in a dead lock during start up
111
109
  else:
112
110
  concurrency_level = self.SINGLE_THREADED_CONCURRENCY_LEVEL
113
- initial_number_of_partitions_to_generate = self.SINGLE_THREADED_CONCURRENCY_LEVEL // 2
111
+ initial_number_of_partitions_to_generate = self.SINGLE_THREADED_CONCURRENCY_LEVEL
114
112
 
115
113
  self._concurrent_source = ConcurrentSource.create(
116
114
  num_workers=concurrency_level,
117
115
  initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
118
116
  logger=self.logger,
119
117
  slice_logger=self._slice_logger,
120
- message_repository=self.message_repository, # type: ignore # message_repository is always instantiated with a value by factory
118
+ message_repository=self.message_repository,
121
119
  )
122
120
 
123
- def _actually_group(self) -> None:
124
- # If the connector command was SPEC, there is no incoming config, and we cannot instantiate streams because
125
- # they might depend on it. Ideally we want to have a static method on this class to get the spec without
126
- # any other arguments, but the existing entrypoint.py isn't designed to support this. Just noting this
127
- # for our future improvements to the CDK.
128
- if self._config:
129
- self._concurrent_streams, self._synchronous_streams = self._group_streams(
130
- config=self._config or {}
131
- )
132
- else:
133
- self._concurrent_streams = None
134
- self._synchronous_streams = None
135
-
136
121
  def read(
137
122
  self,
138
123
  logger: logging.Logger,
139
124
  config: Mapping[str, Any],
140
125
  catalog: ConfiguredAirbyteCatalog,
141
- state: Optional[Union[List[AirbyteStateMessage]]] = None,
126
+ state: Optional[List[AirbyteStateMessage]] = None,
142
127
  ) -> Iterator[AirbyteMessage]:
143
- # ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of the concurrent
144
- # streams must be saved so that they can be removed from the catalog before starting synchronous streams
145
- if self._concurrent_streams is None:
146
- self._actually_group()
128
+ concurrent_streams, _ = self._group_streams(config=config)
147
129
 
148
- if self._concurrent_streams:
130
+ # ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of
131
+ # the concurrent streams must be saved so that they can be removed from the catalog before starting
132
+ # synchronous streams
133
+ if len(concurrent_streams) > 0:
149
134
  concurrent_stream_names = set(
150
- [concurrent_stream.name for concurrent_stream in self._concurrent_streams]
135
+ [concurrent_stream.name for concurrent_stream in concurrent_streams]
151
136
  )
152
137
 
153
138
  selected_concurrent_streams = self._select_streams(
154
- streams=self._concurrent_streams, configured_catalog=catalog
139
+ streams=concurrent_streams, configured_catalog=catalog
155
140
  )
156
141
  # It would appear that passing in an empty set of streams causes an infinite loop in ConcurrentReadProcessor.
157
142
  # This is also evident in concurrent_source_adapter.py so I'll leave this out of scope to fix for now
@@ -170,11 +155,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
170
155
  yield from super().read(logger, config, filtered_catalog, state)
171
156
 
172
157
  def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
173
- if self._concurrent_streams is None:
174
- self._actually_group()
175
-
176
- concurrent_streams = self._concurrent_streams or []
177
- synchronous_streams = self._synchronous_streams or []
158
+ concurrent_streams, synchronous_streams = self._group_streams(config=config)
178
159
  return AirbyteCatalog(
179
160
  streams=[
180
161
  stream.as_airbyte_stream() for stream in concurrent_streams + synchronous_streams
@@ -200,9 +181,13 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
200
181
 
201
182
  state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
202
183
 
203
- name_to_stream_mapping = {
204
- stream["name"]: stream for stream in self.resolved_manifest["streams"]
205
- }
184
+ # Combine streams and dynamic_streams. Note: both cannot be empty at the same time,
185
+ # and this is validated during the initialization of the source.
186
+ streams = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
187
+ self._source_config, config
188
+ )
189
+
190
+ name_to_stream_mapping = {stream["name"]: stream for stream in streams}
206
191
 
207
192
  for declarative_stream in self.streams(config=config):
208
193
  # Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect
@@ -210,7 +195,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
210
195
  # so we need to treat them as synchronous
211
196
  if (
212
197
  isinstance(declarative_stream, DeclarativeStream)
213
- and name_to_stream_mapping[declarative_stream.name].get("retriever")["type"]
198
+ and name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
214
199
  == "SimpleRetriever"
215
200
  ):
216
201
  incremental_sync_component_definition = name_to_stream_mapping[
@@ -219,7 +204,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
219
204
 
220
205
  partition_router_component_definition = (
221
206
  name_to_stream_mapping[declarative_stream.name]
222
- .get("retriever")
207
+ .get("retriever", {})
223
208
  .get("partition_router")
224
209
  )
225
210
  is_without_partition_router_or_cursor = not bool(
@@ -241,7 +226,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
241
226
  cursor = self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
242
227
  state_manager=state_manager,
243
228
  model_type=DatetimeBasedCursorModel,
244
- component_definition=incremental_sync_component_definition,
229
+ component_definition=incremental_sync_component_definition, # type: ignore # Not None because of the if condition above
245
230
  stream_name=declarative_stream.name,
246
231
  stream_namespace=declarative_stream.namespace,
247
232
  config=config or {},
@@ -324,10 +309,11 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
324
309
  def _is_datetime_incremental_without_partition_routing(
325
310
  self,
326
311
  declarative_stream: DeclarativeStream,
327
- incremental_sync_component_definition: Mapping[str, Any],
312
+ incremental_sync_component_definition: Mapping[str, Any] | None,
328
313
  ) -> bool:
329
314
  return (
330
- bool(incremental_sync_component_definition)
315
+ incremental_sync_component_definition is not None
316
+ and bool(incremental_sync_component_definition)
331
317
  and incremental_sync_component_definition.get("type", "")
332
318
  == DatetimeBasedCursorModel.__name__
333
319
  and self._stream_supports_concurrent_partition_processing(
@@ -41,12 +41,12 @@ class MinMaxDatetime:
41
41
  self.datetime = InterpolatedString.create(self.datetime, parameters=parameters or {})
42
42
  self._parser = DatetimeParser()
43
43
  self.min_datetime = (
44
- InterpolatedString.create(self.min_datetime, parameters=parameters)
44
+ InterpolatedString.create(self.min_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
45
45
  if self.min_datetime
46
46
  else None
47
47
  ) # type: ignore
48
48
  self.max_datetime = (
49
- InterpolatedString.create(self.max_datetime, parameters=parameters)
49
+ InterpolatedString.create(self.max_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
50
50
  if self.max_datetime
51
51
  else None
52
52
  ) # type: ignore
@@ -66,7 +66,13 @@ class MinMaxDatetime:
66
66
  datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z"
67
67
 
68
68
  time = self._parser.parse(
69
- str(self.datetime.eval(config, **additional_parameters)), datetime_format
69
+ str(
70
+ self.datetime.eval( # type: ignore[union-attr] # str has no attribute "eval"
71
+ config,
72
+ **additional_parameters,
73
+ )
74
+ ),
75
+ datetime_format,
70
76
  ) # type: ignore # datetime is always cast to an interpolated string
71
77
 
72
78
  if self.min_datetime:
@@ -105,7 +111,7 @@ class MinMaxDatetime:
105
111
  if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance(
106
112
  interpolated_string_or_min_max_datetime, str
107
113
  ):
108
- return MinMaxDatetime(
114
+ return MinMaxDatetime( # type: ignore [call-arg]
109
115
  datetime=interpolated_string_or_min_max_datetime, parameters=parameters
110
116
  )
111
117
  else: