airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. airbyte_cdk/__init__.py +17 -2
  2. airbyte_cdk/config_observation.py +10 -3
  3. airbyte_cdk/connector.py +19 -9
  4. airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
  5. airbyte_cdk/connector_builder/main.py +26 -6
  6. airbyte_cdk/connector_builder/message_grouper.py +95 -25
  7. airbyte_cdk/destinations/destination.py +47 -14
  8. airbyte_cdk/destinations/vector_db_based/config.py +36 -14
  9. airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
  10. airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
  11. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  12. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  13. airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
  14. airbyte_cdk/entrypoint.py +82 -26
  15. airbyte_cdk/exception_handler.py +13 -3
  16. airbyte_cdk/logger.py +10 -2
  17. airbyte_cdk/models/airbyte_protocol.py +11 -5
  18. airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
  19. airbyte_cdk/models/well_known_types.py +1 -1
  20. airbyte_cdk/sources/abstract_source.py +63 -17
  21. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
  22. airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
  23. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
  24. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
  25. airbyte_cdk/sources/connector_state_manager.py +32 -10
  26. airbyte_cdk/sources/declarative/async_job/job.py +3 -1
  27. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
  28. airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
  29. airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
  30. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  31. airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
  32. airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
  33. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
  34. airbyte_cdk/sources/declarative/auth/token.py +25 -8
  35. airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
  36. airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
  37. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
  38. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
  39. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
  40. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
  41. airbyte_cdk/sources/declarative/declarative_source.py +3 -1
  42. airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
  43. airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
  44. airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
  45. airbyte_cdk/sources/declarative/decoders/json_decoder.py +48 -13
  46. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
  47. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
  48. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
  49. airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
  50. airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
  51. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
  52. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
  53. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
  54. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
  55. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
  56. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
  57. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
  58. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
  59. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
  60. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
  61. airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
  62. airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
  63. airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
  64. airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
  65. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +14 -5
  66. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +697 -678
  67. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
  68. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
  69. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +802 -232
  70. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
  71. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
  72. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
  73. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
  74. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
  75. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
  76. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
  77. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
  78. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
  79. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
  80. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
  81. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
  82. airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
  83. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
  84. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
  85. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
  86. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
  87. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
  88. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
  89. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
  90. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
  91. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
  92. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
  93. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
  94. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
  95. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
  96. airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
  97. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
  98. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
  99. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
  100. airbyte_cdk/sources/declarative/spec/spec.py +8 -2
  101. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
  102. airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
  103. airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
  104. airbyte_cdk/sources/declarative/types.py +8 -1
  105. airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
  106. airbyte_cdk/sources/embedded/base_integration.py +14 -4
  107. airbyte_cdk/sources/embedded/catalog.py +16 -4
  108. airbyte_cdk/sources/embedded/runner.py +19 -3
  109. airbyte_cdk/sources/embedded/tools.py +3 -1
  110. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  111. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
  112. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
  113. airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
  114. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
  115. airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
  116. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  117. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  118. airbyte_cdk/sources/file_based/exceptions.py +13 -15
  119. airbyte_cdk/sources/file_based/file_based_source.py +82 -24
  120. airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
  121. airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
  122. airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
  123. airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
  124. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
  125. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  126. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
  127. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
  128. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
  129. airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
  130. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  131. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  132. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
  133. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
  134. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
  135. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
  136. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
  137. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  138. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
  139. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
  140. airbyte_cdk/sources/http_logger.py +5 -1
  141. airbyte_cdk/sources/message/repository.py +18 -4
  142. airbyte_cdk/sources/source.py +17 -7
  143. airbyte_cdk/sources/streams/availability_strategy.py +9 -3
  144. airbyte_cdk/sources/streams/call_rate.py +63 -19
  145. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
  146. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
  147. airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
  148. airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
  149. airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
  150. airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
  151. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
  152. airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
  153. airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
  154. airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
  155. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
  156. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
  157. airbyte_cdk/sources/streams/core.py +77 -22
  158. airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
  159. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
  160. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
  161. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
  162. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
  163. airbyte_cdk/sources/streams/http/exceptions.py +2 -2
  164. airbyte_cdk/sources/streams/http/http.py +133 -33
  165. airbyte_cdk/sources/streams/http/http_client.py +91 -29
  166. airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
  167. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
  168. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
  169. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  170. airbyte_cdk/sources/types.py +5 -1
  171. airbyte_cdk/sources/utils/record_helper.py +12 -3
  172. airbyte_cdk/sources/utils/schema_helpers.py +9 -3
  173. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  174. airbyte_cdk/sources/utils/transform.py +24 -9
  175. airbyte_cdk/sql/exceptions.py +19 -6
  176. airbyte_cdk/sql/secrets.py +3 -1
  177. airbyte_cdk/sql/shared/catalog_providers.py +13 -4
  178. airbyte_cdk/sql/shared/sql_processor.py +44 -14
  179. airbyte_cdk/test/catalog_builder.py +19 -8
  180. airbyte_cdk/test/entrypoint_wrapper.py +27 -8
  181. airbyte_cdk/test/mock_http/mocker.py +41 -11
  182. airbyte_cdk/test/mock_http/request.py +9 -3
  183. airbyte_cdk/test/mock_http/response.py +3 -1
  184. airbyte_cdk/test/mock_http/response_builder.py +29 -7
  185. airbyte_cdk/test/state_builder.py +10 -2
  186. airbyte_cdk/test/utils/data.py +6 -2
  187. airbyte_cdk/test/utils/http_mocking.py +3 -1
  188. airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
  189. airbyte_cdk/utils/analytics_message.py +10 -2
  190. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  191. airbyte_cdk/utils/mapping_helpers.py +3 -1
  192. airbyte_cdk/utils/message_utils.py +11 -4
  193. airbyte_cdk/utils/print_buffer.py +6 -1
  194. airbyte_cdk/utils/schema_inferrer.py +30 -9
  195. airbyte_cdk/utils/spec_schema_transformations.py +3 -1
  196. airbyte_cdk/utils/traced_exception.py +35 -9
  197. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/METADATA +8 -7
  198. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/RECORD +200 -200
  199. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/LICENSE.txt +0 -0
  200. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/WHEEL +0 -0
@@ -17,7 +17,11 @@ class SeparatorSplitterConfigModel(BaseModel):
17
17
  title="Separators",
18
18
  description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
19
19
  )
20
- keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
20
+ keep_separator: bool = Field(
21
+ default=False,
22
+ title="Keep separator",
23
+ description="Whether to keep the separator in the resulting chunks",
24
+ )
21
25
 
22
26
  class Config(OneOfOptionConfig):
23
27
  title = "By Separator"
@@ -68,18 +72,20 @@ class CodeSplitterConfigModel(BaseModel):
68
72
 
69
73
  class Config(OneOfOptionConfig):
70
74
  title = "By Programming Language"
71
- description = (
72
- "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
73
- )
75
+ description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
74
76
  discriminator = "mode"
75
77
 
76
78
 
77
- TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
79
+ TextSplitterConfigModel = Union[
80
+ SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
81
+ ]
78
82
 
79
83
 
80
84
  class FieldNameMappingConfigModel(BaseModel):
81
85
  from_field: str = Field(title="From field name", description="The field name in the source")
82
- to_field: str = Field(title="To field name", description="The field name to use in the destination")
86
+ to_field: str = Field(
87
+ title="To field name", description="The field name to use in the destination"
88
+ )
83
89
 
84
90
 
85
91
  class ProcessingConfigModel(BaseModel):
@@ -132,9 +138,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
132
138
 
133
139
  class Config(OneOfOptionConfig):
134
140
  title = "OpenAI"
135
- description = (
136
- "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
137
- )
141
+ description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
138
142
  discriminator = "mode"
139
143
 
140
144
 
@@ -142,7 +146,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
142
146
  mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
143
147
  api_key: str = Field(title="API key", default="", airbyte_secret=True)
144
148
  base_url: str = Field(
145
- ..., title="Base URL", description="The base URL for your OpenAI-compatible service", examples=["https://your-service-name.com"]
149
+ ...,
150
+ title="Base URL",
151
+ description="The base URL for your OpenAI-compatible service",
152
+ examples=["https://your-service-name.com"],
146
153
  )
147
154
  model_name: str = Field(
148
155
  title="Model name",
@@ -151,7 +158,9 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
151
158
  examples=["text-embedding-ada-002"],
152
159
  )
153
160
  dimensions: int = Field(
154
- title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
161
+ title="Embedding dimensions",
162
+ description="The number of dimensions the embedding model is generating",
163
+ examples=[1536, 384],
155
164
  )
156
165
 
157
166
  class Config(OneOfOptionConfig):
@@ -199,10 +208,16 @@ class FakeEmbeddingConfigModel(BaseModel):
199
208
  class FromFieldEmbeddingConfigModel(BaseModel):
200
209
  mode: Literal["from_field"] = Field("from_field", const=True)
201
210
  field_name: str = Field(
202
- ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
211
+ ...,
212
+ title="Field name",
213
+ description="Name of the field in the record that contains the embedding",
214
+ examples=["embedding", "vector"],
203
215
  )
204
216
  dimensions: int = Field(
205
- ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
217
+ ...,
218
+ title="Embedding dimensions",
219
+ description="The number of dimensions the embedding model is generating",
220
+ examples=[1536, 384],
206
221
  )
207
222
 
208
223
  class Config(OneOfOptionConfig):
@@ -241,7 +256,14 @@ class VectorDBConfigModel(BaseModel):
241
256
  FakeEmbeddingConfigModel,
242
257
  AzureOpenAIEmbeddingConfigModel,
243
258
  OpenAICompatibleEmbeddingConfigModel,
244
- ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object")
259
+ ] = Field(
260
+ ...,
261
+ title="Embedding",
262
+ description="Embedding configuration",
263
+ discriminator="mode",
264
+ group="embedding",
265
+ type="object",
266
+ )
245
267
  processing: ProcessingConfigModel
246
268
  omit_raw_text: bool = Field(
247
269
  default=False,
@@ -8,9 +8,18 @@ from dataclasses import dataclass
8
8
  from typing import Any, Dict, List, Mapping, Optional, Tuple
9
9
 
10
10
  import dpath
11
- from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel, SeparatorSplitterConfigModel, TextSplitterConfigModel
11
+ from airbyte_cdk.destinations.vector_db_based.config import (
12
+ ProcessingConfigModel,
13
+ SeparatorSplitterConfigModel,
14
+ TextSplitterConfigModel,
15
+ )
12
16
  from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
13
- from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
17
+ from airbyte_cdk.models import (
18
+ AirbyteRecordMessage,
19
+ ConfiguredAirbyteCatalog,
20
+ ConfiguredAirbyteStream,
21
+ DestinationSyncMode,
22
+ )
14
23
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
15
24
  from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
16
25
  from langchain.utils import stringify_dict
@@ -30,7 +39,14 @@ class Chunk:
30
39
  embedding: Optional[List[float]] = None
31
40
 
32
41
 
33
- headers_to_split_on = ["(?:^|\n)# ", "(?:^|\n)## ", "(?:^|\n)### ", "(?:^|\n)#### ", "(?:^|\n)##### ", "(?:^|\n)###### "]
42
+ headers_to_split_on = [
43
+ "(?:^|\n)# ",
44
+ "(?:^|\n)## ",
45
+ "(?:^|\n)### ",
46
+ "(?:^|\n)#### ",
47
+ "(?:^|\n)##### ",
48
+ "(?:^|\n)###### ",
49
+ ]
34
50
 
35
51
 
36
52
  class DocumentProcessor:
@@ -64,7 +80,10 @@ class DocumentProcessor:
64
80
  return None
65
81
 
66
82
  def _get_text_splitter(
67
- self, chunk_size: int, chunk_overlap: int, splitter_config: Optional[TextSplitterConfigModel]
83
+ self,
84
+ chunk_size: int,
85
+ chunk_overlap: int,
86
+ splitter_config: Optional[TextSplitterConfigModel],
68
87
  ) -> RecursiveCharacterTextSplitter:
69
88
  if splitter_config is None:
70
89
  splitter_config = SeparatorSplitterConfigModel(mode="separator")
@@ -89,14 +108,20 @@ class DocumentProcessor:
89
108
  return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
90
109
  chunk_size=chunk_size,
91
110
  chunk_overlap=chunk_overlap,
92
- separators=RecursiveCharacterTextSplitter.get_separators_for_language(Language(splitter_config.language)),
111
+ separators=RecursiveCharacterTextSplitter.get_separators_for_language(
112
+ Language(splitter_config.language)
113
+ ),
93
114
  disallowed_special=(),
94
115
  )
95
116
 
96
117
  def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
97
- self.streams = {create_stream_identifier(stream.stream): stream for stream in catalog.streams}
118
+ self.streams = {
119
+ create_stream_identifier(stream.stream): stream for stream in catalog.streams
120
+ }
98
121
 
99
- self.splitter = self._get_text_splitter(config.chunk_size, config.chunk_overlap, config.text_splitter)
122
+ self.splitter = self._get_text_splitter(
123
+ config.chunk_size, config.chunk_overlap, config.text_splitter
124
+ )
100
125
  self.text_fields = config.text_fields
101
126
  self.metadata_fields = config.metadata_fields
102
127
  self.field_name_mappings = config.field_name_mappings
@@ -119,10 +144,18 @@ class DocumentProcessor:
119
144
  failure_type=FailureType.config_error,
120
145
  )
121
146
  chunks = [
122
- Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
147
+ Chunk(
148
+ page_content=chunk_document.page_content,
149
+ metadata=chunk_document.metadata,
150
+ record=record,
151
+ )
123
152
  for chunk_document in self._split_document(doc)
124
153
  ]
125
- id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
154
+ id_to_delete = (
155
+ doc.metadata[METADATA_RECORD_ID_FIELD]
156
+ if METADATA_RECORD_ID_FIELD in doc.metadata
157
+ else None
158
+ )
126
159
  return chunks, id_to_delete
127
160
 
128
161
  def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
@@ -133,7 +166,9 @@ class DocumentProcessor:
133
166
  metadata = self._extract_metadata(record)
134
167
  return Document(page_content=text, metadata=metadata)
135
168
 
136
- def _extract_relevant_fields(self, record: AirbyteRecordMessage, fields: Optional[List[str]]) -> Dict[str, Any]:
169
+ def _extract_relevant_fields(
170
+ self, record: AirbyteRecordMessage, fields: Optional[List[str]]
171
+ ) -> Dict[str, Any]:
137
172
  relevant_fields = {}
138
173
  if fields and len(fields) > 0:
139
174
  for field in fields:
@@ -156,7 +191,10 @@ class DocumentProcessor:
156
191
  stream_identifier = create_stream_identifier(record)
157
192
  current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
158
193
  # if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
159
- if not current_stream.primary_key or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup:
194
+ if (
195
+ not current_stream.primary_key
196
+ or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
197
+ ):
160
198
  return None
161
199
 
162
200
  primary_key = []
@@ -92,7 +92,9 @@ class BaseOpenAIEmbedder(Embedder):
92
92
  batches = create_chunks(documents, batch_size=embedding_batch_size)
93
93
  embeddings: List[Optional[List[float]]] = []
94
94
  for batch in batches:
95
- embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
95
+ embeddings.extend(
96
+ self.embeddings.embed_documents([chunk.page_content for chunk in batch])
97
+ )
96
98
  return embeddings
97
99
 
98
100
  @property
@@ -103,13 +105,30 @@ class BaseOpenAIEmbedder(Embedder):
103
105
 
104
106
  class OpenAIEmbedder(BaseOpenAIEmbedder):
105
107
  def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
106
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, max_retries=15, disallowed_special=()), chunk_size) # type: ignore
108
+ super().__init__(
109
+ OpenAIEmbeddings(
110
+ openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
111
+ ),
112
+ chunk_size,
113
+ ) # type: ignore
107
114
 
108
115
 
109
116
  class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
110
117
  def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
111
118
  # Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
112
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, chunk_size=16, max_retries=15, openai_api_type="azure", openai_api_version="2023-05-15", openai_api_base=config.api_base, deployment=config.deployment, disallowed_special=()), chunk_size) # type: ignore
119
+ super().__init__(
120
+ OpenAIEmbeddings(
121
+ openai_api_key=config.openai_key,
122
+ chunk_size=16,
123
+ max_retries=15,
124
+ openai_api_type="azure",
125
+ openai_api_version="2023-05-15",
126
+ openai_api_base=config.api_base,
127
+ deployment=config.deployment,
128
+ disallowed_special=(),
129
+ ),
130
+ chunk_size,
131
+ ) # type: ignore
113
132
 
114
133
 
115
134
  COHERE_VECTOR_SIZE = 1024
@@ -119,7 +138,9 @@ class CohereEmbedder(Embedder):
119
138
  def __init__(self, config: CohereEmbeddingConfigModel):
120
139
  super().__init__()
121
140
  # Client is set internally
122
- self.embeddings = CohereEmbeddings(cohere_api_key=config.cohere_key, model="embed-english-light-v2.0") # type: ignore
141
+ self.embeddings = CohereEmbeddings(
142
+ cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
143
+ ) # type: ignore
123
144
 
124
145
  def check(self) -> Optional[str]:
125
146
  try:
@@ -129,7 +150,10 @@ class CohereEmbedder(Embedder):
129
150
  return None
130
151
 
131
152
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
132
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
153
+ return cast(
154
+ List[Optional[List[float]]],
155
+ self.embeddings.embed_documents([document.page_content for document in documents]),
156
+ )
133
157
 
134
158
  @property
135
159
  def embedding_dimensions(self) -> int:
@@ -150,7 +174,10 @@ class FakeEmbedder(Embedder):
150
174
  return None
151
175
 
152
176
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
153
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
177
+ return cast(
178
+ List[Optional[List[float]]],
179
+ self.embeddings.embed_documents([document.page_content for document in documents]),
180
+ )
154
181
 
155
182
  @property
156
183
  def embedding_dimensions(self) -> int:
@@ -167,11 +194,20 @@ class OpenAICompatibleEmbedder(Embedder):
167
194
  self.config = config
168
195
  # Client is set internally
169
196
  # Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
170
- self.embeddings = LocalAIEmbeddings(model=config.model_name, openai_api_key=config.api_key or "dummy-api-key", openai_api_base=config.base_url, max_retries=15, disallowed_special=()) # type: ignore
197
+ self.embeddings = LocalAIEmbeddings(
198
+ model=config.model_name,
199
+ openai_api_key=config.api_key or "dummy-api-key",
200
+ openai_api_base=config.base_url,
201
+ max_retries=15,
202
+ disallowed_special=(),
203
+ ) # type: ignore
171
204
 
172
205
  def check(self) -> Optional[str]:
173
206
  deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
174
- if deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE and not self.config.base_url.startswith("https://"):
207
+ if (
208
+ deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
209
+ and not self.config.base_url.startswith("https://")
210
+ ):
175
211
  return "Base URL must start with https://"
176
212
 
177
213
  try:
@@ -181,7 +217,10 @@ class OpenAICompatibleEmbedder(Embedder):
181
217
  return None
182
218
 
183
219
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
184
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
220
+ return cast(
221
+ List[Optional[List[float]]],
222
+ self.embeddings.embed_documents([document.page_content for document in documents]),
223
+ )
185
224
 
186
225
  @property
187
226
  def embedding_dimensions(self) -> int:
@@ -254,8 +293,10 @@ def create_from_config(
254
293
  ],
255
294
  processing_config: ProcessingConfigModel,
256
295
  ) -> Embedder:
257
-
258
296
  if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
259
- return cast(Embedder, embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size))
297
+ return cast(
298
+ Embedder,
299
+ embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
300
+ )
260
301
  else:
261
302
  return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
@@ -26,12 +26,19 @@ class BaseIntegrationTest(unittest.TestCase):
26
26
  It provides helper methods to create Airbyte catalogs, records and state messages.
27
27
  """
28
28
 
29
- def _get_configured_catalog(self, destination_mode: DestinationSyncMode) -> ConfiguredAirbyteCatalog:
30
- stream_schema = {"type": "object", "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}}}
29
+ def _get_configured_catalog(
30
+ self, destination_mode: DestinationSyncMode
31
+ ) -> ConfiguredAirbyteCatalog:
32
+ stream_schema = {
33
+ "type": "object",
34
+ "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
35
+ }
31
36
 
32
37
  overwrite_stream = ConfiguredAirbyteStream(
33
38
  stream=AirbyteStream(
34
- name="mystream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh]
39
+ name="mystream",
40
+ json_schema=stream_schema,
41
+ supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
35
42
  ),
36
43
  primary_key=[["int_col"]],
37
44
  sync_mode=SyncMode.incremental,
@@ -45,7 +52,10 @@ class BaseIntegrationTest(unittest.TestCase):
45
52
 
46
53
  def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
47
54
  return AirbyteMessage(
48
- type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
55
+ type=Type.RECORD,
56
+ record=AirbyteRecordMessage(
57
+ stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
58
+ ),
49
59
  )
50
60
 
51
61
  def setUp(self) -> None:
@@ -10,7 +10,11 @@ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
10
10
 
11
11
 
12
12
  def format_exception(exception: Exception) -> str:
13
- return str(exception) + "\n" + "".join(traceback.TracebackException.from_exception(exception).format())
13
+ return (
14
+ str(exception)
15
+ + "\n"
16
+ + "".join(traceback.TracebackException.from_exception(exception).format())
17
+ )
14
18
 
15
19
 
16
20
  def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
@@ -26,4 +30,6 @@ def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage])
26
30
  if isinstance(stream, AirbyteStream):
27
31
  return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
28
32
  else:
29
- return str(stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}")
33
+ return str(
34
+ stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
35
+ )
@@ -27,7 +27,12 @@ class Writer:
27
27
  """
28
28
 
29
29
  def __init__(
30
- self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int, omit_raw_text: bool
30
+ self,
31
+ processing_config: ProcessingConfigModel,
32
+ indexer: Indexer,
33
+ embedder: Embedder,
34
+ batch_size: int,
35
+ omit_raw_text: bool,
31
36
  ) -> None:
32
37
  self.processing_config = processing_config
33
38
  self.indexer = indexer
@@ -54,7 +59,9 @@ class Writer:
54
59
  self.indexer.delete(ids, namespace, stream)
55
60
 
56
61
  for (namespace, stream), chunks in self.chunks.items():
57
- embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
62
+ embeddings = self.embedder.embed_documents(
63
+ [self._convert_to_document(chunk) for chunk in chunks]
64
+ )
58
65
  for i, document in enumerate(chunks):
59
66
  document.embedding = embeddings[i]
60
67
  if self.omit_raw_text:
@@ -63,7 +70,9 @@ class Writer:
63
70
 
64
71
  self._init_batch()
65
72
 
66
- def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
73
+ def write(
74
+ self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
75
+ ) -> Iterable[AirbyteMessage]:
67
76
  self.processor = DocumentProcessor(self.processing_config, configured_catalog)
68
77
  self.indexer.pre_sync(configured_catalog)
69
78
  for message in input_messages:
@@ -76,7 +85,9 @@ class Writer:
76
85
  record_chunks, record_id_to_delete = self.processor.process(message.record)
77
86
  self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
78
87
  if record_id_to_delete is not None:
79
- self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
88
+ self.ids_to_delete[(message.record.namespace, message.record.stream)].append(
89
+ record_id_to_delete
90
+ )
80
91
  self.number_of_chunks += len(record_chunks)
81
92
  if self.number_of_chunks >= self.batch_size:
82
93
  self._process_batch()
airbyte_cdk/entrypoint.py CHANGED
@@ -62,33 +62,54 @@ class AirbyteEntrypoint(object):
62
62
  def parse_args(args: List[str]) -> argparse.Namespace:
63
63
  # set up parent parsers
64
64
  parent_parser = argparse.ArgumentParser(add_help=False)
65
- parent_parser.add_argument("--debug", action="store_true", help="enables detailed debug logs related to the sync")
65
+ parent_parser.add_argument(
66
+ "--debug", action="store_true", help="enables detailed debug logs related to the sync"
67
+ )
66
68
  main_parser = argparse.ArgumentParser()
67
69
  subparsers = main_parser.add_subparsers(title="commands", dest="command")
68
70
 
69
71
  # spec
70
- subparsers.add_parser("spec", help="outputs the json configuration specification", parents=[parent_parser])
72
+ subparsers.add_parser(
73
+ "spec", help="outputs the json configuration specification", parents=[parent_parser]
74
+ )
71
75
 
72
76
  # check
73
- check_parser = subparsers.add_parser("check", help="checks the config can be used to connect", parents=[parent_parser])
77
+ check_parser = subparsers.add_parser(
78
+ "check", help="checks the config can be used to connect", parents=[parent_parser]
79
+ )
74
80
  required_check_parser = check_parser.add_argument_group("required named arguments")
75
- required_check_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file")
81
+ required_check_parser.add_argument(
82
+ "--config", type=str, required=True, help="path to the json configuration file"
83
+ )
76
84
 
77
85
  # discover
78
86
  discover_parser = subparsers.add_parser(
79
- "discover", help="outputs a catalog describing the source's schema", parents=[parent_parser]
87
+ "discover",
88
+ help="outputs a catalog describing the source's schema",
89
+ parents=[parent_parser],
80
90
  )
81
91
  required_discover_parser = discover_parser.add_argument_group("required named arguments")
82
- required_discover_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file")
92
+ required_discover_parser.add_argument(
93
+ "--config", type=str, required=True, help="path to the json configuration file"
94
+ )
83
95
 
84
96
  # read
85
- read_parser = subparsers.add_parser("read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser])
97
+ read_parser = subparsers.add_parser(
98
+ "read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]
99
+ )
86
100
 
87
- read_parser.add_argument("--state", type=str, required=False, help="path to the json-encoded state file")
101
+ read_parser.add_argument(
102
+ "--state", type=str, required=False, help="path to the json-encoded state file"
103
+ )
88
104
  required_read_parser = read_parser.add_argument_group("required named arguments")
89
- required_read_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file")
90
105
  required_read_parser.add_argument(
91
- "--catalog", type=str, required=True, help="path to the catalog used to determine which data to read"
106
+ "--config", type=str, required=True, help="path to the json configuration file"
107
+ )
108
+ required_read_parser.add_argument(
109
+ "--catalog",
110
+ type=str,
111
+ required=True,
112
+ help="path to the catalog used to determine which data to read",
92
113
  )
93
114
 
94
115
  return main_parser.parse_args(args)
@@ -108,11 +129,14 @@ class AirbyteEntrypoint(object):
108
129
  source_spec: ConnectorSpecification = self.source.spec(self.logger)
109
130
  try:
110
131
  with tempfile.TemporaryDirectory() as temp_dir:
111
- os.environ[ENV_REQUEST_CACHE_PATH] = temp_dir # set this as default directory for request_cache to store *.sqlite files
132
+ os.environ[ENV_REQUEST_CACHE_PATH] = (
133
+ temp_dir # set this as default directory for request_cache to store *.sqlite files
134
+ )
112
135
  if cmd == "spec":
113
136
  message = AirbyteMessage(type=Type.SPEC, spec=source_spec)
114
137
  yield from [
115
- self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source)
138
+ self.airbyte_message_to_string(queued_message)
139
+ for queued_message in self._emit_queued_messages(self.source)
116
140
  ]
117
141
  yield self.airbyte_message_to_string(message)
118
142
  else:
@@ -120,23 +144,38 @@ class AirbyteEntrypoint(object):
120
144
  config = self.source.configure(raw_config, temp_dir)
121
145
 
122
146
  yield from [
123
- self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source)
147
+ self.airbyte_message_to_string(queued_message)
148
+ for queued_message in self._emit_queued_messages(self.source)
124
149
  ]
125
150
  if cmd == "check":
126
- yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.check(source_spec, config))
151
+ yield from map(
152
+ AirbyteEntrypoint.airbyte_message_to_string,
153
+ self.check(source_spec, config),
154
+ )
127
155
  elif cmd == "discover":
128
- yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.discover(source_spec, config))
156
+ yield from map(
157
+ AirbyteEntrypoint.airbyte_message_to_string,
158
+ self.discover(source_spec, config),
159
+ )
129
160
  elif cmd == "read":
130
161
  config_catalog = self.source.read_catalog(parsed_args.catalog)
131
162
  state = self.source.read_state(parsed_args.state)
132
163
 
133
- yield from map(AirbyteEntrypoint.airbyte_message_to_string, self.read(source_spec, config, config_catalog, state))
164
+ yield from map(
165
+ AirbyteEntrypoint.airbyte_message_to_string,
166
+ self.read(source_spec, config, config_catalog, state),
167
+ )
134
168
  else:
135
169
  raise Exception("Unexpected command " + cmd)
136
170
  finally:
137
- yield from [self.airbyte_message_to_string(queued_message) for queued_message in self._emit_queued_messages(self.source)]
138
-
139
- def check(self, source_spec: ConnectorSpecification, config: TConfig) -> Iterable[AirbyteMessage]:
171
+ yield from [
172
+ self.airbyte_message_to_string(queued_message)
173
+ for queued_message in self._emit_queued_messages(self.source)
174
+ ]
175
+
176
+ def check(
177
+ self, source_spec: ConnectorSpecification, config: TConfig
178
+ ) -> Iterable[AirbyteMessage]:
140
179
  self.set_up_secret_filter(config, source_spec.connectionSpecification)
141
180
  try:
142
181
  self.validate_connection(source_spec, config)
@@ -161,7 +200,10 @@ class AirbyteEntrypoint(object):
161
200
  raise traced_exc
162
201
  else:
163
202
  yield AirbyteMessage(
164
- type=Type.CONNECTION_STATUS, connectionStatus=AirbyteConnectionStatus(status=Status.FAILED, message=traced_exc.message)
203
+ type=Type.CONNECTION_STATUS,
204
+ connectionStatus=AirbyteConnectionStatus(
205
+ status=Status.FAILED, message=traced_exc.message
206
+ ),
165
207
  )
166
208
  return
167
209
  if check_result.status == Status.SUCCEEDED:
@@ -172,7 +214,9 @@ class AirbyteEntrypoint(object):
172
214
  yield from self._emit_queued_messages(self.source)
173
215
  yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
174
216
 
175
- def discover(self, source_spec: ConnectorSpecification, config: TConfig) -> Iterable[AirbyteMessage]:
217
+ def discover(
218
+ self, source_spec: ConnectorSpecification, config: TConfig
219
+ ) -> Iterable[AirbyteMessage]:
176
220
  self.set_up_secret_filter(config, source_spec.connectionSpecification)
177
221
  if self.source.check_config_against_spec:
178
222
  self.validate_connection(source_spec, config)
@@ -181,7 +225,9 @@ class AirbyteEntrypoint(object):
181
225
  yield from self._emit_queued_messages(self.source)
182
226
  yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
183
227
 
184
- def read(self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]) -> Iterable[AirbyteMessage]:
228
+ def read(
229
+ self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]
230
+ ) -> Iterable[AirbyteMessage]:
185
231
  self.set_up_secret_filter(config, source_spec.connectionSpecification)
186
232
  if self.source.check_config_against_spec:
187
233
  self.validate_connection(source_spec, config)
@@ -194,16 +240,24 @@ class AirbyteEntrypoint(object):
194
240
  yield self.handle_record_counts(message, stream_message_counter)
195
241
 
196
242
  @staticmethod
197
- def handle_record_counts(message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]) -> AirbyteMessage:
243
+ def handle_record_counts(
244
+ message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]
245
+ ) -> AirbyteMessage:
198
246
  match message.type:
199
247
  case Type.RECORD:
200
- stream_message_count[HashableStreamDescriptor(name=message.record.stream, namespace=message.record.namespace)] += 1.0 # type: ignore[union-attr] # record has `stream` and `namespace`
248
+ stream_message_count[
249
+ HashableStreamDescriptor(
250
+ name=message.record.stream, namespace=message.record.namespace
251
+ )
252
+ ] += 1.0 # type: ignore[union-attr] # record has `stream` and `namespace`
201
253
  case Type.STATE:
202
254
  stream_descriptor = message_utils.get_stream_descriptor(message)
203
255
 
204
256
  # Set record count from the counter onto the state message
205
257
  message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats`
206
- message.state.sourceStats.recordCount = stream_message_count.get(stream_descriptor, 0.0) # type: ignore[union-attr] # state has `sourceStats`
258
+ message.state.sourceStats.recordCount = stream_message_count.get(
259
+ stream_descriptor, 0.0
260
+ ) # type: ignore[union-attr] # state has `sourceStats`
207
261
 
208
262
  # Reset the counter
209
263
  stream_message_count[stream_descriptor] = 0.0
@@ -283,7 +337,9 @@ def _init_internal_request_filter() -> None:
283
337
  )
284
338
 
285
339
  if not parsed_url.hostname:
286
- raise requests.exceptions.InvalidURL("Invalid URL specified: The endpoint that data is being requested from is not a valid URL")
340
+ raise requests.exceptions.InvalidURL(
341
+ "Invalid URL specified: The endpoint that data is being requested from is not a valid URL"
342
+ )
287
343
 
288
344
  try:
289
345
  is_private = _is_private_url(parsed_url.hostname, parsed_url.port) # type: ignore [arg-type]