airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -17,7 +17,11 @@ class SeparatorSplitterConfigModel(BaseModel):
|
|
17
17
|
title="Separators",
|
18
18
|
description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
|
19
19
|
)
|
20
|
-
keep_separator: bool = Field(
|
20
|
+
keep_separator: bool = Field(
|
21
|
+
default=False,
|
22
|
+
title="Keep separator",
|
23
|
+
description="Whether to keep the separator in the resulting chunks",
|
24
|
+
)
|
21
25
|
|
22
26
|
class Config(OneOfOptionConfig):
|
23
27
|
title = "By Separator"
|
@@ -68,18 +72,20 @@ class CodeSplitterConfigModel(BaseModel):
|
|
68
72
|
|
69
73
|
class Config(OneOfOptionConfig):
|
70
74
|
title = "By Programming Language"
|
71
|
-
description =
|
72
|
-
"Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
|
73
|
-
)
|
75
|
+
description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
|
74
76
|
discriminator = "mode"
|
75
77
|
|
76
78
|
|
77
|
-
TextSplitterConfigModel = Union[
|
79
|
+
TextSplitterConfigModel = Union[
|
80
|
+
SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
|
81
|
+
]
|
78
82
|
|
79
83
|
|
80
84
|
class FieldNameMappingConfigModel(BaseModel):
|
81
85
|
from_field: str = Field(title="From field name", description="The field name in the source")
|
82
|
-
to_field: str = Field(
|
86
|
+
to_field: str = Field(
|
87
|
+
title="To field name", description="The field name to use in the destination"
|
88
|
+
)
|
83
89
|
|
84
90
|
|
85
91
|
class ProcessingConfigModel(BaseModel):
|
@@ -132,9 +138,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
|
|
132
138
|
|
133
139
|
class Config(OneOfOptionConfig):
|
134
140
|
title = "OpenAI"
|
135
|
-
description =
|
136
|
-
"Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
137
|
-
)
|
141
|
+
description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
138
142
|
discriminator = "mode"
|
139
143
|
|
140
144
|
|
@@ -142,7 +146,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
|
|
142
146
|
mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
|
143
147
|
api_key: str = Field(title="API key", default="", airbyte_secret=True)
|
144
148
|
base_url: str = Field(
|
145
|
-
...,
|
149
|
+
...,
|
150
|
+
title="Base URL",
|
151
|
+
description="The base URL for your OpenAI-compatible service",
|
152
|
+
examples=["https://your-service-name.com"],
|
146
153
|
)
|
147
154
|
model_name: str = Field(
|
148
155
|
title="Model name",
|
@@ -151,7 +158,9 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
|
|
151
158
|
examples=["text-embedding-ada-002"],
|
152
159
|
)
|
153
160
|
dimensions: int = Field(
|
154
|
-
title="Embedding dimensions",
|
161
|
+
title="Embedding dimensions",
|
162
|
+
description="The number of dimensions the embedding model is generating",
|
163
|
+
examples=[1536, 384],
|
155
164
|
)
|
156
165
|
|
157
166
|
class Config(OneOfOptionConfig):
|
@@ -199,10 +208,16 @@ class FakeEmbeddingConfigModel(BaseModel):
|
|
199
208
|
class FromFieldEmbeddingConfigModel(BaseModel):
|
200
209
|
mode: Literal["from_field"] = Field("from_field", const=True)
|
201
210
|
field_name: str = Field(
|
202
|
-
...,
|
211
|
+
...,
|
212
|
+
title="Field name",
|
213
|
+
description="Name of the field in the record that contains the embedding",
|
214
|
+
examples=["embedding", "vector"],
|
203
215
|
)
|
204
216
|
dimensions: int = Field(
|
205
|
-
...,
|
217
|
+
...,
|
218
|
+
title="Embedding dimensions",
|
219
|
+
description="The number of dimensions the embedding model is generating",
|
220
|
+
examples=[1536, 384],
|
206
221
|
)
|
207
222
|
|
208
223
|
class Config(OneOfOptionConfig):
|
@@ -241,7 +256,14 @@ class VectorDBConfigModel(BaseModel):
|
|
241
256
|
FakeEmbeddingConfigModel,
|
242
257
|
AzureOpenAIEmbeddingConfigModel,
|
243
258
|
OpenAICompatibleEmbeddingConfigModel,
|
244
|
-
] = Field(
|
259
|
+
] = Field(
|
260
|
+
...,
|
261
|
+
title="Embedding",
|
262
|
+
description="Embedding configuration",
|
263
|
+
discriminator="mode",
|
264
|
+
group="embedding",
|
265
|
+
type="object",
|
266
|
+
)
|
245
267
|
processing: ProcessingConfigModel
|
246
268
|
omit_raw_text: bool = Field(
|
247
269
|
default=False,
|
@@ -8,9 +8,18 @@ from dataclasses import dataclass
|
|
8
8
|
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
9
9
|
|
10
10
|
import dpath
|
11
|
-
from airbyte_cdk.destinations.vector_db_based.config import
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
12
|
+
ProcessingConfigModel,
|
13
|
+
SeparatorSplitterConfigModel,
|
14
|
+
TextSplitterConfigModel,
|
15
|
+
)
|
12
16
|
from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
|
13
|
-
from airbyte_cdk.models import
|
17
|
+
from airbyte_cdk.models import (
|
18
|
+
AirbyteRecordMessage,
|
19
|
+
ConfiguredAirbyteCatalog,
|
20
|
+
ConfiguredAirbyteStream,
|
21
|
+
DestinationSyncMode,
|
22
|
+
)
|
14
23
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
15
24
|
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
|
16
25
|
from langchain.utils import stringify_dict
|
@@ -30,7 +39,14 @@ class Chunk:
|
|
30
39
|
embedding: Optional[List[float]] = None
|
31
40
|
|
32
41
|
|
33
|
-
headers_to_split_on = [
|
42
|
+
headers_to_split_on = [
|
43
|
+
"(?:^|\n)# ",
|
44
|
+
"(?:^|\n)## ",
|
45
|
+
"(?:^|\n)### ",
|
46
|
+
"(?:^|\n)#### ",
|
47
|
+
"(?:^|\n)##### ",
|
48
|
+
"(?:^|\n)###### ",
|
49
|
+
]
|
34
50
|
|
35
51
|
|
36
52
|
class DocumentProcessor:
|
@@ -64,7 +80,10 @@ class DocumentProcessor:
|
|
64
80
|
return None
|
65
81
|
|
66
82
|
def _get_text_splitter(
|
67
|
-
self,
|
83
|
+
self,
|
84
|
+
chunk_size: int,
|
85
|
+
chunk_overlap: int,
|
86
|
+
splitter_config: Optional[TextSplitterConfigModel],
|
68
87
|
) -> RecursiveCharacterTextSplitter:
|
69
88
|
if splitter_config is None:
|
70
89
|
splitter_config = SeparatorSplitterConfigModel(mode="separator")
|
@@ -89,14 +108,20 @@ class DocumentProcessor:
|
|
89
108
|
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
90
109
|
chunk_size=chunk_size,
|
91
110
|
chunk_overlap=chunk_overlap,
|
92
|
-
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
|
111
|
+
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
|
112
|
+
Language(splitter_config.language)
|
113
|
+
),
|
93
114
|
disallowed_special=(),
|
94
115
|
)
|
95
116
|
|
96
117
|
def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
|
97
|
-
self.streams = {
|
118
|
+
self.streams = {
|
119
|
+
create_stream_identifier(stream.stream): stream for stream in catalog.streams
|
120
|
+
}
|
98
121
|
|
99
|
-
self.splitter = self._get_text_splitter(
|
122
|
+
self.splitter = self._get_text_splitter(
|
123
|
+
config.chunk_size, config.chunk_overlap, config.text_splitter
|
124
|
+
)
|
100
125
|
self.text_fields = config.text_fields
|
101
126
|
self.metadata_fields = config.metadata_fields
|
102
127
|
self.field_name_mappings = config.field_name_mappings
|
@@ -119,10 +144,18 @@ class DocumentProcessor:
|
|
119
144
|
failure_type=FailureType.config_error,
|
120
145
|
)
|
121
146
|
chunks = [
|
122
|
-
Chunk(
|
147
|
+
Chunk(
|
148
|
+
page_content=chunk_document.page_content,
|
149
|
+
metadata=chunk_document.metadata,
|
150
|
+
record=record,
|
151
|
+
)
|
123
152
|
for chunk_document in self._split_document(doc)
|
124
153
|
]
|
125
|
-
id_to_delete =
|
154
|
+
id_to_delete = (
|
155
|
+
doc.metadata[METADATA_RECORD_ID_FIELD]
|
156
|
+
if METADATA_RECORD_ID_FIELD in doc.metadata
|
157
|
+
else None
|
158
|
+
)
|
126
159
|
return chunks, id_to_delete
|
127
160
|
|
128
161
|
def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
|
@@ -133,7 +166,9 @@ class DocumentProcessor:
|
|
133
166
|
metadata = self._extract_metadata(record)
|
134
167
|
return Document(page_content=text, metadata=metadata)
|
135
168
|
|
136
|
-
def _extract_relevant_fields(
|
169
|
+
def _extract_relevant_fields(
|
170
|
+
self, record: AirbyteRecordMessage, fields: Optional[List[str]]
|
171
|
+
) -> Dict[str, Any]:
|
137
172
|
relevant_fields = {}
|
138
173
|
if fields and len(fields) > 0:
|
139
174
|
for field in fields:
|
@@ -156,7 +191,10 @@ class DocumentProcessor:
|
|
156
191
|
stream_identifier = create_stream_identifier(record)
|
157
192
|
current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
|
158
193
|
# if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
|
159
|
-
if
|
194
|
+
if (
|
195
|
+
not current_stream.primary_key
|
196
|
+
or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
|
197
|
+
):
|
160
198
|
return None
|
161
199
|
|
162
200
|
primary_key = []
|
@@ -92,7 +92,9 @@ class BaseOpenAIEmbedder(Embedder):
|
|
92
92
|
batches = create_chunks(documents, batch_size=embedding_batch_size)
|
93
93
|
embeddings: List[Optional[List[float]]] = []
|
94
94
|
for batch in batches:
|
95
|
-
embeddings.extend(
|
95
|
+
embeddings.extend(
|
96
|
+
self.embeddings.embed_documents([chunk.page_content for chunk in batch])
|
97
|
+
)
|
96
98
|
return embeddings
|
97
99
|
|
98
100
|
@property
|
@@ -103,13 +105,30 @@ class BaseOpenAIEmbedder(Embedder):
|
|
103
105
|
|
104
106
|
class OpenAIEmbedder(BaseOpenAIEmbedder):
|
105
107
|
def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
|
106
|
-
super().__init__(
|
108
|
+
super().__init__(
|
109
|
+
OpenAIEmbeddings(
|
110
|
+
openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
|
111
|
+
),
|
112
|
+
chunk_size,
|
113
|
+
) # type: ignore
|
107
114
|
|
108
115
|
|
109
116
|
class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
|
110
117
|
def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
|
111
118
|
# Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
|
112
|
-
super().__init__(
|
119
|
+
super().__init__(
|
120
|
+
OpenAIEmbeddings(
|
121
|
+
openai_api_key=config.openai_key,
|
122
|
+
chunk_size=16,
|
123
|
+
max_retries=15,
|
124
|
+
openai_api_type="azure",
|
125
|
+
openai_api_version="2023-05-15",
|
126
|
+
openai_api_base=config.api_base,
|
127
|
+
deployment=config.deployment,
|
128
|
+
disallowed_special=(),
|
129
|
+
),
|
130
|
+
chunk_size,
|
131
|
+
) # type: ignore
|
113
132
|
|
114
133
|
|
115
134
|
COHERE_VECTOR_SIZE = 1024
|
@@ -119,7 +138,9 @@ class CohereEmbedder(Embedder):
|
|
119
138
|
def __init__(self, config: CohereEmbeddingConfigModel):
|
120
139
|
super().__init__()
|
121
140
|
# Client is set internally
|
122
|
-
self.embeddings = CohereEmbeddings(
|
141
|
+
self.embeddings = CohereEmbeddings(
|
142
|
+
cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
|
143
|
+
) # type: ignore
|
123
144
|
|
124
145
|
def check(self) -> Optional[str]:
|
125
146
|
try:
|
@@ -129,7 +150,10 @@ class CohereEmbedder(Embedder):
|
|
129
150
|
return None
|
130
151
|
|
131
152
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
132
|
-
return cast(
|
153
|
+
return cast(
|
154
|
+
List[Optional[List[float]]],
|
155
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
156
|
+
)
|
133
157
|
|
134
158
|
@property
|
135
159
|
def embedding_dimensions(self) -> int:
|
@@ -150,7 +174,10 @@ class FakeEmbedder(Embedder):
|
|
150
174
|
return None
|
151
175
|
|
152
176
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
153
|
-
return cast(
|
177
|
+
return cast(
|
178
|
+
List[Optional[List[float]]],
|
179
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
180
|
+
)
|
154
181
|
|
155
182
|
@property
|
156
183
|
def embedding_dimensions(self) -> int:
|
@@ -167,11 +194,20 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
167
194
|
self.config = config
|
168
195
|
# Client is set internally
|
169
196
|
# Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
|
170
|
-
self.embeddings = LocalAIEmbeddings(
|
197
|
+
self.embeddings = LocalAIEmbeddings(
|
198
|
+
model=config.model_name,
|
199
|
+
openai_api_key=config.api_key or "dummy-api-key",
|
200
|
+
openai_api_base=config.base_url,
|
201
|
+
max_retries=15,
|
202
|
+
disallowed_special=(),
|
203
|
+
) # type: ignore
|
171
204
|
|
172
205
|
def check(self) -> Optional[str]:
|
173
206
|
deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
|
174
|
-
if
|
207
|
+
if (
|
208
|
+
deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
|
209
|
+
and not self.config.base_url.startswith("https://")
|
210
|
+
):
|
175
211
|
return "Base URL must start with https://"
|
176
212
|
|
177
213
|
try:
|
@@ -181,7 +217,10 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
181
217
|
return None
|
182
218
|
|
183
219
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
184
|
-
return cast(
|
220
|
+
return cast(
|
221
|
+
List[Optional[List[float]]],
|
222
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
223
|
+
)
|
185
224
|
|
186
225
|
@property
|
187
226
|
def embedding_dimensions(self) -> int:
|
@@ -254,8 +293,10 @@ def create_from_config(
|
|
254
293
|
],
|
255
294
|
processing_config: ProcessingConfigModel,
|
256
295
|
) -> Embedder:
|
257
|
-
|
258
296
|
if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
|
259
|
-
return cast(
|
297
|
+
return cast(
|
298
|
+
Embedder,
|
299
|
+
embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
|
300
|
+
)
|
260
301
|
else:
|
261
302
|
return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
|
@@ -26,12 +26,19 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
26
26
|
It provides helper methods to create Airbyte catalogs, records and state messages.
|
27
27
|
"""
|
28
28
|
|
29
|
-
def _get_configured_catalog(
|
30
|
-
|
29
|
+
def _get_configured_catalog(
|
30
|
+
self, destination_mode: DestinationSyncMode
|
31
|
+
) -> ConfiguredAirbyteCatalog:
|
32
|
+
stream_schema = {
|
33
|
+
"type": "object",
|
34
|
+
"properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
|
35
|
+
}
|
31
36
|
|
32
37
|
overwrite_stream = ConfiguredAirbyteStream(
|
33
38
|
stream=AirbyteStream(
|
34
|
-
name="mystream",
|
39
|
+
name="mystream",
|
40
|
+
json_schema=stream_schema,
|
41
|
+
supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
|
35
42
|
),
|
36
43
|
primary_key=[["int_col"]],
|
37
44
|
sync_mode=SyncMode.incremental,
|
@@ -45,7 +52,10 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
45
52
|
|
46
53
|
def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
|
47
54
|
return AirbyteMessage(
|
48
|
-
type=Type.RECORD,
|
55
|
+
type=Type.RECORD,
|
56
|
+
record=AirbyteRecordMessage(
|
57
|
+
stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
|
58
|
+
),
|
49
59
|
)
|
50
60
|
|
51
61
|
def setUp(self) -> None:
|
@@ -10,7 +10,11 @@ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
|
|
10
10
|
|
11
11
|
|
12
12
|
def format_exception(exception: Exception) -> str:
|
13
|
-
return
|
13
|
+
return (
|
14
|
+
str(exception)
|
15
|
+
+ "\n"
|
16
|
+
+ "".join(traceback.TracebackException.from_exception(exception).format())
|
17
|
+
)
|
14
18
|
|
15
19
|
|
16
20
|
def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
|
@@ -26,4 +30,6 @@ def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage])
|
|
26
30
|
if isinstance(stream, AirbyteStream):
|
27
31
|
return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
|
28
32
|
else:
|
29
|
-
return str(
|
33
|
+
return str(
|
34
|
+
stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
|
35
|
+
)
|
@@ -27,7 +27,12 @@ class Writer:
|
|
27
27
|
"""
|
28
28
|
|
29
29
|
def __init__(
|
30
|
-
self,
|
30
|
+
self,
|
31
|
+
processing_config: ProcessingConfigModel,
|
32
|
+
indexer: Indexer,
|
33
|
+
embedder: Embedder,
|
34
|
+
batch_size: int,
|
35
|
+
omit_raw_text: bool,
|
31
36
|
) -> None:
|
32
37
|
self.processing_config = processing_config
|
33
38
|
self.indexer = indexer
|
@@ -54,7 +59,9 @@ class Writer:
|
|
54
59
|
self.indexer.delete(ids, namespace, stream)
|
55
60
|
|
56
61
|
for (namespace, stream), chunks in self.chunks.items():
|
57
|
-
embeddings = self.embedder.embed_documents(
|
62
|
+
embeddings = self.embedder.embed_documents(
|
63
|
+
[self._convert_to_document(chunk) for chunk in chunks]
|
64
|
+
)
|
58
65
|
for i, document in enumerate(chunks):
|
59
66
|
document.embedding = embeddings[i]
|
60
67
|
if self.omit_raw_text:
|
@@ -63,7 +70,9 @@ class Writer:
|
|
63
70
|
|
64
71
|
self._init_batch()
|
65
72
|
|
66
|
-
def write(
|
73
|
+
def write(
|
74
|
+
self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
|
75
|
+
) -> Iterable[AirbyteMessage]:
|
67
76
|
self.processor = DocumentProcessor(self.processing_config, configured_catalog)
|
68
77
|
self.indexer.pre_sync(configured_catalog)
|
69
78
|
for message in input_messages:
|
@@ -76,7 +85,9 @@ class Writer:
|
|
76
85
|
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
77
86
|
self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
|
78
87
|
if record_id_to_delete is not None:
|
79
|
-
self.ids_to_delete[(message.record.namespace, message.record.stream)].append(
|
88
|
+
self.ids_to_delete[(message.record.namespace, message.record.stream)].append(
|
89
|
+
record_id_to_delete
|
90
|
+
)
|
80
91
|
self.number_of_chunks += len(record_chunks)
|
81
92
|
if self.number_of_chunks >= self.batch_size:
|
82
93
|
self._process_batch()
|
airbyte_cdk/entrypoint.py
CHANGED
@@ -62,33 +62,54 @@ class AirbyteEntrypoint(object):
|
|
62
62
|
def parse_args(args: List[str]) -> argparse.Namespace:
|
63
63
|
# set up parent parsers
|
64
64
|
parent_parser = argparse.ArgumentParser(add_help=False)
|
65
|
-
parent_parser.add_argument(
|
65
|
+
parent_parser.add_argument(
|
66
|
+
"--debug", action="store_true", help="enables detailed debug logs related to the sync"
|
67
|
+
)
|
66
68
|
main_parser = argparse.ArgumentParser()
|
67
69
|
subparsers = main_parser.add_subparsers(title="commands", dest="command")
|
68
70
|
|
69
71
|
# spec
|
70
|
-
subparsers.add_parser(
|
72
|
+
subparsers.add_parser(
|
73
|
+
"spec", help="outputs the json configuration specification", parents=[parent_parser]
|
74
|
+
)
|
71
75
|
|
72
76
|
# check
|
73
|
-
check_parser = subparsers.add_parser(
|
77
|
+
check_parser = subparsers.add_parser(
|
78
|
+
"check", help="checks the config can be used to connect", parents=[parent_parser]
|
79
|
+
)
|
74
80
|
required_check_parser = check_parser.add_argument_group("required named arguments")
|
75
|
-
required_check_parser.add_argument(
|
81
|
+
required_check_parser.add_argument(
|
82
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
83
|
+
)
|
76
84
|
|
77
85
|
# discover
|
78
86
|
discover_parser = subparsers.add_parser(
|
79
|
-
"discover",
|
87
|
+
"discover",
|
88
|
+
help="outputs a catalog describing the source's schema",
|
89
|
+
parents=[parent_parser],
|
80
90
|
)
|
81
91
|
required_discover_parser = discover_parser.add_argument_group("required named arguments")
|
82
|
-
required_discover_parser.add_argument(
|
92
|
+
required_discover_parser.add_argument(
|
93
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
94
|
+
)
|
83
95
|
|
84
96
|
# read
|
85
|
-
read_parser = subparsers.add_parser(
|
97
|
+
read_parser = subparsers.add_parser(
|
98
|
+
"read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]
|
99
|
+
)
|
86
100
|
|
87
|
-
read_parser.add_argument(
|
101
|
+
read_parser.add_argument(
|
102
|
+
"--state", type=str, required=False, help="path to the json-encoded state file"
|
103
|
+
)
|
88
104
|
required_read_parser = read_parser.add_argument_group("required named arguments")
|
89
|
-
required_read_parser.add_argument("--config", type=str, required=True, help="path to the json configuration file")
|
90
105
|
required_read_parser.add_argument(
|
91
|
-
"--
|
106
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
107
|
+
)
|
108
|
+
required_read_parser.add_argument(
|
109
|
+
"--catalog",
|
110
|
+
type=str,
|
111
|
+
required=True,
|
112
|
+
help="path to the catalog used to determine which data to read",
|
92
113
|
)
|
93
114
|
|
94
115
|
return main_parser.parse_args(args)
|
@@ -108,11 +129,14 @@ class AirbyteEntrypoint(object):
|
|
108
129
|
source_spec: ConnectorSpecification = self.source.spec(self.logger)
|
109
130
|
try:
|
110
131
|
with tempfile.TemporaryDirectory() as temp_dir:
|
111
|
-
os.environ[ENV_REQUEST_CACHE_PATH] =
|
132
|
+
os.environ[ENV_REQUEST_CACHE_PATH] = (
|
133
|
+
temp_dir # set this as default directory for request_cache to store *.sqlite files
|
134
|
+
)
|
112
135
|
if cmd == "spec":
|
113
136
|
message = AirbyteMessage(type=Type.SPEC, spec=source_spec)
|
114
137
|
yield from [
|
115
|
-
self.airbyte_message_to_string(queued_message)
|
138
|
+
self.airbyte_message_to_string(queued_message)
|
139
|
+
for queued_message in self._emit_queued_messages(self.source)
|
116
140
|
]
|
117
141
|
yield self.airbyte_message_to_string(message)
|
118
142
|
else:
|
@@ -120,23 +144,38 @@ class AirbyteEntrypoint(object):
|
|
120
144
|
config = self.source.configure(raw_config, temp_dir)
|
121
145
|
|
122
146
|
yield from [
|
123
|
-
self.airbyte_message_to_string(queued_message)
|
147
|
+
self.airbyte_message_to_string(queued_message)
|
148
|
+
for queued_message in self._emit_queued_messages(self.source)
|
124
149
|
]
|
125
150
|
if cmd == "check":
|
126
|
-
yield from map(
|
151
|
+
yield from map(
|
152
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
153
|
+
self.check(source_spec, config),
|
154
|
+
)
|
127
155
|
elif cmd == "discover":
|
128
|
-
yield from map(
|
156
|
+
yield from map(
|
157
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
158
|
+
self.discover(source_spec, config),
|
159
|
+
)
|
129
160
|
elif cmd == "read":
|
130
161
|
config_catalog = self.source.read_catalog(parsed_args.catalog)
|
131
162
|
state = self.source.read_state(parsed_args.state)
|
132
163
|
|
133
|
-
yield from map(
|
164
|
+
yield from map(
|
165
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
166
|
+
self.read(source_spec, config, config_catalog, state),
|
167
|
+
)
|
134
168
|
else:
|
135
169
|
raise Exception("Unexpected command " + cmd)
|
136
170
|
finally:
|
137
|
-
yield from [
|
138
|
-
|
139
|
-
|
171
|
+
yield from [
|
172
|
+
self.airbyte_message_to_string(queued_message)
|
173
|
+
for queued_message in self._emit_queued_messages(self.source)
|
174
|
+
]
|
175
|
+
|
176
|
+
def check(
|
177
|
+
self, source_spec: ConnectorSpecification, config: TConfig
|
178
|
+
) -> Iterable[AirbyteMessage]:
|
140
179
|
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
141
180
|
try:
|
142
181
|
self.validate_connection(source_spec, config)
|
@@ -161,7 +200,10 @@ class AirbyteEntrypoint(object):
|
|
161
200
|
raise traced_exc
|
162
201
|
else:
|
163
202
|
yield AirbyteMessage(
|
164
|
-
type=Type.CONNECTION_STATUS,
|
203
|
+
type=Type.CONNECTION_STATUS,
|
204
|
+
connectionStatus=AirbyteConnectionStatus(
|
205
|
+
status=Status.FAILED, message=traced_exc.message
|
206
|
+
),
|
165
207
|
)
|
166
208
|
return
|
167
209
|
if check_result.status == Status.SUCCEEDED:
|
@@ -172,7 +214,9 @@ class AirbyteEntrypoint(object):
|
|
172
214
|
yield from self._emit_queued_messages(self.source)
|
173
215
|
yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
|
174
216
|
|
175
|
-
def discover(
|
217
|
+
def discover(
|
218
|
+
self, source_spec: ConnectorSpecification, config: TConfig
|
219
|
+
) -> Iterable[AirbyteMessage]:
|
176
220
|
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
177
221
|
if self.source.check_config_against_spec:
|
178
222
|
self.validate_connection(source_spec, config)
|
@@ -181,7 +225,9 @@ class AirbyteEntrypoint(object):
|
|
181
225
|
yield from self._emit_queued_messages(self.source)
|
182
226
|
yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
|
183
227
|
|
184
|
-
def read(
|
228
|
+
def read(
|
229
|
+
self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]
|
230
|
+
) -> Iterable[AirbyteMessage]:
|
185
231
|
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
186
232
|
if self.source.check_config_against_spec:
|
187
233
|
self.validate_connection(source_spec, config)
|
@@ -194,16 +240,24 @@ class AirbyteEntrypoint(object):
|
|
194
240
|
yield self.handle_record_counts(message, stream_message_counter)
|
195
241
|
|
196
242
|
@staticmethod
|
197
|
-
def handle_record_counts(
|
243
|
+
def handle_record_counts(
|
244
|
+
message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]
|
245
|
+
) -> AirbyteMessage:
|
198
246
|
match message.type:
|
199
247
|
case Type.RECORD:
|
200
|
-
stream_message_count[
|
248
|
+
stream_message_count[
|
249
|
+
HashableStreamDescriptor(
|
250
|
+
name=message.record.stream, namespace=message.record.namespace
|
251
|
+
)
|
252
|
+
] += 1.0 # type: ignore[union-attr] # record has `stream` and `namespace`
|
201
253
|
case Type.STATE:
|
202
254
|
stream_descriptor = message_utils.get_stream_descriptor(message)
|
203
255
|
|
204
256
|
# Set record count from the counter onto the state message
|
205
257
|
message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats`
|
206
|
-
message.state.sourceStats.recordCount = stream_message_count.get(
|
258
|
+
message.state.sourceStats.recordCount = stream_message_count.get(
|
259
|
+
stream_descriptor, 0.0
|
260
|
+
) # type: ignore[union-attr] # state has `sourceStats`
|
207
261
|
|
208
262
|
# Reset the counter
|
209
263
|
stream_message_count[stream_descriptor] = 0.0
|
@@ -283,7 +337,9 @@ def _init_internal_request_filter() -> None:
|
|
283
337
|
)
|
284
338
|
|
285
339
|
if not parsed_url.hostname:
|
286
|
-
raise requests.exceptions.InvalidURL(
|
340
|
+
raise requests.exceptions.InvalidURL(
|
341
|
+
"Invalid URL specified: The endpoint that data is being requested from is not a valid URL"
|
342
|
+
)
|
287
343
|
|
288
344
|
try:
|
289
345
|
is_private = _is_private_url(parsed_url.hostname, parsed_url.port) # type: ignore [arg-type]
|