airbyte-cdk 6.34.1.dev0__py3-none-any.whl → 6.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +75 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
- airbyte_cdk/sources/declarative/auth/jwt.py +17 -11
- airbyte_cdk/sources/declarative/auth/oauth.py +6 -1
- airbyte_cdk/sources/declarative/auth/token.py +3 -8
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +30 -79
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +213 -100
- airbyte_cdk/sources/declarative/declarative_stream.py +3 -1
- airbyte_cdk/sources/declarative/decoders/__init__.py +0 -4
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +18 -3
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +12 -58
- airbyte_cdk/sources/declarative/extractors/record_selector.py +12 -3
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +56 -25
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +12 -6
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/__init__.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/filters.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +14 -1
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +9 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +150 -41
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +234 -84
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +4 -2
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +26 -18
- airbyte_cdk/sources/declarative/requesters/http_requester.py +8 -2
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +16 -5
- airbyte_cdk/sources/declarative/requesters/request_option.py +83 -4
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +7 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +1 -4
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +0 -3
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -47
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +6 -12
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +4 -3
- airbyte_cdk/sources/declarative/transformations/add_fields.py +4 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/file_based_source.py +70 -37
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
- airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
- airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/call_rate.py +185 -47
- airbyte_cdk/sources/streams/http/http.py +1 -2
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +217 -56
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +144 -73
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/test/mock_http/mocker.py +9 -1
- airbyte_cdk/test/mock_http/response.py +6 -3
- airbyte_cdk/utils/datetime_helpers.py +48 -66
- airbyte_cdk/utils/mapping_helpers.py +126 -26
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/RECORD +68 -59
- airbyte_cdk/connector_builder/message_grouper.py +0 -448
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,160 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
|
6
|
+
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
7
|
+
|
8
|
+
from airbyte_cdk.connector_builder.models import (
|
9
|
+
HttpRequest,
|
10
|
+
HttpResponse,
|
11
|
+
StreamReadPages,
|
12
|
+
)
|
13
|
+
from airbyte_cdk.models import (
|
14
|
+
AirbyteMessage,
|
15
|
+
)
|
16
|
+
from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
|
17
|
+
from airbyte_cdk.utils.schema_inferrer import (
|
18
|
+
SchemaInferrer,
|
19
|
+
)
|
20
|
+
|
21
|
+
from .helpers import (
|
22
|
+
airbyte_message_to_json,
|
23
|
+
handle_current_page,
|
24
|
+
handle_current_slice,
|
25
|
+
handle_log_message,
|
26
|
+
handle_record_message,
|
27
|
+
is_config_update_message,
|
28
|
+
is_log_message,
|
29
|
+
is_record_message,
|
30
|
+
is_state_message,
|
31
|
+
is_trace_with_error,
|
32
|
+
parse_slice_description,
|
33
|
+
should_close_page,
|
34
|
+
should_close_page_for_slice,
|
35
|
+
should_process_slice_descriptor,
|
36
|
+
)
|
37
|
+
from .types import MESSAGE_GROUPS
|
38
|
+
|
39
|
+
|
40
|
+
def get_message_groups(
|
41
|
+
messages: Iterator[AirbyteMessage],
|
42
|
+
schema_inferrer: SchemaInferrer,
|
43
|
+
datetime_format_inferrer: DatetimeFormatInferrer,
|
44
|
+
limit: int,
|
45
|
+
) -> MESSAGE_GROUPS:
|
46
|
+
"""
|
47
|
+
Processes an iterator of AirbyteMessage objects to group and yield messages based on their type and sequence.
|
48
|
+
|
49
|
+
This function iterates over the provided messages until the number of record messages processed reaches the specified limit.
|
50
|
+
It accumulates messages into pages and slices, handling various types of messages such as log, trace (with errors), record,
|
51
|
+
configuration update, and state messages. The function makes use of helper routines to:
|
52
|
+
- Convert messages to JSON.
|
53
|
+
- Determine when to close a page or a slice.
|
54
|
+
- Parse slice descriptors.
|
55
|
+
- Handle log messages and auxiliary requests.
|
56
|
+
- Process record messages while inferring schema and datetime formats.
|
57
|
+
|
58
|
+
Depending on the incoming message type, it may yield:
|
59
|
+
- StreamReadSlices objects when a slice is completed.
|
60
|
+
- Auxiliary HTTP requests/responses generated from log messages.
|
61
|
+
- Error trace messages if encountered.
|
62
|
+
- Configuration update messages.
|
63
|
+
|
64
|
+
Parameters:
|
65
|
+
messages (Iterator[AirbyteMessage]): An iterator yielding AirbyteMessage instances.
|
66
|
+
schema_inferrer (SchemaInferrer): An instance used to infer and update schema based on record messages.
|
67
|
+
datetime_format_inferrer (DatetimeFormatInferrer): An instance used to infer datetime formats from record messages.
|
68
|
+
limit (int): The maximum number of record messages to process before stopping.
|
69
|
+
|
70
|
+
Yields:
|
71
|
+
Depending on the type of message processed, one or more of the following:
|
72
|
+
- StreamReadSlices: A grouping of pages within a slice along with the slice descriptor and state.
|
73
|
+
- HttpRequest/HttpResponse: Auxiliary request/response information derived from log messages.
|
74
|
+
- TraceMessage: Error details when a trace message with errors is encountered.
|
75
|
+
- ControlMessage: Configuration update details.
|
76
|
+
|
77
|
+
Notes:
|
78
|
+
The function depends on several helper functions (e.g., airbyte_message_to_json, should_close_page,
|
79
|
+
handle_current_page, parse_slice_description, handle_log_message, and others) to encapsulate specific behavior.
|
80
|
+
It maintains internal state for grouping pages and slices, ensuring that related messages are correctly aggregated
|
81
|
+
and yielded as complete units.
|
82
|
+
"""
|
83
|
+
|
84
|
+
records_count = 0
|
85
|
+
at_least_one_page_in_group = False
|
86
|
+
current_page_records: List[Mapping[str, Any]] = []
|
87
|
+
current_slice_descriptor: Optional[Dict[str, Any]] = None
|
88
|
+
current_slice_pages: List[StreamReadPages] = []
|
89
|
+
current_page_request: Optional[HttpRequest] = None
|
90
|
+
current_page_response: Optional[HttpResponse] = None
|
91
|
+
latest_state_message: Optional[Dict[str, Any]] = None
|
92
|
+
|
93
|
+
while records_count < limit and (message := next(messages, None)):
|
94
|
+
json_message = airbyte_message_to_json(message)
|
95
|
+
|
96
|
+
if should_close_page(at_least_one_page_in_group, message, json_message):
|
97
|
+
current_page_request, current_page_response = handle_current_page(
|
98
|
+
current_page_request,
|
99
|
+
current_page_response,
|
100
|
+
current_slice_pages,
|
101
|
+
current_page_records,
|
102
|
+
)
|
103
|
+
|
104
|
+
if should_close_page_for_slice(at_least_one_page_in_group, message):
|
105
|
+
yield handle_current_slice(
|
106
|
+
current_slice_pages,
|
107
|
+
current_slice_descriptor,
|
108
|
+
latest_state_message,
|
109
|
+
)
|
110
|
+
current_slice_descriptor = parse_slice_description(message.log.message) # type: ignore
|
111
|
+
current_slice_pages = []
|
112
|
+
at_least_one_page_in_group = False
|
113
|
+
elif should_process_slice_descriptor(message):
|
114
|
+
# parsing the first slice
|
115
|
+
current_slice_descriptor = parse_slice_description(message.log.message) # type: ignore
|
116
|
+
elif is_log_message(message):
|
117
|
+
(
|
118
|
+
at_least_one_page_in_group,
|
119
|
+
current_page_request,
|
120
|
+
current_page_response,
|
121
|
+
log_or_auxiliary_request,
|
122
|
+
) = handle_log_message(
|
123
|
+
message,
|
124
|
+
json_message,
|
125
|
+
at_least_one_page_in_group,
|
126
|
+
current_page_request,
|
127
|
+
current_page_response,
|
128
|
+
)
|
129
|
+
if log_or_auxiliary_request:
|
130
|
+
yield log_or_auxiliary_request
|
131
|
+
elif is_trace_with_error(message):
|
132
|
+
if message.trace is not None:
|
133
|
+
yield message.trace
|
134
|
+
elif is_record_message(message):
|
135
|
+
records_count = handle_record_message(
|
136
|
+
message,
|
137
|
+
schema_inferrer,
|
138
|
+
datetime_format_inferrer,
|
139
|
+
records_count,
|
140
|
+
current_page_records,
|
141
|
+
)
|
142
|
+
elif is_config_update_message(message):
|
143
|
+
if message.control is not None:
|
144
|
+
yield message.control
|
145
|
+
elif is_state_message(message):
|
146
|
+
latest_state_message = message.state # type: ignore
|
147
|
+
|
148
|
+
else:
|
149
|
+
if current_page_request or current_page_response or current_page_records:
|
150
|
+
handle_current_page(
|
151
|
+
current_page_request,
|
152
|
+
current_page_response,
|
153
|
+
current_slice_pages,
|
154
|
+
current_page_records,
|
155
|
+
)
|
156
|
+
yield handle_current_slice(
|
157
|
+
current_slice_pages,
|
158
|
+
current_slice_descriptor,
|
159
|
+
latest_state_message,
|
160
|
+
)
|
@@ -0,0 +1,441 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
|
6
|
+
import logging
|
7
|
+
from typing import Any, Dict, Iterator, List, Mapping, Optional, Union
|
8
|
+
|
9
|
+
from airbyte_cdk.connector_builder.models import (
|
10
|
+
AuxiliaryRequest,
|
11
|
+
LogMessage,
|
12
|
+
StreamRead,
|
13
|
+
StreamReadSlices,
|
14
|
+
)
|
15
|
+
from airbyte_cdk.entrypoint import AirbyteEntrypoint
|
16
|
+
from airbyte_cdk.models import (
|
17
|
+
AirbyteControlMessage,
|
18
|
+
AirbyteLogMessage,
|
19
|
+
AirbyteMessage,
|
20
|
+
AirbyteStateMessage,
|
21
|
+
AirbyteTraceMessage,
|
22
|
+
ConfiguredAirbyteCatalog,
|
23
|
+
TraceType,
|
24
|
+
)
|
25
|
+
from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
|
26
|
+
from airbyte_cdk.utils import AirbyteTracedException
|
27
|
+
from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
|
28
|
+
from airbyte_cdk.utils.schema_inferrer import (
|
29
|
+
SchemaInferrer,
|
30
|
+
SchemaValidationException,
|
31
|
+
)
|
32
|
+
|
33
|
+
from .helpers import clean_config
|
34
|
+
from .message_grouper import get_message_groups
|
35
|
+
from .types import GROUPED_MESSAGES, INFERRED_SCHEMA_OUTPUT_TYPE, MESSAGE_GROUPS
|
36
|
+
|
37
|
+
|
38
|
+
class TestReader:
|
39
|
+
"""
|
40
|
+
A utility class for performing test reads from a declarative data source, primarily used to validate
|
41
|
+
connector configurations by performing partial stream reads.
|
42
|
+
|
43
|
+
Initialization:
|
44
|
+
|
45
|
+
TestReader(max_pages_per_slice: int, max_slices: int, max_record_limit: int = 1000)
|
46
|
+
Initializes a new instance of the TestReader class with limits on pages per slice, slices, and records
|
47
|
+
per read operation.
|
48
|
+
|
49
|
+
Public Methods:
|
50
|
+
run_test_read(source, config, configured_catalog, state, record_limit=None) -> StreamRead:
|
51
|
+
|
52
|
+
Executes a test read operation from the given declarative source. It configures and infers the schema,
|
53
|
+
processes the read messages (including logging and error handling), and returns a StreamRead object
|
54
|
+
that contains slices of data, log messages, auxiliary requests, and any inferred schema or datetime formats.
|
55
|
+
|
56
|
+
Parameters:
|
57
|
+
source (DeclarativeSource): The data source to read from.
|
58
|
+
config (Mapping[str, Any]): Configuration parameters for the source.
|
59
|
+
configured_catalog (ConfiguredAirbyteCatalog): Catalog containing stream configuration.
|
60
|
+
state (List[AirbyteStateMessage]): Current state information for the read.
|
61
|
+
record_limit (Optional[int]): Optional override for the maximum number of records to read.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
StreamRead: An object encapsulating logs, data slices, auxiliary requests, and inferred metadata,
|
65
|
+
along with indicators if any configured limit was reached.
|
66
|
+
|
67
|
+
"""
|
68
|
+
|
69
|
+
logger = logging.getLogger("airbyte.connector-builder")
|
70
|
+
|
71
|
+
def __init__(
|
72
|
+
self,
|
73
|
+
max_pages_per_slice: int,
|
74
|
+
max_slices: int,
|
75
|
+
max_record_limit: int = 1000,
|
76
|
+
) -> None:
|
77
|
+
self._max_pages_per_slice = max_pages_per_slice
|
78
|
+
self._max_slices = max_slices
|
79
|
+
self._max_record_limit = max_record_limit
|
80
|
+
|
81
|
+
def run_test_read(
|
82
|
+
self,
|
83
|
+
source: DeclarativeSource,
|
84
|
+
config: Mapping[str, Any],
|
85
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
86
|
+
state: List[AirbyteStateMessage],
|
87
|
+
record_limit: Optional[int] = None,
|
88
|
+
) -> StreamRead:
|
89
|
+
"""
|
90
|
+
Run a test read for the connector by reading from a single stream and inferring schema and datetime formats.
|
91
|
+
|
92
|
+
Parameters:
|
93
|
+
source (DeclarativeSource): The source instance providing the streams.
|
94
|
+
config (Mapping[str, Any]): The configuration settings to use for reading.
|
95
|
+
configured_catalog (ConfiguredAirbyteCatalog): The catalog specifying the stream configuration.
|
96
|
+
state (List[AirbyteStateMessage]): A list of state messages to resume the read.
|
97
|
+
record_limit (Optional[int], optional): Maximum number of records to read. Defaults to None.
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
StreamRead: An object containing the following attributes:
|
101
|
+
- logs (List[str]): Log messages generated during the process.
|
102
|
+
- slices (List[Any]): The data slices read from the stream.
|
103
|
+
- test_read_limit_reached (bool): Indicates whether the record limit was reached.
|
104
|
+
- auxiliary_requests (Any): Any auxiliary requests generated during reading.
|
105
|
+
- inferred_schema (Any): The schema inferred from the stream data.
|
106
|
+
- latest_config_update (Any): The latest configuration update, if applicable.
|
107
|
+
- inferred_datetime_formats (Dict[str, str]): Mapping of fields to their inferred datetime formats.
|
108
|
+
"""
|
109
|
+
|
110
|
+
record_limit = self._check_record_limit(record_limit)
|
111
|
+
# The connector builder currently only supports reading from a single stream at a time
|
112
|
+
stream = source.streams(config)[0]
|
113
|
+
schema_inferrer = SchemaInferrer(
|
114
|
+
self._pk_to_nested_and_composite_field(stream.primary_key),
|
115
|
+
self._cursor_field_to_nested_and_composite_field(stream.cursor_field),
|
116
|
+
)
|
117
|
+
datetime_format_inferrer = DatetimeFormatInferrer()
|
118
|
+
message_group = get_message_groups(
|
119
|
+
self._read_stream(source, config, configured_catalog, state),
|
120
|
+
schema_inferrer,
|
121
|
+
datetime_format_inferrer,
|
122
|
+
record_limit,
|
123
|
+
)
|
124
|
+
|
125
|
+
slices, log_messages, auxiliary_requests, latest_config_update = self._categorise_groups(
|
126
|
+
message_group
|
127
|
+
)
|
128
|
+
schema, log_messages = self._get_infered_schema(
|
129
|
+
configured_catalog, schema_inferrer, log_messages
|
130
|
+
)
|
131
|
+
|
132
|
+
return StreamRead(
|
133
|
+
logs=log_messages,
|
134
|
+
slices=slices,
|
135
|
+
test_read_limit_reached=self._has_reached_limit(slices),
|
136
|
+
auxiliary_requests=auxiliary_requests,
|
137
|
+
inferred_schema=schema,
|
138
|
+
latest_config_update=self._get_latest_config_update(latest_config_update),
|
139
|
+
inferred_datetime_formats=datetime_format_inferrer.get_inferred_datetime_formats(),
|
140
|
+
)
|
141
|
+
|
142
|
+
def _pk_to_nested_and_composite_field(
|
143
|
+
self, field: Optional[Union[str, List[str], List[List[str]]]]
|
144
|
+
) -> List[List[str]]:
|
145
|
+
"""
|
146
|
+
Converts a primary key definition into a nested list representation.
|
147
|
+
|
148
|
+
The function accepts a primary key that can be a single string, a list of strings, or a list of lists of strings.
|
149
|
+
It ensures that the return value is always a list of lists of strings.
|
150
|
+
|
151
|
+
Parameters:
|
152
|
+
field (Optional[Union[str, List[str], List[List[str]]]]):
|
153
|
+
The primary key definition. This can be:
|
154
|
+
- None or an empty value: returns a list containing an empty list.
|
155
|
+
- A single string: returns a list containing one list with the string.
|
156
|
+
- A list of strings (composite key): returns a list where each key is encapsulated in its own list.
|
157
|
+
- A list of lists of strings (nested field structure): returns as is.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
List[List[str]]:
|
161
|
+
A nested list representation of the primary key.
|
162
|
+
"""
|
163
|
+
if not field:
|
164
|
+
return [[]]
|
165
|
+
|
166
|
+
if isinstance(field, str):
|
167
|
+
return [[field]]
|
168
|
+
|
169
|
+
is_composite_key = isinstance(field[0], str)
|
170
|
+
if is_composite_key:
|
171
|
+
return [[i] for i in field] # type: ignore # the type of field is expected to be List[str] here
|
172
|
+
|
173
|
+
return field # type: ignore # the type of field is expected to be List[List[str]] here
|
174
|
+
|
175
|
+
def _cursor_field_to_nested_and_composite_field(
|
176
|
+
self, field: Union[str, List[str]]
|
177
|
+
) -> List[List[str]]:
|
178
|
+
"""
|
179
|
+
Transforms the cursor field input into a nested list format suitable for further processing.
|
180
|
+
|
181
|
+
This function accepts a cursor field specification, which can be either:
|
182
|
+
- A falsy value (e.g., None or an empty string), in which case it returns a list containing an empty list.
|
183
|
+
- A string representing a simple cursor field. The string is wrapped in a nested list.
|
184
|
+
- A list of strings representing a composite or nested cursor field. The list is returned wrapped in an outer list.
|
185
|
+
|
186
|
+
Parameters:
|
187
|
+
field (Union[str, List[str]]): The cursor field specification. It can be:
|
188
|
+
- An empty or falsy value: returns [[]].
|
189
|
+
- A string: returns [[field]].
|
190
|
+
- A list of strings: returns [field] if the first element is a string.
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
List[List[str]]: A nested list representation of the cursor field.
|
194
|
+
|
195
|
+
Raises:
|
196
|
+
ValueError: If the input is a list but its first element is not a string,
|
197
|
+
indicating an unsupported type for a cursor field.
|
198
|
+
"""
|
199
|
+
if not field:
|
200
|
+
return [[]]
|
201
|
+
|
202
|
+
if isinstance(field, str):
|
203
|
+
return [[field]]
|
204
|
+
|
205
|
+
is_nested_key = isinstance(field[0], str)
|
206
|
+
if is_nested_key:
|
207
|
+
return [field]
|
208
|
+
|
209
|
+
raise ValueError(f"Unknown type for cursor field `{field}")
|
210
|
+
|
211
|
+
def _check_record_limit(self, record_limit: Optional[int] = None) -> int:
|
212
|
+
"""
|
213
|
+
Checks and adjusts the provided record limit to ensure it falls within the valid range.
|
214
|
+
|
215
|
+
If record_limit is provided, it must be between 1 and self._max_record_limit inclusive.
|
216
|
+
If record_limit is None, it defaults to self._max_record_limit.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
record_limit (Optional[int]): The requested record limit to validate.
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
int: The validated record limit. If record_limit exceeds self._max_record_limit, the maximum allowed value is used.
|
223
|
+
|
224
|
+
Raises:
|
225
|
+
ValueError: If record_limit is provided and is not between 1 and self._max_record_limit.
|
226
|
+
"""
|
227
|
+
if record_limit is not None and not (1 <= record_limit <= self._max_record_limit):
|
228
|
+
raise ValueError(
|
229
|
+
f"Record limit must be between 1 and {self._max_record_limit}. Got {record_limit}"
|
230
|
+
)
|
231
|
+
|
232
|
+
if record_limit is None:
|
233
|
+
record_limit = self._max_record_limit
|
234
|
+
else:
|
235
|
+
record_limit = min(record_limit, self._max_record_limit)
|
236
|
+
|
237
|
+
return record_limit
|
238
|
+
|
239
|
+
def _categorise_groups(self, message_groups: MESSAGE_GROUPS) -> GROUPED_MESSAGES:
|
240
|
+
"""
|
241
|
+
Categorizes a sequence of message groups into slices, log messages, auxiliary requests, and the latest configuration update.
|
242
|
+
|
243
|
+
This function iterates over each message group in the provided collection and processes it based on its type:
|
244
|
+
- AirbyteLogMessage: Converts the log message into a LogMessage object and appends it to the log_messages list.
|
245
|
+
- AirbyteTraceMessage with type ERROR: Extracts error details, creates a LogMessage at the "ERROR" level, and appends it.
|
246
|
+
- AirbyteControlMessage: Updates the latest_config_update if the current message is more recent.
|
247
|
+
- AuxiliaryRequest: Appends the message to the auxiliary_requests list.
|
248
|
+
- StreamReadSlices: Appends the message to the slices list.
|
249
|
+
- Any other type: Raises a ValueError indicating an unknown message group type.
|
250
|
+
|
251
|
+
Parameters:
|
252
|
+
message_groups (MESSAGE_GROUPS): A collection of message groups to be processed.
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
GROUPED_MESSAGES: A tuple containing four elements:
|
256
|
+
- slices: A list of StreamReadSlices objects.
|
257
|
+
- log_messages: A list of LogMessage objects.
|
258
|
+
- auxiliary_requests: A list of AuxiliaryRequest objects.
|
259
|
+
- latest_config_update: The most recent AirbyteControlMessage, or None if none was processed.
|
260
|
+
|
261
|
+
Raises:
|
262
|
+
ValueError: If a message group of an unknown type is encountered.
|
263
|
+
"""
|
264
|
+
|
265
|
+
slices = []
|
266
|
+
log_messages = []
|
267
|
+
auxiliary_requests = []
|
268
|
+
latest_config_update: Optional[AirbyteControlMessage] = None
|
269
|
+
|
270
|
+
for message_group in message_groups:
|
271
|
+
match message_group:
|
272
|
+
case AirbyteLogMessage():
|
273
|
+
log_messages.append(
|
274
|
+
LogMessage(message=message_group.message, level=message_group.level.value)
|
275
|
+
)
|
276
|
+
case AirbyteTraceMessage():
|
277
|
+
if message_group.type == TraceType.ERROR:
|
278
|
+
log_messages.append(
|
279
|
+
LogMessage(
|
280
|
+
message=message_group.error.message, # type: ignore
|
281
|
+
level="ERROR",
|
282
|
+
internal_message=message_group.error.internal_message, # type: ignore
|
283
|
+
stacktrace=message_group.error.stack_trace, # type: ignore
|
284
|
+
)
|
285
|
+
)
|
286
|
+
case AirbyteControlMessage():
|
287
|
+
if (
|
288
|
+
not latest_config_update
|
289
|
+
or latest_config_update.emitted_at <= message_group.emitted_at
|
290
|
+
):
|
291
|
+
latest_config_update = message_group
|
292
|
+
case AuxiliaryRequest():
|
293
|
+
auxiliary_requests.append(message_group)
|
294
|
+
case StreamReadSlices():
|
295
|
+
slices.append(message_group)
|
296
|
+
case _:
|
297
|
+
raise ValueError(f"Unknown message group type: {type(message_group)}")
|
298
|
+
|
299
|
+
return slices, log_messages, auxiliary_requests, latest_config_update
|
300
|
+
|
301
|
+
def _get_infered_schema(
|
302
|
+
self,
|
303
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
304
|
+
schema_inferrer: SchemaInferrer,
|
305
|
+
log_messages: List[LogMessage],
|
306
|
+
) -> INFERRED_SCHEMA_OUTPUT_TYPE:
|
307
|
+
"""
|
308
|
+
Retrieves the inferred schema from the given configured catalog using the provided schema inferrer.
|
309
|
+
|
310
|
+
This function processes a single stream from the configured catalog. It attempts to obtain the stream's
|
311
|
+
schema via the schema inferrer. If a SchemaValidationException occurs, each validation error is logged in the
|
312
|
+
provided log_messages list and the partially inferred schema (from the exception) is returned.
|
313
|
+
|
314
|
+
Parameters:
|
315
|
+
configured_catalog (ConfiguredAirbyteCatalog): The configured catalog that contains the stream metadata.
|
316
|
+
It is assumed that only one stream is defined.
|
317
|
+
schema_inferrer (SchemaInferrer): An instance responsible for inferring the schema for a given stream.
|
318
|
+
log_messages (List[LogMessage]): A list that will be appended with log messages, especially error messages
|
319
|
+
if schema validation issues arise.
|
320
|
+
|
321
|
+
Returns:
|
322
|
+
INFERRED_SCHEMA_OUTPUT_TYPE: A tuple consisting of the inferred schema and the updated list of log messages.
|
323
|
+
"""
|
324
|
+
|
325
|
+
try:
|
326
|
+
# The connector builder currently only supports reading from a single stream at a time
|
327
|
+
configured_stream = configured_catalog.streams[0]
|
328
|
+
schema = schema_inferrer.get_stream_schema(configured_stream.stream.name)
|
329
|
+
except SchemaValidationException as exception:
|
330
|
+
# we update the log_messages with possible errors
|
331
|
+
for validation_error in exception.validation_errors:
|
332
|
+
log_messages.append(LogMessage(validation_error, "ERROR"))
|
333
|
+
schema = exception.schema
|
334
|
+
|
335
|
+
return schema, log_messages
|
336
|
+
|
337
|
+
def _get_latest_config_update(
|
338
|
+
self,
|
339
|
+
latest_config_update: AirbyteControlMessage | None,
|
340
|
+
) -> Dict[str, Any] | None:
|
341
|
+
"""
|
342
|
+
Retrieves a cleaned configuration from the latest Airbyte control message.
|
343
|
+
|
344
|
+
This helper function extracts the configuration from the given Airbyte control message, cleans it using the internal `Parsers.clean_config` function,
|
345
|
+
and returns the resulting dictionary. If no control message is provided (i.e., latest_config_update is None), the function returns None.
|
346
|
+
|
347
|
+
Parameters:
|
348
|
+
latest_config_update (AirbyteControlMessage | None): The control message containing the connector configuration. May be None.
|
349
|
+
|
350
|
+
Returns:
|
351
|
+
Dict[str, Any] | None: The cleaned configuration dictionary if available; otherwise, None.
|
352
|
+
"""
|
353
|
+
|
354
|
+
return (
|
355
|
+
clean_config(latest_config_update.connectorConfig.config) # type: ignore
|
356
|
+
if latest_config_update
|
357
|
+
else None
|
358
|
+
)
|
359
|
+
|
360
|
+
def _read_stream(
|
361
|
+
self,
|
362
|
+
source: DeclarativeSource,
|
363
|
+
config: Mapping[str, Any],
|
364
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
365
|
+
state: List[AirbyteStateMessage],
|
366
|
+
) -> Iterator[AirbyteMessage]:
|
367
|
+
"""
|
368
|
+
Reads messages from the given DeclarativeSource using an AirbyteEntrypoint.
|
369
|
+
|
370
|
+
This method attempts to yield messages from the source's read generator. If the generator
|
371
|
+
raises an AirbyteTracedException, it checks whether the exception message indicates a non-actionable
|
372
|
+
error (e.g., a final exception from AbstractSource that should not be logged). In that case, it stops
|
373
|
+
processing without yielding the exception as a message. For other exceptions, the exception is caught,
|
374
|
+
wrapped into an AirbyteTracedException, and yielded as an AirbyteMessage.
|
375
|
+
|
376
|
+
Parameters:
|
377
|
+
source (DeclarativeSource): The source object that provides data reading logic.
|
378
|
+
config (Mapping[str, Any]): The configuration dictionary for the source.
|
379
|
+
configured_catalog (ConfiguredAirbyteCatalog): The catalog defining the streams and their configurations.
|
380
|
+
state (List[AirbyteStateMessage]): A list representing the current state for incremental sync.
|
381
|
+
|
382
|
+
Yields:
|
383
|
+
AirbyteMessage: Messages yielded from the source's generator. In case of exceptions,
|
384
|
+
an AirbyteMessage encapsulating the error is yielded instead.
|
385
|
+
"""
|
386
|
+
# the generator can raise an exception
|
387
|
+
# iterate over the generated messages. if next raise an exception, catch it and yield it as an AirbyteLogMessage
|
388
|
+
try:
|
389
|
+
yield from AirbyteEntrypoint(source).read(
|
390
|
+
source.spec(self.logger), config, configured_catalog, state
|
391
|
+
)
|
392
|
+
except AirbyteTracedException as traced_exception:
|
393
|
+
# Look for this message which indicates that it is the "final exception" raised by AbstractSource.
|
394
|
+
# If it matches, don't yield this as we don't need to show this in the Builder.
|
395
|
+
# This is somewhat brittle as it relies on the message string, but if they drift then the worst case
|
396
|
+
# is that this message will be shown in the Builder.
|
397
|
+
if (
|
398
|
+
traced_exception.message is not None
|
399
|
+
and "During the sync, the following streams did not sync successfully"
|
400
|
+
in traced_exception.message
|
401
|
+
):
|
402
|
+
return
|
403
|
+
yield traced_exception.as_airbyte_message()
|
404
|
+
except Exception as e:
|
405
|
+
error_message = f"{e.args[0] if len(e.args) > 0 else str(e)}"
|
406
|
+
yield AirbyteTracedException.from_exception(
|
407
|
+
e, message=error_message
|
408
|
+
).as_airbyte_message()
|
409
|
+
|
410
|
+
def _has_reached_limit(self, slices: List[StreamReadSlices]) -> bool:
|
411
|
+
"""
|
412
|
+
Determines whether the provided collection of slices has reached any defined limits.
|
413
|
+
|
414
|
+
This function checks for three types of limits:
|
415
|
+
1. If the number of slices is greater than or equal to a maximum slice limit.
|
416
|
+
2. If any individual slice has a number of pages that meets or exceeds a maximum number of pages per slice.
|
417
|
+
3. If the cumulative number of records across all pages in all slices reaches or exceeds a maximum record limit.
|
418
|
+
|
419
|
+
Parameters:
|
420
|
+
slices (List[StreamReadSlices]): A list where each element represents a slice containing one or more pages, and each page has a collection of records.
|
421
|
+
|
422
|
+
Returns:
|
423
|
+
bool: True if any of the following conditions is met:
|
424
|
+
- The number of slices is at or above the maximum allowed slices.
|
425
|
+
- Any slice contains pages at or above the maximum allowed per slice.
|
426
|
+
- The total count of records reaches or exceeds the maximum record limit.
|
427
|
+
False otherwise.
|
428
|
+
"""
|
429
|
+
if len(slices) >= self._max_slices:
|
430
|
+
return True
|
431
|
+
|
432
|
+
record_count = 0
|
433
|
+
|
434
|
+
for _slice in slices:
|
435
|
+
if len(_slice.pages) >= self._max_pages_per_slice:
|
436
|
+
return True
|
437
|
+
for page in _slice.pages:
|
438
|
+
record_count += len(page.records)
|
439
|
+
if record_count >= self._max_record_limit:
|
440
|
+
return True
|
441
|
+
return False
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
"""
|
6
|
+
This module defines type aliases utilized in the Airbyte Connector Builder's test reader.
|
7
|
+
These aliases streamline type-checking for heterogeneous message groups and schema outputs,
|
8
|
+
ensuring consistency throughout the processing of stream data and associated messages.
|
9
|
+
|
10
|
+
Type Aliases:
|
11
|
+
MESSAGE_GROUPS:
|
12
|
+
An iterable union of message-like objects which may include:
|
13
|
+
- StreamReadSlices: Represents slices used to read data from a stream.
|
14
|
+
- AirbyteControlMessage: Represents control commands used in the Airbyte protocol.
|
15
|
+
- AirbyteLogMessage: Represents log messages generated by the system.
|
16
|
+
- AirbyteTraceMessage: Represents trace messages typically used for debugging.
|
17
|
+
- AuxiliaryRequest: Represents any supplementary request issued during processing.
|
18
|
+
|
19
|
+
INFERRED_SCHEMA_OUTPUT_TYPE:
|
20
|
+
A tuple where:
|
21
|
+
- The first element is either an InferredSchema instance or None, denoting the inferred JSON schema.
|
22
|
+
- The second element is a list of LogMessage instances capturing logs produced during inference.
|
23
|
+
|
24
|
+
GROUPED_MESSAGES:
|
25
|
+
A tuple representing grouped messages divided as follows:
|
26
|
+
- A list of StreamReadSlices.
|
27
|
+
- A list of LogMessage instances.
|
28
|
+
- A list of AuxiliaryRequest instances.
|
29
|
+
- An optional AirbyteControlMessage that, if present, governs control flow in message processing.
|
30
|
+
"""
|
31
|
+
|
32
|
+
from typing import Any, Iterable, List
|
33
|
+
|
34
|
+
from airbyte_cdk.connector_builder.models import (
|
35
|
+
AuxiliaryRequest,
|
36
|
+
HttpRequest,
|
37
|
+
HttpResponse,
|
38
|
+
LogMessage,
|
39
|
+
StreamReadSlices,
|
40
|
+
)
|
41
|
+
from airbyte_cdk.models import (
|
42
|
+
AirbyteControlMessage,
|
43
|
+
AirbyteLogMessage,
|
44
|
+
AirbyteTraceMessage,
|
45
|
+
)
|
46
|
+
from airbyte_cdk.utils.schema_inferrer import (
|
47
|
+
InferredSchema,
|
48
|
+
)
|
49
|
+
|
50
|
+
MESSAGE_GROUPS = Iterable[
|
51
|
+
StreamReadSlices
|
52
|
+
| AirbyteControlMessage
|
53
|
+
| AirbyteLogMessage
|
54
|
+
| AirbyteTraceMessage
|
55
|
+
| AuxiliaryRequest,
|
56
|
+
]
|
57
|
+
|
58
|
+
INFERRED_SCHEMA_OUTPUT_TYPE = tuple[
|
59
|
+
InferredSchema | None,
|
60
|
+
List[LogMessage],
|
61
|
+
]
|
62
|
+
|
63
|
+
GROUPED_MESSAGES = tuple[
|
64
|
+
List[StreamReadSlices],
|
65
|
+
List[LogMessage],
|
66
|
+
List[AuxiliaryRequest],
|
67
|
+
AirbyteControlMessage | None,
|
68
|
+
]
|
69
|
+
|
70
|
+
LOG_MESSAGES_OUTPUT_TYPE = tuple[
|
71
|
+
bool,
|
72
|
+
HttpRequest | None,
|
73
|
+
HttpResponse | None,
|
74
|
+
AuxiliaryRequest | AirbyteLogMessage | None,
|
75
|
+
]
|