airbyte-cdk 6.34.0.dev1__py3-none-any.whl → 6.34.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +12 -16
- airbyte_cdk/connector_builder/message_grouper.py +448 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
- airbyte_cdk/sources/declarative/auth/jwt.py +11 -17
- airbyte_cdk/sources/declarative/auth/oauth.py +1 -6
- airbyte_cdk/sources/declarative/auth/token.py +8 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +19 -30
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +85 -203
- airbyte_cdk/sources/declarative/declarative_stream.py +1 -3
- airbyte_cdk/sources/declarative/decoders/__init__.py +4 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +2 -7
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +58 -12
- airbyte_cdk/sources/declarative/extractors/record_selector.py +3 -12
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +25 -56
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +6 -12
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +0 -9
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +41 -150
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +84 -234
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +2 -4
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +18 -26
- airbyte_cdk/sources/declarative/requesters/http_requester.py +1 -8
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +5 -16
- airbyte_cdk/sources/declarative/requesters/request_option.py +4 -83
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +6 -7
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -6
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +1 -2
- airbyte_cdk/sources/file_based/file_based_source.py +37 -70
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +12 -107
- airbyte_cdk/sources/file_based/stream/__init__.py +1 -10
- airbyte_cdk/sources/streams/call_rate.py +47 -185
- airbyte_cdk/sources/streams/http/http.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +56 -217
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +73 -144
- airbyte_cdk/utils/datetime_helpers.py +66 -48
- airbyte_cdk/utils/mapping_helpers.py +26 -126
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/RECORD +43 -52
- airbyte_cdk/connector_builder/test_reader/__init__.py +0 -7
- airbyte_cdk/connector_builder/test_reader/helpers.py +0 -591
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +0 -160
- airbyte_cdk/connector_builder/test_reader/reader.py +0 -441
- airbyte_cdk/connector_builder/test_reader/types.py +0 -75
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +0 -81
- airbyte_cdk/sources/file_based/stream/identities_stream.py +0 -47
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +0 -85
- airbyte_cdk/sources/specs/transfer_modes.py +0 -26
- airbyte_cdk/sources/streams/permissions/identities_stream.py +0 -75
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/entry_points.txt +0 -0
@@ -2,11 +2,10 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
6
|
-
from dataclasses import asdict, dataclass, field
|
5
|
+
import dataclasses
|
7
6
|
from typing import Any, List, Mapping
|
8
7
|
|
9
|
-
from airbyte_cdk.connector_builder.
|
8
|
+
from airbyte_cdk.connector_builder.message_grouper import MessageGrouper
|
10
9
|
from airbyte_cdk.models import (
|
11
10
|
AirbyteMessage,
|
12
11
|
AirbyteRecordMessage,
|
@@ -33,11 +32,11 @@ MAX_SLICES_KEY = "max_slices"
|
|
33
32
|
MAX_RECORDS_KEY = "max_records"
|
34
33
|
|
35
34
|
|
36
|
-
@dataclass
|
35
|
+
@dataclasses.dataclass
|
37
36
|
class TestReadLimits:
|
38
|
-
max_records: int = field(default=DEFAULT_MAXIMUM_RECORDS)
|
39
|
-
max_pages_per_slice: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE)
|
40
|
-
max_slices: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES)
|
37
|
+
max_records: int = dataclasses.field(default=DEFAULT_MAXIMUM_RECORDS)
|
38
|
+
max_pages_per_slice: int = dataclasses.field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE)
|
39
|
+
max_slices: int = dataclasses.field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES)
|
41
40
|
|
42
41
|
|
43
42
|
def get_limits(config: Mapping[str, Any]) -> TestReadLimits:
|
@@ -74,20 +73,17 @@ def read_stream(
|
|
74
73
|
limits: TestReadLimits,
|
75
74
|
) -> AirbyteMessage:
|
76
75
|
try:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
# The connector builder only supports a single stream
|
81
|
-
|
82
|
-
|
83
|
-
stream_read = test_read_handler.run_test_read(
|
76
|
+
handler = MessageGrouper(limits.max_pages_per_slice, limits.max_slices, limits.max_records)
|
77
|
+
stream_name = configured_catalog.streams[
|
78
|
+
0
|
79
|
+
].stream.name # The connector builder only supports a single stream
|
80
|
+
stream_read = handler.get_message_groups(
|
84
81
|
source, config, configured_catalog, state, limits.max_records
|
85
82
|
)
|
86
|
-
|
87
83
|
return AirbyteMessage(
|
88
84
|
type=MessageType.RECORD,
|
89
85
|
record=AirbyteRecordMessage(
|
90
|
-
data=asdict(stream_read), stream=stream_name, emitted_at=_emitted_at()
|
86
|
+
data=dataclasses.asdict(stream_read), stream=stream_name, emitted_at=_emitted_at()
|
91
87
|
),
|
92
88
|
)
|
93
89
|
except Exception as exc:
|
@@ -0,0 +1,448 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
from copy import deepcopy
|
8
|
+
from json import JSONDecodeError
|
9
|
+
from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Union
|
10
|
+
|
11
|
+
from airbyte_cdk.connector_builder.models import (
|
12
|
+
AuxiliaryRequest,
|
13
|
+
HttpRequest,
|
14
|
+
HttpResponse,
|
15
|
+
LogMessage,
|
16
|
+
StreamRead,
|
17
|
+
StreamReadPages,
|
18
|
+
StreamReadSlices,
|
19
|
+
)
|
20
|
+
from airbyte_cdk.entrypoint import AirbyteEntrypoint
|
21
|
+
from airbyte_cdk.models import (
|
22
|
+
AirbyteControlMessage,
|
23
|
+
AirbyteLogMessage,
|
24
|
+
AirbyteMessage,
|
25
|
+
AirbyteStateMessage,
|
26
|
+
AirbyteTraceMessage,
|
27
|
+
ConfiguredAirbyteCatalog,
|
28
|
+
OrchestratorType,
|
29
|
+
TraceType,
|
30
|
+
)
|
31
|
+
from airbyte_cdk.models import Type as MessageType
|
32
|
+
from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
|
33
|
+
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
34
|
+
from airbyte_cdk.sources.utils.types import JsonType
|
35
|
+
from airbyte_cdk.utils import AirbyteTracedException
|
36
|
+
from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
|
37
|
+
from airbyte_cdk.utils.schema_inferrer import SchemaInferrer, SchemaValidationException
|
38
|
+
|
39
|
+
|
40
|
+
class MessageGrouper:
|
41
|
+
logger = logging.getLogger("airbyte.connector-builder")
|
42
|
+
|
43
|
+
def __init__(self, max_pages_per_slice: int, max_slices: int, max_record_limit: int = 1000):
|
44
|
+
self._max_pages_per_slice = max_pages_per_slice
|
45
|
+
self._max_slices = max_slices
|
46
|
+
self._max_record_limit = max_record_limit
|
47
|
+
|
48
|
+
def _pk_to_nested_and_composite_field(
|
49
|
+
self, field: Optional[Union[str, List[str], List[List[str]]]]
|
50
|
+
) -> List[List[str]]:
|
51
|
+
if not field:
|
52
|
+
return [[]]
|
53
|
+
|
54
|
+
if isinstance(field, str):
|
55
|
+
return [[field]]
|
56
|
+
|
57
|
+
is_composite_key = isinstance(field[0], str)
|
58
|
+
if is_composite_key:
|
59
|
+
return [[i] for i in field] # type: ignore # the type of field is expected to be List[str] here
|
60
|
+
|
61
|
+
return field # type: ignore # the type of field is expected to be List[List[str]] here
|
62
|
+
|
63
|
+
def _cursor_field_to_nested_and_composite_field(
|
64
|
+
self, field: Union[str, List[str]]
|
65
|
+
) -> List[List[str]]:
|
66
|
+
if not field:
|
67
|
+
return [[]]
|
68
|
+
|
69
|
+
if isinstance(field, str):
|
70
|
+
return [[field]]
|
71
|
+
|
72
|
+
is_nested_key = isinstance(field[0], str)
|
73
|
+
if is_nested_key:
|
74
|
+
return [field]
|
75
|
+
|
76
|
+
raise ValueError(f"Unknown type for cursor field `{field}")
|
77
|
+
|
78
|
+
def get_message_groups(
|
79
|
+
self,
|
80
|
+
source: DeclarativeSource,
|
81
|
+
config: Mapping[str, Any],
|
82
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
83
|
+
state: List[AirbyteStateMessage],
|
84
|
+
record_limit: Optional[int] = None,
|
85
|
+
) -> StreamRead:
|
86
|
+
if record_limit is not None and not (1 <= record_limit <= self._max_record_limit):
|
87
|
+
raise ValueError(
|
88
|
+
f"Record limit must be between 1 and {self._max_record_limit}. Got {record_limit}"
|
89
|
+
)
|
90
|
+
stream = source.streams(config)[
|
91
|
+
0
|
92
|
+
] # The connector builder currently only supports reading from a single stream at a time
|
93
|
+
schema_inferrer = SchemaInferrer(
|
94
|
+
self._pk_to_nested_and_composite_field(stream.primary_key),
|
95
|
+
self._cursor_field_to_nested_and_composite_field(stream.cursor_field),
|
96
|
+
)
|
97
|
+
datetime_format_inferrer = DatetimeFormatInferrer()
|
98
|
+
|
99
|
+
if record_limit is None:
|
100
|
+
record_limit = self._max_record_limit
|
101
|
+
else:
|
102
|
+
record_limit = min(record_limit, self._max_record_limit)
|
103
|
+
|
104
|
+
slices = []
|
105
|
+
log_messages = []
|
106
|
+
latest_config_update: AirbyteControlMessage = None
|
107
|
+
auxiliary_requests = []
|
108
|
+
for message_group in self._get_message_groups(
|
109
|
+
self._read_stream(source, config, configured_catalog, state),
|
110
|
+
schema_inferrer,
|
111
|
+
datetime_format_inferrer,
|
112
|
+
record_limit,
|
113
|
+
):
|
114
|
+
if isinstance(message_group, AirbyteLogMessage):
|
115
|
+
log_messages.append(
|
116
|
+
LogMessage(
|
117
|
+
**{"message": message_group.message, "level": message_group.level.value}
|
118
|
+
)
|
119
|
+
)
|
120
|
+
elif isinstance(message_group, AirbyteTraceMessage):
|
121
|
+
if message_group.type == TraceType.ERROR:
|
122
|
+
log_messages.append(
|
123
|
+
LogMessage(
|
124
|
+
**{
|
125
|
+
"message": message_group.error.message,
|
126
|
+
"level": "ERROR",
|
127
|
+
"internal_message": message_group.error.internal_message,
|
128
|
+
"stacktrace": message_group.error.stack_trace,
|
129
|
+
}
|
130
|
+
)
|
131
|
+
)
|
132
|
+
elif isinstance(message_group, AirbyteControlMessage):
|
133
|
+
if (
|
134
|
+
not latest_config_update
|
135
|
+
or latest_config_update.emitted_at <= message_group.emitted_at
|
136
|
+
):
|
137
|
+
latest_config_update = message_group
|
138
|
+
elif isinstance(message_group, AuxiliaryRequest):
|
139
|
+
auxiliary_requests.append(message_group)
|
140
|
+
elif isinstance(message_group, StreamReadSlices):
|
141
|
+
slices.append(message_group)
|
142
|
+
else:
|
143
|
+
raise ValueError(f"Unknown message group type: {type(message_group)}")
|
144
|
+
|
145
|
+
try:
|
146
|
+
# The connector builder currently only supports reading from a single stream at a time
|
147
|
+
configured_stream = configured_catalog.streams[0]
|
148
|
+
schema = schema_inferrer.get_stream_schema(configured_stream.stream.name)
|
149
|
+
except SchemaValidationException as exception:
|
150
|
+
for validation_error in exception.validation_errors:
|
151
|
+
log_messages.append(LogMessage(validation_error, "ERROR"))
|
152
|
+
schema = exception.schema
|
153
|
+
|
154
|
+
return StreamRead(
|
155
|
+
logs=log_messages,
|
156
|
+
slices=slices,
|
157
|
+
test_read_limit_reached=self._has_reached_limit(slices),
|
158
|
+
auxiliary_requests=auxiliary_requests,
|
159
|
+
inferred_schema=schema,
|
160
|
+
latest_config_update=self._clean_config(latest_config_update.connectorConfig.config)
|
161
|
+
if latest_config_update
|
162
|
+
else None,
|
163
|
+
inferred_datetime_formats=datetime_format_inferrer.get_inferred_datetime_formats(),
|
164
|
+
)
|
165
|
+
|
166
|
+
def _get_message_groups(
|
167
|
+
self,
|
168
|
+
messages: Iterator[AirbyteMessage],
|
169
|
+
schema_inferrer: SchemaInferrer,
|
170
|
+
datetime_format_inferrer: DatetimeFormatInferrer,
|
171
|
+
limit: int,
|
172
|
+
) -> Iterable[
|
173
|
+
Union[
|
174
|
+
StreamReadPages,
|
175
|
+
AirbyteControlMessage,
|
176
|
+
AirbyteLogMessage,
|
177
|
+
AirbyteTraceMessage,
|
178
|
+
AuxiliaryRequest,
|
179
|
+
]
|
180
|
+
]:
|
181
|
+
"""
|
182
|
+
Message groups are partitioned according to when request log messages are received. Subsequent response log messages
|
183
|
+
and record messages belong to the prior request log message and when we encounter another request, append the latest
|
184
|
+
message group, until <limit> records have been read.
|
185
|
+
|
186
|
+
Messages received from the CDK read operation will always arrive in the following order:
|
187
|
+
{type: LOG, log: {message: "request: ..."}}
|
188
|
+
{type: LOG, log: {message: "response: ..."}}
|
189
|
+
... 0 or more record messages
|
190
|
+
{type: RECORD, record: {data: ...}}
|
191
|
+
{type: RECORD, record: {data: ...}}
|
192
|
+
Repeats for each request/response made
|
193
|
+
|
194
|
+
Note: The exception is that normal log messages can be received at any time which are not incorporated into grouping
|
195
|
+
"""
|
196
|
+
records_count = 0
|
197
|
+
at_least_one_page_in_group = False
|
198
|
+
current_page_records: List[Mapping[str, Any]] = []
|
199
|
+
current_slice_descriptor: Optional[Dict[str, Any]] = None
|
200
|
+
current_slice_pages: List[StreamReadPages] = []
|
201
|
+
current_page_request: Optional[HttpRequest] = None
|
202
|
+
current_page_response: Optional[HttpResponse] = None
|
203
|
+
latest_state_message: Optional[Dict[str, Any]] = None
|
204
|
+
|
205
|
+
while records_count < limit and (message := next(messages, None)):
|
206
|
+
json_object = self._parse_json(message.log) if message.type == MessageType.LOG else None
|
207
|
+
if json_object is not None and not isinstance(json_object, dict):
|
208
|
+
raise ValueError(
|
209
|
+
f"Expected log message to be a dict, got {json_object} of type {type(json_object)}"
|
210
|
+
)
|
211
|
+
json_message: Optional[Dict[str, JsonType]] = json_object
|
212
|
+
if self._need_to_close_page(at_least_one_page_in_group, message, json_message):
|
213
|
+
self._close_page(
|
214
|
+
current_page_request,
|
215
|
+
current_page_response,
|
216
|
+
current_slice_pages,
|
217
|
+
current_page_records,
|
218
|
+
)
|
219
|
+
current_page_request = None
|
220
|
+
current_page_response = None
|
221
|
+
|
222
|
+
if (
|
223
|
+
at_least_one_page_in_group
|
224
|
+
and message.type == MessageType.LOG
|
225
|
+
and message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
226
|
+
):
|
227
|
+
yield StreamReadSlices(
|
228
|
+
pages=current_slice_pages,
|
229
|
+
slice_descriptor=current_slice_descriptor,
|
230
|
+
state=[latest_state_message] if latest_state_message else [],
|
231
|
+
)
|
232
|
+
current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
233
|
+
current_slice_pages = []
|
234
|
+
at_least_one_page_in_group = False
|
235
|
+
elif message.type == MessageType.LOG and message.log.message.startswith( # type: ignore[union-attr] # None doesn't have 'message'
|
236
|
+
SliceLogger.SLICE_LOG_PREFIX
|
237
|
+
):
|
238
|
+
# parsing the first slice
|
239
|
+
current_slice_descriptor = self._parse_slice_description(message.log.message) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
240
|
+
elif message.type == MessageType.LOG:
|
241
|
+
if json_message is not None and self._is_http_log(json_message):
|
242
|
+
if self._is_auxiliary_http_request(json_message):
|
243
|
+
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
244
|
+
if not isinstance(airbyte_cdk, dict):
|
245
|
+
raise ValueError(
|
246
|
+
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
247
|
+
)
|
248
|
+
stream = airbyte_cdk.get("stream", {})
|
249
|
+
if not isinstance(stream, dict):
|
250
|
+
raise ValueError(
|
251
|
+
f"Expected stream to be a dict, got {stream} of type {type(stream)}"
|
252
|
+
)
|
253
|
+
title_prefix = (
|
254
|
+
"Parent stream: " if stream.get("is_substream", False) else ""
|
255
|
+
)
|
256
|
+
http = json_message.get("http", {})
|
257
|
+
if not isinstance(http, dict):
|
258
|
+
raise ValueError(
|
259
|
+
f"Expected http to be a dict, got {http} of type {type(http)}"
|
260
|
+
)
|
261
|
+
yield AuxiliaryRequest(
|
262
|
+
title=title_prefix + str(http.get("title", None)),
|
263
|
+
description=str(http.get("description", None)),
|
264
|
+
request=self._create_request_from_log_message(json_message),
|
265
|
+
response=self._create_response_from_log_message(json_message),
|
266
|
+
)
|
267
|
+
else:
|
268
|
+
at_least_one_page_in_group = True
|
269
|
+
current_page_request = self._create_request_from_log_message(json_message)
|
270
|
+
current_page_response = self._create_response_from_log_message(json_message)
|
271
|
+
else:
|
272
|
+
yield message.log
|
273
|
+
elif message.type == MessageType.TRACE:
|
274
|
+
if message.trace.type == TraceType.ERROR: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has trace.type
|
275
|
+
yield message.trace
|
276
|
+
elif message.type == MessageType.RECORD:
|
277
|
+
current_page_records.append(message.record.data) # type: ignore[arg-type, union-attr] # AirbyteMessage with MessageType.RECORD has record.data
|
278
|
+
records_count += 1
|
279
|
+
schema_inferrer.accumulate(message.record)
|
280
|
+
datetime_format_inferrer.accumulate(message.record)
|
281
|
+
elif (
|
282
|
+
message.type == MessageType.CONTROL
|
283
|
+
and message.control.type == OrchestratorType.CONNECTOR_CONFIG # type: ignore[union-attr] # None doesn't have 'type'
|
284
|
+
):
|
285
|
+
yield message.control
|
286
|
+
elif message.type == MessageType.STATE:
|
287
|
+
latest_state_message = message.state # type: ignore[assignment]
|
288
|
+
else:
|
289
|
+
if current_page_request or current_page_response or current_page_records:
|
290
|
+
self._close_page(
|
291
|
+
current_page_request,
|
292
|
+
current_page_response,
|
293
|
+
current_slice_pages,
|
294
|
+
current_page_records,
|
295
|
+
)
|
296
|
+
yield StreamReadSlices(
|
297
|
+
pages=current_slice_pages,
|
298
|
+
slice_descriptor=current_slice_descriptor,
|
299
|
+
state=[latest_state_message] if latest_state_message else [],
|
300
|
+
)
|
301
|
+
|
302
|
+
@staticmethod
|
303
|
+
def _need_to_close_page(
|
304
|
+
at_least_one_page_in_group: bool,
|
305
|
+
message: AirbyteMessage,
|
306
|
+
json_message: Optional[Dict[str, Any]],
|
307
|
+
) -> bool:
|
308
|
+
return (
|
309
|
+
at_least_one_page_in_group
|
310
|
+
and message.type == MessageType.LOG
|
311
|
+
and (
|
312
|
+
MessageGrouper._is_page_http_request(json_message)
|
313
|
+
or message.log.message.startswith("slice:") # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
314
|
+
)
|
315
|
+
)
|
316
|
+
|
317
|
+
@staticmethod
|
318
|
+
def _is_page_http_request(json_message: Optional[Dict[str, Any]]) -> bool:
|
319
|
+
if not json_message:
|
320
|
+
return False
|
321
|
+
else:
|
322
|
+
return MessageGrouper._is_http_log(
|
323
|
+
json_message
|
324
|
+
) and not MessageGrouper._is_auxiliary_http_request(json_message)
|
325
|
+
|
326
|
+
@staticmethod
|
327
|
+
def _is_http_log(message: Dict[str, JsonType]) -> bool:
|
328
|
+
return bool(message.get("http", False))
|
329
|
+
|
330
|
+
@staticmethod
|
331
|
+
def _is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool:
|
332
|
+
"""
|
333
|
+
A auxiliary request is a request that is performed and will not directly lead to record for the specific stream it is being queried.
|
334
|
+
A couple of examples are:
|
335
|
+
* OAuth authentication
|
336
|
+
* Substream slice generation
|
337
|
+
"""
|
338
|
+
if not message:
|
339
|
+
return False
|
340
|
+
|
341
|
+
is_http = MessageGrouper._is_http_log(message)
|
342
|
+
return is_http and message.get("http", {}).get("is_auxiliary", False)
|
343
|
+
|
344
|
+
@staticmethod
|
345
|
+
def _close_page(
|
346
|
+
current_page_request: Optional[HttpRequest],
|
347
|
+
current_page_response: Optional[HttpResponse],
|
348
|
+
current_slice_pages: List[StreamReadPages],
|
349
|
+
current_page_records: List[Mapping[str, Any]],
|
350
|
+
) -> None:
|
351
|
+
"""
|
352
|
+
Close a page when parsing message groups
|
353
|
+
"""
|
354
|
+
current_slice_pages.append(
|
355
|
+
StreamReadPages(
|
356
|
+
request=current_page_request,
|
357
|
+
response=current_page_response,
|
358
|
+
records=deepcopy(current_page_records), # type: ignore [arg-type]
|
359
|
+
)
|
360
|
+
)
|
361
|
+
current_page_records.clear()
|
362
|
+
|
363
|
+
def _read_stream(
|
364
|
+
self,
|
365
|
+
source: DeclarativeSource,
|
366
|
+
config: Mapping[str, Any],
|
367
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
368
|
+
state: List[AirbyteStateMessage],
|
369
|
+
) -> Iterator[AirbyteMessage]:
|
370
|
+
# the generator can raise an exception
|
371
|
+
# iterate over the generated messages. if next raise an exception, catch it and yield it as an AirbyteLogMessage
|
372
|
+
try:
|
373
|
+
yield from AirbyteEntrypoint(source).read(
|
374
|
+
source.spec(self.logger), config, configured_catalog, state
|
375
|
+
)
|
376
|
+
except AirbyteTracedException as traced_exception:
|
377
|
+
# Look for this message which indicates that it is the "final exception" raised by AbstractSource.
|
378
|
+
# If it matches, don't yield this as we don't need to show this in the Builder.
|
379
|
+
# This is somewhat brittle as it relies on the message string, but if they drift then the worst case
|
380
|
+
# is that this message will be shown in the Builder.
|
381
|
+
if (
|
382
|
+
traced_exception.message is not None
|
383
|
+
and "During the sync, the following streams did not sync successfully"
|
384
|
+
in traced_exception.message
|
385
|
+
):
|
386
|
+
return
|
387
|
+
yield traced_exception.as_airbyte_message()
|
388
|
+
except Exception as e:
|
389
|
+
error_message = f"{e.args[0] if len(e.args) > 0 else str(e)}"
|
390
|
+
yield AirbyteTracedException.from_exception(
|
391
|
+
e, message=error_message
|
392
|
+
).as_airbyte_message()
|
393
|
+
|
394
|
+
@staticmethod
|
395
|
+
def _parse_json(log_message: AirbyteLogMessage) -> JsonType:
|
396
|
+
# TODO: As a temporary stopgap, the CDK emits request/response data as a log message string. Ideally this should come in the
|
397
|
+
# form of a custom message object defined in the Airbyte protocol, but this unblocks us in the immediate while the
|
398
|
+
# protocol change is worked on.
|
399
|
+
try:
|
400
|
+
json_object: JsonType = json.loads(log_message.message)
|
401
|
+
return json_object
|
402
|
+
except JSONDecodeError:
|
403
|
+
return None
|
404
|
+
|
405
|
+
@staticmethod
|
406
|
+
def _create_request_from_log_message(json_http_message: Dict[str, Any]) -> HttpRequest:
|
407
|
+
url = json_http_message.get("url", {}).get("full", "")
|
408
|
+
request = json_http_message.get("http", {}).get("request", {})
|
409
|
+
return HttpRequest(
|
410
|
+
url=url,
|
411
|
+
http_method=request.get("method", ""),
|
412
|
+
headers=request.get("headers"),
|
413
|
+
body=request.get("body", {}).get("content", ""),
|
414
|
+
)
|
415
|
+
|
416
|
+
@staticmethod
|
417
|
+
def _create_response_from_log_message(json_http_message: Dict[str, Any]) -> HttpResponse:
|
418
|
+
response = json_http_message.get("http", {}).get("response", {})
|
419
|
+
body = response.get("body", {}).get("content", "")
|
420
|
+
return HttpResponse(
|
421
|
+
status=response.get("status_code"), body=body, headers=response.get("headers")
|
422
|
+
)
|
423
|
+
|
424
|
+
def _has_reached_limit(self, slices: List[StreamReadSlices]) -> bool:
|
425
|
+
if len(slices) >= self._max_slices:
|
426
|
+
return True
|
427
|
+
|
428
|
+
record_count = 0
|
429
|
+
|
430
|
+
for _slice in slices:
|
431
|
+
if len(_slice.pages) >= self._max_pages_per_slice:
|
432
|
+
return True
|
433
|
+
for page in _slice.pages:
|
434
|
+
record_count += len(page.records)
|
435
|
+
if record_count >= self._max_record_limit:
|
436
|
+
return True
|
437
|
+
return False
|
438
|
+
|
439
|
+
def _parse_slice_description(self, log_message: str) -> Dict[str, Any]:
|
440
|
+
return json.loads(log_message.replace(SliceLogger.SLICE_LOG_PREFIX, "", 1)) # type: ignore
|
441
|
+
|
442
|
+
@staticmethod
|
443
|
+
def _clean_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
444
|
+
cleaned_config = deepcopy(config)
|
445
|
+
for key in config.keys():
|
446
|
+
if key.startswith("__"):
|
447
|
+
del cleaned_config[key]
|
448
|
+
return cleaned_config
|
@@ -437,10 +437,10 @@ class AsyncJobOrchestrator:
|
|
437
437
|
yield from self._process_running_partitions_and_yield_completed_ones()
|
438
438
|
self._wait_on_status_update()
|
439
439
|
except Exception as exception:
|
440
|
-
LOGGER.warning(
|
441
|
-
f"Caught exception that stops the processing of the jobs: {exception}. Traceback: {traceback.format_exc()}"
|
442
|
-
)
|
443
440
|
if self._is_breaking_exception(exception):
|
441
|
+
LOGGER.warning(
|
442
|
+
f"Caught exception that stops the processing of the jobs: {exception}"
|
443
|
+
)
|
444
444
|
self._abort_all_running_jobs()
|
445
445
|
raise exception
|
446
446
|
|
@@ -482,16 +482,16 @@ class AsyncJobOrchestrator:
|
|
482
482
|
and exception.failure_type == FailureType.config_error
|
483
483
|
)
|
484
484
|
|
485
|
-
def fetch_records(self,
|
485
|
+
def fetch_records(self, partition: AsyncPartition) -> Iterable[Mapping[str, Any]]:
|
486
486
|
"""
|
487
|
-
Fetches records from the given jobs.
|
487
|
+
Fetches records from the given partition's jobs.
|
488
488
|
|
489
489
|
Args:
|
490
|
-
|
490
|
+
partition (AsyncPartition): The partition containing the jobs.
|
491
491
|
|
492
492
|
Yields:
|
493
493
|
Iterable[Mapping[str, Any]]: The fetched records from the jobs.
|
494
494
|
"""
|
495
|
-
for job in
|
495
|
+
for job in partition.jobs:
|
496
496
|
yield from self._job_repository.fetch_records(job)
|
497
497
|
self._job_repository.delete(job)
|
@@ -3,7 +3,6 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import base64
|
6
|
-
import json
|
7
6
|
from dataclasses import InitVar, dataclass
|
8
7
|
from datetime import datetime
|
9
8
|
from typing import Any, Mapping, Optional, Union
|
@@ -105,21 +104,21 @@ class JwtAuthenticator(DeclarativeAuthenticator):
|
|
105
104
|
)
|
106
105
|
|
107
106
|
def _get_jwt_headers(self) -> dict[str, Any]:
|
108
|
-
"""
|
107
|
+
""" "
|
109
108
|
Builds and returns the headers used when signing the JWT.
|
110
109
|
"""
|
111
|
-
headers = self._additional_jwt_headers.eval(self.config
|
110
|
+
headers = self._additional_jwt_headers.eval(self.config)
|
112
111
|
if any(prop in headers for prop in ["kid", "alg", "typ", "cty"]):
|
113
112
|
raise ValueError(
|
114
113
|
"'kid', 'alg', 'typ', 'cty' are reserved headers and should not be set as part of 'additional_jwt_headers'"
|
115
114
|
)
|
116
115
|
|
117
116
|
if self._kid:
|
118
|
-
headers["kid"] = self._kid.eval(self.config
|
117
|
+
headers["kid"] = self._kid.eval(self.config)
|
119
118
|
if self._typ:
|
120
|
-
headers["typ"] = self._typ.eval(self.config
|
119
|
+
headers["typ"] = self._typ.eval(self.config)
|
121
120
|
if self._cty:
|
122
|
-
headers["cty"] = self._cty.eval(self.config
|
121
|
+
headers["cty"] = self._cty.eval(self.config)
|
123
122
|
headers["alg"] = self._algorithm
|
124
123
|
return headers
|
125
124
|
|
@@ -131,19 +130,18 @@ class JwtAuthenticator(DeclarativeAuthenticator):
|
|
131
130
|
exp = now + self._token_duration if isinstance(self._token_duration, int) else now
|
132
131
|
nbf = now
|
133
132
|
|
134
|
-
payload = self._additional_jwt_payload.eval(self.config
|
133
|
+
payload = self._additional_jwt_payload.eval(self.config)
|
135
134
|
if any(prop in payload for prop in ["iss", "sub", "aud", "iat", "exp", "nbf"]):
|
136
135
|
raise ValueError(
|
137
136
|
"'iss', 'sub', 'aud', 'iat', 'exp', 'nbf' are reserved properties and should not be set as part of 'additional_jwt_payload'"
|
138
137
|
)
|
139
138
|
|
140
139
|
if self._iss:
|
141
|
-
payload["iss"] = self._iss.eval(self.config
|
140
|
+
payload["iss"] = self._iss.eval(self.config)
|
142
141
|
if self._sub:
|
143
|
-
payload["sub"] = self._sub.eval(self.config
|
142
|
+
payload["sub"] = self._sub.eval(self.config)
|
144
143
|
if self._aud:
|
145
|
-
payload["aud"] = self._aud.eval(self.config
|
146
|
-
|
144
|
+
payload["aud"] = self._aud.eval(self.config)
|
147
145
|
payload["iat"] = now
|
148
146
|
payload["exp"] = exp
|
149
147
|
payload["nbf"] = nbf
|
@@ -153,7 +151,7 @@ class JwtAuthenticator(DeclarativeAuthenticator):
|
|
153
151
|
"""
|
154
152
|
Returns the secret key used to sign the JWT.
|
155
153
|
"""
|
156
|
-
secret_key: str = self._secret_key.eval(self.config
|
154
|
+
secret_key: str = self._secret_key.eval(self.config)
|
157
155
|
return (
|
158
156
|
base64.b64encode(secret_key.encode()).decode()
|
159
157
|
if self._base64_encode_secret_key
|
@@ -178,11 +176,7 @@ class JwtAuthenticator(DeclarativeAuthenticator):
|
|
178
176
|
"""
|
179
177
|
Returns the header prefix to be used when attaching the token to the request.
|
180
178
|
"""
|
181
|
-
return (
|
182
|
-
self._header_prefix.eval(self.config, json_loads=json.loads)
|
183
|
-
if self._header_prefix
|
184
|
-
else None
|
185
|
-
)
|
179
|
+
return self._header_prefix.eval(self.config) if self._header_prefix else None
|
186
180
|
|
187
181
|
@property
|
188
182
|
def auth_header(self) -> str:
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from dataclasses import InitVar, dataclass, field
|
6
|
-
from datetime import
|
6
|
+
from datetime import timedelta
|
7
7
|
from typing import Any, List, Mapping, MutableMapping, Optional, Union
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator
|
@@ -232,13 +232,8 @@ class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAut
|
|
232
232
|
return self._refresh_request_headers.eval(self.config)
|
233
233
|
|
234
234
|
def get_token_expiry_date(self) -> AirbyteDateTime:
|
235
|
-
if not self._has_access_token_been_initialized():
|
236
|
-
return AirbyteDateTime.from_datetime(datetime.min)
|
237
235
|
return self._token_expiry_date # type: ignore # _token_expiry_date is an AirbyteDateTime. It is never None despite what mypy thinks
|
238
236
|
|
239
|
-
def _has_access_token_been_initialized(self) -> bool:
|
240
|
-
return self._access_token is not None
|
241
|
-
|
242
237
|
def set_token_expiry_date(self, value: Union[str, int]) -> None:
|
243
238
|
self._token_expiry_date = self._parse_token_expiration_date(value)
|
244
239
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import base64
|
6
6
|
import logging
|
7
7
|
from dataclasses import InitVar, dataclass
|
8
|
-
from typing import Any, Mapping,
|
8
|
+
from typing import Any, Mapping, Union
|
9
9
|
|
10
10
|
import requests
|
11
11
|
from cachetools import TTLCache, cached
|
@@ -45,6 +45,11 @@ class ApiKeyAuthenticator(DeclarativeAuthenticator):
|
|
45
45
|
config: Config
|
46
46
|
parameters: InitVar[Mapping[str, Any]]
|
47
47
|
|
48
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
49
|
+
self._field_name = InterpolatedString.create(
|
50
|
+
self.request_option.field_name, parameters=parameters
|
51
|
+
)
|
52
|
+
|
48
53
|
@property
|
49
54
|
def auth_header(self) -> str:
|
50
55
|
options = self._get_request_options(RequestOptionType.header)
|
@@ -55,9 +60,9 @@ class ApiKeyAuthenticator(DeclarativeAuthenticator):
|
|
55
60
|
return self.token_provider.get_token()
|
56
61
|
|
57
62
|
def _get_request_options(self, option_type: RequestOptionType) -> Mapping[str, Any]:
|
58
|
-
options
|
63
|
+
options = {}
|
59
64
|
if self.request_option.inject_into == option_type:
|
60
|
-
self.
|
65
|
+
options[self._field_name.eval(self.config)] = self.token
|
61
66
|
return options
|
62
67
|
|
63
68
|
def get_request_params(self) -> Mapping[str, Any]:
|