airbyte-cdk 6.34.0.dev0__py3-none-any.whl → 6.34.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +75 -0
- airbyte_cdk/entrypoint.py +6 -6
- airbyte_cdk/logger.py +1 -4
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +122 -38
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +5 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +10 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/file_based_source.py +70 -37
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
- airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
- airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/test/mock_http/mocker.py +9 -1
- airbyte_cdk/test/mock_http/response.py +6 -3
- airbyte_cdk/utils/mapping_helpers.py +43 -2
- airbyte_cdk/utils/print_buffer.py +0 -4
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/RECORD +30 -21
- airbyte_cdk/connector_builder/message_grouper.py +0 -448
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.0.dev0.dist-info → airbyte_cdk-6.34.0.dev2.dist-info}/entry_points.txt +0 -0
@@ -2,10 +2,11 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
5
|
+
|
6
|
+
from dataclasses import asdict, dataclass, field
|
6
7
|
from typing import Any, List, Mapping
|
7
8
|
|
8
|
-
from airbyte_cdk.connector_builder.
|
9
|
+
from airbyte_cdk.connector_builder.test_reader import TestReader
|
9
10
|
from airbyte_cdk.models import (
|
10
11
|
AirbyteMessage,
|
11
12
|
AirbyteRecordMessage,
|
@@ -32,11 +33,11 @@ MAX_SLICES_KEY = "max_slices"
|
|
32
33
|
MAX_RECORDS_KEY = "max_records"
|
33
34
|
|
34
35
|
|
35
|
-
@
|
36
|
+
@dataclass
|
36
37
|
class TestReadLimits:
|
37
|
-
max_records: int =
|
38
|
-
max_pages_per_slice: int =
|
39
|
-
max_slices: int =
|
38
|
+
max_records: int = field(default=DEFAULT_MAXIMUM_RECORDS)
|
39
|
+
max_pages_per_slice: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE)
|
40
|
+
max_slices: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES)
|
40
41
|
|
41
42
|
|
42
43
|
def get_limits(config: Mapping[str, Any]) -> TestReadLimits:
|
@@ -73,17 +74,20 @@ def read_stream(
|
|
73
74
|
limits: TestReadLimits,
|
74
75
|
) -> AirbyteMessage:
|
75
76
|
try:
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
77
|
+
test_read_handler = TestReader(
|
78
|
+
limits.max_pages_per_slice, limits.max_slices, limits.max_records
|
79
|
+
)
|
80
|
+
# The connector builder only supports a single stream
|
81
|
+
stream_name = configured_catalog.streams[0].stream.name
|
82
|
+
|
83
|
+
stream_read = test_read_handler.run_test_read(
|
81
84
|
source, config, configured_catalog, state, limits.max_records
|
82
85
|
)
|
86
|
+
|
83
87
|
return AirbyteMessage(
|
84
88
|
type=MessageType.RECORD,
|
85
89
|
record=AirbyteRecordMessage(
|
86
|
-
data=
|
90
|
+
data=asdict(stream_read), stream=stream_name, emitted_at=_emitted_at()
|
87
91
|
),
|
88
92
|
)
|
89
93
|
except Exception as exc:
|
@@ -0,0 +1,591 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import json
|
6
|
+
from copy import deepcopy
|
7
|
+
from json import JSONDecodeError
|
8
|
+
from typing import Any, Dict, List, Mapping, Optional
|
9
|
+
|
10
|
+
from airbyte_cdk.connector_builder.models import (
|
11
|
+
AuxiliaryRequest,
|
12
|
+
HttpRequest,
|
13
|
+
HttpResponse,
|
14
|
+
StreamReadPages,
|
15
|
+
StreamReadSlices,
|
16
|
+
)
|
17
|
+
from airbyte_cdk.models import (
|
18
|
+
AirbyteLogMessage,
|
19
|
+
AirbyteMessage,
|
20
|
+
OrchestratorType,
|
21
|
+
TraceType,
|
22
|
+
)
|
23
|
+
from airbyte_cdk.models import Type as MessageType
|
24
|
+
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
25
|
+
from airbyte_cdk.sources.utils.types import JsonType
|
26
|
+
from airbyte_cdk.utils.datetime_format_inferrer import DatetimeFormatInferrer
|
27
|
+
from airbyte_cdk.utils.schema_inferrer import (
|
28
|
+
SchemaInferrer,
|
29
|
+
)
|
30
|
+
|
31
|
+
from .types import LOG_MESSAGES_OUTPUT_TYPE
|
32
|
+
|
33
|
+
# -------
|
34
|
+
# Parsers
|
35
|
+
# -------
|
36
|
+
|
37
|
+
|
38
|
+
def airbyte_message_to_json(message: AirbyteMessage) -> Optional[Dict[str, JsonType]]:
|
39
|
+
"""
|
40
|
+
Converts an AirbyteMessage to a JSON dictionary if its type is LOG.
|
41
|
+
|
42
|
+
This function attempts to parse the 'log' field of the given AirbyteMessage when its type is MessageType.LOG.
|
43
|
+
If the parsed JSON object exists but is not a dictionary, a ValueError is raised. If the message is not of type LOG,
|
44
|
+
the function returns None.
|
45
|
+
|
46
|
+
Parameters:
|
47
|
+
message (AirbyteMessage): The AirbyteMessage instance containing the log data.
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Optional[Dict[str, JsonType]]: The parsed log message as a dictionary if the message type is LOG, otherwise None.
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
ValueError: If the parsed log message is not a dictionary.
|
54
|
+
"""
|
55
|
+
if is_log_message(message):
|
56
|
+
json_object = parse_json(message.log) # type: ignore
|
57
|
+
|
58
|
+
if json_object is not None and not isinstance(json_object, dict):
|
59
|
+
raise ValueError(
|
60
|
+
f"Expected log message to be a dict, got {json_object} of type {type(json_object)}"
|
61
|
+
)
|
62
|
+
|
63
|
+
return json_object
|
64
|
+
return None
|
65
|
+
|
66
|
+
|
67
|
+
def clean_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
68
|
+
"""
|
69
|
+
Cleans the configuration dictionary by removing all keys that start with a double underscore.
|
70
|
+
|
71
|
+
This function creates a deep copy of the provided configuration dictionary and iterates
|
72
|
+
over its keys, deleting any key that begins with '__'. This is useful for filtering out
|
73
|
+
internal or meta-data fields that are not meant to be part of the final configuration.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
config (Dict[str, Any]): The input configuration dictionary containing various key-value pairs.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
Dict[str, Any]: A deep copy of the original configuration with keys starting with '__' removed.
|
80
|
+
"""
|
81
|
+
cleaned_config = deepcopy(config)
|
82
|
+
for key in config.keys():
|
83
|
+
if key.startswith("__"):
|
84
|
+
del cleaned_config[key]
|
85
|
+
return cleaned_config
|
86
|
+
|
87
|
+
|
88
|
+
def create_request_from_log_message(json_http_message: Dict[str, Any]) -> HttpRequest:
|
89
|
+
"""
|
90
|
+
Creates an HttpRequest object from the provided JSON-formatted log message.
|
91
|
+
|
92
|
+
This function parses a dictionary that represents a logged HTTP message, extracting the URL, HTTP method,
|
93
|
+
headers, and body from nested dictionary structures. It is assumed that the expected keys and nested keys exist
|
94
|
+
or default values are used.
|
95
|
+
|
96
|
+
Parameters:
|
97
|
+
json_http_message (Dict[str, Any]):
|
98
|
+
A dictionary containing log message details with the following expected structure:
|
99
|
+
{
|
100
|
+
"url": {
|
101
|
+
"full": "<full_url>"
|
102
|
+
},
|
103
|
+
"http": {
|
104
|
+
"request": {
|
105
|
+
"method": "<HTTP_method>",
|
106
|
+
"headers": <headers>,
|
107
|
+
"body": {
|
108
|
+
"content": "<body_content>"
|
109
|
+
}
|
110
|
+
}
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
HttpRequest:
|
116
|
+
An HttpRequest instance initialized with:
|
117
|
+
- url: Extracted from json_http_message["url"]["full"], defaults to an empty string if missing.
|
118
|
+
- http_method: Extracted from json_http_message["http"]["request"]["method"], defaults to an empty string if missing.
|
119
|
+
- headers: Extracted from json_http_message["http"]["request"]["headers"].
|
120
|
+
- body: Extracted from json_http_message["http"]["request"]["body"]["content"], defaults to an empty string if missing.
|
121
|
+
"""
|
122
|
+
url = json_http_message.get("url", {}).get("full", "")
|
123
|
+
request = json_http_message.get("http", {}).get("request", {})
|
124
|
+
return HttpRequest(
|
125
|
+
url=url,
|
126
|
+
http_method=request.get("method", ""),
|
127
|
+
headers=request.get("headers"),
|
128
|
+
body=request.get("body", {}).get("content", ""),
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
def create_response_from_log_message(json_http_message: Dict[str, Any]) -> HttpResponse:
|
133
|
+
"""
|
134
|
+
Generate an HttpResponse instance from a JSON log message containing HTTP response details.
|
135
|
+
|
136
|
+
Parameters:
|
137
|
+
json_http_message (Dict[str, Any]): A dictionary representing a JSON-encoded HTTP message.
|
138
|
+
It should include an "http" key with a nested "response" dictionary that contains:
|
139
|
+
- "status_code": The HTTP status code.
|
140
|
+
- "body": A dictionary with a "content" key for the response body.
|
141
|
+
- "headers": The HTTP response headers.
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
HttpResponse: An HttpResponse object constructed from the extracted status code, body content, and headers.
|
145
|
+
"""
|
146
|
+
response = json_http_message.get("http", {}).get("response", {})
|
147
|
+
body = response.get("body", {}).get("content", "")
|
148
|
+
return HttpResponse(
|
149
|
+
status=response.get("status_code"), body=body, headers=response.get("headers")
|
150
|
+
)
|
151
|
+
|
152
|
+
|
153
|
+
def parse_json(log_message: AirbyteLogMessage) -> JsonType:
|
154
|
+
"""
|
155
|
+
Parse and extract a JSON object from an Airbyte log message.
|
156
|
+
|
157
|
+
This function attempts to decode the JSON string contained in the message field
|
158
|
+
of the provided AirbyteLogMessage instance. If the decoding process fails due to
|
159
|
+
malformed JSON, the function returns None.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
log_message (AirbyteLogMessage): A log message object containing a JSON-formatted string in its 'message' attribute.
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
JsonType: The parsed JSON object if decoding is successful; otherwise, None.
|
166
|
+
"""
|
167
|
+
# TODO: As a temporary stopgap, the CDK emits request/response data as a log message string. Ideally this should come in the
|
168
|
+
# form of a custom message object defined in the Airbyte protocol, but this unblocks us in the immediate while the
|
169
|
+
# protocol change is worked on.
|
170
|
+
try:
|
171
|
+
json_object: JsonType = json.loads(log_message.message)
|
172
|
+
return json_object
|
173
|
+
except JSONDecodeError:
|
174
|
+
return None
|
175
|
+
|
176
|
+
|
177
|
+
def parse_slice_description(log_message: str) -> Dict[str, Any]:
|
178
|
+
"""
|
179
|
+
Parses a log message containing a JSON payload and returns it as a dictionary.
|
180
|
+
|
181
|
+
The function removes a predefined logging prefix (defined by the constant
|
182
|
+
SliceLogger.SLICE_LOG_PREFIX) from the beginning of the log message and then
|
183
|
+
parses the remaining string as JSON.
|
184
|
+
|
185
|
+
Parameters:
|
186
|
+
log_message (str): The log message string that includes the JSON payload,
|
187
|
+
prefixed by SliceLogger.SLICE_LOG_PREFIX.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
Dict[str, Any]: A dictionary resulting from parsing the modified log message.
|
191
|
+
|
192
|
+
Raises:
|
193
|
+
json.JSONDecodeError: If the log message (after prefix removal) is not a valid JSON.
|
194
|
+
"""
|
195
|
+
return json.loads(log_message.replace(SliceLogger.SLICE_LOG_PREFIX, "", 1)) # type: ignore
|
196
|
+
|
197
|
+
|
198
|
+
# -------
|
199
|
+
# Conditions
|
200
|
+
# -------
|
201
|
+
|
202
|
+
|
203
|
+
def should_close_page(
|
204
|
+
at_least_one_page_in_group: bool,
|
205
|
+
message: AirbyteMessage,
|
206
|
+
json_message: Optional[Dict[str, Any]],
|
207
|
+
) -> bool:
|
208
|
+
"""
|
209
|
+
Determines whether a page should be closed based on its content and state.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
at_least_one_page_in_group (bool): Indicates if there is at least one page in the group.
|
213
|
+
message (AirbyteMessage): The message object containing details such as type and log information.
|
214
|
+
json_message (Optional[Dict[str, Any]]): A JSON representation of the message that may provide additional context,
|
215
|
+
particularly for HTTP requests.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
bool: True if all of the following conditions are met:
|
219
|
+
- There is at least one page in the group.
|
220
|
+
- The message type is MessageType.LOG.
|
221
|
+
- Either the JSON message corresponds to a page HTTP request (as determined by _is_page_http_request)
|
222
|
+
or the log message starts with "slice:".
|
223
|
+
Otherwise, returns False.
|
224
|
+
"""
|
225
|
+
return (
|
226
|
+
at_least_one_page_in_group
|
227
|
+
and is_log_message(message)
|
228
|
+
and (
|
229
|
+
is_page_http_request(json_message) or message.log.message.startswith("slice:") # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
230
|
+
)
|
231
|
+
)
|
232
|
+
|
233
|
+
|
234
|
+
def should_process_slice_descriptor(message: AirbyteMessage) -> bool:
|
235
|
+
"""
|
236
|
+
Determines whether the given AirbyteMessage should be processed as a slice descriptor.
|
237
|
+
|
238
|
+
This function checks if the message is a log message and if its log content starts with the
|
239
|
+
specific slice log prefix. It is used to filter out messages that represent slice descriptors
|
240
|
+
for further processing.
|
241
|
+
|
242
|
+
Parameters:
|
243
|
+
message (AirbyteMessage): The message to evaluate.
|
244
|
+
|
245
|
+
Returns:
|
246
|
+
bool: True if the message is a log message whose log message starts with the predefined
|
247
|
+
slice log prefix, indicating it is a slice descriptor; otherwise, False.
|
248
|
+
"""
|
249
|
+
return is_log_message(message) and message.log.message.startswith( # type: ignore
|
250
|
+
SliceLogger.SLICE_LOG_PREFIX
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
def should_close_page_for_slice(at_least_one_page_in_group: bool, message: AirbyteMessage) -> bool:
|
255
|
+
"""
|
256
|
+
Determines whether the current slice page should be closed.
|
257
|
+
|
258
|
+
This function checks if there is at least one page in the current group and if further processing
|
259
|
+
of the slice descriptor is required based on the provided Airbyte message.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
at_least_one_page_in_group (bool): Indicates if at least one page already exists in the slice group.
|
263
|
+
message (AirbyteMessage): The message containing the slice descriptor information to be evaluated.
|
264
|
+
|
265
|
+
Returns:
|
266
|
+
bool: True if both conditions are met and the slice page needs to be closed; otherwise, False.
|
267
|
+
"""
|
268
|
+
return at_least_one_page_in_group and should_process_slice_descriptor(message)
|
269
|
+
|
270
|
+
|
271
|
+
def is_page_http_request(json_message: Optional[Dict[str, Any]]) -> bool:
|
272
|
+
"""
|
273
|
+
Determines whether a given JSON message represents a page HTTP request.
|
274
|
+
|
275
|
+
This function checks if the provided JSON message qualifies as a page HTTP request by verifying that:
|
276
|
+
1. The JSON message exists.
|
277
|
+
2. The JSON message is recognized as a valid HTTP log.
|
278
|
+
3. The JSON message is not classified as an auxiliary HTTP request.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
json_message (Optional[Dict[str, Any]]): A dictionary containing the JSON message to be evaluated.
|
282
|
+
If None or empty, the message will not be considered a page HTTP request.
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
bool: True if the JSON message is a valid HTTP log and not an auxiliary HTTP request; otherwise, False.
|
286
|
+
"""
|
287
|
+
if not json_message:
|
288
|
+
return False
|
289
|
+
else:
|
290
|
+
return is_http_log(json_message) and not is_auxiliary_http_request(json_message)
|
291
|
+
|
292
|
+
|
293
|
+
def is_http_log(message: Dict[str, JsonType]) -> bool:
|
294
|
+
"""
|
295
|
+
Determine if the provided log message represents an HTTP log.
|
296
|
+
|
297
|
+
This function inspects the given message dictionary for the presence of the "http" key.
|
298
|
+
If the key exists and its value is truthy, the function interprets the message as an HTTP log.
|
299
|
+
|
300
|
+
Args:
|
301
|
+
message (Dict[str, JsonType]): A dictionary containing log data. It may include an "http" key
|
302
|
+
whose truthy value indicates an HTTP log.
|
303
|
+
|
304
|
+
Returns:
|
305
|
+
bool: True if the message is an HTTP log (i.e., "http" exists and is truthy); otherwise, False.
|
306
|
+
"""
|
307
|
+
return bool(message.get("http", False))
|
308
|
+
|
309
|
+
|
310
|
+
def is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool:
|
311
|
+
"""
|
312
|
+
Determines if the provided message represents an auxiliary HTTP request.
|
313
|
+
|
314
|
+
A auxiliary request is a request that is performed and will not directly lead to record for the specific stream it is being queried.
|
315
|
+
|
316
|
+
A couple of examples are:
|
317
|
+
* OAuth authentication
|
318
|
+
* Substream slice generation
|
319
|
+
|
320
|
+
Parameters:
|
321
|
+
message (Optional[Dict[str, Any]]): A dictionary representing a log message for an HTTP request.
|
322
|
+
The dictionary may contain nested keys indicating whether the request is auxiliary.
|
323
|
+
|
324
|
+
Returns:
|
325
|
+
bool: True if the message is an HTTP log and indicates an auxiliary request; otherwise, False.
|
326
|
+
"""
|
327
|
+
if not message:
|
328
|
+
return False
|
329
|
+
|
330
|
+
return is_http_log(message) and message.get("http", {}).get("is_auxiliary", False)
|
331
|
+
|
332
|
+
|
333
|
+
def is_log_message(message: AirbyteMessage) -> bool:
|
334
|
+
"""
|
335
|
+
Determines whether the provided message is of type LOG.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
message (AirbyteMessage): The message to evaluate.
|
339
|
+
|
340
|
+
Returns:
|
341
|
+
bool: True if the message's type is LOG, otherwise False.
|
342
|
+
"""
|
343
|
+
return message.type == MessageType.LOG # type: ignore
|
344
|
+
|
345
|
+
|
346
|
+
def is_trace_with_error(message: AirbyteMessage) -> bool:
|
347
|
+
"""
|
348
|
+
Determines whether the provided AirbyteMessage is a TRACE message with an error.
|
349
|
+
|
350
|
+
This function checks if the message's type is TRACE and that its trace component is of type ERROR.
|
351
|
+
|
352
|
+
Parameters:
|
353
|
+
message (AirbyteMessage): The Airbyte message to be evaluated.
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
bool: True if the message is a TRACE message with an error, False otherwise.
|
357
|
+
"""
|
358
|
+
return message.type == MessageType.TRACE and message.trace.type == TraceType.ERROR # type: ignore
|
359
|
+
|
360
|
+
|
361
|
+
def is_record_message(message: AirbyteMessage) -> bool:
|
362
|
+
"""
|
363
|
+
Determines whether the provided Airbyte message represents a record.
|
364
|
+
|
365
|
+
Parameters:
|
366
|
+
message (AirbyteMessage): The message instance to check. It should include a 'type' attribute that is comparable to MessageType.RECORD.
|
367
|
+
|
368
|
+
Returns:
|
369
|
+
bool: True if the message type is RECORD, otherwise False.
|
370
|
+
"""
|
371
|
+
return message.type == MessageType.RECORD # type: ignore
|
372
|
+
|
373
|
+
|
374
|
+
def is_config_update_message(message: AirbyteMessage) -> bool:
|
375
|
+
"""
|
376
|
+
Determine whether the provided AirbyteMessage represents a connector configuration update.
|
377
|
+
|
378
|
+
This function evaluates if the message is a control message and if its control type
|
379
|
+
matches that of a connector configuration update (i.e., OrchestratorType.CONNECTOR_CONFIG).
|
380
|
+
It is primarily used to filter messages related to configuration updates in the data pipeline.
|
381
|
+
|
382
|
+
Parameters:
|
383
|
+
message (AirbyteMessage): The message object to be evaluated.
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
bool: True if the message is a connector configuration update message, False otherwise.
|
387
|
+
"""
|
388
|
+
return ( # type: ignore
|
389
|
+
message.type == MessageType.CONTROL
|
390
|
+
and message.control.type == OrchestratorType.CONNECTOR_CONFIG # type: ignore
|
391
|
+
)
|
392
|
+
|
393
|
+
|
394
|
+
def is_state_message(message: AirbyteMessage) -> bool:
|
395
|
+
"""
|
396
|
+
Determines whether the provided AirbyteMessage is a state message.
|
397
|
+
|
398
|
+
Parameters:
|
399
|
+
message (AirbyteMessage): The message to inspect.
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
bool: True if the message's type is MessageType.STATE, False otherwise.
|
403
|
+
"""
|
404
|
+
return message.type == MessageType.STATE # type: ignore
|
405
|
+
|
406
|
+
|
407
|
+
# -------
|
408
|
+
# Handlers
|
409
|
+
# -------
|
410
|
+
|
411
|
+
|
412
|
+
def handle_current_slice(
|
413
|
+
current_slice_pages: List[StreamReadPages],
|
414
|
+
current_slice_descriptor: Optional[Dict[str, Any]] = None,
|
415
|
+
latest_state_message: Optional[Dict[str, Any]] = None,
|
416
|
+
) -> StreamReadSlices:
|
417
|
+
"""
|
418
|
+
Handles the current slice by packaging its pages, descriptor, and state into a StreamReadSlices instance.
|
419
|
+
|
420
|
+
Args:
|
421
|
+
current_slice_pages (List[StreamReadPages]): The pages to be included in the slice.
|
422
|
+
current_slice_descriptor (Optional[Dict[str, Any]]): Descriptor for the current slice, optional.
|
423
|
+
latest_state_message (Optional[Dict[str, Any]]): The latest state message, optional.
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
StreamReadSlices: An object containing the current slice's pages, descriptor, and state.
|
427
|
+
"""
|
428
|
+
return StreamReadSlices(
|
429
|
+
pages=current_slice_pages,
|
430
|
+
slice_descriptor=current_slice_descriptor,
|
431
|
+
state=[latest_state_message] if latest_state_message else [],
|
432
|
+
)
|
433
|
+
|
434
|
+
|
435
|
+
def handle_current_page(
|
436
|
+
current_page_request: Optional[HttpRequest],
|
437
|
+
current_page_response: Optional[HttpResponse],
|
438
|
+
current_slice_pages: List[StreamReadPages],
|
439
|
+
current_page_records: List[Mapping[str, Any]],
|
440
|
+
) -> tuple[None, None]:
|
441
|
+
"""
|
442
|
+
Closes the current page by appending its request, response, and records
|
443
|
+
to the list of pages and clearing the current page records.
|
444
|
+
|
445
|
+
Args:
|
446
|
+
current_page_request (Optional[HttpRequest]): The HTTP request associated with the current page.
|
447
|
+
current_page_response (Optional[HttpResponse]): The HTTP response associated with the current page.
|
448
|
+
current_slice_pages (List[StreamReadPages]): A list to append the current page information.
|
449
|
+
current_page_records (List[Mapping[str, Any]]): The records of the current page to be cleared after processing.
|
450
|
+
|
451
|
+
Returns:
|
452
|
+
tuple[None, None]: A tuple indicating that no values are returned.
|
453
|
+
"""
|
454
|
+
|
455
|
+
current_slice_pages.append(
|
456
|
+
StreamReadPages(
|
457
|
+
request=current_page_request,
|
458
|
+
response=current_page_response,
|
459
|
+
records=deepcopy(current_page_records), # type: ignore [arg-type]
|
460
|
+
)
|
461
|
+
)
|
462
|
+
current_page_records.clear()
|
463
|
+
|
464
|
+
return None, None
|
465
|
+
|
466
|
+
|
467
|
+
def handle_auxiliary_request(json_message: Dict[str, JsonType]) -> AuxiliaryRequest:
|
468
|
+
"""
|
469
|
+
Parses the provided JSON message and constructs an AuxiliaryRequest object by extracting
|
470
|
+
relevant fields from nested dictionaries.
|
471
|
+
|
472
|
+
This function retrieves and validates the "airbyte_cdk", "stream", and "http" dictionaries
|
473
|
+
from the JSON message. It raises a ValueError if any of these are not of type dict. A title
|
474
|
+
is dynamically created by checking if the stream is a substream and then combining a prefix
|
475
|
+
with the "title" field from the "http" dictionary. The function also uses helper functions
|
476
|
+
to generate the request and response portions of the AuxiliaryRequest.
|
477
|
+
|
478
|
+
Parameters:
|
479
|
+
json_message (Dict[str, JsonType]): A dictionary representing the JSON log message containing
|
480
|
+
auxiliary request details.
|
481
|
+
|
482
|
+
Returns:
|
483
|
+
AuxiliaryRequest: An object containing the formatted title, description, request, and response
|
484
|
+
extracted from the JSON message.
|
485
|
+
|
486
|
+
Raises:
|
487
|
+
ValueError: If any of the "airbyte_cdk", "stream", or "http" fields is not a dictionary.
|
488
|
+
"""
|
489
|
+
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
490
|
+
|
491
|
+
if not isinstance(airbyte_cdk, dict):
|
492
|
+
raise ValueError(
|
493
|
+
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
494
|
+
)
|
495
|
+
|
496
|
+
stream = airbyte_cdk.get("stream", {})
|
497
|
+
|
498
|
+
if not isinstance(stream, dict):
|
499
|
+
raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}")
|
500
|
+
|
501
|
+
title_prefix = "Parent stream: " if stream.get("is_substream", False) else ""
|
502
|
+
http = json_message.get("http", {})
|
503
|
+
|
504
|
+
if not isinstance(http, dict):
|
505
|
+
raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
|
506
|
+
|
507
|
+
return AuxiliaryRequest(
|
508
|
+
title=title_prefix + str(http.get("title", None)),
|
509
|
+
description=str(http.get("description", None)),
|
510
|
+
request=create_request_from_log_message(json_message),
|
511
|
+
response=create_response_from_log_message(json_message),
|
512
|
+
)
|
513
|
+
|
514
|
+
|
515
|
+
def handle_log_message(
|
516
|
+
message: AirbyteMessage,
|
517
|
+
json_message: Dict[str, JsonType] | None,
|
518
|
+
at_least_one_page_in_group: bool,
|
519
|
+
current_page_request: Optional[HttpRequest],
|
520
|
+
current_page_response: Optional[HttpResponse],
|
521
|
+
) -> LOG_MESSAGES_OUTPUT_TYPE:
|
522
|
+
"""
|
523
|
+
Process a log message by handling both HTTP-specific and auxiliary log entries.
|
524
|
+
|
525
|
+
Parameters:
|
526
|
+
message (AirbyteMessage): The original log message received.
|
527
|
+
json_message (Dict[str, JsonType] | None): A parsed JSON representation of the log message, if available.
|
528
|
+
at_least_one_page_in_group (bool): Indicates whether at least one page within the group has been processed.
|
529
|
+
current_page_request (Optional[HttpRequest]): The HTTP request object corresponding to the current page, if any.
|
530
|
+
current_page_response (Optional[HttpResponse]): The HTTP response object corresponding to the current page, if any.
|
531
|
+
|
532
|
+
Returns:
|
533
|
+
LOG_MESSAGES_OUTPUT_TYPE: A tuple containing:
|
534
|
+
- A boolean flag that determines whether the group contains at least one page.
|
535
|
+
- An updated HttpRequest for the current page (if applicable).
|
536
|
+
- An updated HttpResponse for the current page (if applicable).
|
537
|
+
- The auxiliary log message, which might be the original HTTP log or another log field.
|
538
|
+
|
539
|
+
Note:
|
540
|
+
If the parsed JSON message indicates an HTTP log and represents an auxiliary HTTP request,
|
541
|
+
the auxiliary log is handled via _handle_auxiliary_request. Otherwise, if the JSON log is a standard HTTP log,
|
542
|
+
the function updates the current page's request and response objects by generating them from the log message.
|
543
|
+
"""
|
544
|
+
auxiliary_request = None
|
545
|
+
log_message = None
|
546
|
+
|
547
|
+
if json_message is not None and is_http_log(json_message):
|
548
|
+
if is_auxiliary_http_request(json_message):
|
549
|
+
auxiliary_request = handle_auxiliary_request(json_message)
|
550
|
+
else:
|
551
|
+
at_least_one_page_in_group = True
|
552
|
+
current_page_request = create_request_from_log_message(json_message)
|
553
|
+
current_page_response = create_response_from_log_message(json_message)
|
554
|
+
else:
|
555
|
+
log_message = message.log
|
556
|
+
|
557
|
+
return (
|
558
|
+
at_least_one_page_in_group,
|
559
|
+
current_page_request,
|
560
|
+
current_page_response,
|
561
|
+
auxiliary_request or log_message,
|
562
|
+
)
|
563
|
+
|
564
|
+
|
565
|
+
def handle_record_message(
|
566
|
+
message: AirbyteMessage,
|
567
|
+
schema_inferrer: SchemaInferrer,
|
568
|
+
datetime_format_inferrer: DatetimeFormatInferrer,
|
569
|
+
records_count: int,
|
570
|
+
current_page_records: List[Mapping[str, Any]],
|
571
|
+
) -> int:
|
572
|
+
"""
|
573
|
+
Processes an Airbyte record message by updating the current batch and accumulating schema and datetime format information.
|
574
|
+
|
575
|
+
Parameters:
|
576
|
+
message (AirbyteMessage): The Airbyte message to process. Expected to have a 'type' attribute and, if of type RECORD, a 'record' attribute containing the record data.
|
577
|
+
schema_inferrer (SchemaInferrer): An instance responsible for inferring and accumulating schema details based on the record data.
|
578
|
+
datetime_format_inferrer (DatetimeFormatInferrer): An instance responsible for inferring and accumulating datetime format information from the record data.
|
579
|
+
records_count (int): The current count of processed records. This value is incremented if the message is a record.
|
580
|
+
current_page_records (List[Mapping[str, Any]]): A list where the data of processed record messages is accumulated.
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
int: The updated record count after processing the message.
|
584
|
+
"""
|
585
|
+
if message.type == MessageType.RECORD:
|
586
|
+
current_page_records.append(message.record.data) # type: ignore
|
587
|
+
records_count += 1
|
588
|
+
schema_inferrer.accumulate(message.record) # type: ignore
|
589
|
+
datetime_format_inferrer.accumulate(message.record) # type: ignore
|
590
|
+
|
591
|
+
return records_count
|