airbyte-cdk 0.52.6__py3-none-any.whl → 0.52.8__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/destinations/vector_db_based/config.py +1 -0
- airbyte_cdk/sources/abstract_source.py +12 -61
- airbyte_cdk/sources/file_based/config/unstructured_format.py +1 -1
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +1 -2
- airbyte_cdk/sources/message/repository.py +0 -6
- airbyte_cdk/sources/source.py +14 -13
- airbyte_cdk/sources/streams/concurrent/adapters.py +94 -21
- airbyte_cdk/sources/streams/concurrent/cursor.py +148 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +2 -3
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +1 -3
- airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +7 -3
- airbyte_cdk/sources/streams/core.py +71 -1
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/METADATA +3 -3
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/RECORD +32 -30
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/WHEEL +1 -1
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +5 -0
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1 -1
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +16 -0
- unit_tests/sources/message/test_repository.py +7 -20
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +46 -5
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +154 -37
- unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +6 -0
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +19 -3
- unit_tests/sources/streams/concurrent/test_adapters.py +48 -22
- unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +5 -4
- unit_tests/sources/streams/concurrent/test_cursor.py +130 -0
- unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py +14 -10
- unit_tests/sources/streams/test_stream_read.py +3 -1
- unit_tests/sources/test_abstract_source.py +12 -9
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
from typing import Any, Mapping, Optional
|
2
|
+
from unittest import TestCase
|
3
|
+
from unittest.mock import Mock
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
7
|
+
from airbyte_cdk.sources.message import MessageRepository
|
8
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Comparable, ConcurrentCursor, CursorField
|
9
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
11
|
+
|
12
|
+
_A_STREAM_NAME = "a stream name"
|
13
|
+
_A_STREAM_NAMESPACE = "a stream namespace"
|
14
|
+
_ANY_STATE = None
|
15
|
+
_A_CURSOR_FIELD_KEY = "a_cursor_field_key"
|
16
|
+
_NO_PARTITION_IDENTIFIER = None
|
17
|
+
_NO_SLICE = None
|
18
|
+
_NO_SLICE_BOUNDARIES = None
|
19
|
+
_LOWER_SLICE_BOUNDARY_FIELD = "lower_boundary"
|
20
|
+
_UPPER_SLICE_BOUNDARY_FIELD = "upper_boundary"
|
21
|
+
_SLICE_BOUNDARY_FIELDS = (_LOWER_SLICE_BOUNDARY_FIELD, _UPPER_SLICE_BOUNDARY_FIELD)
|
22
|
+
_A_VERY_HIGH_CURSOR_VALUE = 1000000000
|
23
|
+
|
24
|
+
|
25
|
+
def _partition(_slice: Optional[Mapping[str, Any]]) -> Partition:
|
26
|
+
partition = Mock(spec=Partition)
|
27
|
+
partition.to_slice.return_value = _slice
|
28
|
+
return partition
|
29
|
+
|
30
|
+
|
31
|
+
def _record(cursor_value: Comparable) -> Record:
|
32
|
+
return Record(data={_A_CURSOR_FIELD_KEY: cursor_value})
|
33
|
+
|
34
|
+
|
35
|
+
class ConcurrentCursorTest(TestCase):
|
36
|
+
def setUp(self) -> None:
|
37
|
+
self._message_repository = Mock(spec=MessageRepository)
|
38
|
+
self._state_manager = Mock(spec=ConnectorStateManager)
|
39
|
+
|
40
|
+
def _cursor_with_slice_boundary_fields(self) -> ConcurrentCursor:
|
41
|
+
return ConcurrentCursor(
|
42
|
+
_A_STREAM_NAME,
|
43
|
+
_A_STREAM_NAMESPACE,
|
44
|
+
_ANY_STATE,
|
45
|
+
self._message_repository,
|
46
|
+
self._state_manager,
|
47
|
+
CursorField(_A_CURSOR_FIELD_KEY),
|
48
|
+
_SLICE_BOUNDARY_FIELDS,
|
49
|
+
)
|
50
|
+
|
51
|
+
def _cursor_without_slice_boundary_fields(self) -> ConcurrentCursor:
|
52
|
+
return ConcurrentCursor(
|
53
|
+
_A_STREAM_NAME,
|
54
|
+
_A_STREAM_NAMESPACE,
|
55
|
+
_ANY_STATE,
|
56
|
+
self._message_repository,
|
57
|
+
self._state_manager,
|
58
|
+
CursorField(_A_CURSOR_FIELD_KEY),
|
59
|
+
None,
|
60
|
+
)
|
61
|
+
|
62
|
+
def test_given_boundary_fields_when_close_partition_then_emit_state(self) -> None:
|
63
|
+
self._cursor_with_slice_boundary_fields().close_partition(
|
64
|
+
_partition(
|
65
|
+
{_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30},
|
66
|
+
)
|
67
|
+
)
|
68
|
+
|
69
|
+
self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value)
|
70
|
+
self._state_manager.update_state_for_stream.assert_called_once_with(
|
71
|
+
_A_STREAM_NAME,
|
72
|
+
_A_STREAM_NAMESPACE,
|
73
|
+
{
|
74
|
+
"slices": [
|
75
|
+
{
|
76
|
+
"start": 12,
|
77
|
+
"end": 30,
|
78
|
+
},
|
79
|
+
]
|
80
|
+
},
|
81
|
+
)
|
82
|
+
|
83
|
+
def test_given_boundary_fields_and_record_observed_when_close_partition_then_ignore_records(self) -> None:
|
84
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
85
|
+
cursor.observe(_record(_A_VERY_HIGH_CURSOR_VALUE))
|
86
|
+
|
87
|
+
cursor.close_partition(_partition({_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}))
|
88
|
+
|
89
|
+
assert self._state_manager.update_state_for_stream.call_args_list[0].args[2]["slices"][0]["end"] != _A_VERY_HIGH_CURSOR_VALUE
|
90
|
+
|
91
|
+
def test_given_no_boundary_fields_when_close_partition_then_emit_state(self) -> None:
|
92
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
93
|
+
cursor.observe(_record(10))
|
94
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
95
|
+
|
96
|
+
self._state_manager.update_state_for_stream.assert_called_once_with(
|
97
|
+
_A_STREAM_NAME,
|
98
|
+
_A_STREAM_NAMESPACE,
|
99
|
+
{
|
100
|
+
"slices": [
|
101
|
+
{
|
102
|
+
"start": 0,
|
103
|
+
"end": 10,
|
104
|
+
},
|
105
|
+
]
|
106
|
+
},
|
107
|
+
)
|
108
|
+
|
109
|
+
def test_given_no_boundary_fields_when_close_multiple_partitions_then_raise_exception(self) -> None:
|
110
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
111
|
+
cursor.observe(_record(10))
|
112
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
113
|
+
|
114
|
+
with pytest.raises(ValueError):
|
115
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
116
|
+
|
117
|
+
def test_given_no_records_observed_when_close_partition_then_do_not_emit_state(self) -> None:
|
118
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
119
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
120
|
+
assert self._message_repository.emit_message.call_count == 0
|
121
|
+
|
122
|
+
def test_given_slice_boundaries_and_no_slice_when_close_partition_then_raise_error(self) -> None:
|
123
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
124
|
+
with pytest.raises(KeyError):
|
125
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
126
|
+
|
127
|
+
def test_given_slice_boundaries_not_matching_slice_when_close_partition_then_raise_error(self) -> None:
|
128
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
129
|
+
with pytest.raises(KeyError):
|
130
|
+
cursor.close_partition(_partition({"not_matching_key": "value"}))
|
@@ -5,9 +5,9 @@
|
|
5
5
|
import unittest
|
6
6
|
from unittest.mock import Mock, call
|
7
7
|
|
8
|
-
import pytest
|
9
8
|
from airbyte_cdk.models import AirbyteStream, SyncMode
|
10
9
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
11
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
12
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
|
@@ -25,6 +25,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
25
25
|
self._slice_logger = Mock()
|
26
26
|
self._logger = Mock()
|
27
27
|
self._message_repository = Mock()
|
28
|
+
self._cursor = Mock(spec=Cursor)
|
28
29
|
self._stream = ThreadBasedConcurrentStream(
|
29
30
|
self._partition_generator,
|
30
31
|
self._max_workers,
|
@@ -39,6 +40,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
39
40
|
1,
|
40
41
|
2,
|
41
42
|
0,
|
43
|
+
cursor=self._cursor,
|
42
44
|
)
|
43
45
|
|
44
46
|
def test_get_json_schema(self):
|
@@ -76,17 +78,20 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
76
78
|
with self.assertRaises(Exception):
|
77
79
|
self._stream._check_for_errors(futures)
|
78
80
|
|
79
|
-
def
|
81
|
+
def test_read_observe_records_and_close_partition(self):
|
80
82
|
partition = Mock(spec=Partition)
|
81
|
-
|
83
|
+
expected_records = [Record({"id": 1}), Record({"id": "2"})]
|
84
|
+
partition.read.return_value = expected_records
|
85
|
+
partition.to_slice.return_value = {"slice": "slice"}
|
86
|
+
self._slice_logger.should_log_slice_message.return_value = False
|
87
|
+
|
82
88
|
self._partition_generator.generate.return_value = [partition]
|
83
|
-
|
84
|
-
|
89
|
+
actual_records = list(self._stream.read())
|
90
|
+
|
91
|
+
assert expected_records == actual_records
|
85
92
|
|
86
|
-
|
87
|
-
self.
|
88
|
-
with pytest.raises(RuntimeError):
|
89
|
-
list(self._stream.read())
|
93
|
+
self._cursor.observe.has_calls([call(record) for record in expected_records])
|
94
|
+
self._cursor.close_partition.assert_called_once_with(partition)
|
90
95
|
|
91
96
|
def test_read_no_slice_message(self):
|
92
97
|
partition = Mock(spec=Partition)
|
@@ -218,7 +223,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
218
223
|
assert expected_airbyte_stream == airbyte_stream
|
219
224
|
|
220
225
|
def test_as_airbyte_stream_with_a_cursor(self):
|
221
|
-
|
222
226
|
json_schema = {
|
223
227
|
"type": "object",
|
224
228
|
"properties": {
|
@@ -12,6 +12,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
12
12
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
13
13
|
from airbyte_cdk.sources.streams import Stream
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
|
15
16
|
from airbyte_cdk.sources.streams.core import StreamData
|
16
17
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
17
18
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
|
@@ -19,6 +20,7 @@ from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
|
|
19
20
|
_A_CURSOR_FIELD = ["NESTED", "CURSOR"]
|
20
21
|
_DEFAULT_INTERNAL_CONFIG = InternalConfig()
|
21
22
|
_STREAM_NAME = "STREAM"
|
23
|
+
_NO_STATE = None
|
22
24
|
|
23
25
|
|
24
26
|
class _MockStream(Stream):
|
@@ -57,7 +59,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
|
|
57
59
|
source = Mock()
|
58
60
|
source._slice_logger = slice_logger
|
59
61
|
source.message_repository = message_repository
|
60
|
-
stream = StreamFacade.create_from_stream(stream, source, logger, 1)
|
62
|
+
stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
|
61
63
|
stream.logger.setLevel(logger.level)
|
62
64
|
return stream
|
63
65
|
|
@@ -996,10 +996,11 @@ class TestIncrementalRead:
|
|
996
996
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
997
997
|
# stream 1 slice 2
|
998
998
|
_as_record("s1", stream_output[0]),
|
999
|
-
_as_record("s1", stream_output[1]),
|
1000
999
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1000
|
+
_as_record("s1", stream_output[1]),
|
1001
1001
|
_as_record("s1", stream_output[2]),
|
1002
1002
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1003
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1003
1004
|
_as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
|
1004
1005
|
# stream 2 slice 1
|
1005
1006
|
_as_stream_status("s2", AirbyteStreamStatus.STARTED),
|
@@ -1011,17 +1012,18 @@ class TestIncrementalRead:
|
|
1011
1012
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1012
1013
|
# stream 2 slice 2
|
1013
1014
|
_as_record("s2", stream_output[0]),
|
1014
|
-
_as_record("s2", stream_output[1]),
|
1015
1015
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1016
|
+
_as_record("s2", stream_output[1]),
|
1016
1017
|
_as_record("s2", stream_output[2]),
|
1017
1018
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1019
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1018
1020
|
_as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
|
1019
1021
|
]
|
1020
1022
|
)
|
1021
1023
|
|
1022
1024
|
messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
|
1023
1025
|
|
1024
|
-
assert
|
1026
|
+
assert messages == expected
|
1025
1027
|
|
1026
1028
|
@pytest.mark.parametrize(
|
1027
1029
|
"per_stream_enabled",
|
@@ -1108,11 +1110,12 @@ class TestIncrementalRead:
|
|
1108
1110
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1109
1111
|
# stream 1 slice 2
|
1110
1112
|
stream_data_to_airbyte_message("s1", stream_output[0]),
|
1113
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1111
1114
|
stream_data_to_airbyte_message("s1", stream_output[1]),
|
1112
1115
|
stream_data_to_airbyte_message("s1", stream_output[2]),
|
1113
|
-
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1114
1116
|
stream_data_to_airbyte_message("s1", stream_output[3]),
|
1115
1117
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1118
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1116
1119
|
_as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
|
1117
1120
|
# stream 2 slice 1
|
1118
1121
|
_as_stream_status("s2", AirbyteStreamStatus.STARTED),
|
@@ -1125,33 +1128,33 @@ class TestIncrementalRead:
|
|
1125
1128
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1126
1129
|
# stream 2 slice 2
|
1127
1130
|
stream_data_to_airbyte_message("s2", stream_output[0]),
|
1131
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1128
1132
|
stream_data_to_airbyte_message("s2", stream_output[1]),
|
1129
1133
|
stream_data_to_airbyte_message("s2", stream_output[2]),
|
1130
|
-
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1131
1134
|
stream_data_to_airbyte_message("s2", stream_output[3]),
|
1132
1135
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1136
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1133
1137
|
_as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
|
1134
1138
|
]
|
1135
1139
|
)
|
1136
1140
|
|
1137
1141
|
messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
|
1138
1142
|
|
1139
|
-
assert
|
1143
|
+
assert messages == expected
|
1140
1144
|
|
1141
1145
|
|
1142
1146
|
def test_checkpoint_state_from_stream_instance():
|
1143
1147
|
teams_stream = MockStreamOverridesStateMethod()
|
1144
1148
|
managers_stream = StreamNoStateMethod()
|
1145
|
-
src = MockSource(streams=[teams_stream, managers_stream])
|
1146
1149
|
state_manager = ConnectorStateManager({"teams": teams_stream, "managers": managers_stream}, [])
|
1147
1150
|
|
1148
1151
|
# The stream_state passed to checkpoint_state() should be ignored since stream implements state function
|
1149
1152
|
teams_stream.state = {"updated_at": "2022-09-11"}
|
1150
|
-
actual_message =
|
1153
|
+
actual_message = teams_stream._checkpoint_state({"ignored": "state"}, state_manager, True)
|
1151
1154
|
assert actual_message == _as_state({"teams": {"updated_at": "2022-09-11"}}, "teams", {"updated_at": "2022-09-11"})
|
1152
1155
|
|
1153
1156
|
# The stream_state passed to checkpoint_state() should be used since the stream does not implement state function
|
1154
|
-
actual_message =
|
1157
|
+
actual_message = managers_stream._checkpoint_state({"updated": "expected_here"}, state_manager, True)
|
1155
1158
|
assert actual_message == _as_state(
|
1156
1159
|
{"teams": {"updated_at": "2022-09-11"}, "managers": {"updated": "expected_here"}}, "managers", {"updated": "expected_here"}
|
1157
1160
|
)
|
File without changes
|
File without changes
|