airbyte-cdk 0.52.6__py3-none-any.whl → 0.52.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/destinations/vector_db_based/config.py +1 -0
- airbyte_cdk/sources/abstract_source.py +12 -61
- airbyte_cdk/sources/file_based/config/unstructured_format.py +1 -1
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +1 -2
- airbyte_cdk/sources/message/repository.py +0 -6
- airbyte_cdk/sources/source.py +14 -13
- airbyte_cdk/sources/streams/concurrent/adapters.py +94 -21
- airbyte_cdk/sources/streams/concurrent/cursor.py +148 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +2 -3
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +1 -3
- airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +7 -3
- airbyte_cdk/sources/streams/core.py +71 -1
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/METADATA +3 -3
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/RECORD +32 -30
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/WHEEL +1 -1
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +5 -0
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1 -1
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +16 -0
- unit_tests/sources/message/test_repository.py +7 -20
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +46 -5
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +154 -37
- unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +6 -0
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +19 -3
- unit_tests/sources/streams/concurrent/test_adapters.py +48 -22
- unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +5 -4
- unit_tests/sources/streams/concurrent/test_cursor.py +130 -0
- unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py +14 -10
- unit_tests/sources/streams/test_stream_read.py +3 -1
- unit_tests/sources/test_abstract_source.py +12 -9
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
from typing import Any, Mapping, Optional
|
2
|
+
from unittest import TestCase
|
3
|
+
from unittest.mock import Mock
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
7
|
+
from airbyte_cdk.sources.message import MessageRepository
|
8
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Comparable, ConcurrentCursor, CursorField
|
9
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
11
|
+
|
12
|
+
_A_STREAM_NAME = "a stream name"
|
13
|
+
_A_STREAM_NAMESPACE = "a stream namespace"
|
14
|
+
_ANY_STATE = None
|
15
|
+
_A_CURSOR_FIELD_KEY = "a_cursor_field_key"
|
16
|
+
_NO_PARTITION_IDENTIFIER = None
|
17
|
+
_NO_SLICE = None
|
18
|
+
_NO_SLICE_BOUNDARIES = None
|
19
|
+
_LOWER_SLICE_BOUNDARY_FIELD = "lower_boundary"
|
20
|
+
_UPPER_SLICE_BOUNDARY_FIELD = "upper_boundary"
|
21
|
+
_SLICE_BOUNDARY_FIELDS = (_LOWER_SLICE_BOUNDARY_FIELD, _UPPER_SLICE_BOUNDARY_FIELD)
|
22
|
+
_A_VERY_HIGH_CURSOR_VALUE = 1000000000
|
23
|
+
|
24
|
+
|
25
|
+
def _partition(_slice: Optional[Mapping[str, Any]]) -> Partition:
|
26
|
+
partition = Mock(spec=Partition)
|
27
|
+
partition.to_slice.return_value = _slice
|
28
|
+
return partition
|
29
|
+
|
30
|
+
|
31
|
+
def _record(cursor_value: Comparable) -> Record:
|
32
|
+
return Record(data={_A_CURSOR_FIELD_KEY: cursor_value})
|
33
|
+
|
34
|
+
|
35
|
+
class ConcurrentCursorTest(TestCase):
|
36
|
+
def setUp(self) -> None:
|
37
|
+
self._message_repository = Mock(spec=MessageRepository)
|
38
|
+
self._state_manager = Mock(spec=ConnectorStateManager)
|
39
|
+
|
40
|
+
def _cursor_with_slice_boundary_fields(self) -> ConcurrentCursor:
|
41
|
+
return ConcurrentCursor(
|
42
|
+
_A_STREAM_NAME,
|
43
|
+
_A_STREAM_NAMESPACE,
|
44
|
+
_ANY_STATE,
|
45
|
+
self._message_repository,
|
46
|
+
self._state_manager,
|
47
|
+
CursorField(_A_CURSOR_FIELD_KEY),
|
48
|
+
_SLICE_BOUNDARY_FIELDS,
|
49
|
+
)
|
50
|
+
|
51
|
+
def _cursor_without_slice_boundary_fields(self) -> ConcurrentCursor:
|
52
|
+
return ConcurrentCursor(
|
53
|
+
_A_STREAM_NAME,
|
54
|
+
_A_STREAM_NAMESPACE,
|
55
|
+
_ANY_STATE,
|
56
|
+
self._message_repository,
|
57
|
+
self._state_manager,
|
58
|
+
CursorField(_A_CURSOR_FIELD_KEY),
|
59
|
+
None,
|
60
|
+
)
|
61
|
+
|
62
|
+
def test_given_boundary_fields_when_close_partition_then_emit_state(self) -> None:
|
63
|
+
self._cursor_with_slice_boundary_fields().close_partition(
|
64
|
+
_partition(
|
65
|
+
{_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30},
|
66
|
+
)
|
67
|
+
)
|
68
|
+
|
69
|
+
self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value)
|
70
|
+
self._state_manager.update_state_for_stream.assert_called_once_with(
|
71
|
+
_A_STREAM_NAME,
|
72
|
+
_A_STREAM_NAMESPACE,
|
73
|
+
{
|
74
|
+
"slices": [
|
75
|
+
{
|
76
|
+
"start": 12,
|
77
|
+
"end": 30,
|
78
|
+
},
|
79
|
+
]
|
80
|
+
},
|
81
|
+
)
|
82
|
+
|
83
|
+
def test_given_boundary_fields_and_record_observed_when_close_partition_then_ignore_records(self) -> None:
|
84
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
85
|
+
cursor.observe(_record(_A_VERY_HIGH_CURSOR_VALUE))
|
86
|
+
|
87
|
+
cursor.close_partition(_partition({_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}))
|
88
|
+
|
89
|
+
assert self._state_manager.update_state_for_stream.call_args_list[0].args[2]["slices"][0]["end"] != _A_VERY_HIGH_CURSOR_VALUE
|
90
|
+
|
91
|
+
def test_given_no_boundary_fields_when_close_partition_then_emit_state(self) -> None:
|
92
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
93
|
+
cursor.observe(_record(10))
|
94
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
95
|
+
|
96
|
+
self._state_manager.update_state_for_stream.assert_called_once_with(
|
97
|
+
_A_STREAM_NAME,
|
98
|
+
_A_STREAM_NAMESPACE,
|
99
|
+
{
|
100
|
+
"slices": [
|
101
|
+
{
|
102
|
+
"start": 0,
|
103
|
+
"end": 10,
|
104
|
+
},
|
105
|
+
]
|
106
|
+
},
|
107
|
+
)
|
108
|
+
|
109
|
+
def test_given_no_boundary_fields_when_close_multiple_partitions_then_raise_exception(self) -> None:
|
110
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
111
|
+
cursor.observe(_record(10))
|
112
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
113
|
+
|
114
|
+
with pytest.raises(ValueError):
|
115
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
116
|
+
|
117
|
+
def test_given_no_records_observed_when_close_partition_then_do_not_emit_state(self) -> None:
|
118
|
+
cursor = self._cursor_without_slice_boundary_fields()
|
119
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
120
|
+
assert self._message_repository.emit_message.call_count == 0
|
121
|
+
|
122
|
+
def test_given_slice_boundaries_and_no_slice_when_close_partition_then_raise_error(self) -> None:
|
123
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
124
|
+
with pytest.raises(KeyError):
|
125
|
+
cursor.close_partition(_partition(_NO_SLICE))
|
126
|
+
|
127
|
+
def test_given_slice_boundaries_not_matching_slice_when_close_partition_then_raise_error(self) -> None:
|
128
|
+
cursor = self._cursor_with_slice_boundary_fields()
|
129
|
+
with pytest.raises(KeyError):
|
130
|
+
cursor.close_partition(_partition({"not_matching_key": "value"}))
|
@@ -5,9 +5,9 @@
|
|
5
5
|
import unittest
|
6
6
|
from unittest.mock import Mock, call
|
7
7
|
|
8
|
-
import pytest
|
9
8
|
from airbyte_cdk.models import AirbyteStream, SyncMode
|
10
9
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
11
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
12
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
|
@@ -25,6 +25,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
25
25
|
self._slice_logger = Mock()
|
26
26
|
self._logger = Mock()
|
27
27
|
self._message_repository = Mock()
|
28
|
+
self._cursor = Mock(spec=Cursor)
|
28
29
|
self._stream = ThreadBasedConcurrentStream(
|
29
30
|
self._partition_generator,
|
30
31
|
self._max_workers,
|
@@ -39,6 +40,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
39
40
|
1,
|
40
41
|
2,
|
41
42
|
0,
|
43
|
+
cursor=self._cursor,
|
42
44
|
)
|
43
45
|
|
44
46
|
def test_get_json_schema(self):
|
@@ -76,17 +78,20 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
76
78
|
with self.assertRaises(Exception):
|
77
79
|
self._stream._check_for_errors(futures)
|
78
80
|
|
79
|
-
def
|
81
|
+
def test_read_observe_records_and_close_partition(self):
|
80
82
|
partition = Mock(spec=Partition)
|
81
|
-
|
83
|
+
expected_records = [Record({"id": 1}), Record({"id": "2"})]
|
84
|
+
partition.read.return_value = expected_records
|
85
|
+
partition.to_slice.return_value = {"slice": "slice"}
|
86
|
+
self._slice_logger.should_log_slice_message.return_value = False
|
87
|
+
|
82
88
|
self._partition_generator.generate.return_value = [partition]
|
83
|
-
|
84
|
-
|
89
|
+
actual_records = list(self._stream.read())
|
90
|
+
|
91
|
+
assert expected_records == actual_records
|
85
92
|
|
86
|
-
|
87
|
-
self.
|
88
|
-
with pytest.raises(RuntimeError):
|
89
|
-
list(self._stream.read())
|
93
|
+
self._cursor.observe.has_calls([call(record) for record in expected_records])
|
94
|
+
self._cursor.close_partition.assert_called_once_with(partition)
|
90
95
|
|
91
96
|
def test_read_no_slice_message(self):
|
92
97
|
partition = Mock(spec=Partition)
|
@@ -218,7 +223,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
|
|
218
223
|
assert expected_airbyte_stream == airbyte_stream
|
219
224
|
|
220
225
|
def test_as_airbyte_stream_with_a_cursor(self):
|
221
|
-
|
222
226
|
json_schema = {
|
223
227
|
"type": "object",
|
224
228
|
"properties": {
|
@@ -12,6 +12,7 @@ from airbyte_cdk.models import Type as MessageType
|
|
12
12
|
from airbyte_cdk.sources.message import InMemoryMessageRepository
|
13
13
|
from airbyte_cdk.sources.streams import Stream
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
15
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
|
15
16
|
from airbyte_cdk.sources.streams.core import StreamData
|
16
17
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
17
18
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
|
@@ -19,6 +20,7 @@ from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
|
|
19
20
|
_A_CURSOR_FIELD = ["NESTED", "CURSOR"]
|
20
21
|
_DEFAULT_INTERNAL_CONFIG = InternalConfig()
|
21
22
|
_STREAM_NAME = "STREAM"
|
23
|
+
_NO_STATE = None
|
22
24
|
|
23
25
|
|
24
26
|
class _MockStream(Stream):
|
@@ -57,7 +59,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
|
|
57
59
|
source = Mock()
|
58
60
|
source._slice_logger = slice_logger
|
59
61
|
source.message_repository = message_repository
|
60
|
-
stream = StreamFacade.create_from_stream(stream, source, logger, 1)
|
62
|
+
stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
|
61
63
|
stream.logger.setLevel(logger.level)
|
62
64
|
return stream
|
63
65
|
|
@@ -996,10 +996,11 @@ class TestIncrementalRead:
|
|
996
996
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
997
997
|
# stream 1 slice 2
|
998
998
|
_as_record("s1", stream_output[0]),
|
999
|
-
_as_record("s1", stream_output[1]),
|
1000
999
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1000
|
+
_as_record("s1", stream_output[1]),
|
1001
1001
|
_as_record("s1", stream_output[2]),
|
1002
1002
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1003
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1003
1004
|
_as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
|
1004
1005
|
# stream 2 slice 1
|
1005
1006
|
_as_stream_status("s2", AirbyteStreamStatus.STARTED),
|
@@ -1011,17 +1012,18 @@ class TestIncrementalRead:
|
|
1011
1012
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1012
1013
|
# stream 2 slice 2
|
1013
1014
|
_as_record("s2", stream_output[0]),
|
1014
|
-
_as_record("s2", stream_output[1]),
|
1015
1015
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1016
|
+
_as_record("s2", stream_output[1]),
|
1016
1017
|
_as_record("s2", stream_output[2]),
|
1017
1018
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1019
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1018
1020
|
_as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
|
1019
1021
|
]
|
1020
1022
|
)
|
1021
1023
|
|
1022
1024
|
messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
|
1023
1025
|
|
1024
|
-
assert
|
1026
|
+
assert messages == expected
|
1025
1027
|
|
1026
1028
|
@pytest.mark.parametrize(
|
1027
1029
|
"per_stream_enabled",
|
@@ -1108,11 +1110,12 @@ class TestIncrementalRead:
|
|
1108
1110
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1109
1111
|
# stream 1 slice 2
|
1110
1112
|
stream_data_to_airbyte_message("s1", stream_output[0]),
|
1113
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1111
1114
|
stream_data_to_airbyte_message("s1", stream_output[1]),
|
1112
1115
|
stream_data_to_airbyte_message("s1", stream_output[2]),
|
1113
|
-
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1114
1116
|
stream_data_to_airbyte_message("s1", stream_output[3]),
|
1115
1117
|
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1118
|
+
_as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
|
1116
1119
|
_as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
|
1117
1120
|
# stream 2 slice 1
|
1118
1121
|
_as_stream_status("s2", AirbyteStreamStatus.STARTED),
|
@@ -1125,33 +1128,33 @@ class TestIncrementalRead:
|
|
1125
1128
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1126
1129
|
# stream 2 slice 2
|
1127
1130
|
stream_data_to_airbyte_message("s2", stream_output[0]),
|
1131
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1128
1132
|
stream_data_to_airbyte_message("s2", stream_output[1]),
|
1129
1133
|
stream_data_to_airbyte_message("s2", stream_output[2]),
|
1130
|
-
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1131
1134
|
stream_data_to_airbyte_message("s2", stream_output[3]),
|
1132
1135
|
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1136
|
+
_as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
|
1133
1137
|
_as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
|
1134
1138
|
]
|
1135
1139
|
)
|
1136
1140
|
|
1137
1141
|
messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
|
1138
1142
|
|
1139
|
-
assert
|
1143
|
+
assert messages == expected
|
1140
1144
|
|
1141
1145
|
|
1142
1146
|
def test_checkpoint_state_from_stream_instance():
|
1143
1147
|
teams_stream = MockStreamOverridesStateMethod()
|
1144
1148
|
managers_stream = StreamNoStateMethod()
|
1145
|
-
src = MockSource(streams=[teams_stream, managers_stream])
|
1146
1149
|
state_manager = ConnectorStateManager({"teams": teams_stream, "managers": managers_stream}, [])
|
1147
1150
|
|
1148
1151
|
# The stream_state passed to checkpoint_state() should be ignored since stream implements state function
|
1149
1152
|
teams_stream.state = {"updated_at": "2022-09-11"}
|
1150
|
-
actual_message =
|
1153
|
+
actual_message = teams_stream._checkpoint_state({"ignored": "state"}, state_manager, True)
|
1151
1154
|
assert actual_message == _as_state({"teams": {"updated_at": "2022-09-11"}}, "teams", {"updated_at": "2022-09-11"})
|
1152
1155
|
|
1153
1156
|
# The stream_state passed to checkpoint_state() should be used since the stream does not implement state function
|
1154
|
-
actual_message =
|
1157
|
+
actual_message = managers_stream._checkpoint_state({"updated": "expected_here"}, state_manager, True)
|
1155
1158
|
assert actual_message == _as_state(
|
1156
1159
|
{"teams": {"updated_at": "2022-09-11"}, "managers": {"updated": "expected_here"}}, "managers", {"updated": "expected_here"}
|
1157
1160
|
)
|
File without changes
|
File without changes
|