airbyte-cdk 0.52.6__py3-none-any.whl → 0.52.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. airbyte_cdk/destinations/vector_db_based/config.py +1 -0
  2. airbyte_cdk/sources/abstract_source.py +12 -61
  3. airbyte_cdk/sources/file_based/config/unstructured_format.py +1 -1
  4. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +1 -2
  5. airbyte_cdk/sources/message/repository.py +0 -6
  6. airbyte_cdk/sources/source.py +14 -13
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +94 -21
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +148 -0
  9. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +2 -3
  10. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +3 -0
  11. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +1 -3
  12. airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +7 -3
  13. airbyte_cdk/sources/streams/core.py +71 -1
  14. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/METADATA +3 -3
  15. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/RECORD +32 -30
  16. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/WHEEL +1 -1
  17. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +5 -0
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1 -1
  19. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +16 -0
  20. unit_tests/sources/message/test_repository.py +7 -20
  21. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +46 -5
  22. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +154 -37
  23. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +6 -0
  24. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +19 -3
  25. unit_tests/sources/streams/concurrent/test_adapters.py +48 -22
  26. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +5 -4
  27. unit_tests/sources/streams/concurrent/test_cursor.py +130 -0
  28. unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py +14 -10
  29. unit_tests/sources/streams/test_stream_read.py +3 -1
  30. unit_tests/sources/test_abstract_source.py +12 -9
  31. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ from typing import Any, Mapping, Optional
2
+ from unittest import TestCase
3
+ from unittest.mock import Mock
4
+
5
+ import pytest
6
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
7
+ from airbyte_cdk.sources.message import MessageRepository
8
+ from airbyte_cdk.sources.streams.concurrent.cursor import Comparable, ConcurrentCursor, CursorField
9
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
10
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
11
+
12
+ _A_STREAM_NAME = "a stream name"
13
+ _A_STREAM_NAMESPACE = "a stream namespace"
14
+ _ANY_STATE = None
15
+ _A_CURSOR_FIELD_KEY = "a_cursor_field_key"
16
+ _NO_PARTITION_IDENTIFIER = None
17
+ _NO_SLICE = None
18
+ _NO_SLICE_BOUNDARIES = None
19
+ _LOWER_SLICE_BOUNDARY_FIELD = "lower_boundary"
20
+ _UPPER_SLICE_BOUNDARY_FIELD = "upper_boundary"
21
+ _SLICE_BOUNDARY_FIELDS = (_LOWER_SLICE_BOUNDARY_FIELD, _UPPER_SLICE_BOUNDARY_FIELD)
22
+ _A_VERY_HIGH_CURSOR_VALUE = 1000000000
23
+
24
+
25
+ def _partition(_slice: Optional[Mapping[str, Any]]) -> Partition:
26
+ partition = Mock(spec=Partition)
27
+ partition.to_slice.return_value = _slice
28
+ return partition
29
+
30
+
31
+ def _record(cursor_value: Comparable) -> Record:
32
+ return Record(data={_A_CURSOR_FIELD_KEY: cursor_value})
33
+
34
+
35
+ class ConcurrentCursorTest(TestCase):
36
+ def setUp(self) -> None:
37
+ self._message_repository = Mock(spec=MessageRepository)
38
+ self._state_manager = Mock(spec=ConnectorStateManager)
39
+
40
+ def _cursor_with_slice_boundary_fields(self) -> ConcurrentCursor:
41
+ return ConcurrentCursor(
42
+ _A_STREAM_NAME,
43
+ _A_STREAM_NAMESPACE,
44
+ _ANY_STATE,
45
+ self._message_repository,
46
+ self._state_manager,
47
+ CursorField(_A_CURSOR_FIELD_KEY),
48
+ _SLICE_BOUNDARY_FIELDS,
49
+ )
50
+
51
+ def _cursor_without_slice_boundary_fields(self) -> ConcurrentCursor:
52
+ return ConcurrentCursor(
53
+ _A_STREAM_NAME,
54
+ _A_STREAM_NAMESPACE,
55
+ _ANY_STATE,
56
+ self._message_repository,
57
+ self._state_manager,
58
+ CursorField(_A_CURSOR_FIELD_KEY),
59
+ None,
60
+ )
61
+
62
+ def test_given_boundary_fields_when_close_partition_then_emit_state(self) -> None:
63
+ self._cursor_with_slice_boundary_fields().close_partition(
64
+ _partition(
65
+ {_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30},
66
+ )
67
+ )
68
+
69
+ self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value)
70
+ self._state_manager.update_state_for_stream.assert_called_once_with(
71
+ _A_STREAM_NAME,
72
+ _A_STREAM_NAMESPACE,
73
+ {
74
+ "slices": [
75
+ {
76
+ "start": 12,
77
+ "end": 30,
78
+ },
79
+ ]
80
+ },
81
+ )
82
+
83
+ def test_given_boundary_fields_and_record_observed_when_close_partition_then_ignore_records(self) -> None:
84
+ cursor = self._cursor_with_slice_boundary_fields()
85
+ cursor.observe(_record(_A_VERY_HIGH_CURSOR_VALUE))
86
+
87
+ cursor.close_partition(_partition({_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}))
88
+
89
+ assert self._state_manager.update_state_for_stream.call_args_list[0].args[2]["slices"][0]["end"] != _A_VERY_HIGH_CURSOR_VALUE
90
+
91
+ def test_given_no_boundary_fields_when_close_partition_then_emit_state(self) -> None:
92
+ cursor = self._cursor_without_slice_boundary_fields()
93
+ cursor.observe(_record(10))
94
+ cursor.close_partition(_partition(_NO_SLICE))
95
+
96
+ self._state_manager.update_state_for_stream.assert_called_once_with(
97
+ _A_STREAM_NAME,
98
+ _A_STREAM_NAMESPACE,
99
+ {
100
+ "slices": [
101
+ {
102
+ "start": 0,
103
+ "end": 10,
104
+ },
105
+ ]
106
+ },
107
+ )
108
+
109
+ def test_given_no_boundary_fields_when_close_multiple_partitions_then_raise_exception(self) -> None:
110
+ cursor = self._cursor_without_slice_boundary_fields()
111
+ cursor.observe(_record(10))
112
+ cursor.close_partition(_partition(_NO_SLICE))
113
+
114
+ with pytest.raises(ValueError):
115
+ cursor.close_partition(_partition(_NO_SLICE))
116
+
117
+ def test_given_no_records_observed_when_close_partition_then_do_not_emit_state(self) -> None:
118
+ cursor = self._cursor_without_slice_boundary_fields()
119
+ cursor.close_partition(_partition(_NO_SLICE))
120
+ assert self._message_repository.emit_message.call_count == 0
121
+
122
+ def test_given_slice_boundaries_and_no_slice_when_close_partition_then_raise_error(self) -> None:
123
+ cursor = self._cursor_with_slice_boundary_fields()
124
+ with pytest.raises(KeyError):
125
+ cursor.close_partition(_partition(_NO_SLICE))
126
+
127
+ def test_given_slice_boundaries_not_matching_slice_when_close_partition_then_raise_error(self) -> None:
128
+ cursor = self._cursor_with_slice_boundary_fields()
129
+ with pytest.raises(KeyError):
130
+ cursor.close_partition(_partition({"not_matching_key": "value"}))
@@ -5,9 +5,9 @@
5
5
  import unittest
6
6
  from unittest.mock import Mock, call
7
7
 
8
- import pytest
9
8
  from airbyte_cdk.models import AirbyteStream, SyncMode
10
9
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
10
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
11
11
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
12
12
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
13
13
  from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
@@ -25,6 +25,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
25
25
  self._slice_logger = Mock()
26
26
  self._logger = Mock()
27
27
  self._message_repository = Mock()
28
+ self._cursor = Mock(spec=Cursor)
28
29
  self._stream = ThreadBasedConcurrentStream(
29
30
  self._partition_generator,
30
31
  self._max_workers,
@@ -39,6 +40,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
39
40
  1,
40
41
  2,
41
42
  0,
43
+ cursor=self._cursor,
42
44
  )
43
45
 
44
46
  def test_get_json_schema(self):
@@ -76,17 +78,20 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
76
78
  with self.assertRaises(Exception):
77
79
  self._stream._check_for_errors(futures)
78
80
 
79
- def test_read_raises_an_exception_if_a_partition_raises_an_exception(self):
81
+ def test_read_observe_records_and_close_partition(self):
80
82
  partition = Mock(spec=Partition)
81
- partition.read.side_effect = RuntimeError("error")
83
+ expected_records = [Record({"id": 1}), Record({"id": "2"})]
84
+ partition.read.return_value = expected_records
85
+ partition.to_slice.return_value = {"slice": "slice"}
86
+ self._slice_logger.should_log_slice_message.return_value = False
87
+
82
88
  self._partition_generator.generate.return_value = [partition]
83
- with pytest.raises(RuntimeError):
84
- list(self._stream.read())
89
+ actual_records = list(self._stream.read())
90
+
91
+ assert expected_records == actual_records
85
92
 
86
- def test_read_raises_an_exception_if_partition_generator_raises_an_exception(self):
87
- self._partition_generator.generate.side_effect = RuntimeError("error")
88
- with pytest.raises(RuntimeError):
89
- list(self._stream.read())
93
+ self._cursor.observe.has_calls([call(record) for record in expected_records])
94
+ self._cursor.close_partition.assert_called_once_with(partition)
90
95
 
91
96
  def test_read_no_slice_message(self):
92
97
  partition = Mock(spec=Partition)
@@ -218,7 +223,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
218
223
  assert expected_airbyte_stream == airbyte_stream
219
224
 
220
225
  def test_as_airbyte_stream_with_a_cursor(self):
221
-
222
226
  json_schema = {
223
227
  "type": "object",
224
228
  "properties": {
@@ -12,6 +12,7 @@ from airbyte_cdk.models import Type as MessageType
12
12
  from airbyte_cdk.sources.message import InMemoryMessageRepository
13
13
  from airbyte_cdk.sources.streams import Stream
14
14
  from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
15
+ from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
15
16
  from airbyte_cdk.sources.streams.core import StreamData
16
17
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
17
18
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
@@ -19,6 +20,7 @@ from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
19
20
  _A_CURSOR_FIELD = ["NESTED", "CURSOR"]
20
21
  _DEFAULT_INTERNAL_CONFIG = InternalConfig()
21
22
  _STREAM_NAME = "STREAM"
23
+ _NO_STATE = None
22
24
 
23
25
 
24
26
  class _MockStream(Stream):
@@ -57,7 +59,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
57
59
  source = Mock()
58
60
  source._slice_logger = slice_logger
59
61
  source.message_repository = message_repository
60
- stream = StreamFacade.create_from_stream(stream, source, logger, 1)
62
+ stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
61
63
  stream.logger.setLevel(logger.level)
62
64
  return stream
63
65
 
@@ -996,10 +996,11 @@ class TestIncrementalRead:
996
996
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
997
997
  # stream 1 slice 2
998
998
  _as_record("s1", stream_output[0]),
999
- _as_record("s1", stream_output[1]),
1000
999
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1000
+ _as_record("s1", stream_output[1]),
1001
1001
  _as_record("s1", stream_output[2]),
1002
1002
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1003
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1003
1004
  _as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
1004
1005
  # stream 2 slice 1
1005
1006
  _as_stream_status("s2", AirbyteStreamStatus.STARTED),
@@ -1011,17 +1012,18 @@ class TestIncrementalRead:
1011
1012
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1012
1013
  # stream 2 slice 2
1013
1014
  _as_record("s2", stream_output[0]),
1014
- _as_record("s2", stream_output[1]),
1015
1015
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1016
+ _as_record("s2", stream_output[1]),
1016
1017
  _as_record("s2", stream_output[2]),
1017
1018
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1019
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1018
1020
  _as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
1019
1021
  ]
1020
1022
  )
1021
1023
 
1022
1024
  messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
1023
1025
 
1024
- assert expected == messages
1026
+ assert messages == expected
1025
1027
 
1026
1028
  @pytest.mark.parametrize(
1027
1029
  "per_stream_enabled",
@@ -1108,11 +1110,12 @@ class TestIncrementalRead:
1108
1110
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1109
1111
  # stream 1 slice 2
1110
1112
  stream_data_to_airbyte_message("s1", stream_output[0]),
1113
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1111
1114
  stream_data_to_airbyte_message("s1", stream_output[1]),
1112
1115
  stream_data_to_airbyte_message("s1", stream_output[2]),
1113
- _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1114
1116
  stream_data_to_airbyte_message("s1", stream_output[3]),
1115
1117
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1118
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1116
1119
  _as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
1117
1120
  # stream 2 slice 1
1118
1121
  _as_stream_status("s2", AirbyteStreamStatus.STARTED),
@@ -1125,33 +1128,33 @@ class TestIncrementalRead:
1125
1128
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1126
1129
  # stream 2 slice 2
1127
1130
  stream_data_to_airbyte_message("s2", stream_output[0]),
1131
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1128
1132
  stream_data_to_airbyte_message("s2", stream_output[1]),
1129
1133
  stream_data_to_airbyte_message("s2", stream_output[2]),
1130
- _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1131
1134
  stream_data_to_airbyte_message("s2", stream_output[3]),
1132
1135
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1136
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1133
1137
  _as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
1134
1138
  ]
1135
1139
  )
1136
1140
 
1137
1141
  messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
1138
1142
 
1139
- assert expected == messages
1143
+ assert messages == expected
1140
1144
 
1141
1145
 
1142
1146
  def test_checkpoint_state_from_stream_instance():
1143
1147
  teams_stream = MockStreamOverridesStateMethod()
1144
1148
  managers_stream = StreamNoStateMethod()
1145
- src = MockSource(streams=[teams_stream, managers_stream])
1146
1149
  state_manager = ConnectorStateManager({"teams": teams_stream, "managers": managers_stream}, [])
1147
1150
 
1148
1151
  # The stream_state passed to checkpoint_state() should be ignored since stream implements state function
1149
1152
  teams_stream.state = {"updated_at": "2022-09-11"}
1150
- actual_message = src._checkpoint_state(teams_stream, {"ignored": "state"}, state_manager)
1153
+ actual_message = teams_stream._checkpoint_state({"ignored": "state"}, state_manager, True)
1151
1154
  assert actual_message == _as_state({"teams": {"updated_at": "2022-09-11"}}, "teams", {"updated_at": "2022-09-11"})
1152
1155
 
1153
1156
  # The stream_state passed to checkpoint_state() should be used since the stream does not implement state function
1154
- actual_message = src._checkpoint_state(managers_stream, {"updated": "expected_here"}, state_manager)
1157
+ actual_message = managers_stream._checkpoint_state({"updated": "expected_here"}, state_manager, True)
1155
1158
  assert actual_message == _as_state(
1156
1159
  {"teams": {"updated_at": "2022-09-11"}, "managers": {"updated": "expected_here"}}, "managers", {"updated": "expected_here"}
1157
1160
  )