airbyte-cdk 0.52.6__py3-none-any.whl → 0.52.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. airbyte_cdk/destinations/vector_db_based/config.py +1 -0
  2. airbyte_cdk/sources/abstract_source.py +12 -61
  3. airbyte_cdk/sources/file_based/config/unstructured_format.py +1 -1
  4. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +1 -2
  5. airbyte_cdk/sources/message/repository.py +0 -6
  6. airbyte_cdk/sources/source.py +14 -13
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +94 -21
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +148 -0
  9. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +2 -3
  10. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +3 -0
  11. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +1 -3
  12. airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +7 -3
  13. airbyte_cdk/sources/streams/core.py +71 -1
  14. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/METADATA +3 -3
  15. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/RECORD +32 -30
  16. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/WHEEL +1 -1
  17. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +5 -0
  18. unit_tests/sources/file_based/scenarios/csv_scenarios.py +1 -1
  19. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +16 -0
  20. unit_tests/sources/message/test_repository.py +7 -20
  21. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +46 -5
  22. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +154 -37
  23. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +6 -0
  24. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +19 -3
  25. unit_tests/sources/streams/concurrent/test_adapters.py +48 -22
  26. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +5 -4
  27. unit_tests/sources/streams/concurrent/test_cursor.py +130 -0
  28. unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py +14 -10
  29. unit_tests/sources/streams/test_stream_read.py +3 -1
  30. unit_tests/sources/test_abstract_source.py +12 -9
  31. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.52.6.dist-info → airbyte_cdk-0.52.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ from typing import Any, Mapping, Optional
2
+ from unittest import TestCase
3
+ from unittest.mock import Mock
4
+
5
+ import pytest
6
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
7
+ from airbyte_cdk.sources.message import MessageRepository
8
+ from airbyte_cdk.sources.streams.concurrent.cursor import Comparable, ConcurrentCursor, CursorField
9
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
10
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
11
+
12
+ _A_STREAM_NAME = "a stream name"
13
+ _A_STREAM_NAMESPACE = "a stream namespace"
14
+ _ANY_STATE = None
15
+ _A_CURSOR_FIELD_KEY = "a_cursor_field_key"
16
+ _NO_PARTITION_IDENTIFIER = None
17
+ _NO_SLICE = None
18
+ _NO_SLICE_BOUNDARIES = None
19
+ _LOWER_SLICE_BOUNDARY_FIELD = "lower_boundary"
20
+ _UPPER_SLICE_BOUNDARY_FIELD = "upper_boundary"
21
+ _SLICE_BOUNDARY_FIELDS = (_LOWER_SLICE_BOUNDARY_FIELD, _UPPER_SLICE_BOUNDARY_FIELD)
22
+ _A_VERY_HIGH_CURSOR_VALUE = 1000000000
23
+
24
+
25
+ def _partition(_slice: Optional[Mapping[str, Any]]) -> Partition:
26
+ partition = Mock(spec=Partition)
27
+ partition.to_slice.return_value = _slice
28
+ return partition
29
+
30
+
31
+ def _record(cursor_value: Comparable) -> Record:
32
+ return Record(data={_A_CURSOR_FIELD_KEY: cursor_value})
33
+
34
+
35
+ class ConcurrentCursorTest(TestCase):
36
+ def setUp(self) -> None:
37
+ self._message_repository = Mock(spec=MessageRepository)
38
+ self._state_manager = Mock(spec=ConnectorStateManager)
39
+
40
+ def _cursor_with_slice_boundary_fields(self) -> ConcurrentCursor:
41
+ return ConcurrentCursor(
42
+ _A_STREAM_NAME,
43
+ _A_STREAM_NAMESPACE,
44
+ _ANY_STATE,
45
+ self._message_repository,
46
+ self._state_manager,
47
+ CursorField(_A_CURSOR_FIELD_KEY),
48
+ _SLICE_BOUNDARY_FIELDS,
49
+ )
50
+
51
+ def _cursor_without_slice_boundary_fields(self) -> ConcurrentCursor:
52
+ return ConcurrentCursor(
53
+ _A_STREAM_NAME,
54
+ _A_STREAM_NAMESPACE,
55
+ _ANY_STATE,
56
+ self._message_repository,
57
+ self._state_manager,
58
+ CursorField(_A_CURSOR_FIELD_KEY),
59
+ None,
60
+ )
61
+
62
+ def test_given_boundary_fields_when_close_partition_then_emit_state(self) -> None:
63
+ self._cursor_with_slice_boundary_fields().close_partition(
64
+ _partition(
65
+ {_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30},
66
+ )
67
+ )
68
+
69
+ self._message_repository.emit_message.assert_called_once_with(self._state_manager.create_state_message.return_value)
70
+ self._state_manager.update_state_for_stream.assert_called_once_with(
71
+ _A_STREAM_NAME,
72
+ _A_STREAM_NAMESPACE,
73
+ {
74
+ "slices": [
75
+ {
76
+ "start": 12,
77
+ "end": 30,
78
+ },
79
+ ]
80
+ },
81
+ )
82
+
83
+ def test_given_boundary_fields_and_record_observed_when_close_partition_then_ignore_records(self) -> None:
84
+ cursor = self._cursor_with_slice_boundary_fields()
85
+ cursor.observe(_record(_A_VERY_HIGH_CURSOR_VALUE))
86
+
87
+ cursor.close_partition(_partition({_LOWER_SLICE_BOUNDARY_FIELD: 12, _UPPER_SLICE_BOUNDARY_FIELD: 30}))
88
+
89
+ assert self._state_manager.update_state_for_stream.call_args_list[0].args[2]["slices"][0]["end"] != _A_VERY_HIGH_CURSOR_VALUE
90
+
91
+ def test_given_no_boundary_fields_when_close_partition_then_emit_state(self) -> None:
92
+ cursor = self._cursor_without_slice_boundary_fields()
93
+ cursor.observe(_record(10))
94
+ cursor.close_partition(_partition(_NO_SLICE))
95
+
96
+ self._state_manager.update_state_for_stream.assert_called_once_with(
97
+ _A_STREAM_NAME,
98
+ _A_STREAM_NAMESPACE,
99
+ {
100
+ "slices": [
101
+ {
102
+ "start": 0,
103
+ "end": 10,
104
+ },
105
+ ]
106
+ },
107
+ )
108
+
109
+ def test_given_no_boundary_fields_when_close_multiple_partitions_then_raise_exception(self) -> None:
110
+ cursor = self._cursor_without_slice_boundary_fields()
111
+ cursor.observe(_record(10))
112
+ cursor.close_partition(_partition(_NO_SLICE))
113
+
114
+ with pytest.raises(ValueError):
115
+ cursor.close_partition(_partition(_NO_SLICE))
116
+
117
+ def test_given_no_records_observed_when_close_partition_then_do_not_emit_state(self) -> None:
118
+ cursor = self._cursor_without_slice_boundary_fields()
119
+ cursor.close_partition(_partition(_NO_SLICE))
120
+ assert self._message_repository.emit_message.call_count == 0
121
+
122
+ def test_given_slice_boundaries_and_no_slice_when_close_partition_then_raise_error(self) -> None:
123
+ cursor = self._cursor_with_slice_boundary_fields()
124
+ with pytest.raises(KeyError):
125
+ cursor.close_partition(_partition(_NO_SLICE))
126
+
127
+ def test_given_slice_boundaries_not_matching_slice_when_close_partition_then_raise_error(self) -> None:
128
+ cursor = self._cursor_with_slice_boundary_fields()
129
+ with pytest.raises(KeyError):
130
+ cursor.close_partition(_partition({"not_matching_key": "value"}))
@@ -5,9 +5,9 @@
5
5
  import unittest
6
6
  from unittest.mock import Mock, call
7
7
 
8
- import pytest
9
8
  from airbyte_cdk.models import AirbyteStream, SyncMode
10
9
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
10
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
11
11
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
12
12
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
13
13
  from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
@@ -25,6 +25,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
25
25
  self._slice_logger = Mock()
26
26
  self._logger = Mock()
27
27
  self._message_repository = Mock()
28
+ self._cursor = Mock(spec=Cursor)
28
29
  self._stream = ThreadBasedConcurrentStream(
29
30
  self._partition_generator,
30
31
  self._max_workers,
@@ -39,6 +40,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
39
40
  1,
40
41
  2,
41
42
  0,
43
+ cursor=self._cursor,
42
44
  )
43
45
 
44
46
  def test_get_json_schema(self):
@@ -76,17 +78,20 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
76
78
  with self.assertRaises(Exception):
77
79
  self._stream._check_for_errors(futures)
78
80
 
79
- def test_read_raises_an_exception_if_a_partition_raises_an_exception(self):
81
+ def test_read_observe_records_and_close_partition(self):
80
82
  partition = Mock(spec=Partition)
81
- partition.read.side_effect = RuntimeError("error")
83
+ expected_records = [Record({"id": 1}), Record({"id": "2"})]
84
+ partition.read.return_value = expected_records
85
+ partition.to_slice.return_value = {"slice": "slice"}
86
+ self._slice_logger.should_log_slice_message.return_value = False
87
+
82
88
  self._partition_generator.generate.return_value = [partition]
83
- with pytest.raises(RuntimeError):
84
- list(self._stream.read())
89
+ actual_records = list(self._stream.read())
90
+
91
+ assert expected_records == actual_records
85
92
 
86
- def test_read_raises_an_exception_if_partition_generator_raises_an_exception(self):
87
- self._partition_generator.generate.side_effect = RuntimeError("error")
88
- with pytest.raises(RuntimeError):
89
- list(self._stream.read())
93
+ self._cursor.observe.has_calls([call(record) for record in expected_records])
94
+ self._cursor.close_partition.assert_called_once_with(partition)
90
95
 
91
96
  def test_read_no_slice_message(self):
92
97
  partition = Mock(spec=Partition)
@@ -218,7 +223,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
218
223
  assert expected_airbyte_stream == airbyte_stream
219
224
 
220
225
  def test_as_airbyte_stream_with_a_cursor(self):
221
-
222
226
  json_schema = {
223
227
  "type": "object",
224
228
  "properties": {
@@ -12,6 +12,7 @@ from airbyte_cdk.models import Type as MessageType
12
12
  from airbyte_cdk.sources.message import InMemoryMessageRepository
13
13
  from airbyte_cdk.sources.streams import Stream
14
14
  from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
15
+ from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
15
16
  from airbyte_cdk.sources.streams.core import StreamData
16
17
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
17
18
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
@@ -19,6 +20,7 @@ from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
19
20
  _A_CURSOR_FIELD = ["NESTED", "CURSOR"]
20
21
  _DEFAULT_INTERNAL_CONFIG = InternalConfig()
21
22
  _STREAM_NAME = "STREAM"
23
+ _NO_STATE = None
22
24
 
23
25
 
24
26
  class _MockStream(Stream):
@@ -57,7 +59,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
57
59
  source = Mock()
58
60
  source._slice_logger = slice_logger
59
61
  source.message_repository = message_repository
60
- stream = StreamFacade.create_from_stream(stream, source, logger, 1)
62
+ stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
61
63
  stream.logger.setLevel(logger.level)
62
64
  return stream
63
65
 
@@ -996,10 +996,11 @@ class TestIncrementalRead:
996
996
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
997
997
  # stream 1 slice 2
998
998
  _as_record("s1", stream_output[0]),
999
- _as_record("s1", stream_output[1]),
1000
999
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1000
+ _as_record("s1", stream_output[1]),
1001
1001
  _as_record("s1", stream_output[2]),
1002
1002
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1003
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1003
1004
  _as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
1004
1005
  # stream 2 slice 1
1005
1006
  _as_stream_status("s2", AirbyteStreamStatus.STARTED),
@@ -1011,17 +1012,18 @@ class TestIncrementalRead:
1011
1012
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1012
1013
  # stream 2 slice 2
1013
1014
  _as_record("s2", stream_output[0]),
1014
- _as_record("s2", stream_output[1]),
1015
1015
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1016
+ _as_record("s2", stream_output[1]),
1016
1017
  _as_record("s2", stream_output[2]),
1017
1018
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1019
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1018
1020
  _as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
1019
1021
  ]
1020
1022
  )
1021
1023
 
1022
1024
  messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
1023
1025
 
1024
- assert expected == messages
1026
+ assert messages == expected
1025
1027
 
1026
1028
  @pytest.mark.parametrize(
1027
1029
  "per_stream_enabled",
@@ -1108,11 +1110,12 @@ class TestIncrementalRead:
1108
1110
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1109
1111
  # stream 1 slice 2
1110
1112
  stream_data_to_airbyte_message("s1", stream_output[0]),
1113
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1111
1114
  stream_data_to_airbyte_message("s1", stream_output[1]),
1112
1115
  stream_data_to_airbyte_message("s1", stream_output[2]),
1113
- _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1114
1116
  stream_data_to_airbyte_message("s1", stream_output[3]),
1115
1117
  _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1118
+ _as_state({"s1": state}, "s1", state) if per_stream_enabled else _as_state({"s1": state}),
1116
1119
  _as_stream_status("s1", AirbyteStreamStatus.COMPLETE),
1117
1120
  # stream 2 slice 1
1118
1121
  _as_stream_status("s2", AirbyteStreamStatus.STARTED),
@@ -1125,33 +1128,33 @@ class TestIncrementalRead:
1125
1128
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1126
1129
  # stream 2 slice 2
1127
1130
  stream_data_to_airbyte_message("s2", stream_output[0]),
1131
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1128
1132
  stream_data_to_airbyte_message("s2", stream_output[1]),
1129
1133
  stream_data_to_airbyte_message("s2", stream_output[2]),
1130
- _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1131
1134
  stream_data_to_airbyte_message("s2", stream_output[3]),
1132
1135
  _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1136
+ _as_state({"s1": state, "s2": state}, "s2", state) if per_stream_enabled else _as_state({"s1": state, "s2": state}),
1133
1137
  _as_stream_status("s2", AirbyteStreamStatus.COMPLETE),
1134
1138
  ]
1135
1139
  )
1136
1140
 
1137
1141
  messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=input_state)))
1138
1142
 
1139
- assert expected == messages
1143
+ assert messages == expected
1140
1144
 
1141
1145
 
1142
1146
  def test_checkpoint_state_from_stream_instance():
1143
1147
  teams_stream = MockStreamOverridesStateMethod()
1144
1148
  managers_stream = StreamNoStateMethod()
1145
- src = MockSource(streams=[teams_stream, managers_stream])
1146
1149
  state_manager = ConnectorStateManager({"teams": teams_stream, "managers": managers_stream}, [])
1147
1150
 
1148
1151
  # The stream_state passed to checkpoint_state() should be ignored since stream implements state function
1149
1152
  teams_stream.state = {"updated_at": "2022-09-11"}
1150
- actual_message = src._checkpoint_state(teams_stream, {"ignored": "state"}, state_manager)
1153
+ actual_message = teams_stream._checkpoint_state({"ignored": "state"}, state_manager, True)
1151
1154
  assert actual_message == _as_state({"teams": {"updated_at": "2022-09-11"}}, "teams", {"updated_at": "2022-09-11"})
1152
1155
 
1153
1156
  # The stream_state passed to checkpoint_state() should be used since the stream does not implement state function
1154
- actual_message = src._checkpoint_state(managers_stream, {"updated": "expected_here"}, state_manager)
1157
+ actual_message = managers_stream._checkpoint_state({"updated": "expected_here"}, state_manager, True)
1155
1158
  assert actual_message == _as_state(
1156
1159
  {"teams": {"updated_at": "2022-09-11"}, "managers": {"updated": "expected_here"}}, "managers", {"updated": "expected_here"}
1157
1160
  )