airbyte-cdk 0.54.0__py3-none-any.whl → 0.55.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. airbyte_cdk/sources/concurrent_source/__init__.py +3 -0
  2. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +190 -0
  3. airbyte_cdk/sources/concurrent_source/concurrent_source.py +161 -0
  4. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +63 -0
  5. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +17 -0
  6. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +97 -0
  7. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +4 -4
  8. airbyte_cdk/sources/streams/concurrent/adapters.py +34 -12
  9. airbyte_cdk/sources/streams/concurrent/default_stream.py +79 -0
  10. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +7 -7
  11. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +23 -0
  12. airbyte_cdk/sources/streams/concurrent/partitions/record.py +4 -3
  13. airbyte_cdk/sources/streams/concurrent/partitions/types.py +2 -3
  14. airbyte_cdk/sources/utils/slice_logger.py +5 -0
  15. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/METADATA +1 -1
  16. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/RECORD +35 -23
  17. unit_tests/sources/concurrent_source/__init__.py +3 -0
  18. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +105 -0
  19. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +14 -7
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +2 -3
  21. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +44 -55
  22. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +24 -15
  23. unit_tests/sources/streams/concurrent/test_adapters.py +52 -32
  24. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +6 -5
  25. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +604 -0
  26. unit_tests/sources/streams/concurrent/test_cursor.py +1 -1
  27. unit_tests/sources/streams/concurrent/{test_thread_based_concurrent_stream.py → test_default_stream.py} +7 -144
  28. unit_tests/sources/streams/concurrent/test_partition_reader.py +2 -2
  29. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +98 -0
  30. unit_tests/sources/streams/test_stream_read.py +1 -2
  31. unit_tests/sources/test_concurrent_source.py +105 -0
  32. unit_tests/sources/test_source_read.py +461 -0
  33. airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +0 -221
  34. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/LICENSE.txt +0 -0
  35. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/WHEEL +0 -0
  36. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,33 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
5
4
  import unittest
6
- from unittest.mock import Mock, call
5
+ from unittest.mock import Mock
7
6
 
8
- import pytest
9
7
  from airbyte_cdk.models import AirbyteStream, SyncMode
10
8
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
11
9
  from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
12
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
13
- from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
14
- from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
15
-
16
- _MAX_CONCURRENT_TASKS = 2
10
+ from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
17
11
 
18
12
 
19
13
  class ThreadBasedConcurrentStreamTest(unittest.TestCase):
20
14
  def setUp(self):
21
15
  self._partition_generator = Mock()
22
- self._max_workers = 1
23
16
  self._name = "name"
24
17
  self._json_schema = {}
25
18
  self._availability_strategy = Mock()
26
19
  self._primary_key = []
27
20
  self._cursor_field = None
28
- self._slice_logger = Mock()
29
21
  self._logger = Mock()
30
- self._message_repository = Mock()
31
22
  self._cursor = Mock(spec=Cursor)
32
- self._stream = ThreadBasedConcurrentStream(
23
+ self._stream = DefaultStream(
33
24
  self._partition_generator,
34
- self._max_workers,
35
25
  self._name,
36
26
  self._json_schema,
37
27
  self._availability_strategy,
38
28
  self._primary_key,
39
29
  self._cursor_field,
40
- self._slice_logger,
41
30
  self._logger,
42
- self._message_repository,
43
- 1,
44
- _MAX_CONCURRENT_TASKS,
45
- 0,
46
- cursor=self._cursor,
47
31
  )
48
32
 
49
33
  def test_get_json_schema(self):
@@ -65,13 +49,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
65
49
  with self.assertRaises(Exception):
66
50
  self._stream._check_for_errors(futures)
67
51
 
68
- def test_check_for_error_raises_no_exception_if_all_futures_succeeded(self):
69
- futures = [Mock() for _ in range(3)]
70
- for f in futures:
71
- f.exception.return_value = None
72
-
73
- self._stream._check_for_errors(futures)
74
-
75
52
  def test_check_for_error_raises_an_exception_if_any_of_the_futures_raised_an_exception(self):
76
53
  futures = [Mock() for _ in range(3)]
77
54
  for f in futures:
@@ -81,96 +58,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
81
58
  with self.assertRaises(Exception):
82
59
  self._stream._check_for_errors(futures)
83
60
 
84
- def test_read_observe_records_and_close_partition(self):
85
- partition = Mock(spec=Partition)
86
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
87
- partition.read.return_value = expected_records
88
- partition.to_slice.return_value = {"slice": "slice"}
89
- self._slice_logger.should_log_slice_message.return_value = False
90
-
91
- self._partition_generator.generate.return_value = [partition]
92
- actual_records = list(self._stream.read())
93
-
94
- assert expected_records == actual_records
95
-
96
- self._cursor.observe.has_calls([call(record) for record in expected_records])
97
- self._cursor.close_partition.assert_called_once_with(partition)
98
-
99
- def test_read_no_slice_message(self):
100
- partition = Mock(spec=Partition)
101
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
102
- partition.read.return_value = expected_records
103
- partition.to_slice.return_value = {"slice": "slice"}
104
- self._slice_logger.should_log_slice_message.return_value = False
105
-
106
- self._partition_generator.generate.return_value = [partition]
107
- actual_records = list(self._stream.read())
108
-
109
- assert expected_records == actual_records
110
-
111
- self._message_repository.emit_message.assert_not_called()
112
-
113
- def test_read_log_slice_message(self):
114
- partition = Mock(spec=Partition)
115
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
116
- partition.read.return_value = expected_records
117
- partition.to_slice.return_value = {"slice": "slice"}
118
- self._slice_logger.should_log_slice_message.return_value = True
119
- slice_log_message = Mock()
120
- self._slice_logger.create_slice_log_message.return_value = slice_log_message
121
-
122
- self._partition_generator.generate.return_value = [partition]
123
- list(self._stream.read())
124
-
125
- self._message_repository.emit_message.assert_called_once_with(slice_log_message)
126
-
127
- def test_wait_while_task_queue_is_full(self):
128
- f1 = Mock()
129
- f2 = Mock()
130
-
131
- # Verify that the done() method will be called until only one future is still running
132
- f1.done.side_effect = [False, False]
133
- f1.exception.return_value = None
134
- f2.done.side_effect = [False, True]
135
- f2.exception.return_value = None
136
- futures = [f1, f2]
137
- self._stream._wait_while_too_many_pending_futures(futures)
138
-
139
- f1.done.assert_has_calls([call(), call()])
140
- f2.done.assert_has_calls([call(), call()])
141
-
142
- def test_given_exception_then_fail_immediately(self):
143
- f1 = Mock()
144
- f2 = Mock()
145
-
146
- # Verify that the done() method will be called until only one future is still running
147
- f1.done.return_value = True
148
- f1.exception.return_value = None
149
- f2.done.return_value = True
150
- f2.exception.return_value = ValueError("ERROR")
151
- futures = [f1, f2]
152
-
153
- with pytest.raises(RuntimeError):
154
- self._stream._wait_while_too_many_pending_futures(futures)
155
-
156
- def test_given_removing_multiple_elements_when_pruning_then_fail_immediately(self):
157
- # Verify that the done() method will be called until only one future is still running
158
- futures = []
159
- for _ in range(_MAX_CONCURRENT_TASKS + 1):
160
- future = Mock()
161
- future.done.return_value = True
162
- future.exception.return_value = None
163
- futures.append(future)
164
-
165
- pending_future = Mock()
166
- pending_future.done.return_value = False
167
- pending_future.exception.return_value = None
168
- futures.append(pending_future)
169
-
170
- self._stream._wait_while_too_many_pending_futures(futures)
171
-
172
- assert futures == [pending_future]
173
-
174
61
  def test_as_airbyte_stream(self):
175
62
  expected_airbyte_stream = AirbyteStream(
176
63
  name=self._name,
@@ -193,20 +80,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
193
80
  "id_b": {"type": ["null", "string"]},
194
81
  },
195
82
  }
196
- stream = ThreadBasedConcurrentStream(
83
+ stream = DefaultStream(
197
84
  self._partition_generator,
198
- self._max_workers,
199
85
  self._name,
200
86
  json_schema,
201
87
  self._availability_strategy,
202
88
  ["id"],
203
89
  self._cursor_field,
204
- self._slice_logger,
205
90
  self._logger,
206
- self._message_repository,
207
- 1,
208
- 2,
209
- 0,
210
91
  )
211
92
 
212
93
  expected_airbyte_stream = AirbyteStream(
@@ -230,20 +111,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
230
111
  "id_b": {"type": ["null", "string"]},
231
112
  },
232
113
  }
233
- stream = ThreadBasedConcurrentStream(
114
+ stream = DefaultStream(
234
115
  self._partition_generator,
235
- self._max_workers,
236
116
  self._name,
237
117
  json_schema,
238
118
  self._availability_strategy,
239
119
  ["id_a", "id_b"],
240
120
  self._cursor_field,
241
- self._slice_logger,
242
121
  self._logger,
243
- self._message_repository,
244
- 1,
245
- 2,
246
- 0,
247
122
  )
248
123
 
249
124
  expected_airbyte_stream = AirbyteStream(
@@ -267,20 +142,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
267
142
  "date": {"type": ["null", "string"]},
268
143
  },
269
144
  }
270
- stream = ThreadBasedConcurrentStream(
145
+ stream = DefaultStream(
271
146
  self._partition_generator,
272
- self._max_workers,
273
147
  self._name,
274
148
  json_schema,
275
149
  self._availability_strategy,
276
150
  self._primary_key,
277
151
  "date",
278
- self._slice_logger,
279
152
  self._logger,
280
- self._message_repository,
281
- 1,
282
- 2,
283
- 0,
284
153
  )
285
154
 
286
155
  expected_airbyte_stream = AirbyteStream(
@@ -297,20 +166,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
297
166
  assert expected_airbyte_stream == airbyte_stream
298
167
 
299
168
  def test_as_airbyte_stream_with_namespace(self):
300
- stream = ThreadBasedConcurrentStream(
169
+ stream = DefaultStream(
301
170
  self._partition_generator,
302
- self._max_workers,
303
171
  self._name,
304
172
  self._json_schema,
305
173
  self._availability_strategy,
306
174
  self._primary_key,
307
175
  self._cursor_field,
308
- self._slice_logger,
309
176
  self._logger,
310
- self._message_repository,
311
- 1,
312
- 2,
313
- 0,
314
177
  namespace="test",
315
178
  )
316
179
  expected_airbyte_stream = AirbyteStream(
@@ -16,8 +16,8 @@ def test_partition_reader():
16
16
 
17
17
  stream_partition = Mock()
18
18
  records = [
19
- Record({"id": 1, "name": "Jack"}),
20
- Record({"id": 2, "name": "John"}),
19
+ Record({"id": 1, "name": "Jack"}, "stream"),
20
+ Record({"id": 2, "name": "John"}, "stream"),
21
21
  ]
22
22
  stream_partition.read.return_value = iter(records)
23
23
 
@@ -0,0 +1,98 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ from concurrent.futures import Future, ThreadPoolExecutor
5
+ from unittest import TestCase
6
+ from unittest.mock import Mock, patch
7
+
8
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
9
+
10
+ _SLEEP_TIME = 2
11
+
12
+
13
+ class ThreadPoolManagerTest(TestCase):
14
+ def setUp(self):
15
+ self._threadpool = Mock(spec=ThreadPoolExecutor)
16
+ self._thread_pool_manager = ThreadPoolManager(self._threadpool, Mock(), max_concurrent_tasks=1, sleep_time=_SLEEP_TIME)
17
+ self._fn = lambda x: x
18
+ self._arg = "arg"
19
+
20
+ def test_submit_calls_underlying_thread_pool(self):
21
+ self._thread_pool_manager.submit(self._fn, self._arg)
22
+ self._threadpool.submit.assert_called_with(self._fn, self._arg)
23
+
24
+ assert len(self._thread_pool_manager._futures) == 1
25
+
26
+ def test_submit_too_many_concurrent_tasks(self):
27
+ future = Mock(spec=Future)
28
+ future.exception.return_value = None
29
+ future.done.side_effect = [False, True]
30
+
31
+ with patch("time.sleep") as sleep_mock:
32
+ self._thread_pool_manager._futures = [future]
33
+ self._thread_pool_manager.submit(self._fn, self._arg)
34
+ self._threadpool.submit.assert_called_with(self._fn, self._arg)
35
+ sleep_mock.assert_called_with(_SLEEP_TIME)
36
+
37
+ assert len(self._thread_pool_manager._futures) == 1
38
+
39
+ def test_submit_task_previous_task_failed(self):
40
+ future = Mock(spec=Future)
41
+ future.exception.return_value = RuntimeError
42
+ future.done.side_effect = [False, True]
43
+
44
+ self._thread_pool_manager._futures = [future]
45
+
46
+ with self.assertRaises(RuntimeError):
47
+ self._thread_pool_manager.submit(self._fn, self._arg)
48
+
49
+ def test_shutdown(self):
50
+ self._thread_pool_manager.shutdown()
51
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
52
+
53
+ def test_is_done_is_false_if_not_all_futures_are_done(self):
54
+ future = Mock(spec=Future)
55
+ future.done.return_value = False
56
+
57
+ self._thread_pool_manager._futures = [future]
58
+
59
+ assert not self._thread_pool_manager.is_done()
60
+
61
+ def test_is_done_is_true_if_all_futures_are_done(self):
62
+ future = Mock(spec=Future)
63
+ future.done.return_value = True
64
+
65
+ self._thread_pool_manager._futures = [future]
66
+
67
+ assert self._thread_pool_manager.is_done()
68
+
69
+ def test_threadpool_shutdown_if_errors(self):
70
+ future = Mock(spec=Future)
71
+ future.exception.return_value = RuntimeError
72
+
73
+ self._thread_pool_manager._futures = [future]
74
+
75
+ with self.assertRaises(RuntimeError):
76
+ self._thread_pool_manager.check_for_errors_and_shutdown()
77
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
78
+
79
+ def test_check_for_errors_and_shutdown_raises_error_if_futures_are_not_done(self):
80
+ future = Mock(spec=Future)
81
+ future.exception.return_value = None
82
+ future.done.return_value = False
83
+
84
+ self._thread_pool_manager._futures = [future]
85
+
86
+ with self.assertRaises(RuntimeError):
87
+ self._thread_pool_manager.check_for_errors_and_shutdown()
88
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
89
+
90
+ def test_check_for_errors_and_shutdown_does_not_raise_error_if_futures_are_done(self):
91
+ future = Mock(spec=Future)
92
+ future.exception.return_value = None
93
+ future.done.return_value = True
94
+
95
+ self._thread_pool_manager._futures = [future]
96
+
97
+ self._thread_pool_manager.check_for_errors_and_shutdown()
98
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
@@ -1,7 +1,6 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
5
4
  import logging
6
5
  from typing import Any, Iterable, List, Mapping, Optional, Union
7
6
  from unittest.mock import Mock
@@ -59,7 +58,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
59
58
  source = Mock()
60
59
  source._slice_logger = slice_logger
61
60
  source.message_repository = message_repository
62
- stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
61
+ stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, NoopCursor())
63
62
  stream.logger.setLevel(logger.level)
64
63
  return stream
65
64
 
@@ -0,0 +1,105 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import concurrent
5
+ import logging
6
+ from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Tuple
7
+ from unittest.mock import Mock
8
+
9
+ from airbyte_cdk.models import SyncMode
10
+ from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
11
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
12
+ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
13
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
14
+ from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability, StreamAvailable, StreamUnavailable
15
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
16
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
17
+ from airbyte_protocol.models import AirbyteStream
18
+
19
+ logger = logging.getLogger("airbyte")
20
+
21
+
22
+ class _MockSource(ConcurrentSource):
23
+ def __init__(
24
+ self,
25
+ check_lambda: Callable[[], Tuple[bool, Optional[Any]]] = None,
26
+ per_stream: bool = True,
27
+ message_repository: MessageRepository = InMemoryMessageRepository(),
28
+ threadpool: ThreadPoolManager = ThreadPoolManager(
29
+ concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="workerpool"), logger
30
+ ),
31
+ exception_on_missing_stream: bool = True,
32
+ ):
33
+ super().__init__(threadpool, Mock(), Mock(), message_repository)
34
+ self.check_lambda = check_lambda
35
+ self.per_stream = per_stream
36
+ self.exception_on_missing_stream = exception_on_missing_stream
37
+ self._message_repository = message_repository
38
+
39
+
40
+ MESSAGE_FROM_REPOSITORY = Mock()
41
+
42
+
43
+ class _MockStream(AbstractStream):
44
+ def __init__(self, name: str, available: bool = True, json_schema: Dict[str, Any] = {}):
45
+ self._name = name
46
+ self._available = available
47
+ self._json_schema = json_schema
48
+
49
+ def generate_partitions(self) -> Iterable[Partition]:
50
+ yield _MockPartition(self._name)
51
+
52
+ @property
53
+ def name(self) -> str:
54
+ return self._name
55
+
56
+ @property
57
+ def cursor_field(self) -> Optional[str]:
58
+ raise NotImplementedError
59
+
60
+ def check_availability(self) -> StreamAvailability:
61
+ if self._available:
62
+ return StreamAvailable()
63
+ else:
64
+ return StreamUnavailable("stream is unavailable")
65
+
66
+ def get_json_schema(self) -> Mapping[str, Any]:
67
+ return self._json_schema
68
+
69
+ def as_airbyte_stream(self) -> AirbyteStream:
70
+ return AirbyteStream(name=self.name, json_schema=self.get_json_schema(), supported_sync_modes=[SyncMode.full_refresh])
71
+
72
+ def log_stream_sync_configuration(self) -> None:
73
+ raise NotImplementedError
74
+
75
+
76
+ class _MockPartition(Partition):
77
+ def __init__(self, name: str):
78
+ self._name = name
79
+ self._closed = False
80
+
81
+ def read(self) -> Iterable[Record]:
82
+ yield from [Record({"key": "value"}, self._name)]
83
+
84
+ def to_slice(self) -> Optional[Mapping[str, Any]]:
85
+ return {}
86
+
87
+ def stream_name(self) -> str:
88
+ return self._name
89
+
90
+ def close(self) -> None:
91
+ self._closed = True
92
+
93
+ def is_closed(self) -> bool:
94
+ return self._closed
95
+
96
+ def __hash__(self) -> int:
97
+ return hash(self._name)
98
+
99
+
100
+ def test_concurrent_source_reading_from_no_streams():
101
+ stream = _MockStream("my_stream", False, {})
102
+ source = _MockSource()
103
+ messages = []
104
+ for m in source.read([stream]):
105
+ messages.append(m)