airbyte-cdk 0.54.0__py3-none-any.whl → 0.55.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. airbyte_cdk/sources/concurrent_source/__init__.py +3 -0
  2. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +190 -0
  3. airbyte_cdk/sources/concurrent_source/concurrent_source.py +161 -0
  4. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +63 -0
  5. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +17 -0
  6. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +97 -0
  7. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +4 -4
  8. airbyte_cdk/sources/streams/concurrent/adapters.py +34 -12
  9. airbyte_cdk/sources/streams/concurrent/default_stream.py +79 -0
  10. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +7 -7
  11. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +23 -0
  12. airbyte_cdk/sources/streams/concurrent/partitions/record.py +4 -3
  13. airbyte_cdk/sources/streams/concurrent/partitions/types.py +2 -3
  14. airbyte_cdk/sources/utils/slice_logger.py +5 -0
  15. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/METADATA +1 -1
  16. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/RECORD +35 -23
  17. unit_tests/sources/concurrent_source/__init__.py +3 -0
  18. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +105 -0
  19. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +14 -7
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +2 -3
  21. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +44 -55
  22. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +24 -15
  23. unit_tests/sources/streams/concurrent/test_adapters.py +52 -32
  24. unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py +6 -5
  25. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +604 -0
  26. unit_tests/sources/streams/concurrent/test_cursor.py +1 -1
  27. unit_tests/sources/streams/concurrent/{test_thread_based_concurrent_stream.py → test_default_stream.py} +7 -144
  28. unit_tests/sources/streams/concurrent/test_partition_reader.py +2 -2
  29. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +98 -0
  30. unit_tests/sources/streams/test_stream_read.py +1 -2
  31. unit_tests/sources/test_concurrent_source.py +105 -0
  32. unit_tests/sources/test_source_read.py +461 -0
  33. airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py +0 -221
  34. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/LICENSE.txt +0 -0
  35. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/WHEEL +0 -0
  36. {airbyte_cdk-0.54.0.dist-info → airbyte_cdk-0.55.0.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,33 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
5
4
  import unittest
6
- from unittest.mock import Mock, call
5
+ from unittest.mock import Mock
7
6
 
8
- import pytest
9
7
  from airbyte_cdk.models import AirbyteStream, SyncMode
10
8
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import STREAM_AVAILABLE
11
9
  from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
12
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
13
- from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
14
- from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
15
-
16
- _MAX_CONCURRENT_TASKS = 2
10
+ from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
17
11
 
18
12
 
19
13
  class ThreadBasedConcurrentStreamTest(unittest.TestCase):
20
14
  def setUp(self):
21
15
  self._partition_generator = Mock()
22
- self._max_workers = 1
23
16
  self._name = "name"
24
17
  self._json_schema = {}
25
18
  self._availability_strategy = Mock()
26
19
  self._primary_key = []
27
20
  self._cursor_field = None
28
- self._slice_logger = Mock()
29
21
  self._logger = Mock()
30
- self._message_repository = Mock()
31
22
  self._cursor = Mock(spec=Cursor)
32
- self._stream = ThreadBasedConcurrentStream(
23
+ self._stream = DefaultStream(
33
24
  self._partition_generator,
34
- self._max_workers,
35
25
  self._name,
36
26
  self._json_schema,
37
27
  self._availability_strategy,
38
28
  self._primary_key,
39
29
  self._cursor_field,
40
- self._slice_logger,
41
30
  self._logger,
42
- self._message_repository,
43
- 1,
44
- _MAX_CONCURRENT_TASKS,
45
- 0,
46
- cursor=self._cursor,
47
31
  )
48
32
 
49
33
  def test_get_json_schema(self):
@@ -65,13 +49,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
65
49
  with self.assertRaises(Exception):
66
50
  self._stream._check_for_errors(futures)
67
51
 
68
- def test_check_for_error_raises_no_exception_if_all_futures_succeeded(self):
69
- futures = [Mock() for _ in range(3)]
70
- for f in futures:
71
- f.exception.return_value = None
72
-
73
- self._stream._check_for_errors(futures)
74
-
75
52
  def test_check_for_error_raises_an_exception_if_any_of_the_futures_raised_an_exception(self):
76
53
  futures = [Mock() for _ in range(3)]
77
54
  for f in futures:
@@ -81,96 +58,6 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
81
58
  with self.assertRaises(Exception):
82
59
  self._stream._check_for_errors(futures)
83
60
 
84
- def test_read_observe_records_and_close_partition(self):
85
- partition = Mock(spec=Partition)
86
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
87
- partition.read.return_value = expected_records
88
- partition.to_slice.return_value = {"slice": "slice"}
89
- self._slice_logger.should_log_slice_message.return_value = False
90
-
91
- self._partition_generator.generate.return_value = [partition]
92
- actual_records = list(self._stream.read())
93
-
94
- assert expected_records == actual_records
95
-
96
- self._cursor.observe.has_calls([call(record) for record in expected_records])
97
- self._cursor.close_partition.assert_called_once_with(partition)
98
-
99
- def test_read_no_slice_message(self):
100
- partition = Mock(spec=Partition)
101
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
102
- partition.read.return_value = expected_records
103
- partition.to_slice.return_value = {"slice": "slice"}
104
- self._slice_logger.should_log_slice_message.return_value = False
105
-
106
- self._partition_generator.generate.return_value = [partition]
107
- actual_records = list(self._stream.read())
108
-
109
- assert expected_records == actual_records
110
-
111
- self._message_repository.emit_message.assert_not_called()
112
-
113
- def test_read_log_slice_message(self):
114
- partition = Mock(spec=Partition)
115
- expected_records = [Record({"id": 1}), Record({"id": "2"})]
116
- partition.read.return_value = expected_records
117
- partition.to_slice.return_value = {"slice": "slice"}
118
- self._slice_logger.should_log_slice_message.return_value = True
119
- slice_log_message = Mock()
120
- self._slice_logger.create_slice_log_message.return_value = slice_log_message
121
-
122
- self._partition_generator.generate.return_value = [partition]
123
- list(self._stream.read())
124
-
125
- self._message_repository.emit_message.assert_called_once_with(slice_log_message)
126
-
127
- def test_wait_while_task_queue_is_full(self):
128
- f1 = Mock()
129
- f2 = Mock()
130
-
131
- # Verify that the done() method will be called until only one future is still running
132
- f1.done.side_effect = [False, False]
133
- f1.exception.return_value = None
134
- f2.done.side_effect = [False, True]
135
- f2.exception.return_value = None
136
- futures = [f1, f2]
137
- self._stream._wait_while_too_many_pending_futures(futures)
138
-
139
- f1.done.assert_has_calls([call(), call()])
140
- f2.done.assert_has_calls([call(), call()])
141
-
142
- def test_given_exception_then_fail_immediately(self):
143
- f1 = Mock()
144
- f2 = Mock()
145
-
146
- # Verify that the done() method will be called until only one future is still running
147
- f1.done.return_value = True
148
- f1.exception.return_value = None
149
- f2.done.return_value = True
150
- f2.exception.return_value = ValueError("ERROR")
151
- futures = [f1, f2]
152
-
153
- with pytest.raises(RuntimeError):
154
- self._stream._wait_while_too_many_pending_futures(futures)
155
-
156
- def test_given_removing_multiple_elements_when_pruning_then_fail_immediately(self):
157
- # Verify that the done() method will be called until only one future is still running
158
- futures = []
159
- for _ in range(_MAX_CONCURRENT_TASKS + 1):
160
- future = Mock()
161
- future.done.return_value = True
162
- future.exception.return_value = None
163
- futures.append(future)
164
-
165
- pending_future = Mock()
166
- pending_future.done.return_value = False
167
- pending_future.exception.return_value = None
168
- futures.append(pending_future)
169
-
170
- self._stream._wait_while_too_many_pending_futures(futures)
171
-
172
- assert futures == [pending_future]
173
-
174
61
  def test_as_airbyte_stream(self):
175
62
  expected_airbyte_stream = AirbyteStream(
176
63
  name=self._name,
@@ -193,20 +80,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
193
80
  "id_b": {"type": ["null", "string"]},
194
81
  },
195
82
  }
196
- stream = ThreadBasedConcurrentStream(
83
+ stream = DefaultStream(
197
84
  self._partition_generator,
198
- self._max_workers,
199
85
  self._name,
200
86
  json_schema,
201
87
  self._availability_strategy,
202
88
  ["id"],
203
89
  self._cursor_field,
204
- self._slice_logger,
205
90
  self._logger,
206
- self._message_repository,
207
- 1,
208
- 2,
209
- 0,
210
91
  )
211
92
 
212
93
  expected_airbyte_stream = AirbyteStream(
@@ -230,20 +111,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
230
111
  "id_b": {"type": ["null", "string"]},
231
112
  },
232
113
  }
233
- stream = ThreadBasedConcurrentStream(
114
+ stream = DefaultStream(
234
115
  self._partition_generator,
235
- self._max_workers,
236
116
  self._name,
237
117
  json_schema,
238
118
  self._availability_strategy,
239
119
  ["id_a", "id_b"],
240
120
  self._cursor_field,
241
- self._slice_logger,
242
121
  self._logger,
243
- self._message_repository,
244
- 1,
245
- 2,
246
- 0,
247
122
  )
248
123
 
249
124
  expected_airbyte_stream = AirbyteStream(
@@ -267,20 +142,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
267
142
  "date": {"type": ["null", "string"]},
268
143
  },
269
144
  }
270
- stream = ThreadBasedConcurrentStream(
145
+ stream = DefaultStream(
271
146
  self._partition_generator,
272
- self._max_workers,
273
147
  self._name,
274
148
  json_schema,
275
149
  self._availability_strategy,
276
150
  self._primary_key,
277
151
  "date",
278
- self._slice_logger,
279
152
  self._logger,
280
- self._message_repository,
281
- 1,
282
- 2,
283
- 0,
284
153
  )
285
154
 
286
155
  expected_airbyte_stream = AirbyteStream(
@@ -297,20 +166,14 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
297
166
  assert expected_airbyte_stream == airbyte_stream
298
167
 
299
168
  def test_as_airbyte_stream_with_namespace(self):
300
- stream = ThreadBasedConcurrentStream(
169
+ stream = DefaultStream(
301
170
  self._partition_generator,
302
- self._max_workers,
303
171
  self._name,
304
172
  self._json_schema,
305
173
  self._availability_strategy,
306
174
  self._primary_key,
307
175
  self._cursor_field,
308
- self._slice_logger,
309
176
  self._logger,
310
- self._message_repository,
311
- 1,
312
- 2,
313
- 0,
314
177
  namespace="test",
315
178
  )
316
179
  expected_airbyte_stream = AirbyteStream(
@@ -16,8 +16,8 @@ def test_partition_reader():
16
16
 
17
17
  stream_partition = Mock()
18
18
  records = [
19
- Record({"id": 1, "name": "Jack"}),
20
- Record({"id": 2, "name": "John"}),
19
+ Record({"id": 1, "name": "Jack"}, "stream"),
20
+ Record({"id": 2, "name": "John"}, "stream"),
21
21
  ]
22
22
  stream_partition.read.return_value = iter(records)
23
23
 
@@ -0,0 +1,98 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ from concurrent.futures import Future, ThreadPoolExecutor
5
+ from unittest import TestCase
6
+ from unittest.mock import Mock, patch
7
+
8
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
9
+
10
+ _SLEEP_TIME = 2
11
+
12
+
13
+ class ThreadPoolManagerTest(TestCase):
14
+ def setUp(self):
15
+ self._threadpool = Mock(spec=ThreadPoolExecutor)
16
+ self._thread_pool_manager = ThreadPoolManager(self._threadpool, Mock(), max_concurrent_tasks=1, sleep_time=_SLEEP_TIME)
17
+ self._fn = lambda x: x
18
+ self._arg = "arg"
19
+
20
+ def test_submit_calls_underlying_thread_pool(self):
21
+ self._thread_pool_manager.submit(self._fn, self._arg)
22
+ self._threadpool.submit.assert_called_with(self._fn, self._arg)
23
+
24
+ assert len(self._thread_pool_manager._futures) == 1
25
+
26
+ def test_submit_too_many_concurrent_tasks(self):
27
+ future = Mock(spec=Future)
28
+ future.exception.return_value = None
29
+ future.done.side_effect = [False, True]
30
+
31
+ with patch("time.sleep") as sleep_mock:
32
+ self._thread_pool_manager._futures = [future]
33
+ self._thread_pool_manager.submit(self._fn, self._arg)
34
+ self._threadpool.submit.assert_called_with(self._fn, self._arg)
35
+ sleep_mock.assert_called_with(_SLEEP_TIME)
36
+
37
+ assert len(self._thread_pool_manager._futures) == 1
38
+
39
+ def test_submit_task_previous_task_failed(self):
40
+ future = Mock(spec=Future)
41
+ future.exception.return_value = RuntimeError
42
+ future.done.side_effect = [False, True]
43
+
44
+ self._thread_pool_manager._futures = [future]
45
+
46
+ with self.assertRaises(RuntimeError):
47
+ self._thread_pool_manager.submit(self._fn, self._arg)
48
+
49
+ def test_shutdown(self):
50
+ self._thread_pool_manager.shutdown()
51
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
52
+
53
+ def test_is_done_is_false_if_not_all_futures_are_done(self):
54
+ future = Mock(spec=Future)
55
+ future.done.return_value = False
56
+
57
+ self._thread_pool_manager._futures = [future]
58
+
59
+ assert not self._thread_pool_manager.is_done()
60
+
61
+ def test_is_done_is_true_if_all_futures_are_done(self):
62
+ future = Mock(spec=Future)
63
+ future.done.return_value = True
64
+
65
+ self._thread_pool_manager._futures = [future]
66
+
67
+ assert self._thread_pool_manager.is_done()
68
+
69
+ def test_threadpool_shutdown_if_errors(self):
70
+ future = Mock(spec=Future)
71
+ future.exception.return_value = RuntimeError
72
+
73
+ self._thread_pool_manager._futures = [future]
74
+
75
+ with self.assertRaises(RuntimeError):
76
+ self._thread_pool_manager.check_for_errors_and_shutdown()
77
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
78
+
79
+ def test_check_for_errors_and_shutdown_raises_error_if_futures_are_not_done(self):
80
+ future = Mock(spec=Future)
81
+ future.exception.return_value = None
82
+ future.done.return_value = False
83
+
84
+ self._thread_pool_manager._futures = [future]
85
+
86
+ with self.assertRaises(RuntimeError):
87
+ self._thread_pool_manager.check_for_errors_and_shutdown()
88
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
89
+
90
+ def test_check_for_errors_and_shutdown_does_not_raise_error_if_futures_are_done(self):
91
+ future = Mock(spec=Future)
92
+ future.exception.return_value = None
93
+ future.done.return_value = True
94
+
95
+ self._thread_pool_manager._futures = [future]
96
+
97
+ self._thread_pool_manager.check_for_errors_and_shutdown()
98
+ self._threadpool.shutdown.assert_called_with(wait=False, cancel_futures=True)
@@ -1,7 +1,6 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
-
5
4
  import logging
6
5
  from typing import Any, Iterable, List, Mapping, Optional, Union
7
6
  from unittest.mock import Mock
@@ -59,7 +58,7 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
59
58
  source = Mock()
60
59
  source._slice_logger = slice_logger
61
60
  source.message_repository = message_repository
62
- stream = StreamFacade.create_from_stream(stream, source, logger, 1, _NO_STATE, NoopCursor())
61
+ stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, NoopCursor())
63
62
  stream.logger.setLevel(logger.level)
64
63
  return stream
65
64
 
@@ -0,0 +1,105 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import concurrent
5
+ import logging
6
+ from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Tuple
7
+ from unittest.mock import Mock
8
+
9
+ from airbyte_cdk.models import SyncMode
10
+ from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
11
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
12
+ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
13
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
14
+ from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability, StreamAvailable, StreamUnavailable
15
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
16
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
17
+ from airbyte_protocol.models import AirbyteStream
18
+
19
+ logger = logging.getLogger("airbyte")
20
+
21
+
22
+ class _MockSource(ConcurrentSource):
23
+ def __init__(
24
+ self,
25
+ check_lambda: Callable[[], Tuple[bool, Optional[Any]]] = None,
26
+ per_stream: bool = True,
27
+ message_repository: MessageRepository = InMemoryMessageRepository(),
28
+ threadpool: ThreadPoolManager = ThreadPoolManager(
29
+ concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="workerpool"), logger
30
+ ),
31
+ exception_on_missing_stream: bool = True,
32
+ ):
33
+ super().__init__(threadpool, Mock(), Mock(), message_repository)
34
+ self.check_lambda = check_lambda
35
+ self.per_stream = per_stream
36
+ self.exception_on_missing_stream = exception_on_missing_stream
37
+ self._message_repository = message_repository
38
+
39
+
40
+ MESSAGE_FROM_REPOSITORY = Mock()
41
+
42
+
43
+ class _MockStream(AbstractStream):
44
+ def __init__(self, name: str, available: bool = True, json_schema: Dict[str, Any] = {}):
45
+ self._name = name
46
+ self._available = available
47
+ self._json_schema = json_schema
48
+
49
+ def generate_partitions(self) -> Iterable[Partition]:
50
+ yield _MockPartition(self._name)
51
+
52
+ @property
53
+ def name(self) -> str:
54
+ return self._name
55
+
56
+ @property
57
+ def cursor_field(self) -> Optional[str]:
58
+ raise NotImplementedError
59
+
60
+ def check_availability(self) -> StreamAvailability:
61
+ if self._available:
62
+ return StreamAvailable()
63
+ else:
64
+ return StreamUnavailable("stream is unavailable")
65
+
66
+ def get_json_schema(self) -> Mapping[str, Any]:
67
+ return self._json_schema
68
+
69
+ def as_airbyte_stream(self) -> AirbyteStream:
70
+ return AirbyteStream(name=self.name, json_schema=self.get_json_schema(), supported_sync_modes=[SyncMode.full_refresh])
71
+
72
+ def log_stream_sync_configuration(self) -> None:
73
+ raise NotImplementedError
74
+
75
+
76
+ class _MockPartition(Partition):
77
+ def __init__(self, name: str):
78
+ self._name = name
79
+ self._closed = False
80
+
81
+ def read(self) -> Iterable[Record]:
82
+ yield from [Record({"key": "value"}, self._name)]
83
+
84
+ def to_slice(self) -> Optional[Mapping[str, Any]]:
85
+ return {}
86
+
87
+ def stream_name(self) -> str:
88
+ return self._name
89
+
90
+ def close(self) -> None:
91
+ self._closed = True
92
+
93
+ def is_closed(self) -> bool:
94
+ return self._closed
95
+
96
+ def __hash__(self) -> int:
97
+ return hash(self._name)
98
+
99
+
100
+ def test_concurrent_source_reading_from_no_streams():
101
+ stream = _MockStream("my_stream", False, {})
102
+ source = _MockSource()
103
+ messages = []
104
+ for m in source.read([stream]):
105
+ messages.append(m)