airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. airbyte_cdk/sources/abstract_source.py +14 -33
  2. airbyte_cdk/sources/connector_state_manager.py +16 -4
  3. airbyte_cdk/sources/file_based/file_based_source.py +87 -35
  4. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
  5. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
  6. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
  7. airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
  8. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
  9. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
  10. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
  11. airbyte_cdk/test/mock_http/mocker.py +3 -1
  12. airbyte_cdk/test/mock_http/response.py +9 -1
  13. airbyte_cdk/utils/traced_exception.py +1 -16
  14. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
  15. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
  19. unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
  20. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  21. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
  22. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
  23. unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
  24. unit_tests/sources/file_based/test_scenarios.py +16 -8
  25. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
  26. unit_tests/sources/test_abstract_source.py +36 -170
  27. unit_tests/sources/test_connector_state_manager.py +20 -13
  28. unit_tests/sources/test_integration_source.py +8 -25
  29. unit_tests/sources/test_source_read.py +1 -1
  30. unit_tests/test/mock_http/test_mocker.py +3 -1
  31. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
  33. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, MutableMapping, Optional, Tuple
6
+ from unittest.mock import MagicMock
7
+
8
+ import pytest
9
+ from airbyte_cdk.models import AirbyteStateMessage, SyncMode
10
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
11
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
13
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor
14
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
15
+ from freezegun import freeze_time
16
+
17
+ DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT
18
+ MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
19
+
20
+
21
+ def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor:
22
+ stream = MagicMock()
23
+ stream.name = "test"
24
+ stream.namespace = None
25
+ stream_config = MagicMock()
26
+ stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
27
+ cursor = FileBasedConcurrentCursor(
28
+ stream_config,
29
+ stream.name,
30
+ None,
31
+ input_state,
32
+ MagicMock(),
33
+ ConnectorStateManager(
34
+ stream_instance_map={stream.name: stream},
35
+ state=[AirbyteStateMessage.parse_obj(input_state)] if input_state is not None else None,
36
+ ),
37
+ CursorField(FileBasedConcurrentCursor.CURSOR_FIELD),
38
+ )
39
+ return cursor
40
+
41
+
42
+ @pytest.mark.parametrize(
43
+ "input_state, expected_cursor_value",
44
+ [
45
+ pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"),
46
+ pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"),
47
+ pytest.param(
48
+ {
49
+ "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
50
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
51
+ },
52
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
53
+ id="cursor-value-matches-earliest-file",
54
+ ),
55
+ pytest.param(
56
+ {
57
+ "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
58
+ "_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"
59
+ },
60
+ (datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
61
+ id="cursor-value-is-earlier",
62
+ ),
63
+ pytest.param(
64
+ {
65
+ "history": {"a.csv": "2022-01-01T00:00:00.000000Z"},
66
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
67
+ },
68
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
69
+ id="cursor-value-is-later",
70
+ ),
71
+ pytest.param(
72
+ {
73
+ "history": {
74
+ "a.csv": "2021-01-01T00:00:00.000000Z",
75
+ "b.csv": "2021-01-02T00:00:00.000000Z",
76
+ "c.csv": "2021-01-03T00:00:00.000000Z"
77
+ },
78
+ "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv"
79
+ },
80
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
81
+ id="cursor-not-earliest",
82
+ ),
83
+ pytest.param(
84
+ {
85
+ "history": {"b.csv": "2020-12-31T00:00:00.000000Z"},
86
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
87
+ },
88
+ (datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"),
89
+ id="state-with-cursor-and-earlier-history"
90
+ ),
91
+ pytest.param(
92
+ {
93
+ "history": {"b.csv": "2021-01-02T00:00:00.000000Z"},
94
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
95
+ },
96
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
97
+ id="state-with-cursor-and-later-history"
98
+ ),
99
+ ]
100
+ )
101
+ def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]):
102
+ cursor = _make_cursor(input_state)
103
+ assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
108
+ [
109
+ pytest.param(
110
+ {"history": {}},
111
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
112
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
113
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
114
+ [],
115
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
116
+ id="add-to-empty-history-single-pending-file",
117
+ ),
118
+ pytest.param(
119
+ {"history": {}},
120
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
121
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
122
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
123
+ [("pending.csv", "2020-01-05T00:00:00.000000Z")],
124
+ "2020-01-05T00:00:00.000000Z_pending.csv",
125
+ id="add-to-empty-history-pending-file-is-older",
126
+ ),
127
+ pytest.param(
128
+ {"history": {}},
129
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
130
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
131
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
132
+ [("pending.csv", "2022-01-05T00:00:00.000000Z")],
133
+ "2022-01-05T00:00:00.000000Z_pending.csv",
134
+ id="add-to-empty-history-pending-file-is-newer",
135
+ ),
136
+ pytest.param(
137
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
138
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
139
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
140
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
141
+ [],
142
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
143
+ id="add-to-nonempty-history-single-pending-file",
144
+ ),
145
+ pytest.param(
146
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
147
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
148
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
149
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
150
+ [("pending.csv", "2020-01-05T00:00:00.000000Z")],
151
+ "2020-01-05T00:00:00.000000Z_pending.csv",
152
+ id="add-to-nonempty-history-pending-file-is-older",
153
+ ),
154
+ pytest.param(
155
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
156
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
157
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
158
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
159
+ [("pending.csv", "2022-01-05T00:00:00.000000Z")],
160
+ "2022-01-05T00:00:00.000000Z_pending.csv",
161
+ id="add-to-nonempty-history-pending-file-is-newer",
162
+ ),
163
+ ]
164
+ )
165
+ def test_add_file(
166
+ initial_state: MutableMapping[str, Any],
167
+ pending_files: List[Tuple[str, str]],
168
+ file_to_add: Tuple[str, str],
169
+ expected_history: Dict[str, Any],
170
+ expected_pending_files: List[Tuple[str, str]],
171
+ expected_cursor_value: str,
172
+ ):
173
+ cursor = _make_cursor(initial_state)
174
+ mock_message_repository = MagicMock()
175
+ cursor._message_repository = mock_message_repository
176
+ stream = MagicMock()
177
+
178
+ cursor.set_pending_partitions([
179
+ FileBasedStreamPartition(
180
+ stream,
181
+ {"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]},
182
+ mock_message_repository,
183
+ SyncMode.full_refresh,
184
+ FileBasedConcurrentCursor.CURSOR_FIELD,
185
+ initial_state,
186
+ cursor
187
+ ) for uri, timestamp in pending_files
188
+ ])
189
+
190
+ uri, timestamp = file_to_add
191
+ cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
192
+ assert cursor._file_to_datetime_history == expected_history
193
+ assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
194
+ assert mock_message_repository.emit_message.call_args_list[0].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
195
+
196
+
197
+ @pytest.mark.parametrize(
198
+ "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
199
+ [
200
+ pytest.param(
201
+ {"history": {}},
202
+ [],
203
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
204
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
205
+ [],
206
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
207
+ id="add-to-empty-history-no-pending-files",
208
+ ),
209
+ pytest.param(
210
+ {"history": {}},
211
+ [("pending.csv", "2021-01-05T00:00:00.000000Z")],
212
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
213
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
214
+ [("pending.csv", "2021-01-05T00:00:00.000000Z")],
215
+ "2021-01-05T00:00:00.000000Z_pending.csv",
216
+ id="add-to-empty-history-file-not-in-pending-files",
217
+ ),
218
+ ]
219
+ )
220
+ def test_add_file_invalid(
221
+ initial_state: MutableMapping[str, Any],
222
+ pending_files: List[Tuple[str, str]],
223
+ file_to_add: Tuple[str, str],
224
+ expected_history: Dict[str, Any],
225
+ expected_pending_files: List[Tuple[str, str]],
226
+ expected_cursor_value: str,
227
+ ):
228
+ cursor = _make_cursor(initial_state)
229
+ cursor._pending_files = {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files}
230
+ mock_message_repository = MagicMock()
231
+ cursor._message_repository = mock_message_repository
232
+
233
+ uri, timestamp = file_to_add
234
+ cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
235
+ assert cursor._file_to_datetime_history == expected_history
236
+ assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
237
+ assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN"
238
+ assert mock_message_repository.emit_message.call_args_list[1].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
239
+
240
+
241
+ @pytest.mark.parametrize(
242
+ "input_state, pending_files, expected_cursor_value",
243
+ [
244
+ pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"),
245
+ pytest.param(
246
+ {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
247
+ [],
248
+ "2021-01-01T00:00:00.000000Z_a.csv",
249
+ id="no-pending-with-history"
250
+ ),
251
+ pytest.param(
252
+ {"history": {}},
253
+ [("b.csv", "2021-01-02T00:00:00.000000Z")],
254
+ "2021-01-02T00:00:00.000000Z_b.csv",
255
+ id="pending-no-history"
256
+ ),
257
+ pytest.param(
258
+ {"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}},
259
+ [("b.csv", "2021-01-02T00:00:00.000000Z")],
260
+ "2021-01-01T00:00:00.000000Z_a.csv",
261
+ id="with-pending-before-history"
262
+ ),
263
+ pytest.param(
264
+ {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
265
+ [("b.csv", "2022-01-02T00:00:00.000000Z")],
266
+ "2022-01-01T00:00:00.000000Z_a.csv",
267
+ id="with-pending-after-history"
268
+ ),
269
+ ]
270
+ )
271
+ def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str):
272
+ cursor = _make_cursor(input_state)
273
+ pending_partitions = []
274
+ for url, timestamp in pending_files:
275
+ partition = MagicMock()
276
+ partition.to_slice = lambda *args, **kwargs: {"files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}
277
+ pending_partitions.append(partition)
278
+
279
+ cursor.set_pending_partitions(pending_partitions)
280
+
281
+
282
+ @pytest.mark.parametrize(
283
+ "all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync",
284
+ [
285
+ pytest.param(
286
+ [RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
287
+ {},
288
+ False,
289
+ (datetime.min, ""),
290
+ ["new.csv"],
291
+ id="empty-history-one-new-file"
292
+ ),
293
+ pytest.param(
294
+ [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
295
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
296
+ False,
297
+ (datetime.min, ""),
298
+ ["a.csv"],
299
+ id="non-empty-history-file-in-history-modified"
300
+ ),
301
+ pytest.param(
302
+ [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
303
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
304
+ False,
305
+ (datetime.min, ""),
306
+ [],
307
+ id="non-empty-history-file-in-history-not-modified"
308
+ ),
309
+ ]
310
+ )
311
+ def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync):
312
+ cursor = _make_cursor({})
313
+ cursor._file_to_datetime_history = history
314
+ cursor._prev_cursor_value = prev_cursor_value
315
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
316
+ files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock()))
317
+ assert [f.uri for f in files_to_sync] == expected_files_to_sync
318
+
319
+
320
+ @freeze_time("2023-06-16T00:00:00Z")
321
+ @pytest.mark.parametrize(
322
+ "file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync",
323
+ [
324
+ pytest.param(
325
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
326
+ {},
327
+ False,
328
+ (datetime.min, ""),
329
+ datetime.min,
330
+ True,
331
+ id="file-not-in-history-not-full-old-cursor"
332
+ ),
333
+ pytest.param(
334
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
335
+ {},
336
+ False,
337
+ (datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""),
338
+ datetime.min,
339
+ True,
340
+ id="file-not-in-history-not-full-new-cursor"
341
+ ),
342
+ pytest.param(
343
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
344
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
345
+ False,
346
+ (datetime.min, ""),
347
+ datetime.min,
348
+ False,
349
+ id="file-in-history-not-modified"
350
+ ),
351
+ pytest.param(
352
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
353
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
354
+ False,
355
+ (datetime.min, ""),
356
+ datetime.min,
357
+ False,
358
+ id="file-in-history-modified-before"
359
+ ),
360
+ pytest.param(
361
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
362
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
363
+ False,
364
+ (datetime.min, ""),
365
+ datetime.min,
366
+ True,
367
+ id="file-in-history-modified-after"
368
+ ),
369
+ pytest.param(
370
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
371
+ {},
372
+ True,
373
+ (datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
374
+ datetime.min,
375
+ True,
376
+ id="history-full-file-modified-after-cursor"
377
+ ),
378
+ pytest.param(
379
+ RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
380
+ {},
381
+ True,
382
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"),
383
+ datetime.min,
384
+ True,
385
+ id="history-full-modified-eq-cursor-uri-gt"
386
+ ),
387
+ pytest.param(
388
+ RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
389
+ {},
390
+ True,
391
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"),
392
+ datetime.min,
393
+ False,
394
+ id="history-full-modified-eq-cursor-uri-lt"
395
+ ),
396
+ pytest.param(
397
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
398
+ {},
399
+ True,
400
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
401
+ datetime.min,
402
+ True,
403
+ id="history-full-modified-before-cursor-and-after-sync-start"
404
+ ),
405
+ pytest.param(
406
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
407
+ {},
408
+ True,
409
+ (datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
410
+ datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
411
+ False,
412
+ id="history-full-modified-before-cursor-and-before-sync-start"
413
+ ),
414
+ ]
415
+ )
416
+ def test_should_sync_file(
417
+ file_to_check: RemoteFile,
418
+ history: Dict[str, Any],
419
+ is_history_full: bool,
420
+ prev_cursor_value: Tuple[datetime, str],
421
+ sync_start: datetime,
422
+ expected_should_sync: bool,
423
+ ):
424
+ cursor = _make_cursor({})
425
+ cursor._file_to_datetime_history = history
426
+ cursor._prev_cursor_value = prev_cursor_value
427
+ cursor._sync_start = sync_start
428
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
429
+ should_sync = cursor._should_sync_file(file_to_check, MagicMock())
430
+ assert should_sync == expected_should_sync
431
+
432
+
433
+ @freeze_time("2023-06-16T00:00:00Z")
434
+ @pytest.mark.parametrize(
435
+ "input_history, is_history_full, expected_start_time",
436
+ [
437
+ pytest.param({}, False, datetime.min, id="empty-history"),
438
+ pytest.param(
439
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
440
+ False,
441
+ datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
442
+ id="non-full-history"
443
+ ),
444
+ pytest.param(
445
+ {f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all before the time window
446
+ True,
447
+ datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), # Time window start time
448
+ id="full-history-earliest-before-window"
449
+ ),
450
+ pytest.param(
451
+ {f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all after the time window
452
+ True,
453
+ datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT), # Earliest file time
454
+ id="full-history-earliest-after-window"
455
+ ),
456
+ ]
457
+ )
458
+ def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch):
459
+ cursor = _make_cursor({"history": input_history})
460
+ cursor._file_to_datetime_history = input_history
461
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
462
+ assert cursor._compute_start_time() == expected_start_time
@@ -26,6 +26,29 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
26
26
  success_multi_stream_scenario,
27
27
  success_user_provided_schema_scenario,
28
28
  )
29
+ from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import (
30
+ multi_csv_different_timestamps_scenario_concurrent,
31
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
32
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
33
+ multi_csv_per_timestamp_scenario_concurrent,
34
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
35
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
36
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
37
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
38
+ multi_csv_same_timestamp_scenario_concurrent,
39
+ multi_csv_skip_file_if_already_in_history_concurrent,
40
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
41
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
42
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
43
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
44
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
45
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
46
+ single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
47
+ single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
48
+ single_csv_input_state_is_earlier_scenario_concurrent,
49
+ single_csv_input_state_is_later_scenario_concurrent,
50
+ single_csv_no_input_state_scenario_concurrent,
51
+ )
29
52
  from unit_tests.sources.file_based.scenarios.csv_scenarios import (
30
53
  csv_autogenerate_column_names_scenario,
31
54
  csv_custom_bool_values_scenario,
@@ -214,6 +237,28 @@ discover_scenarios = [
214
237
  unstructured_invalid_file_type_discover_scenario_no_skip,
215
238
  unstructured_invalid_file_type_discover_scenario_skip,
216
239
  unstructured_invalid_file_type_read_scenario,
240
+ multi_csv_different_timestamps_scenario_concurrent,
241
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
242
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
243
+ multi_csv_per_timestamp_scenario_concurrent,
244
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
245
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
246
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
247
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
248
+ multi_csv_same_timestamp_scenario_concurrent,
249
+ multi_csv_skip_file_if_already_in_history_concurrent,
250
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
251
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
252
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
253
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
254
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
255
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
256
+ single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
257
+ single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
258
+ single_csv_input_state_is_earlier_scenario_concurrent,
259
+ single_csv_input_state_is_later_scenario_concurrent,
260
+ single_csv_no_input_state_scenario_concurrent,
261
+
217
262
  ]
218
263
 
219
264
  read_scenarios = discover_scenarios + [
@@ -13,6 +13,7 @@ from _pytest.reports import ExceptionInfo
13
13
  from airbyte_cdk.entrypoint import launch
14
14
  from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode
15
15
  from airbyte_cdk.sources import AbstractSource
16
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
16
17
  from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
17
18
  from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
18
19
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
@@ -72,12 +73,10 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
72
73
  def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
73
74
  records, log_messages = output.records_and_state_messages, output.logs
74
75
  logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
75
- expected_records = scenario.expected_records
76
-
77
- if expected_records is None:
76
+ if scenario.expected_records is None:
78
77
  return
79
78
 
80
- assert len(records) == len(expected_records)
79
+ expected_records = [r for r in scenario.expected_records] if scenario.expected_records else []
81
80
 
82
81
  sorted_expected_records = sorted(
83
82
  filter(lambda e: "data" in e, expected_records),
@@ -87,6 +86,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
87
86
  filter(lambda r: r.record, records),
88
87
  key=lambda record: ",".join(f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"),
89
88
  )
89
+
90
+ assert len(sorted_records) == len(sorted_expected_records)
91
+
90
92
  for actual, expected in zip(sorted_records, sorted_expected_records):
91
93
  if actual.record:
92
94
  assert len(actual.record.data) == len(expected["data"])
@@ -97,10 +99,16 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
97
99
  assert value == expected["data"][key]
98
100
  assert actual.record.stream == expected["stream"]
99
101
 
100
- expected_states = filter(lambda e: "data" not in e, expected_records)
101
- states = filter(lambda r: r.state, records)
102
- for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
103
- assert actual.state.data == expected
102
+ expected_states = list(filter(lambda e: "data" not in e, expected_records))
103
+ states = list(filter(lambda r: r.state, records))
104
+
105
+ if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
106
+ # Only check the last state emitted because we don't know the order the others will be in.
107
+ # This may be needed for non-file-based concurrent scenarios too.
108
+ assert states[-1].state.data == expected_states[-1]
109
+ else:
110
+ for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
111
+ assert actual.state.data == expected
104
112
 
105
113
  if scenario.expected_logs:
106
114
  read_logs = scenario.expected_logs.get("read")
@@ -5,7 +5,14 @@ import concurrent
5
5
  import logging
6
6
  from typing import Any, List, Mapping, Optional, Tuple, Union
7
7
 
8
- from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, DestinationSyncMode, SyncMode
8
+ from airbyte_cdk.models import (
9
+ AirbyteStateMessage,
10
+ AirbyteStream,
11
+ ConfiguredAirbyteCatalog,
12
+ ConnectorSpecification,
13
+ DestinationSyncMode,
14
+ SyncMode,
15
+ )
9
16
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
10
17
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
11
18
  from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
@@ -51,7 +58,11 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
51
58
  return True, None
52
59
 
53
60
  def streams(self, config: Mapping[str, Any]) -> List[Stream]:
54
- state_manager = ConnectorStateManager(stream_instance_map={s.name: s for s in self._streams}, state=self._state)
61
+ state_manager = ConnectorStateManager(
62
+ stream_instance_map={s.name: AirbyteStream(name=s.name, namespace=None, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]) for s in self._streams},
63
+ state=self._state,
64
+ ) # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace`
65
+
55
66
  state_converter = StreamFacadeConcurrentConnectorStateConverter()
56
67
  stream_states = [state_manager.get_stream_state(stream.name, stream.namespace) for stream in self._streams]
57
68
  return [