airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. airbyte_cdk/sources/abstract_source.py +14 -33
  2. airbyte_cdk/sources/connector_state_manager.py +16 -4
  3. airbyte_cdk/sources/file_based/file_based_source.py +87 -35
  4. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
  5. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
  6. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
  7. airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
  8. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
  9. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
  10. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
  11. airbyte_cdk/test/mock_http/mocker.py +3 -1
  12. airbyte_cdk/test/mock_http/response.py +9 -1
  13. airbyte_cdk/utils/traced_exception.py +1 -16
  14. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
  15. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
  19. unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
  20. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  21. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
  22. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
  23. unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
  24. unit_tests/sources/file_based/test_scenarios.py +16 -8
  25. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
  26. unit_tests/sources/test_abstract_source.py +36 -170
  27. unit_tests/sources/test_connector_state_manager.py +20 -13
  28. unit_tests/sources/test_integration_source.py +8 -25
  29. unit_tests/sources/test_source_read.py +1 -1
  30. unit_tests/test/mock_http/test_mocker.py +3 -1
  31. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
  33. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
1
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, MutableMapping, Optional, Tuple
6
+ from unittest.mock import MagicMock
7
+
8
+ import pytest
9
+ from airbyte_cdk.models import AirbyteStateMessage, SyncMode
10
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
11
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
13
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor
14
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
15
+ from freezegun import freeze_time
16
+
17
+ DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT
18
+ MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
19
+
20
+
21
+ def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor:
22
+ stream = MagicMock()
23
+ stream.name = "test"
24
+ stream.namespace = None
25
+ stream_config = MagicMock()
26
+ stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
27
+ cursor = FileBasedConcurrentCursor(
28
+ stream_config,
29
+ stream.name,
30
+ None,
31
+ input_state,
32
+ MagicMock(),
33
+ ConnectorStateManager(
34
+ stream_instance_map={stream.name: stream},
35
+ state=[AirbyteStateMessage.parse_obj(input_state)] if input_state is not None else None,
36
+ ),
37
+ CursorField(FileBasedConcurrentCursor.CURSOR_FIELD),
38
+ )
39
+ return cursor
40
+
41
+
42
+ @pytest.mark.parametrize(
43
+ "input_state, expected_cursor_value",
44
+ [
45
+ pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"),
46
+ pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"),
47
+ pytest.param(
48
+ {
49
+ "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
50
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
51
+ },
52
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
53
+ id="cursor-value-matches-earliest-file",
54
+ ),
55
+ pytest.param(
56
+ {
57
+ "history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
58
+ "_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"
59
+ },
60
+ (datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
61
+ id="cursor-value-is-earlier",
62
+ ),
63
+ pytest.param(
64
+ {
65
+ "history": {"a.csv": "2022-01-01T00:00:00.000000Z"},
66
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
67
+ },
68
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
69
+ id="cursor-value-is-later",
70
+ ),
71
+ pytest.param(
72
+ {
73
+ "history": {
74
+ "a.csv": "2021-01-01T00:00:00.000000Z",
75
+ "b.csv": "2021-01-02T00:00:00.000000Z",
76
+ "c.csv": "2021-01-03T00:00:00.000000Z"
77
+ },
78
+ "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv"
79
+ },
80
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
81
+ id="cursor-not-earliest",
82
+ ),
83
+ pytest.param(
84
+ {
85
+ "history": {"b.csv": "2020-12-31T00:00:00.000000Z"},
86
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
87
+ },
88
+ (datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"),
89
+ id="state-with-cursor-and-earlier-history"
90
+ ),
91
+ pytest.param(
92
+ {
93
+ "history": {"b.csv": "2021-01-02T00:00:00.000000Z"},
94
+ "_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
95
+ },
96
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
97
+ id="state-with-cursor-and-later-history"
98
+ ),
99
+ ]
100
+ )
101
+ def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]):
102
+ cursor = _make_cursor(input_state)
103
+ assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
108
+ [
109
+ pytest.param(
110
+ {"history": {}},
111
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
112
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
113
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
114
+ [],
115
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
116
+ id="add-to-empty-history-single-pending-file",
117
+ ),
118
+ pytest.param(
119
+ {"history": {}},
120
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
121
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
122
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
123
+ [("pending.csv", "2020-01-05T00:00:00.000000Z")],
124
+ "2020-01-05T00:00:00.000000Z_pending.csv",
125
+ id="add-to-empty-history-pending-file-is-older",
126
+ ),
127
+ pytest.param(
128
+ {"history": {}},
129
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
130
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
131
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
132
+ [("pending.csv", "2022-01-05T00:00:00.000000Z")],
133
+ "2022-01-05T00:00:00.000000Z_pending.csv",
134
+ id="add-to-empty-history-pending-file-is-newer",
135
+ ),
136
+ pytest.param(
137
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
138
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z")],
139
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
140
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
141
+ [],
142
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
143
+ id="add-to-nonempty-history-single-pending-file",
144
+ ),
145
+ pytest.param(
146
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
147
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
148
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
149
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
150
+ [("pending.csv", "2020-01-05T00:00:00.000000Z")],
151
+ "2020-01-05T00:00:00.000000Z_pending.csv",
152
+ id="add-to-nonempty-history-pending-file-is-older",
153
+ ),
154
+ pytest.param(
155
+ {"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
156
+ [("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
157
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
158
+ {"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
159
+ [("pending.csv", "2022-01-05T00:00:00.000000Z")],
160
+ "2022-01-05T00:00:00.000000Z_pending.csv",
161
+ id="add-to-nonempty-history-pending-file-is-newer",
162
+ ),
163
+ ]
164
+ )
165
+ def test_add_file(
166
+ initial_state: MutableMapping[str, Any],
167
+ pending_files: List[Tuple[str, str]],
168
+ file_to_add: Tuple[str, str],
169
+ expected_history: Dict[str, Any],
170
+ expected_pending_files: List[Tuple[str, str]],
171
+ expected_cursor_value: str,
172
+ ):
173
+ cursor = _make_cursor(initial_state)
174
+ mock_message_repository = MagicMock()
175
+ cursor._message_repository = mock_message_repository
176
+ stream = MagicMock()
177
+
178
+ cursor.set_pending_partitions([
179
+ FileBasedStreamPartition(
180
+ stream,
181
+ {"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]},
182
+ mock_message_repository,
183
+ SyncMode.full_refresh,
184
+ FileBasedConcurrentCursor.CURSOR_FIELD,
185
+ initial_state,
186
+ cursor
187
+ ) for uri, timestamp in pending_files
188
+ ])
189
+
190
+ uri, timestamp = file_to_add
191
+ cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
192
+ assert cursor._file_to_datetime_history == expected_history
193
+ assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
194
+ assert mock_message_repository.emit_message.call_args_list[0].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
195
+
196
+
197
+ @pytest.mark.parametrize(
198
+ "initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
199
+ [
200
+ pytest.param(
201
+ {"history": {}},
202
+ [],
203
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
204
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
205
+ [],
206
+ "2021-01-05T00:00:00.000000Z_newfile.csv",
207
+ id="add-to-empty-history-no-pending-files",
208
+ ),
209
+ pytest.param(
210
+ {"history": {}},
211
+ [("pending.csv", "2021-01-05T00:00:00.000000Z")],
212
+ ("newfile.csv", "2021-01-05T00:00:00.000000Z"),
213
+ {"newfile.csv": "2021-01-05T00:00:00.000000Z"},
214
+ [("pending.csv", "2021-01-05T00:00:00.000000Z")],
215
+ "2021-01-05T00:00:00.000000Z_pending.csv",
216
+ id="add-to-empty-history-file-not-in-pending-files",
217
+ ),
218
+ ]
219
+ )
220
+ def test_add_file_invalid(
221
+ initial_state: MutableMapping[str, Any],
222
+ pending_files: List[Tuple[str, str]],
223
+ file_to_add: Tuple[str, str],
224
+ expected_history: Dict[str, Any],
225
+ expected_pending_files: List[Tuple[str, str]],
226
+ expected_cursor_value: str,
227
+ ):
228
+ cursor = _make_cursor(initial_state)
229
+ cursor._pending_files = {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files}
230
+ mock_message_repository = MagicMock()
231
+ cursor._message_repository = mock_message_repository
232
+
233
+ uri, timestamp = file_to_add
234
+ cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
235
+ assert cursor._file_to_datetime_history == expected_history
236
+ assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
237
+ assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN"
238
+ assert mock_message_repository.emit_message.call_args_list[1].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
239
+
240
+
241
+ @pytest.mark.parametrize(
242
+ "input_state, pending_files, expected_cursor_value",
243
+ [
244
+ pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"),
245
+ pytest.param(
246
+ {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
247
+ [],
248
+ "2021-01-01T00:00:00.000000Z_a.csv",
249
+ id="no-pending-with-history"
250
+ ),
251
+ pytest.param(
252
+ {"history": {}},
253
+ [("b.csv", "2021-01-02T00:00:00.000000Z")],
254
+ "2021-01-02T00:00:00.000000Z_b.csv",
255
+ id="pending-no-history"
256
+ ),
257
+ pytest.param(
258
+ {"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}},
259
+ [("b.csv", "2021-01-02T00:00:00.000000Z")],
260
+ "2021-01-01T00:00:00.000000Z_a.csv",
261
+ id="with-pending-before-history"
262
+ ),
263
+ pytest.param(
264
+ {"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
265
+ [("b.csv", "2022-01-02T00:00:00.000000Z")],
266
+ "2022-01-01T00:00:00.000000Z_a.csv",
267
+ id="with-pending-after-history"
268
+ ),
269
+ ]
270
+ )
271
+ def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str):
272
+ cursor = _make_cursor(input_state)
273
+ pending_partitions = []
274
+ for url, timestamp in pending_files:
275
+ partition = MagicMock()
276
+ partition.to_slice = lambda *args, **kwargs: {"files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}
277
+ pending_partitions.append(partition)
278
+
279
+ cursor.set_pending_partitions(pending_partitions)
280
+
281
+
282
+ @pytest.mark.parametrize(
283
+ "all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync",
284
+ [
285
+ pytest.param(
286
+ [RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
287
+ {},
288
+ False,
289
+ (datetime.min, ""),
290
+ ["new.csv"],
291
+ id="empty-history-one-new-file"
292
+ ),
293
+ pytest.param(
294
+ [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
295
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
296
+ False,
297
+ (datetime.min, ""),
298
+ ["a.csv"],
299
+ id="non-empty-history-file-in-history-modified"
300
+ ),
301
+ pytest.param(
302
+ [RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
303
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
304
+ False,
305
+ (datetime.min, ""),
306
+ [],
307
+ id="non-empty-history-file-in-history-not-modified"
308
+ ),
309
+ ]
310
+ )
311
+ def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync):
312
+ cursor = _make_cursor({})
313
+ cursor._file_to_datetime_history = history
314
+ cursor._prev_cursor_value = prev_cursor_value
315
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
316
+ files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock()))
317
+ assert [f.uri for f in files_to_sync] == expected_files_to_sync
318
+
319
+
320
+ @freeze_time("2023-06-16T00:00:00Z")
321
+ @pytest.mark.parametrize(
322
+ "file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync",
323
+ [
324
+ pytest.param(
325
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
326
+ {},
327
+ False,
328
+ (datetime.min, ""),
329
+ datetime.min,
330
+ True,
331
+ id="file-not-in-history-not-full-old-cursor"
332
+ ),
333
+ pytest.param(
334
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
335
+ {},
336
+ False,
337
+ (datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""),
338
+ datetime.min,
339
+ True,
340
+ id="file-not-in-history-not-full-new-cursor"
341
+ ),
342
+ pytest.param(
343
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
344
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
345
+ False,
346
+ (datetime.min, ""),
347
+ datetime.min,
348
+ False,
349
+ id="file-in-history-not-modified"
350
+ ),
351
+ pytest.param(
352
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
353
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
354
+ False,
355
+ (datetime.min, ""),
356
+ datetime.min,
357
+ False,
358
+ id="file-in-history-modified-before"
359
+ ),
360
+ pytest.param(
361
+ RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
362
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
363
+ False,
364
+ (datetime.min, ""),
365
+ datetime.min,
366
+ True,
367
+ id="file-in-history-modified-after"
368
+ ),
369
+ pytest.param(
370
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
371
+ {},
372
+ True,
373
+ (datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
374
+ datetime.min,
375
+ True,
376
+ id="history-full-file-modified-after-cursor"
377
+ ),
378
+ pytest.param(
379
+ RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
380
+ {},
381
+ True,
382
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"),
383
+ datetime.min,
384
+ True,
385
+ id="history-full-modified-eq-cursor-uri-gt"
386
+ ),
387
+ pytest.param(
388
+ RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
389
+ {},
390
+ True,
391
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"),
392
+ datetime.min,
393
+ False,
394
+ id="history-full-modified-eq-cursor-uri-lt"
395
+ ),
396
+ pytest.param(
397
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
398
+ {},
399
+ True,
400
+ (datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
401
+ datetime.min,
402
+ True,
403
+ id="history-full-modified-before-cursor-and-after-sync-start"
404
+ ),
405
+ pytest.param(
406
+ RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
407
+ {},
408
+ True,
409
+ (datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
410
+ datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
411
+ False,
412
+ id="history-full-modified-before-cursor-and-before-sync-start"
413
+ ),
414
+ ]
415
+ )
416
+ def test_should_sync_file(
417
+ file_to_check: RemoteFile,
418
+ history: Dict[str, Any],
419
+ is_history_full: bool,
420
+ prev_cursor_value: Tuple[datetime, str],
421
+ sync_start: datetime,
422
+ expected_should_sync: bool,
423
+ ):
424
+ cursor = _make_cursor({})
425
+ cursor._file_to_datetime_history = history
426
+ cursor._prev_cursor_value = prev_cursor_value
427
+ cursor._sync_start = sync_start
428
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
429
+ should_sync = cursor._should_sync_file(file_to_check, MagicMock())
430
+ assert should_sync == expected_should_sync
431
+
432
+
433
+ @freeze_time("2023-06-16T00:00:00Z")
434
+ @pytest.mark.parametrize(
435
+ "input_history, is_history_full, expected_start_time",
436
+ [
437
+ pytest.param({}, False, datetime.min, id="empty-history"),
438
+ pytest.param(
439
+ {"a.csv": "2021-01-01T00:00:00.000000Z"},
440
+ False,
441
+ datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
442
+ id="non-full-history"
443
+ ),
444
+ pytest.param(
445
+ {f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all before the time window
446
+ True,
447
+ datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), # Time window start time
448
+ id="full-history-earliest-before-window"
449
+ ),
450
+ pytest.param(
451
+ {f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all after the time window
452
+ True,
453
+ datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT), # Earliest file time
454
+ id="full-history-earliest-after-window"
455
+ ),
456
+ ]
457
+ )
458
+ def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch):
459
+ cursor = _make_cursor({"history": input_history})
460
+ cursor._file_to_datetime_history = input_history
461
+ cursor._is_history_full = MagicMock(return_value=is_history_full)
462
+ assert cursor._compute_start_time() == expected_start_time
@@ -26,6 +26,29 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
26
26
  success_multi_stream_scenario,
27
27
  success_user_provided_schema_scenario,
28
28
  )
29
+ from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import (
30
+ multi_csv_different_timestamps_scenario_concurrent,
31
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
32
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
33
+ multi_csv_per_timestamp_scenario_concurrent,
34
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
35
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
36
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
37
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
38
+ multi_csv_same_timestamp_scenario_concurrent,
39
+ multi_csv_skip_file_if_already_in_history_concurrent,
40
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
41
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
42
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
43
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
44
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
45
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
46
+ single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
47
+ single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
48
+ single_csv_input_state_is_earlier_scenario_concurrent,
49
+ single_csv_input_state_is_later_scenario_concurrent,
50
+ single_csv_no_input_state_scenario_concurrent,
51
+ )
29
52
  from unit_tests.sources.file_based.scenarios.csv_scenarios import (
30
53
  csv_autogenerate_column_names_scenario,
31
54
  csv_custom_bool_values_scenario,
@@ -214,6 +237,28 @@ discover_scenarios = [
214
237
  unstructured_invalid_file_type_discover_scenario_no_skip,
215
238
  unstructured_invalid_file_type_discover_scenario_skip,
216
239
  unstructured_invalid_file_type_read_scenario,
240
+ multi_csv_different_timestamps_scenario_concurrent,
241
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
242
+ multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
243
+ multi_csv_per_timestamp_scenario_concurrent,
244
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
245
+ multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
246
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
247
+ multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
248
+ multi_csv_same_timestamp_scenario_concurrent,
249
+ multi_csv_skip_file_if_already_in_history_concurrent,
250
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
251
+ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
252
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
253
+ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
254
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
255
+ multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
256
+ single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
257
+ single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
258
+ single_csv_input_state_is_earlier_scenario_concurrent,
259
+ single_csv_input_state_is_later_scenario_concurrent,
260
+ single_csv_no_input_state_scenario_concurrent,
261
+
217
262
  ]
218
263
 
219
264
  read_scenarios = discover_scenarios + [
@@ -13,6 +13,7 @@ from _pytest.reports import ExceptionInfo
13
13
  from airbyte_cdk.entrypoint import launch
14
14
  from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode
15
15
  from airbyte_cdk.sources import AbstractSource
16
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
16
17
  from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
17
18
  from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
18
19
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
@@ -72,12 +73,10 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
72
73
  def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
73
74
  records, log_messages = output.records_and_state_messages, output.logs
74
75
  logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
75
- expected_records = scenario.expected_records
76
-
77
- if expected_records is None:
76
+ if scenario.expected_records is None:
78
77
  return
79
78
 
80
- assert len(records) == len(expected_records)
79
+ expected_records = [r for r in scenario.expected_records] if scenario.expected_records else []
81
80
 
82
81
  sorted_expected_records = sorted(
83
82
  filter(lambda e: "data" in e, expected_records),
@@ -87,6 +86,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
87
86
  filter(lambda r: r.record, records),
88
87
  key=lambda record: ",".join(f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"),
89
88
  )
89
+
90
+ assert len(sorted_records) == len(sorted_expected_records)
91
+
90
92
  for actual, expected in zip(sorted_records, sorted_expected_records):
91
93
  if actual.record:
92
94
  assert len(actual.record.data) == len(expected["data"])
@@ -97,10 +99,16 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
97
99
  assert value == expected["data"][key]
98
100
  assert actual.record.stream == expected["stream"]
99
101
 
100
- expected_states = filter(lambda e: "data" not in e, expected_records)
101
- states = filter(lambda r: r.state, records)
102
- for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
103
- assert actual.state.data == expected
102
+ expected_states = list(filter(lambda e: "data" not in e, expected_records))
103
+ states = list(filter(lambda r: r.state, records))
104
+
105
+ if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
106
+ # Only check the last state emitted because we don't know the order the others will be in.
107
+ # This may be needed for non-file-based concurrent scenarios too.
108
+ assert states[-1].state.data == expected_states[-1]
109
+ else:
110
+ for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
111
+ assert actual.state.data == expected
104
112
 
105
113
  if scenario.expected_logs:
106
114
  read_logs = scenario.expected_logs.get("read")
@@ -5,7 +5,14 @@ import concurrent
5
5
  import logging
6
6
  from typing import Any, List, Mapping, Optional, Tuple, Union
7
7
 
8
- from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification, DestinationSyncMode, SyncMode
8
+ from airbyte_cdk.models import (
9
+ AirbyteStateMessage,
10
+ AirbyteStream,
11
+ ConfiguredAirbyteCatalog,
12
+ ConnectorSpecification,
13
+ DestinationSyncMode,
14
+ SyncMode,
15
+ )
9
16
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
10
17
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
11
18
  from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
@@ -51,7 +58,11 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
51
58
  return True, None
52
59
 
53
60
  def streams(self, config: Mapping[str, Any]) -> List[Stream]:
54
- state_manager = ConnectorStateManager(stream_instance_map={s.name: s for s in self._streams}, state=self._state)
61
+ state_manager = ConnectorStateManager(
62
+ stream_instance_map={s.name: AirbyteStream(name=s.name, namespace=None, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]) for s in self._streams},
63
+ state=self._state,
64
+ ) # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace`
65
+
55
66
  state_converter = StreamFacadeConcurrentConnectorStateConverter()
56
67
  stream_states = [state_manager.get_stream_state(stream.name, stream.namespace) for stream in self._streams]
57
68
  return [