airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/abstract_source.py +14 -33
- airbyte_cdk/sources/connector_state_manager.py +16 -4
- airbyte_cdk/sources/file_based/file_based_source.py +87 -35
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
- airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
- airbyte_cdk/test/mock_http/mocker.py +3 -1
- airbyte_cdk/test/mock_http/response.py +9 -1
- airbyte_cdk/utils/traced_exception.py +1 -16
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
- unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
- unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
- unit_tests/sources/file_based/test_scenarios.py +16 -8
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
- unit_tests/sources/test_abstract_source.py +36 -170
- unit_tests/sources/test_connector_state_manager.py +20 -13
- unit_tests/sources/test_integration_source.py +8 -25
- unit_tests/sources/test_source_read.py +1 -1
- unit_tests/test/mock_http/test_mocker.py +3 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Any, Dict, List, MutableMapping, Optional, Tuple
|
6
|
+
from unittest.mock import MagicMock
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
from airbyte_cdk.models import AirbyteStateMessage, SyncMode
|
10
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
11
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
12
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
13
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
15
|
+
from freezegun import freeze_time
|
16
|
+
|
17
|
+
DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT
|
18
|
+
MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
|
19
|
+
|
20
|
+
|
21
|
+
def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor:
|
22
|
+
stream = MagicMock()
|
23
|
+
stream.name = "test"
|
24
|
+
stream.namespace = None
|
25
|
+
stream_config = MagicMock()
|
26
|
+
stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
27
|
+
cursor = FileBasedConcurrentCursor(
|
28
|
+
stream_config,
|
29
|
+
stream.name,
|
30
|
+
None,
|
31
|
+
input_state,
|
32
|
+
MagicMock(),
|
33
|
+
ConnectorStateManager(
|
34
|
+
stream_instance_map={stream.name: stream},
|
35
|
+
state=[AirbyteStateMessage.parse_obj(input_state)] if input_state is not None else None,
|
36
|
+
),
|
37
|
+
CursorField(FileBasedConcurrentCursor.CURSOR_FIELD),
|
38
|
+
)
|
39
|
+
return cursor
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.mark.parametrize(
|
43
|
+
"input_state, expected_cursor_value",
|
44
|
+
[
|
45
|
+
pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"),
|
46
|
+
pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"),
|
47
|
+
pytest.param(
|
48
|
+
{
|
49
|
+
"history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
|
50
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
51
|
+
},
|
52
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
53
|
+
id="cursor-value-matches-earliest-file",
|
54
|
+
),
|
55
|
+
pytest.param(
|
56
|
+
{
|
57
|
+
"history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
|
58
|
+
"_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"
|
59
|
+
},
|
60
|
+
(datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
61
|
+
id="cursor-value-is-earlier",
|
62
|
+
),
|
63
|
+
pytest.param(
|
64
|
+
{
|
65
|
+
"history": {"a.csv": "2022-01-01T00:00:00.000000Z"},
|
66
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
67
|
+
},
|
68
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
69
|
+
id="cursor-value-is-later",
|
70
|
+
),
|
71
|
+
pytest.param(
|
72
|
+
{
|
73
|
+
"history": {
|
74
|
+
"a.csv": "2021-01-01T00:00:00.000000Z",
|
75
|
+
"b.csv": "2021-01-02T00:00:00.000000Z",
|
76
|
+
"c.csv": "2021-01-03T00:00:00.000000Z"
|
77
|
+
},
|
78
|
+
"_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv"
|
79
|
+
},
|
80
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
81
|
+
id="cursor-not-earliest",
|
82
|
+
),
|
83
|
+
pytest.param(
|
84
|
+
{
|
85
|
+
"history": {"b.csv": "2020-12-31T00:00:00.000000Z"},
|
86
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
87
|
+
},
|
88
|
+
(datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"),
|
89
|
+
id="state-with-cursor-and-earlier-history"
|
90
|
+
),
|
91
|
+
pytest.param(
|
92
|
+
{
|
93
|
+
"history": {"b.csv": "2021-01-02T00:00:00.000000Z"},
|
94
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
95
|
+
},
|
96
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
97
|
+
id="state-with-cursor-and-later-history"
|
98
|
+
),
|
99
|
+
]
|
100
|
+
)
|
101
|
+
def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]):
|
102
|
+
cursor = _make_cursor(input_state)
|
103
|
+
assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value
|
104
|
+
|
105
|
+
|
106
|
+
@pytest.mark.parametrize(
|
107
|
+
"initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
|
108
|
+
[
|
109
|
+
pytest.param(
|
110
|
+
{"history": {}},
|
111
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z")],
|
112
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
113
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
114
|
+
[],
|
115
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
116
|
+
id="add-to-empty-history-single-pending-file",
|
117
|
+
),
|
118
|
+
pytest.param(
|
119
|
+
{"history": {}},
|
120
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
121
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
122
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
123
|
+
[("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
124
|
+
"2020-01-05T00:00:00.000000Z_pending.csv",
|
125
|
+
id="add-to-empty-history-pending-file-is-older",
|
126
|
+
),
|
127
|
+
pytest.param(
|
128
|
+
{"history": {}},
|
129
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
130
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
131
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
132
|
+
[("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
133
|
+
"2022-01-05T00:00:00.000000Z_pending.csv",
|
134
|
+
id="add-to-empty-history-pending-file-is-newer",
|
135
|
+
),
|
136
|
+
pytest.param(
|
137
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
138
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z")],
|
139
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
140
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
141
|
+
[],
|
142
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
143
|
+
id="add-to-nonempty-history-single-pending-file",
|
144
|
+
),
|
145
|
+
pytest.param(
|
146
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
147
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
148
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
149
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
150
|
+
[("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
151
|
+
"2020-01-05T00:00:00.000000Z_pending.csv",
|
152
|
+
id="add-to-nonempty-history-pending-file-is-older",
|
153
|
+
),
|
154
|
+
pytest.param(
|
155
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
156
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
157
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
158
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
159
|
+
[("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
160
|
+
"2022-01-05T00:00:00.000000Z_pending.csv",
|
161
|
+
id="add-to-nonempty-history-pending-file-is-newer",
|
162
|
+
),
|
163
|
+
]
|
164
|
+
)
|
165
|
+
def test_add_file(
|
166
|
+
initial_state: MutableMapping[str, Any],
|
167
|
+
pending_files: List[Tuple[str, str]],
|
168
|
+
file_to_add: Tuple[str, str],
|
169
|
+
expected_history: Dict[str, Any],
|
170
|
+
expected_pending_files: List[Tuple[str, str]],
|
171
|
+
expected_cursor_value: str,
|
172
|
+
):
|
173
|
+
cursor = _make_cursor(initial_state)
|
174
|
+
mock_message_repository = MagicMock()
|
175
|
+
cursor._message_repository = mock_message_repository
|
176
|
+
stream = MagicMock()
|
177
|
+
|
178
|
+
cursor.set_pending_partitions([
|
179
|
+
FileBasedStreamPartition(
|
180
|
+
stream,
|
181
|
+
{"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]},
|
182
|
+
mock_message_repository,
|
183
|
+
SyncMode.full_refresh,
|
184
|
+
FileBasedConcurrentCursor.CURSOR_FIELD,
|
185
|
+
initial_state,
|
186
|
+
cursor
|
187
|
+
) for uri, timestamp in pending_files
|
188
|
+
])
|
189
|
+
|
190
|
+
uri, timestamp = file_to_add
|
191
|
+
cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
|
192
|
+
assert cursor._file_to_datetime_history == expected_history
|
193
|
+
assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
|
194
|
+
assert mock_message_repository.emit_message.call_args_list[0].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
|
195
|
+
|
196
|
+
|
197
|
+
@pytest.mark.parametrize(
|
198
|
+
"initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
|
199
|
+
[
|
200
|
+
pytest.param(
|
201
|
+
{"history": {}},
|
202
|
+
[],
|
203
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
204
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
205
|
+
[],
|
206
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
207
|
+
id="add-to-empty-history-no-pending-files",
|
208
|
+
),
|
209
|
+
pytest.param(
|
210
|
+
{"history": {}},
|
211
|
+
[("pending.csv", "2021-01-05T00:00:00.000000Z")],
|
212
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
213
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
214
|
+
[("pending.csv", "2021-01-05T00:00:00.000000Z")],
|
215
|
+
"2021-01-05T00:00:00.000000Z_pending.csv",
|
216
|
+
id="add-to-empty-history-file-not-in-pending-files",
|
217
|
+
),
|
218
|
+
]
|
219
|
+
)
|
220
|
+
def test_add_file_invalid(
|
221
|
+
initial_state: MutableMapping[str, Any],
|
222
|
+
pending_files: List[Tuple[str, str]],
|
223
|
+
file_to_add: Tuple[str, str],
|
224
|
+
expected_history: Dict[str, Any],
|
225
|
+
expected_pending_files: List[Tuple[str, str]],
|
226
|
+
expected_cursor_value: str,
|
227
|
+
):
|
228
|
+
cursor = _make_cursor(initial_state)
|
229
|
+
cursor._pending_files = {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files}
|
230
|
+
mock_message_repository = MagicMock()
|
231
|
+
cursor._message_repository = mock_message_repository
|
232
|
+
|
233
|
+
uri, timestamp = file_to_add
|
234
|
+
cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
|
235
|
+
assert cursor._file_to_datetime_history == expected_history
|
236
|
+
assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
|
237
|
+
assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN"
|
238
|
+
assert mock_message_repository.emit_message.call_args_list[1].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
|
239
|
+
|
240
|
+
|
241
|
+
@pytest.mark.parametrize(
|
242
|
+
"input_state, pending_files, expected_cursor_value",
|
243
|
+
[
|
244
|
+
pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"),
|
245
|
+
pytest.param(
|
246
|
+
{"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
|
247
|
+
[],
|
248
|
+
"2021-01-01T00:00:00.000000Z_a.csv",
|
249
|
+
id="no-pending-with-history"
|
250
|
+
),
|
251
|
+
pytest.param(
|
252
|
+
{"history": {}},
|
253
|
+
[("b.csv", "2021-01-02T00:00:00.000000Z")],
|
254
|
+
"2021-01-02T00:00:00.000000Z_b.csv",
|
255
|
+
id="pending-no-history"
|
256
|
+
),
|
257
|
+
pytest.param(
|
258
|
+
{"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}},
|
259
|
+
[("b.csv", "2021-01-02T00:00:00.000000Z")],
|
260
|
+
"2021-01-01T00:00:00.000000Z_a.csv",
|
261
|
+
id="with-pending-before-history"
|
262
|
+
),
|
263
|
+
pytest.param(
|
264
|
+
{"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
|
265
|
+
[("b.csv", "2022-01-02T00:00:00.000000Z")],
|
266
|
+
"2022-01-01T00:00:00.000000Z_a.csv",
|
267
|
+
id="with-pending-after-history"
|
268
|
+
),
|
269
|
+
]
|
270
|
+
)
|
271
|
+
def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str):
|
272
|
+
cursor = _make_cursor(input_state)
|
273
|
+
pending_partitions = []
|
274
|
+
for url, timestamp in pending_files:
|
275
|
+
partition = MagicMock()
|
276
|
+
partition.to_slice = lambda *args, **kwargs: {"files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}
|
277
|
+
pending_partitions.append(partition)
|
278
|
+
|
279
|
+
cursor.set_pending_partitions(pending_partitions)
|
280
|
+
|
281
|
+
|
282
|
+
@pytest.mark.parametrize(
|
283
|
+
"all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync",
|
284
|
+
[
|
285
|
+
pytest.param(
|
286
|
+
[RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
287
|
+
{},
|
288
|
+
False,
|
289
|
+
(datetime.min, ""),
|
290
|
+
["new.csv"],
|
291
|
+
id="empty-history-one-new-file"
|
292
|
+
),
|
293
|
+
pytest.param(
|
294
|
+
[RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
295
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
296
|
+
False,
|
297
|
+
(datetime.min, ""),
|
298
|
+
["a.csv"],
|
299
|
+
id="non-empty-history-file-in-history-modified"
|
300
|
+
),
|
301
|
+
pytest.param(
|
302
|
+
[RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
303
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
304
|
+
False,
|
305
|
+
(datetime.min, ""),
|
306
|
+
[],
|
307
|
+
id="non-empty-history-file-in-history-not-modified"
|
308
|
+
),
|
309
|
+
]
|
310
|
+
)
|
311
|
+
def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync):
|
312
|
+
cursor = _make_cursor({})
|
313
|
+
cursor._file_to_datetime_history = history
|
314
|
+
cursor._prev_cursor_value = prev_cursor_value
|
315
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
316
|
+
files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock()))
|
317
|
+
assert [f.uri for f in files_to_sync] == expected_files_to_sync
|
318
|
+
|
319
|
+
|
320
|
+
@freeze_time("2023-06-16T00:00:00Z")
|
321
|
+
@pytest.mark.parametrize(
|
322
|
+
"file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync",
|
323
|
+
[
|
324
|
+
pytest.param(
|
325
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
326
|
+
{},
|
327
|
+
False,
|
328
|
+
(datetime.min, ""),
|
329
|
+
datetime.min,
|
330
|
+
True,
|
331
|
+
id="file-not-in-history-not-full-old-cursor"
|
332
|
+
),
|
333
|
+
pytest.param(
|
334
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
335
|
+
{},
|
336
|
+
False,
|
337
|
+
(datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""),
|
338
|
+
datetime.min,
|
339
|
+
True,
|
340
|
+
id="file-not-in-history-not-full-new-cursor"
|
341
|
+
),
|
342
|
+
pytest.param(
|
343
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
344
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
345
|
+
False,
|
346
|
+
(datetime.min, ""),
|
347
|
+
datetime.min,
|
348
|
+
False,
|
349
|
+
id="file-in-history-not-modified"
|
350
|
+
),
|
351
|
+
pytest.param(
|
352
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
353
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
354
|
+
False,
|
355
|
+
(datetime.min, ""),
|
356
|
+
datetime.min,
|
357
|
+
False,
|
358
|
+
id="file-in-history-modified-before"
|
359
|
+
),
|
360
|
+
pytest.param(
|
361
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
362
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
363
|
+
False,
|
364
|
+
(datetime.min, ""),
|
365
|
+
datetime.min,
|
366
|
+
True,
|
367
|
+
id="file-in-history-modified-after"
|
368
|
+
),
|
369
|
+
pytest.param(
|
370
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
371
|
+
{},
|
372
|
+
True,
|
373
|
+
(datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
374
|
+
datetime.min,
|
375
|
+
True,
|
376
|
+
id="history-full-file-modified-after-cursor"
|
377
|
+
),
|
378
|
+
pytest.param(
|
379
|
+
RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
380
|
+
{},
|
381
|
+
True,
|
382
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"),
|
383
|
+
datetime.min,
|
384
|
+
True,
|
385
|
+
id="history-full-modified-eq-cursor-uri-gt"
|
386
|
+
),
|
387
|
+
pytest.param(
|
388
|
+
RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
389
|
+
{},
|
390
|
+
True,
|
391
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"),
|
392
|
+
datetime.min,
|
393
|
+
False,
|
394
|
+
id="history-full-modified-eq-cursor-uri-lt"
|
395
|
+
),
|
396
|
+
pytest.param(
|
397
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
398
|
+
{},
|
399
|
+
True,
|
400
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
401
|
+
datetime.min,
|
402
|
+
True,
|
403
|
+
id="history-full-modified-before-cursor-and-after-sync-start"
|
404
|
+
),
|
405
|
+
pytest.param(
|
406
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
407
|
+
{},
|
408
|
+
True,
|
409
|
+
(datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
410
|
+
datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
|
411
|
+
False,
|
412
|
+
id="history-full-modified-before-cursor-and-before-sync-start"
|
413
|
+
),
|
414
|
+
]
|
415
|
+
)
|
416
|
+
def test_should_sync_file(
|
417
|
+
file_to_check: RemoteFile,
|
418
|
+
history: Dict[str, Any],
|
419
|
+
is_history_full: bool,
|
420
|
+
prev_cursor_value: Tuple[datetime, str],
|
421
|
+
sync_start: datetime,
|
422
|
+
expected_should_sync: bool,
|
423
|
+
):
|
424
|
+
cursor = _make_cursor({})
|
425
|
+
cursor._file_to_datetime_history = history
|
426
|
+
cursor._prev_cursor_value = prev_cursor_value
|
427
|
+
cursor._sync_start = sync_start
|
428
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
429
|
+
should_sync = cursor._should_sync_file(file_to_check, MagicMock())
|
430
|
+
assert should_sync == expected_should_sync
|
431
|
+
|
432
|
+
|
433
|
+
@freeze_time("2023-06-16T00:00:00Z")
|
434
|
+
@pytest.mark.parametrize(
|
435
|
+
"input_history, is_history_full, expected_start_time",
|
436
|
+
[
|
437
|
+
pytest.param({}, False, datetime.min, id="empty-history"),
|
438
|
+
pytest.param(
|
439
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
440
|
+
False,
|
441
|
+
datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
|
442
|
+
id="non-full-history"
|
443
|
+
),
|
444
|
+
pytest.param(
|
445
|
+
{f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all before the time window
|
446
|
+
True,
|
447
|
+
datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), # Time window start time
|
448
|
+
id="full-history-earliest-before-window"
|
449
|
+
),
|
450
|
+
pytest.param(
|
451
|
+
{f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all after the time window
|
452
|
+
True,
|
453
|
+
datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT), # Earliest file time
|
454
|
+
id="full-history-earliest-after-window"
|
455
|
+
),
|
456
|
+
]
|
457
|
+
)
|
458
|
+
def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch):
|
459
|
+
cursor = _make_cursor({"history": input_history})
|
460
|
+
cursor._file_to_datetime_history = input_history
|
461
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
462
|
+
assert cursor._compute_start_time() == expected_start_time
|
@@ -26,6 +26,29 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
|
|
26
26
|
success_multi_stream_scenario,
|
27
27
|
success_user_provided_schema_scenario,
|
28
28
|
)
|
29
|
+
from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import (
|
30
|
+
multi_csv_different_timestamps_scenario_concurrent,
|
31
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
|
32
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
|
33
|
+
multi_csv_per_timestamp_scenario_concurrent,
|
34
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
|
35
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
|
36
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
|
37
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
|
38
|
+
multi_csv_same_timestamp_scenario_concurrent,
|
39
|
+
multi_csv_skip_file_if_already_in_history_concurrent,
|
40
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
|
41
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
|
42
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
|
43
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
|
44
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
|
45
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
|
46
|
+
single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
|
47
|
+
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
|
48
|
+
single_csv_input_state_is_earlier_scenario_concurrent,
|
49
|
+
single_csv_input_state_is_later_scenario_concurrent,
|
50
|
+
single_csv_no_input_state_scenario_concurrent,
|
51
|
+
)
|
29
52
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
30
53
|
csv_autogenerate_column_names_scenario,
|
31
54
|
csv_custom_bool_values_scenario,
|
@@ -214,6 +237,28 @@ discover_scenarios = [
|
|
214
237
|
unstructured_invalid_file_type_discover_scenario_no_skip,
|
215
238
|
unstructured_invalid_file_type_discover_scenario_skip,
|
216
239
|
unstructured_invalid_file_type_read_scenario,
|
240
|
+
multi_csv_different_timestamps_scenario_concurrent,
|
241
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
|
242
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
|
243
|
+
multi_csv_per_timestamp_scenario_concurrent,
|
244
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
|
245
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
|
246
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
|
247
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
|
248
|
+
multi_csv_same_timestamp_scenario_concurrent,
|
249
|
+
multi_csv_skip_file_if_already_in_history_concurrent,
|
250
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
|
251
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
|
252
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
|
253
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
|
254
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
|
255
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
|
256
|
+
single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
|
257
|
+
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
|
258
|
+
single_csv_input_state_is_earlier_scenario_concurrent,
|
259
|
+
single_csv_input_state_is_later_scenario_concurrent,
|
260
|
+
single_csv_no_input_state_scenario_concurrent,
|
261
|
+
|
217
262
|
]
|
218
263
|
|
219
264
|
read_scenarios = discover_scenarios + [
|
@@ -13,6 +13,7 @@ from _pytest.reports import ExceptionInfo
|
|
13
13
|
from airbyte_cdk.entrypoint import launch
|
14
14
|
from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode
|
15
15
|
from airbyte_cdk.sources import AbstractSource
|
16
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
16
17
|
from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
|
17
18
|
from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
|
18
19
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
@@ -72,12 +73,10 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
|
|
72
73
|
def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
|
73
74
|
records, log_messages = output.records_and_state_messages, output.logs
|
74
75
|
logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
|
75
|
-
|
76
|
-
|
77
|
-
if expected_records is None:
|
76
|
+
if scenario.expected_records is None:
|
78
77
|
return
|
79
78
|
|
80
|
-
|
79
|
+
expected_records = [r for r in scenario.expected_records] if scenario.expected_records else []
|
81
80
|
|
82
81
|
sorted_expected_records = sorted(
|
83
82
|
filter(lambda e: "data" in e, expected_records),
|
@@ -87,6 +86,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
87
86
|
filter(lambda r: r.record, records),
|
88
87
|
key=lambda record: ",".join(f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"),
|
89
88
|
)
|
89
|
+
|
90
|
+
assert len(sorted_records) == len(sorted_expected_records)
|
91
|
+
|
90
92
|
for actual, expected in zip(sorted_records, sorted_expected_records):
|
91
93
|
if actual.record:
|
92
94
|
assert len(actual.record.data) == len(expected["data"])
|
@@ -97,10 +99,16 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
97
99
|
assert value == expected["data"][key]
|
98
100
|
assert actual.record.stream == expected["stream"]
|
99
101
|
|
100
|
-
expected_states = filter(lambda e: "data" not in e, expected_records)
|
101
|
-
states = filter(lambda r: r.state, records)
|
102
|
-
|
103
|
-
|
102
|
+
expected_states = list(filter(lambda e: "data" not in e, expected_records))
|
103
|
+
states = list(filter(lambda r: r.state, records))
|
104
|
+
|
105
|
+
if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
|
106
|
+
# Only check the last state emitted because we don't know the order the others will be in.
|
107
|
+
# This may be needed for non-file-based concurrent scenarios too.
|
108
|
+
assert states[-1].state.data == expected_states[-1]
|
109
|
+
else:
|
110
|
+
for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
|
111
|
+
assert actual.state.data == expected
|
104
112
|
|
105
113
|
if scenario.expected_logs:
|
106
114
|
read_logs = scenario.expected_logs.get("read")
|
@@ -5,7 +5,14 @@ import concurrent
|
|
5
5
|
import logging
|
6
6
|
from typing import Any, List, Mapping, Optional, Tuple, Union
|
7
7
|
|
8
|
-
from airbyte_cdk.models import
|
8
|
+
from airbyte_cdk.models import (
|
9
|
+
AirbyteStateMessage,
|
10
|
+
AirbyteStream,
|
11
|
+
ConfiguredAirbyteCatalog,
|
12
|
+
ConnectorSpecification,
|
13
|
+
DestinationSyncMode,
|
14
|
+
SyncMode,
|
15
|
+
)
|
9
16
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
10
17
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
11
18
|
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
@@ -51,7 +58,11 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
|
|
51
58
|
return True, None
|
52
59
|
|
53
60
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
54
|
-
state_manager = ConnectorStateManager(
|
61
|
+
state_manager = ConnectorStateManager(
|
62
|
+
stream_instance_map={s.name: AirbyteStream(name=s.name, namespace=None, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]) for s in self._streams},
|
63
|
+
state=self._state,
|
64
|
+
) # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace`
|
65
|
+
|
55
66
|
state_converter = StreamFacadeConcurrentConnectorStateConverter()
|
56
67
|
stream_states = [state_manager.get_stream_state(stream.name, stream.namespace) for stream in self._streams]
|
57
68
|
return [
|