airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +14 -33
- airbyte_cdk/sources/connector_state_manager.py +16 -4
- airbyte_cdk/sources/file_based/file_based_source.py +87 -35
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
- airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
- airbyte_cdk/test/mock_http/mocker.py +3 -1
- airbyte_cdk/test/mock_http/response.py +9 -1
- airbyte_cdk/utils/traced_exception.py +1 -16
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
- unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
- unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
- unit_tests/sources/file_based/test_scenarios.py +16 -8
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
- unit_tests/sources/test_abstract_source.py +36 -170
- unit_tests/sources/test_connector_state_manager.py +20 -13
- unit_tests/sources/test_integration_source.py +8 -25
- unit_tests/sources/test_source_read.py +1 -1
- unit_tests/test/mock_http/test_mocker.py +3 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Any, Dict, List, MutableMapping, Optional, Tuple
|
6
|
+
from unittest.mock import MagicMock
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
from airbyte_cdk.models import AirbyteStateMessage, SyncMode
|
10
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
11
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
12
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
13
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedConcurrentCursor
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
15
|
+
from freezegun import freeze_time
|
16
|
+
|
17
|
+
DATE_TIME_FORMAT = FileBasedConcurrentCursor.DATE_TIME_FORMAT
|
18
|
+
MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
|
19
|
+
|
20
|
+
|
21
|
+
def _make_cursor(input_state: Optional[MutableMapping[str, Any]]) -> FileBasedConcurrentCursor:
|
22
|
+
stream = MagicMock()
|
23
|
+
stream.name = "test"
|
24
|
+
stream.namespace = None
|
25
|
+
stream_config = MagicMock()
|
26
|
+
stream_config.days_to_sync_if_history_is_full = MOCK_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
27
|
+
cursor = FileBasedConcurrentCursor(
|
28
|
+
stream_config,
|
29
|
+
stream.name,
|
30
|
+
None,
|
31
|
+
input_state,
|
32
|
+
MagicMock(),
|
33
|
+
ConnectorStateManager(
|
34
|
+
stream_instance_map={stream.name: stream},
|
35
|
+
state=[AirbyteStateMessage.parse_obj(input_state)] if input_state is not None else None,
|
36
|
+
),
|
37
|
+
CursorField(FileBasedConcurrentCursor.CURSOR_FIELD),
|
38
|
+
)
|
39
|
+
return cursor
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.mark.parametrize(
|
43
|
+
"input_state, expected_cursor_value",
|
44
|
+
[
|
45
|
+
pytest.param({}, (datetime.min, ""), id="no-state-gives-min-cursor"),
|
46
|
+
pytest.param({"history": {}}, (datetime.min, ""), id="missing-cursor-field-gives-min-cursor"),
|
47
|
+
pytest.param(
|
48
|
+
{
|
49
|
+
"history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
|
50
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
51
|
+
},
|
52
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
53
|
+
id="cursor-value-matches-earliest-file",
|
54
|
+
),
|
55
|
+
pytest.param(
|
56
|
+
{
|
57
|
+
"history": {"a.csv": "2021-01-01T00:00:00.000000Z"},
|
58
|
+
"_ab_source_file_last_modified": "2020-01-01T00:00:00.000000Z_a.csv"
|
59
|
+
},
|
60
|
+
(datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
61
|
+
id="cursor-value-is-earlier",
|
62
|
+
),
|
63
|
+
pytest.param(
|
64
|
+
{
|
65
|
+
"history": {"a.csv": "2022-01-01T00:00:00.000000Z"},
|
66
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
67
|
+
},
|
68
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
69
|
+
id="cursor-value-is-later",
|
70
|
+
),
|
71
|
+
pytest.param(
|
72
|
+
{
|
73
|
+
"history": {
|
74
|
+
"a.csv": "2021-01-01T00:00:00.000000Z",
|
75
|
+
"b.csv": "2021-01-02T00:00:00.000000Z",
|
76
|
+
"c.csv": "2021-01-03T00:00:00.000000Z"
|
77
|
+
},
|
78
|
+
"_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv"
|
79
|
+
},
|
80
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
81
|
+
id="cursor-not-earliest",
|
82
|
+
),
|
83
|
+
pytest.param(
|
84
|
+
{
|
85
|
+
"history": {"b.csv": "2020-12-31T00:00:00.000000Z"},
|
86
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
87
|
+
},
|
88
|
+
(datetime.strptime("2020-12-31T00:00:00.000000Z", DATE_TIME_FORMAT), "b.csv"),
|
89
|
+
id="state-with-cursor-and-earlier-history"
|
90
|
+
),
|
91
|
+
pytest.param(
|
92
|
+
{
|
93
|
+
"history": {"b.csv": "2021-01-02T00:00:00.000000Z"},
|
94
|
+
"_ab_source_file_last_modified": "2021-01-01T00:00:00.000000Z_a.csv"
|
95
|
+
},
|
96
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
97
|
+
id="state-with-cursor-and-later-history"
|
98
|
+
),
|
99
|
+
]
|
100
|
+
)
|
101
|
+
def test_compute_prev_sync_cursor(input_state: MutableMapping[str, Any], expected_cursor_value: Tuple[datetime, str]):
|
102
|
+
cursor = _make_cursor(input_state)
|
103
|
+
assert cursor._compute_prev_sync_cursor(input_state) == expected_cursor_value
|
104
|
+
|
105
|
+
|
106
|
+
@pytest.mark.parametrize(
|
107
|
+
"initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
|
108
|
+
[
|
109
|
+
pytest.param(
|
110
|
+
{"history": {}},
|
111
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z")],
|
112
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
113
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
114
|
+
[],
|
115
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
116
|
+
id="add-to-empty-history-single-pending-file",
|
117
|
+
),
|
118
|
+
pytest.param(
|
119
|
+
{"history": {}},
|
120
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
121
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
122
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
123
|
+
[("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
124
|
+
"2020-01-05T00:00:00.000000Z_pending.csv",
|
125
|
+
id="add-to-empty-history-pending-file-is-older",
|
126
|
+
),
|
127
|
+
pytest.param(
|
128
|
+
{"history": {}},
|
129
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
130
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
131
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
132
|
+
[("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
133
|
+
"2022-01-05T00:00:00.000000Z_pending.csv",
|
134
|
+
id="add-to-empty-history-pending-file-is-newer",
|
135
|
+
),
|
136
|
+
pytest.param(
|
137
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
138
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z")],
|
139
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
140
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
141
|
+
[],
|
142
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
143
|
+
id="add-to-nonempty-history-single-pending-file",
|
144
|
+
),
|
145
|
+
pytest.param(
|
146
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
147
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
148
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
149
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
150
|
+
[("pending.csv", "2020-01-05T00:00:00.000000Z")],
|
151
|
+
"2020-01-05T00:00:00.000000Z_pending.csv",
|
152
|
+
id="add-to-nonempty-history-pending-file-is-older",
|
153
|
+
),
|
154
|
+
pytest.param(
|
155
|
+
{"history": {"existing.csv": "2021-01-04T00:00:00.000000Z"}},
|
156
|
+
[("newfile.csv", "2021-01-05T00:00:00.000000Z"), ("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
157
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
158
|
+
{"existing.csv": "2021-01-04T00:00:00.000000Z", "newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
159
|
+
[("pending.csv", "2022-01-05T00:00:00.000000Z")],
|
160
|
+
"2022-01-05T00:00:00.000000Z_pending.csv",
|
161
|
+
id="add-to-nonempty-history-pending-file-is-newer",
|
162
|
+
),
|
163
|
+
]
|
164
|
+
)
|
165
|
+
def test_add_file(
|
166
|
+
initial_state: MutableMapping[str, Any],
|
167
|
+
pending_files: List[Tuple[str, str]],
|
168
|
+
file_to_add: Tuple[str, str],
|
169
|
+
expected_history: Dict[str, Any],
|
170
|
+
expected_pending_files: List[Tuple[str, str]],
|
171
|
+
expected_cursor_value: str,
|
172
|
+
):
|
173
|
+
cursor = _make_cursor(initial_state)
|
174
|
+
mock_message_repository = MagicMock()
|
175
|
+
cursor._message_repository = mock_message_repository
|
176
|
+
stream = MagicMock()
|
177
|
+
|
178
|
+
cursor.set_pending_partitions([
|
179
|
+
FileBasedStreamPartition(
|
180
|
+
stream,
|
181
|
+
{"files": [RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]},
|
182
|
+
mock_message_repository,
|
183
|
+
SyncMode.full_refresh,
|
184
|
+
FileBasedConcurrentCursor.CURSOR_FIELD,
|
185
|
+
initial_state,
|
186
|
+
cursor
|
187
|
+
) for uri, timestamp in pending_files
|
188
|
+
])
|
189
|
+
|
190
|
+
uri, timestamp = file_to_add
|
191
|
+
cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
|
192
|
+
assert cursor._file_to_datetime_history == expected_history
|
193
|
+
assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
|
194
|
+
assert mock_message_repository.emit_message.call_args_list[0].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
|
195
|
+
|
196
|
+
|
197
|
+
@pytest.mark.parametrize(
|
198
|
+
"initial_state, pending_files, file_to_add, expected_history, expected_pending_files, expected_cursor_value",
|
199
|
+
[
|
200
|
+
pytest.param(
|
201
|
+
{"history": {}},
|
202
|
+
[],
|
203
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
204
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
205
|
+
[],
|
206
|
+
"2021-01-05T00:00:00.000000Z_newfile.csv",
|
207
|
+
id="add-to-empty-history-no-pending-files",
|
208
|
+
),
|
209
|
+
pytest.param(
|
210
|
+
{"history": {}},
|
211
|
+
[("pending.csv", "2021-01-05T00:00:00.000000Z")],
|
212
|
+
("newfile.csv", "2021-01-05T00:00:00.000000Z"),
|
213
|
+
{"newfile.csv": "2021-01-05T00:00:00.000000Z"},
|
214
|
+
[("pending.csv", "2021-01-05T00:00:00.000000Z")],
|
215
|
+
"2021-01-05T00:00:00.000000Z_pending.csv",
|
216
|
+
id="add-to-empty-history-file-not-in-pending-files",
|
217
|
+
),
|
218
|
+
]
|
219
|
+
)
|
220
|
+
def test_add_file_invalid(
|
221
|
+
initial_state: MutableMapping[str, Any],
|
222
|
+
pending_files: List[Tuple[str, str]],
|
223
|
+
file_to_add: Tuple[str, str],
|
224
|
+
expected_history: Dict[str, Any],
|
225
|
+
expected_pending_files: List[Tuple[str, str]],
|
226
|
+
expected_cursor_value: str,
|
227
|
+
):
|
228
|
+
cursor = _make_cursor(initial_state)
|
229
|
+
cursor._pending_files = {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in pending_files}
|
230
|
+
mock_message_repository = MagicMock()
|
231
|
+
cursor._message_repository = mock_message_repository
|
232
|
+
|
233
|
+
uri, timestamp = file_to_add
|
234
|
+
cursor.add_file(RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)))
|
235
|
+
assert cursor._file_to_datetime_history == expected_history
|
236
|
+
assert cursor._pending_files == {uri: RemoteFile(uri=uri, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT)) for uri, timestamp in expected_pending_files}
|
237
|
+
assert mock_message_repository.emit_message.call_args_list[0].args[0].log.level.value == "WARN"
|
238
|
+
assert mock_message_repository.emit_message.call_args_list[1].args[0].state.data["test"]["_ab_source_file_last_modified"] == expected_cursor_value
|
239
|
+
|
240
|
+
|
241
|
+
@pytest.mark.parametrize(
|
242
|
+
"input_state, pending_files, expected_cursor_value",
|
243
|
+
[
|
244
|
+
pytest.param({}, [], f"{datetime.min.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}_", id="no-state-no-pending"),
|
245
|
+
pytest.param(
|
246
|
+
{"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
|
247
|
+
[],
|
248
|
+
"2021-01-01T00:00:00.000000Z_a.csv",
|
249
|
+
id="no-pending-with-history"
|
250
|
+
),
|
251
|
+
pytest.param(
|
252
|
+
{"history": {}},
|
253
|
+
[("b.csv", "2021-01-02T00:00:00.000000Z")],
|
254
|
+
"2021-01-02T00:00:00.000000Z_b.csv",
|
255
|
+
id="pending-no-history"
|
256
|
+
),
|
257
|
+
pytest.param(
|
258
|
+
{"history": {"a.csv": "2022-01-01T00:00:00.000000Z"}},
|
259
|
+
[("b.csv", "2021-01-02T00:00:00.000000Z")],
|
260
|
+
"2021-01-01T00:00:00.000000Z_a.csv",
|
261
|
+
id="with-pending-before-history"
|
262
|
+
),
|
263
|
+
pytest.param(
|
264
|
+
{"history": {"a.csv": "2021-01-01T00:00:00.000000Z"}},
|
265
|
+
[("b.csv", "2022-01-02T00:00:00.000000Z")],
|
266
|
+
"2022-01-01T00:00:00.000000Z_a.csv",
|
267
|
+
id="with-pending-after-history"
|
268
|
+
),
|
269
|
+
]
|
270
|
+
)
|
271
|
+
def test_get_new_cursor_value(input_state: MutableMapping[str, Any], pending_files: List[Tuple[str, str]], expected_cursor_value: str):
|
272
|
+
cursor = _make_cursor(input_state)
|
273
|
+
pending_partitions = []
|
274
|
+
for url, timestamp in pending_files:
|
275
|
+
partition = MagicMock()
|
276
|
+
partition.to_slice = lambda *args, **kwargs: {"files": [RemoteFile(uri=url, last_modified=datetime.strptime(timestamp, DATE_TIME_FORMAT))]}
|
277
|
+
pending_partitions.append(partition)
|
278
|
+
|
279
|
+
cursor.set_pending_partitions(pending_partitions)
|
280
|
+
|
281
|
+
|
282
|
+
@pytest.mark.parametrize(
|
283
|
+
"all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync",
|
284
|
+
[
|
285
|
+
pytest.param(
|
286
|
+
[RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
287
|
+
{},
|
288
|
+
False,
|
289
|
+
(datetime.min, ""),
|
290
|
+
["new.csv"],
|
291
|
+
id="empty-history-one-new-file"
|
292
|
+
),
|
293
|
+
pytest.param(
|
294
|
+
[RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-02T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
295
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
296
|
+
False,
|
297
|
+
(datetime.min, ""),
|
298
|
+
["a.csv"],
|
299
|
+
id="non-empty-history-file-in-history-modified"
|
300
|
+
),
|
301
|
+
pytest.param(
|
302
|
+
[RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ"))],
|
303
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
304
|
+
False,
|
305
|
+
(datetime.min, ""),
|
306
|
+
[],
|
307
|
+
id="non-empty-history-file-in-history-not-modified"
|
308
|
+
),
|
309
|
+
]
|
310
|
+
)
|
311
|
+
def test_get_files_to_sync(all_files, history, is_history_full, prev_cursor_value, expected_files_to_sync):
|
312
|
+
cursor = _make_cursor({})
|
313
|
+
cursor._file_to_datetime_history = history
|
314
|
+
cursor._prev_cursor_value = prev_cursor_value
|
315
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
316
|
+
files_to_sync = list(cursor.get_files_to_sync(all_files, MagicMock()))
|
317
|
+
assert [f.uri for f in files_to_sync] == expected_files_to_sync
|
318
|
+
|
319
|
+
|
320
|
+
@freeze_time("2023-06-16T00:00:00Z")
|
321
|
+
@pytest.mark.parametrize(
|
322
|
+
"file_to_check, history, is_history_full, prev_cursor_value, sync_start, expected_should_sync",
|
323
|
+
[
|
324
|
+
pytest.param(
|
325
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
326
|
+
{},
|
327
|
+
False,
|
328
|
+
(datetime.min, ""),
|
329
|
+
datetime.min,
|
330
|
+
True,
|
331
|
+
id="file-not-in-history-not-full-old-cursor"
|
332
|
+
),
|
333
|
+
pytest.param(
|
334
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-03T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
335
|
+
{},
|
336
|
+
False,
|
337
|
+
(datetime.strptime("2024-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), ""),
|
338
|
+
datetime.min,
|
339
|
+
True,
|
340
|
+
id="file-not-in-history-not-full-new-cursor"
|
341
|
+
),
|
342
|
+
pytest.param(
|
343
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
344
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
345
|
+
False,
|
346
|
+
(datetime.min, ""),
|
347
|
+
datetime.min,
|
348
|
+
False,
|
349
|
+
id="file-in-history-not-modified"
|
350
|
+
),
|
351
|
+
pytest.param(
|
352
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
353
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
354
|
+
False,
|
355
|
+
(datetime.min, ""),
|
356
|
+
datetime.min,
|
357
|
+
False,
|
358
|
+
id="file-in-history-modified-before"
|
359
|
+
),
|
360
|
+
pytest.param(
|
361
|
+
RemoteFile(uri="a.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
362
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
363
|
+
False,
|
364
|
+
(datetime.min, ""),
|
365
|
+
datetime.min,
|
366
|
+
True,
|
367
|
+
id="file-in-history-modified-after"
|
368
|
+
),
|
369
|
+
pytest.param(
|
370
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
371
|
+
{},
|
372
|
+
True,
|
373
|
+
(datetime.strptime("2021-01-02T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
374
|
+
datetime.min,
|
375
|
+
True,
|
376
|
+
id="history-full-file-modified-after-cursor"
|
377
|
+
),
|
378
|
+
pytest.param(
|
379
|
+
RemoteFile(uri="new1.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
380
|
+
{},
|
381
|
+
True,
|
382
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new0.csv"),
|
383
|
+
datetime.min,
|
384
|
+
True,
|
385
|
+
id="history-full-modified-eq-cursor-uri-gt"
|
386
|
+
),
|
387
|
+
pytest.param(
|
388
|
+
RemoteFile(uri="new0.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
389
|
+
{},
|
390
|
+
True,
|
391
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "new1.csv"),
|
392
|
+
datetime.min,
|
393
|
+
False,
|
394
|
+
id="history-full-modified-eq-cursor-uri-lt"
|
395
|
+
),
|
396
|
+
pytest.param(
|
397
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2020-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
398
|
+
{},
|
399
|
+
True,
|
400
|
+
(datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
401
|
+
datetime.min,
|
402
|
+
True,
|
403
|
+
id="history-full-modified-before-cursor-and-after-sync-start"
|
404
|
+
),
|
405
|
+
pytest.param(
|
406
|
+
RemoteFile(uri="new.csv", last_modified=datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT)),
|
407
|
+
{},
|
408
|
+
True,
|
409
|
+
(datetime.strptime("2022-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), "a.csv"),
|
410
|
+
datetime.strptime("2024-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
|
411
|
+
False,
|
412
|
+
id="history-full-modified-before-cursor-and-before-sync-start"
|
413
|
+
),
|
414
|
+
]
|
415
|
+
)
|
416
|
+
def test_should_sync_file(
|
417
|
+
file_to_check: RemoteFile,
|
418
|
+
history: Dict[str, Any],
|
419
|
+
is_history_full: bool,
|
420
|
+
prev_cursor_value: Tuple[datetime, str],
|
421
|
+
sync_start: datetime,
|
422
|
+
expected_should_sync: bool,
|
423
|
+
):
|
424
|
+
cursor = _make_cursor({})
|
425
|
+
cursor._file_to_datetime_history = history
|
426
|
+
cursor._prev_cursor_value = prev_cursor_value
|
427
|
+
cursor._sync_start = sync_start
|
428
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
429
|
+
should_sync = cursor._should_sync_file(file_to_check, MagicMock())
|
430
|
+
assert should_sync == expected_should_sync
|
431
|
+
|
432
|
+
|
433
|
+
@freeze_time("2023-06-16T00:00:00Z")
|
434
|
+
@pytest.mark.parametrize(
|
435
|
+
"input_history, is_history_full, expected_start_time",
|
436
|
+
[
|
437
|
+
pytest.param({}, False, datetime.min, id="empty-history"),
|
438
|
+
pytest.param(
|
439
|
+
{"a.csv": "2021-01-01T00:00:00.000000Z"},
|
440
|
+
False,
|
441
|
+
datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT),
|
442
|
+
id="non-full-history"
|
443
|
+
),
|
444
|
+
pytest.param(
|
445
|
+
{f"file{i}.csv": f"2021-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all before the time window
|
446
|
+
True,
|
447
|
+
datetime.strptime("2021-01-01T00:00:00.000000Z", DATE_TIME_FORMAT), # Time window start time
|
448
|
+
id="full-history-earliest-before-window"
|
449
|
+
),
|
450
|
+
pytest.param(
|
451
|
+
{f"file{i}.csv": f"2024-01-0{i}T00:00:00.000000Z" for i in range(1, 4)}, # all after the time window
|
452
|
+
True,
|
453
|
+
datetime.strptime("2023-06-13T00:00:00.000000Z", DATE_TIME_FORMAT), # Earliest file time
|
454
|
+
id="full-history-earliest-after-window"
|
455
|
+
),
|
456
|
+
]
|
457
|
+
)
|
458
|
+
def test_compute_start_time(input_history, is_history_full, expected_start_time, monkeypatch):
|
459
|
+
cursor = _make_cursor({"history": input_history})
|
460
|
+
cursor._file_to_datetime_history = input_history
|
461
|
+
cursor._is_history_full = MagicMock(return_value=is_history_full)
|
462
|
+
assert cursor._compute_start_time() == expected_start_time
|
@@ -26,6 +26,29 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
|
|
26
26
|
success_multi_stream_scenario,
|
27
27
|
success_user_provided_schema_scenario,
|
28
28
|
)
|
29
|
+
from unit_tests.sources.file_based.scenarios.concurrent_incremental_scenarios import (
|
30
|
+
multi_csv_different_timestamps_scenario_concurrent,
|
31
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
|
32
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
|
33
|
+
multi_csv_per_timestamp_scenario_concurrent,
|
34
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
|
35
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
|
36
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
|
37
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
|
38
|
+
multi_csv_same_timestamp_scenario_concurrent,
|
39
|
+
multi_csv_skip_file_if_already_in_history_concurrent,
|
40
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
|
41
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
|
42
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
|
43
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
|
44
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
|
45
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
|
46
|
+
single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
|
47
|
+
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
|
48
|
+
single_csv_input_state_is_earlier_scenario_concurrent,
|
49
|
+
single_csv_input_state_is_later_scenario_concurrent,
|
50
|
+
single_csv_no_input_state_scenario_concurrent,
|
51
|
+
)
|
29
52
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
30
53
|
csv_autogenerate_column_names_scenario,
|
31
54
|
csv_custom_bool_values_scenario,
|
@@ -214,6 +237,28 @@ discover_scenarios = [
|
|
214
237
|
unstructured_invalid_file_type_discover_scenario_no_skip,
|
215
238
|
unstructured_invalid_file_type_discover_scenario_skip,
|
216
239
|
unstructured_invalid_file_type_read_scenario,
|
240
|
+
multi_csv_different_timestamps_scenario_concurrent,
|
241
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_newer,
|
242
|
+
multi_csv_include_missing_files_within_history_range_concurrent_cursor_is_older,
|
243
|
+
multi_csv_per_timestamp_scenario_concurrent,
|
244
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_newer,
|
245
|
+
multi_csv_remove_old_files_if_history_is_full_scenario_concurrent_cursor_is_older,
|
246
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_newer,
|
247
|
+
multi_csv_same_timestamp_more_files_than_history_size_scenario_concurrent_cursor_is_older,
|
248
|
+
multi_csv_same_timestamp_scenario_concurrent,
|
249
|
+
multi_csv_skip_file_if_already_in_history_concurrent,
|
250
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_newer,
|
251
|
+
multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_different_timestamps_scenario_concurrent_cursor_is_older,
|
252
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_newer,
|
253
|
+
multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_timestamps_scenario_concurrent_cursor_is_older,
|
254
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_newer,
|
255
|
+
multi_csv_sync_recent_files_if_history_is_incomplete_scenario_concurrent_cursor_is_older,
|
256
|
+
single_csv_file_is_skipped_if_same_modified_at_as_in_history_concurrent,
|
257
|
+
single_csv_file_is_synced_if_modified_at_is_more_recent_than_in_history_concurrent,
|
258
|
+
single_csv_input_state_is_earlier_scenario_concurrent,
|
259
|
+
single_csv_input_state_is_later_scenario_concurrent,
|
260
|
+
single_csv_no_input_state_scenario_concurrent,
|
261
|
+
|
217
262
|
]
|
218
263
|
|
219
264
|
read_scenarios = discover_scenarios + [
|
@@ -13,6 +13,7 @@ from _pytest.reports import ExceptionInfo
|
|
13
13
|
from airbyte_cdk.entrypoint import launch
|
14
14
|
from airbyte_cdk.models import AirbyteAnalyticsTraceMessage, SyncMode
|
15
15
|
from airbyte_cdk.sources import AbstractSource
|
16
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
16
17
|
from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput
|
17
18
|
from airbyte_cdk.test.entrypoint_wrapper import read as entrypoint_read
|
18
19
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
@@ -72,12 +73,10 @@ def assert_exception(expected_exception: type[BaseException], output: Entrypoint
|
|
72
73
|
def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[AbstractSource]) -> None:
|
73
74
|
records, log_messages = output.records_and_state_messages, output.logs
|
74
75
|
logs = [message.log for message in log_messages if message.log.level.value in scenario.log_levels]
|
75
|
-
|
76
|
-
|
77
|
-
if expected_records is None:
|
76
|
+
if scenario.expected_records is None:
|
78
77
|
return
|
79
78
|
|
80
|
-
|
79
|
+
expected_records = [r for r in scenario.expected_records] if scenario.expected_records else []
|
81
80
|
|
82
81
|
sorted_expected_records = sorted(
|
83
82
|
filter(lambda e: "data" in e, expected_records),
|
@@ -87,6 +86,9 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
87
86
|
filter(lambda r: r.record, records),
|
88
87
|
key=lambda record: ",".join(f"{k}={v}" for k, v in sorted(record.record.data.items(), key=lambda items: (items[0], items[1])) if k != "emitted_at"),
|
89
88
|
)
|
89
|
+
|
90
|
+
assert len(sorted_records) == len(sorted_expected_records)
|
91
|
+
|
90
92
|
for actual, expected in zip(sorted_records, sorted_expected_records):
|
91
93
|
if actual.record:
|
92
94
|
assert len(actual.record.data) == len(expected["data"])
|
@@ -97,10 +99,16 @@ def _verify_read_output(output: EntrypointOutput, scenario: TestScenario[Abstrac
|
|
97
99
|
assert value == expected["data"][key]
|
98
100
|
assert actual.record.stream == expected["stream"]
|
99
101
|
|
100
|
-
expected_states = filter(lambda e: "data" not in e, expected_records)
|
101
|
-
states = filter(lambda r: r.state, records)
|
102
|
-
|
103
|
-
|
102
|
+
expected_states = list(filter(lambda e: "data" not in e, expected_records))
|
103
|
+
states = list(filter(lambda r: r.state, records))
|
104
|
+
|
105
|
+
if hasattr(scenario.source, "cursor_cls") and issubclass(scenario.source.cursor_cls, AbstractConcurrentFileBasedCursor):
|
106
|
+
# Only check the last state emitted because we don't know the order the others will be in.
|
107
|
+
# This may be needed for non-file-based concurrent scenarios too.
|
108
|
+
assert states[-1].state.data == expected_states[-1]
|
109
|
+
else:
|
110
|
+
for actual, expected in zip(states, expected_states): # states should be emitted in sorted order
|
111
|
+
assert actual.state.data == expected
|
104
112
|
|
105
113
|
if scenario.expected_logs:
|
106
114
|
read_logs = scenario.expected_logs.get("read")
|
@@ -5,7 +5,14 @@ import concurrent
|
|
5
5
|
import logging
|
6
6
|
from typing import Any, List, Mapping, Optional, Tuple, Union
|
7
7
|
|
8
|
-
from airbyte_cdk.models import
|
8
|
+
from airbyte_cdk.models import (
|
9
|
+
AirbyteStateMessage,
|
10
|
+
AirbyteStream,
|
11
|
+
ConfiguredAirbyteCatalog,
|
12
|
+
ConnectorSpecification,
|
13
|
+
DestinationSyncMode,
|
14
|
+
SyncMode,
|
15
|
+
)
|
9
16
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
10
17
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
11
18
|
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
@@ -51,7 +58,11 @@ class StreamFacadeSource(ConcurrentSourceAdapter):
|
|
51
58
|
return True, None
|
52
59
|
|
53
60
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
54
|
-
state_manager = ConnectorStateManager(
|
61
|
+
state_manager = ConnectorStateManager(
|
62
|
+
stream_instance_map={s.name: AirbyteStream(name=s.name, namespace=None, json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental]) for s in self._streams},
|
63
|
+
state=self._state,
|
64
|
+
) # The input values into the AirbyteStream are dummy values; the connector state manager only uses `name` and `namespace`
|
65
|
+
|
55
66
|
state_converter = StreamFacadeConcurrentConnectorStateConverter()
|
56
67
|
stream_states = [state_manager.get_stream_state(stream.name, stream.namespace) for stream in self._streams]
|
57
68
|
return [
|