airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/entrypoint.py +7 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
- airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
- airbyte_cdk/sources/file_based/file_based_source.py +4 -5
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
- unit_tests/sources/file_based/config/test_csv_format.py +23 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/in_memory_files_source.py +11 -3
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
- unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
- unit_tests/sources/file_based/test_scenarios.py +30 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
from unit_tests.sources.file_based.helpers import LowHistoryLimitCursor
|
5
6
|
from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder
|
6
7
|
|
7
8
|
single_csv_input_state_is_earlier_scenario = (
|
@@ -1004,7 +1005,7 @@ multi_csv_remove_old_files_if_history_is_full_scenario = (
|
|
1004
1005
|
}
|
1005
1006
|
)
|
1006
1007
|
.set_file_type("csv")
|
1007
|
-
.
|
1008
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1008
1009
|
.set_expected_catalog(
|
1009
1010
|
{
|
1010
1011
|
"streams": [
|
@@ -1151,7 +1152,7 @@ multi_csv_same_timestamp_more_files_than_history_size_scenario = (
|
|
1151
1152
|
}
|
1152
1153
|
)
|
1153
1154
|
.set_file_type("csv")
|
1154
|
-
.
|
1155
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1155
1156
|
.set_expected_catalog(
|
1156
1157
|
{
|
1157
1158
|
"streams": [
|
@@ -1268,7 +1269,7 @@ multi_csv_sync_recent_files_if_history_is_incomplete_scenario = (
|
|
1268
1269
|
},
|
1269
1270
|
}
|
1270
1271
|
)
|
1271
|
-
.
|
1272
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1272
1273
|
.set_file_type("csv")
|
1273
1274
|
.set_expected_catalog(
|
1274
1275
|
{
|
@@ -1386,7 +1387,7 @@ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_time
|
|
1386
1387
|
}
|
1387
1388
|
)
|
1388
1389
|
.set_file_type("csv")
|
1389
|
-
.
|
1390
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1390
1391
|
.set_expected_catalog(
|
1391
1392
|
{
|
1392
1393
|
"streams": [
|
@@ -1509,7 +1510,7 @@ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_differe
|
|
1509
1510
|
}
|
1510
1511
|
)
|
1511
1512
|
.set_file_type("csv")
|
1512
|
-
.
|
1513
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1513
1514
|
.set_expected_catalog(
|
1514
1515
|
{
|
1515
1516
|
"streams": [
|
@@ -11,10 +11,11 @@ from airbyte_cdk.sources.file_based.availability_strategy.abstract_file_based_av
|
|
11
11
|
AbstractFileBasedAvailabilityStrategy,
|
12
12
|
)
|
13
13
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
|
14
|
-
from airbyte_cdk.sources.file_based.file_based_source import
|
14
|
+
from airbyte_cdk.sources.file_based.file_based_source import default_parsers
|
15
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
18
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
18
19
|
from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource
|
19
20
|
|
20
21
|
|
@@ -46,7 +47,7 @@ class TestScenario:
|
|
46
47
|
expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]],
|
47
48
|
incremental_scenario_config: Optional[IncrementalScenarioConfig],
|
48
49
|
file_write_options: Mapping[str, Any],
|
49
|
-
|
50
|
+
cursor_cls: Optional[Type[AbstractFileBasedCursor]],
|
50
51
|
):
|
51
52
|
self.name = name
|
52
53
|
self.config = config
|
@@ -68,7 +69,7 @@ class TestScenario:
|
|
68
69
|
stream_reader,
|
69
70
|
self.configured_catalog(SyncMode.incremental if incremental_scenario_config else SyncMode.full_refresh),
|
70
71
|
file_write_options,
|
71
|
-
|
72
|
+
cursor_cls,
|
72
73
|
)
|
73
74
|
self.incremental_scenario_config = incremental_scenario_config
|
74
75
|
self.validate()
|
@@ -124,7 +125,7 @@ class TestScenarioBuilder:
|
|
124
125
|
self._expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None
|
125
126
|
self._incremental_scenario_config: Optional[IncrementalScenarioConfig] = None
|
126
127
|
self._file_write_options: Mapping[str, Any] = {}
|
127
|
-
self.
|
128
|
+
self._cursor_cls: Optional[Type[AbstractFileBasedCursor]] = None
|
128
129
|
|
129
130
|
def set_name(self, name: str) -> "TestScenarioBuilder":
|
130
131
|
self._name = name
|
@@ -182,8 +183,8 @@ class TestScenarioBuilder:
|
|
182
183
|
self._stream_reader = stream_reader
|
183
184
|
return self
|
184
185
|
|
185
|
-
def
|
186
|
-
self.
|
186
|
+
def set_cursor_cls(self, cursor_cls: AbstractFileBasedCursor) -> "TestScenarioBuilder":
|
187
|
+
self._cursor_cls = cursor_cls
|
187
188
|
return self
|
188
189
|
|
189
190
|
def set_incremental_scenario_config(self, incremental_scenario_config: IncrementalScenarioConfig) -> "TestScenarioBuilder":
|
@@ -232,5 +233,5 @@ class TestScenarioBuilder:
|
|
232
233
|
self._expected_read_error,
|
233
234
|
self._incremental_scenario_config,
|
234
235
|
self._file_write_options,
|
235
|
-
self.
|
236
|
+
self._cursor_cls,
|
236
237
|
)
|
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping
|
|
7
7
|
from unittest.mock import MagicMock
|
8
8
|
|
9
9
|
import pytest
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
11
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
12
|
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
12
13
|
from freezegun import freeze_time
|
@@ -103,7 +104,7 @@ from freezegun import freeze_time
|
|
103
104
|
],
|
104
105
|
)
|
105
106
|
def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[datetime], expected_state_dict: Mapping[str, Any]) -> None:
|
106
|
-
cursor =
|
107
|
+
cursor = get_cursor(max_history_size=3, days_to_sync_if_history_is_full=3)
|
107
108
|
assert cursor._compute_start_time() == datetime.min
|
108
109
|
|
109
110
|
for index, f in enumerate(files_to_add):
|
@@ -160,7 +161,7 @@ def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[date
|
|
160
161
|
])
|
161
162
|
def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List[RemoteFile], max_history_size: int, history_is_partial: bool) -> None:
|
162
163
|
logger = MagicMock()
|
163
|
-
cursor =
|
164
|
+
cursor = get_cursor(max_history_size, 3)
|
164
165
|
|
165
166
|
files_to_sync = list(cursor.get_files_to_sync(files, logger))
|
166
167
|
for f in files_to_sync:
|
@@ -173,7 +174,7 @@ def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List
|
|
173
174
|
@freeze_time("2023-06-16T00:00:00Z")
|
174
175
|
def test_only_recent_files_are_synced_if_history_is_full() -> None:
|
175
176
|
logger = MagicMock()
|
176
|
-
cursor =
|
177
|
+
cursor = get_cursor(2, 3)
|
177
178
|
|
178
179
|
files_in_history = [
|
179
180
|
RemoteFile(uri="b1.csv", last_modified=datetime(2021, 1, 2), file_type="csv"),
|
@@ -210,7 +211,7 @@ def test_only_recent_files_are_synced_if_history_is_full() -> None:
|
|
210
211
|
])
|
211
212
|
def test_sync_file_already_present_in_history(modified_at_delta: timedelta, should_sync_file: bool) -> None:
|
212
213
|
logger = MagicMock()
|
213
|
-
cursor =
|
214
|
+
cursor = get_cursor(2, 3)
|
214
215
|
original_modified_at = datetime(2021, 1, 2)
|
215
216
|
filename = "a.csv"
|
216
217
|
files_in_history = [
|
@@ -245,7 +246,7 @@ def test_sync_file_already_present_in_history(modified_at_delta: timedelta, shou
|
|
245
246
|
)
|
246
247
|
def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_in_history: datetime, should_sync_file: bool) -> None:
|
247
248
|
logger = MagicMock()
|
248
|
-
cursor =
|
249
|
+
cursor = get_cursor(1, 3)
|
249
250
|
|
250
251
|
cursor.add_file(RemoteFile(uri="b.csv", last_modified=earliest_dt_in_history, file_type="csv"))
|
251
252
|
cursor._start_time = cursor._compute_start_time()
|
@@ -255,13 +256,13 @@ def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_i
|
|
255
256
|
|
256
257
|
|
257
258
|
def test_set_initial_state_no_history() -> None:
|
258
|
-
cursor =
|
259
|
+
cursor = get_cursor(1, 3)
|
259
260
|
cursor.set_initial_state({})
|
260
261
|
|
261
262
|
|
262
|
-
def
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
263
|
+
def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> DefaultFileBasedCursor:
|
264
|
+
cursor_cls = DefaultFileBasedCursor
|
265
|
+
cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size
|
266
|
+
config = FileBasedStreamConfig(
|
267
|
+
file_type="csv", name="test", validation_policy="emit_records", days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
|
268
|
+
return cursor_cls(config)
|
@@ -34,10 +34,25 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
|
|
34
34
|
success_user_provided_schema_scenario,
|
35
35
|
)
|
36
36
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
37
|
+
csv_autogenerate_column_names_scenario,
|
38
|
+
csv_custom_bool_values_scenario,
|
39
|
+
csv_custom_delimiter_in_double_quotes_scenario,
|
40
|
+
csv_custom_delimiter_with_escape_char_scenario,
|
37
41
|
csv_custom_format_scenario,
|
42
|
+
csv_custom_null_values_scenario,
|
43
|
+
csv_double_quote_is_set_scenario,
|
44
|
+
csv_escape_char_is_set_scenario,
|
38
45
|
csv_legacy_format_scenario,
|
39
46
|
csv_multi_stream_scenario,
|
47
|
+
csv_newline_in_values_not_quoted_scenario,
|
48
|
+
csv_newline_in_values_quoted_value_scenario,
|
40
49
|
csv_single_stream_scenario,
|
50
|
+
csv_skip_after_header_scenario,
|
51
|
+
csv_skip_before_and_after_header_scenario,
|
52
|
+
csv_skip_before_header_scenario,
|
53
|
+
csv_string_can_be_null_with_input_schemas_scenario,
|
54
|
+
csv_string_not_null_if_no_null_values_scenario,
|
55
|
+
csv_strings_can_be_null_not_quoted_scenario,
|
41
56
|
empty_schema_inference_scenario,
|
42
57
|
invalid_csv_scenario,
|
43
58
|
multi_csv_scenario,
|
@@ -162,11 +177,26 @@ discover_scenarios = [
|
|
162
177
|
jsonl_user_input_schema_scenario,
|
163
178
|
schemaless_jsonl_scenario,
|
164
179
|
schemaless_jsonl_multi_stream_scenario,
|
180
|
+
csv_string_can_be_null_with_input_schemas_scenario,
|
181
|
+
csv_string_not_null_if_no_null_values_scenario,
|
182
|
+
csv_strings_can_be_null_not_quoted_scenario,
|
183
|
+
csv_newline_in_values_quoted_value_scenario,
|
184
|
+
csv_escape_char_is_set_scenario,
|
185
|
+
csv_double_quote_is_set_scenario,
|
186
|
+
csv_custom_delimiter_with_escape_char_scenario,
|
187
|
+
csv_custom_delimiter_in_double_quotes_scenario,
|
188
|
+
csv_skip_before_header_scenario,
|
189
|
+
csv_skip_after_header_scenario,
|
190
|
+
csv_skip_before_and_after_header_scenario,
|
191
|
+
csv_custom_bool_values_scenario,
|
192
|
+
csv_custom_null_values_scenario,
|
165
193
|
single_avro_scenario,
|
166
194
|
avro_all_types_scenario,
|
167
195
|
multiple_avro_combine_schema_scenario,
|
168
196
|
multiple_streams_avro_scenario,
|
169
197
|
avro_file_with_decimal_as_float_scenario,
|
198
|
+
csv_newline_in_values_not_quoted_scenario,
|
199
|
+
csv_autogenerate_column_names_scenario,
|
170
200
|
]
|
171
201
|
|
172
202
|
|
File without changes
|
File without changes
|
File without changes
|