airbyte-cdk 0.50.0__py3-none-any.whl → 0.50.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/entrypoint.py +7 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -3
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +9 -9
- airbyte_cdk/sources/file_based/config/csv_format.py +42 -6
- airbyte_cdk/sources/file_based/file_based_source.py +4 -5
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +114 -59
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +2 -2
- airbyte_cdk/sources/file_based/stream/cursor/{file_based_cursor.py → abstract_file_based_cursor.py} +9 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +10 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +15 -2
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/RECORD +25 -24
- unit_tests/sources/file_based/config/test_csv_format.py +23 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +50 -18
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/in_memory_files_source.py +11 -3
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +1254 -47
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +6 -5
- unit_tests/sources/file_based/scenarios/scenario_builder.py +8 -7
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +13 -12
- unit_tests/sources/file_based/test_scenarios.py +30 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.50.0.dist-info → airbyte_cdk-0.50.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
from unit_tests.sources.file_based.helpers import LowHistoryLimitCursor
|
5
6
|
from unit_tests.sources.file_based.scenarios.scenario_builder import IncrementalScenarioConfig, TestScenarioBuilder
|
6
7
|
|
7
8
|
single_csv_input_state_is_earlier_scenario = (
|
@@ -1004,7 +1005,7 @@ multi_csv_remove_old_files_if_history_is_full_scenario = (
|
|
1004
1005
|
}
|
1005
1006
|
)
|
1006
1007
|
.set_file_type("csv")
|
1007
|
-
.
|
1008
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1008
1009
|
.set_expected_catalog(
|
1009
1010
|
{
|
1010
1011
|
"streams": [
|
@@ -1151,7 +1152,7 @@ multi_csv_same_timestamp_more_files_than_history_size_scenario = (
|
|
1151
1152
|
}
|
1152
1153
|
)
|
1153
1154
|
.set_file_type("csv")
|
1154
|
-
.
|
1155
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1155
1156
|
.set_expected_catalog(
|
1156
1157
|
{
|
1157
1158
|
"streams": [
|
@@ -1268,7 +1269,7 @@ multi_csv_sync_recent_files_if_history_is_incomplete_scenario = (
|
|
1268
1269
|
},
|
1269
1270
|
}
|
1270
1271
|
)
|
1271
|
-
.
|
1272
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1272
1273
|
.set_file_type("csv")
|
1273
1274
|
.set_expected_catalog(
|
1274
1275
|
{
|
@@ -1386,7 +1387,7 @@ multi_csv_sync_files_within_time_window_if_history_is_incomplete__different_time
|
|
1386
1387
|
}
|
1387
1388
|
)
|
1388
1389
|
.set_file_type("csv")
|
1389
|
-
.
|
1390
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1390
1391
|
.set_expected_catalog(
|
1391
1392
|
{
|
1392
1393
|
"streams": [
|
@@ -1509,7 +1510,7 @@ multi_csv_sync_files_within_history_time_window_if_history_is_incomplete_differe
|
|
1509
1510
|
}
|
1510
1511
|
)
|
1511
1512
|
.set_file_type("csv")
|
1512
|
-
.
|
1513
|
+
.set_cursor_cls(LowHistoryLimitCursor)
|
1513
1514
|
.set_expected_catalog(
|
1514
1515
|
{
|
1515
1516
|
"streams": [
|
@@ -11,10 +11,11 @@ from airbyte_cdk.sources.file_based.availability_strategy.abstract_file_based_av
|
|
11
11
|
AbstractFileBasedAvailabilityStrategy,
|
12
12
|
)
|
13
13
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
|
14
|
-
from airbyte_cdk.sources.file_based.file_based_source import
|
14
|
+
from airbyte_cdk.sources.file_based.file_based_source import default_parsers
|
15
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
18
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
18
19
|
from unit_tests.sources.file_based.in_memory_files_source import InMemoryFilesSource
|
19
20
|
|
20
21
|
|
@@ -46,7 +47,7 @@ class TestScenario:
|
|
46
47
|
expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]],
|
47
48
|
incremental_scenario_config: Optional[IncrementalScenarioConfig],
|
48
49
|
file_write_options: Mapping[str, Any],
|
49
|
-
|
50
|
+
cursor_cls: Optional[Type[AbstractFileBasedCursor]],
|
50
51
|
):
|
51
52
|
self.name = name
|
52
53
|
self.config = config
|
@@ -68,7 +69,7 @@ class TestScenario:
|
|
68
69
|
stream_reader,
|
69
70
|
self.configured_catalog(SyncMode.incremental if incremental_scenario_config else SyncMode.full_refresh),
|
70
71
|
file_write_options,
|
71
|
-
|
72
|
+
cursor_cls,
|
72
73
|
)
|
73
74
|
self.incremental_scenario_config = incremental_scenario_config
|
74
75
|
self.validate()
|
@@ -124,7 +125,7 @@ class TestScenarioBuilder:
|
|
124
125
|
self._expected_read_error: Tuple[Optional[Type[Exception]], Optional[str]] = None, None
|
125
126
|
self._incremental_scenario_config: Optional[IncrementalScenarioConfig] = None
|
126
127
|
self._file_write_options: Mapping[str, Any] = {}
|
127
|
-
self.
|
128
|
+
self._cursor_cls: Optional[Type[AbstractFileBasedCursor]] = None
|
128
129
|
|
129
130
|
def set_name(self, name: str) -> "TestScenarioBuilder":
|
130
131
|
self._name = name
|
@@ -182,8 +183,8 @@ class TestScenarioBuilder:
|
|
182
183
|
self._stream_reader = stream_reader
|
183
184
|
return self
|
184
185
|
|
185
|
-
def
|
186
|
-
self.
|
186
|
+
def set_cursor_cls(self, cursor_cls: AbstractFileBasedCursor) -> "TestScenarioBuilder":
|
187
|
+
self._cursor_cls = cursor_cls
|
187
188
|
return self
|
188
189
|
|
189
190
|
def set_incremental_scenario_config(self, incremental_scenario_config: IncrementalScenarioConfig) -> "TestScenarioBuilder":
|
@@ -232,5 +233,5 @@ class TestScenarioBuilder:
|
|
232
233
|
self._expected_read_error,
|
233
234
|
self._incremental_scenario_config,
|
234
235
|
self._file_write_options,
|
235
|
-
self.
|
236
|
+
self._cursor_cls,
|
236
237
|
)
|
@@ -7,6 +7,7 @@ from typing import Any, List, Mapping
|
|
7
7
|
from unittest.mock import MagicMock
|
8
8
|
|
9
9
|
import pytest
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
11
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
12
|
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
12
13
|
from freezegun import freeze_time
|
@@ -103,7 +104,7 @@ from freezegun import freeze_time
|
|
103
104
|
],
|
104
105
|
)
|
105
106
|
def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[datetime], expected_state_dict: Mapping[str, Any]) -> None:
|
106
|
-
cursor =
|
107
|
+
cursor = get_cursor(max_history_size=3, days_to_sync_if_history_is_full=3)
|
107
108
|
assert cursor._compute_start_time() == datetime.min
|
108
109
|
|
109
110
|
for index, f in enumerate(files_to_add):
|
@@ -160,7 +161,7 @@ def test_add_file(files_to_add: List[RemoteFile], expected_start_time: List[date
|
|
160
161
|
])
|
161
162
|
def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List[RemoteFile], max_history_size: int, history_is_partial: bool) -> None:
|
162
163
|
logger = MagicMock()
|
163
|
-
cursor =
|
164
|
+
cursor = get_cursor(max_history_size, 3)
|
164
165
|
|
165
166
|
files_to_sync = list(cursor.get_files_to_sync(files, logger))
|
166
167
|
for f in files_to_sync:
|
@@ -173,7 +174,7 @@ def test_get_files_to_sync(files: List[RemoteFile], expected_files_to_sync: List
|
|
173
174
|
@freeze_time("2023-06-16T00:00:00Z")
|
174
175
|
def test_only_recent_files_are_synced_if_history_is_full() -> None:
|
175
176
|
logger = MagicMock()
|
176
|
-
cursor =
|
177
|
+
cursor = get_cursor(2, 3)
|
177
178
|
|
178
179
|
files_in_history = [
|
179
180
|
RemoteFile(uri="b1.csv", last_modified=datetime(2021, 1, 2), file_type="csv"),
|
@@ -210,7 +211,7 @@ def test_only_recent_files_are_synced_if_history_is_full() -> None:
|
|
210
211
|
])
|
211
212
|
def test_sync_file_already_present_in_history(modified_at_delta: timedelta, should_sync_file: bool) -> None:
|
212
213
|
logger = MagicMock()
|
213
|
-
cursor =
|
214
|
+
cursor = get_cursor(2, 3)
|
214
215
|
original_modified_at = datetime(2021, 1, 2)
|
215
216
|
filename = "a.csv"
|
216
217
|
files_in_history = [
|
@@ -245,7 +246,7 @@ def test_sync_file_already_present_in_history(modified_at_delta: timedelta, shou
|
|
245
246
|
)
|
246
247
|
def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_in_history: datetime, should_sync_file: bool) -> None:
|
247
248
|
logger = MagicMock()
|
248
|
-
cursor =
|
249
|
+
cursor = get_cursor(1, 3)
|
249
250
|
|
250
251
|
cursor.add_file(RemoteFile(uri="b.csv", last_modified=earliest_dt_in_history, file_type="csv"))
|
251
252
|
cursor._start_time = cursor._compute_start_time()
|
@@ -255,13 +256,13 @@ def test_should_sync_file(file_name: str, last_modified: datetime, earliest_dt_i
|
|
255
256
|
|
256
257
|
|
257
258
|
def test_set_initial_state_no_history() -> None:
|
258
|
-
cursor =
|
259
|
+
cursor = get_cursor(1, 3)
|
259
260
|
cursor.set_initial_state({})
|
260
261
|
|
261
262
|
|
262
|
-
def
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
263
|
+
def get_cursor(max_history_size: int, days_to_sync_if_history_is_full: int) -> DefaultFileBasedCursor:
|
264
|
+
cursor_cls = DefaultFileBasedCursor
|
265
|
+
cursor_cls.DEFAULT_MAX_HISTORY_SIZE = max_history_size
|
266
|
+
config = FileBasedStreamConfig(
|
267
|
+
file_type="csv", name="test", validation_policy="emit_records", days_to_sync_if_history_is_full=days_to_sync_if_history_is_full)
|
268
|
+
return cursor_cls(config)
|
@@ -34,10 +34,25 @@ from unit_tests.sources.file_based.scenarios.check_scenarios import (
|
|
34
34
|
success_user_provided_schema_scenario,
|
35
35
|
)
|
36
36
|
from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
37
|
+
csv_autogenerate_column_names_scenario,
|
38
|
+
csv_custom_bool_values_scenario,
|
39
|
+
csv_custom_delimiter_in_double_quotes_scenario,
|
40
|
+
csv_custom_delimiter_with_escape_char_scenario,
|
37
41
|
csv_custom_format_scenario,
|
42
|
+
csv_custom_null_values_scenario,
|
43
|
+
csv_double_quote_is_set_scenario,
|
44
|
+
csv_escape_char_is_set_scenario,
|
38
45
|
csv_legacy_format_scenario,
|
39
46
|
csv_multi_stream_scenario,
|
47
|
+
csv_newline_in_values_not_quoted_scenario,
|
48
|
+
csv_newline_in_values_quoted_value_scenario,
|
40
49
|
csv_single_stream_scenario,
|
50
|
+
csv_skip_after_header_scenario,
|
51
|
+
csv_skip_before_and_after_header_scenario,
|
52
|
+
csv_skip_before_header_scenario,
|
53
|
+
csv_string_can_be_null_with_input_schemas_scenario,
|
54
|
+
csv_string_not_null_if_no_null_values_scenario,
|
55
|
+
csv_strings_can_be_null_not_quoted_scenario,
|
41
56
|
empty_schema_inference_scenario,
|
42
57
|
invalid_csv_scenario,
|
43
58
|
multi_csv_scenario,
|
@@ -162,11 +177,26 @@ discover_scenarios = [
|
|
162
177
|
jsonl_user_input_schema_scenario,
|
163
178
|
schemaless_jsonl_scenario,
|
164
179
|
schemaless_jsonl_multi_stream_scenario,
|
180
|
+
csv_string_can_be_null_with_input_schemas_scenario,
|
181
|
+
csv_string_not_null_if_no_null_values_scenario,
|
182
|
+
csv_strings_can_be_null_not_quoted_scenario,
|
183
|
+
csv_newline_in_values_quoted_value_scenario,
|
184
|
+
csv_escape_char_is_set_scenario,
|
185
|
+
csv_double_quote_is_set_scenario,
|
186
|
+
csv_custom_delimiter_with_escape_char_scenario,
|
187
|
+
csv_custom_delimiter_in_double_quotes_scenario,
|
188
|
+
csv_skip_before_header_scenario,
|
189
|
+
csv_skip_after_header_scenario,
|
190
|
+
csv_skip_before_and_after_header_scenario,
|
191
|
+
csv_custom_bool_values_scenario,
|
192
|
+
csv_custom_null_values_scenario,
|
165
193
|
single_avro_scenario,
|
166
194
|
avro_all_types_scenario,
|
167
195
|
multiple_avro_combine_schema_scenario,
|
168
196
|
multiple_streams_avro_scenario,
|
169
197
|
avro_file_with_decimal_as_float_scenario,
|
198
|
+
csv_newline_in_values_not_quoted_scenario,
|
199
|
+
csv_autogenerate_column_names_scenario,
|
170
200
|
]
|
171
201
|
|
172
202
|
|
File without changes
|
File without changes
|
File without changes
|