airbyte-cdk 0.51.2__py3-none-any.whl → 0.51.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +4 -5
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +2 -3
- {airbyte_cdk-0.51.2.dist-info → airbyte_cdk-0.51.4.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.51.2.dist-info → airbyte_cdk-0.51.4.dist-info}/RECORD +10 -10
- {airbyte_cdk-0.51.2.dist-info → airbyte_cdk-0.51.4.dist-info}/WHEEL +1 -1
- unit_tests/sources/file_based/file_types/test_csv_parser.py +1 -1
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -2
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +83 -1
- {airbyte_cdk-0.51.2.dist-info → airbyte_cdk-0.51.4.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.51.2.dist-info → airbyte_cdk-0.51.4.dist-info}/top_level.txt +0 -0
@@ -158,7 +158,7 @@ class CsvParser(FileTypeParser):
|
|
158
158
|
deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
|
159
159
|
else:
|
160
160
|
deduped_property_types = {}
|
161
|
-
cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger)
|
161
|
+
cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
|
162
162
|
data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
|
163
163
|
for row in data_generator:
|
164
164
|
yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null)
|
@@ -170,10 +170,10 @@ class CsvParser(FileTypeParser):
|
|
170
170
|
|
171
171
|
@staticmethod
|
172
172
|
def _get_cast_function(
|
173
|
-
deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger
|
173
|
+
deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger, schemaless: bool
|
174
174
|
) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
|
175
175
|
# Only cast values if the schema is provided
|
176
|
-
if deduped_property_types:
|
176
|
+
if deduped_property_types and not schemaless:
|
177
177
|
return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger)
|
178
178
|
else:
|
179
179
|
# If no schema is provided, yield the rows as they are
|
@@ -275,11 +275,10 @@ class CsvParser(FileTypeParser):
|
|
275
275
|
except ValueError:
|
276
276
|
warnings.append(_format_warning(key, value, prop_type))
|
277
277
|
|
278
|
+
result[key] = cast_value
|
278
279
|
else:
|
279
280
|
warnings.append(_format_warning(key, value, prop_type))
|
280
281
|
|
281
|
-
result[key] = cast_value
|
282
|
-
|
283
282
|
if warnings:
|
284
283
|
logger.warning(
|
285
284
|
f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}",
|
@@ -65,7 +65,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
65
65
|
slices = [{"files": list(group[1])} for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)]
|
66
66
|
return slices
|
67
67
|
|
68
|
-
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[
|
68
|
+
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
|
69
69
|
"""
|
70
70
|
Yield all records from all remote files in `list_files_for_this_sync`.
|
71
71
|
|
@@ -127,9 +127,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
127
127
|
stack_trace=traceback.format_exc(),
|
128
128
|
),
|
129
129
|
)
|
130
|
-
break
|
131
130
|
|
132
|
-
|
131
|
+
finally:
|
133
132
|
if n_skipped:
|
134
133
|
yield AirbyteMessage(
|
135
134
|
type=MessageType.LOG,
|
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
|
|
157
157
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
|
158
158
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
|
159
159
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
|
160
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
160
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=SsWy_8KunUz1MYKroix6fWv37mbPEH0h5SoW4g3Qjf4,16676
|
161
161
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
|
162
162
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
|
163
163
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
|
@@ -166,7 +166,7 @@ airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_valida
|
|
166
166
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
|
167
167
|
airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
|
168
168
|
airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=tvVew6din9y8a3hItzU0PjTQrMxbVI7bK-3pRTvOswg,5810
|
169
|
-
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=
|
169
|
+
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=eFYqN657J5A0sf9og_w7qea8lu2xtUobjYYDldfmbmA,11839
|
170
170
|
airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
|
171
171
|
airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
|
172
172
|
airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
|
@@ -315,13 +315,13 @@ unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZ
|
|
315
315
|
unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
|
316
316
|
unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
317
317
|
unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
|
318
|
-
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=
|
318
|
+
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
|
319
319
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
320
320
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
|
321
321
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
322
322
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
|
323
323
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=7DR49LCjns72Dv5-R-cg6_SUR1zpHtE9_uFEWoYwx1s,5834
|
324
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
324
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
|
325
325
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
|
326
326
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
|
327
327
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
|
@@ -330,7 +330,7 @@ unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=sx
|
|
330
330
|
unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=GFACt5-0BMFVtrYueA1_6va2euHfhKcBLcYfDHHnmFI,26715
|
331
331
|
unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
332
332
|
unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=fzggaGwtXgNk-sAjQ8D71CPTCNBVxBS6HW63FKdkKME,12491
|
333
|
-
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=
|
333
|
+
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=gZlNs6s9dRgcFLrLZtkJDRSje_8gDylHj3xktjsUMVo,5782
|
334
334
|
unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
335
335
|
unit_tests/sources/fixtures/source_test_fixture.py,sha256=r-UtR241EGQMZTw1RoKaatrpCGeQn7OIuRPWfG9f7nI,5380
|
336
336
|
unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -352,8 +352,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
352
352
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
353
353
|
unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
|
354
354
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
355
|
-
airbyte_cdk-0.51.
|
356
|
-
airbyte_cdk-0.51.
|
357
|
-
airbyte_cdk-0.51.
|
358
|
-
airbyte_cdk-0.51.
|
359
|
-
airbyte_cdk-0.51.
|
355
|
+
airbyte_cdk-0.51.4.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
356
|
+
airbyte_cdk-0.51.4.dist-info/METADATA,sha256=imzQYjx2YvmpJDtuZK6lUo96gKmLJBYNOZf06KeAg-s,9399
|
357
|
+
airbyte_cdk-0.51.4.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
358
|
+
airbyte_cdk-0.51.4.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
359
|
+
airbyte_cdk-0.51.4.dist-info/RECORD,,
|
@@ -100,7 +100,7 @@ logger = logging.getLogger()
|
|
100
100
|
pytest.param(
|
101
101
|
{"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects"
|
102
102
|
),
|
103
|
-
pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {
|
103
|
+
pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {}, id="item-not-in-props-doesn't-error"),
|
104
104
|
],
|
105
105
|
)
|
106
106
|
def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None:
|
@@ -481,7 +481,6 @@ multi_csv_stream_n_file_exceeds_limit_for_inference = (
|
|
481
481
|
"data": {
|
482
482
|
"col1": "val11b",
|
483
483
|
"col2": "val12b",
|
484
|
-
"col3": "val13b",
|
485
484
|
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
486
485
|
"_ab_source_file_url": "b.csv",
|
487
486
|
},
|
@@ -491,7 +490,6 @@ multi_csv_stream_n_file_exceeds_limit_for_inference = (
|
|
491
490
|
"data": {
|
492
491
|
"col1": "val21b",
|
493
492
|
"col2": "val22b",
|
494
|
-
"col3": "val23b",
|
495
493
|
"_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
|
496
494
|
"_ab_source_file_url": "b.csv",
|
497
495
|
},
|
@@ -2,9 +2,20 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
5
|
+
import unittest
|
6
|
+
from datetime import datetime, timezone
|
7
|
+
from typing import Any, Iterable, Iterator, Mapping
|
8
|
+
from unittest.mock import Mock
|
6
9
|
|
7
10
|
import pytest
|
11
|
+
from airbyte_cdk.models import Level
|
12
|
+
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
13
|
+
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
14
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
15
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
17
|
+
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
18
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
8
19
|
from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream
|
9
20
|
|
10
21
|
|
@@ -46,3 +57,74 @@ from airbyte_cdk.sources.file_based.stream.default_file_based_stream import Defa
|
|
46
57
|
)
|
47
58
|
def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None:
|
48
59
|
assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
|
60
|
+
|
61
|
+
|
62
|
+
class DefaultFileBasedStreamTest(unittest.TestCase):
|
63
|
+
_FILE_TYPE = "file_type"
|
64
|
+
_NOW = datetime(2022, 10, 22, tzinfo=timezone.utc)
|
65
|
+
_A_RECORD = {"a_record": 1}
|
66
|
+
|
67
|
+
def setUp(self) -> None:
|
68
|
+
self._stream_config = Mock()
|
69
|
+
self._stream_config.file_type = self._FILE_TYPE
|
70
|
+
self._stream_config.name = "a stream name"
|
71
|
+
self._catalog_schema = Mock()
|
72
|
+
self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
|
73
|
+
self._availability_strategy = Mock(spec=AbstractFileBasedAvailabilityStrategy)
|
74
|
+
self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy)
|
75
|
+
self._parser = Mock(spec=FileTypeParser)
|
76
|
+
self._validation_policy = Mock(spec=AbstractSchemaValidationPolicy)
|
77
|
+
self._validation_policy.name = "validation policy name"
|
78
|
+
self._cursor = Mock(spec=AbstractFileBasedCursor)
|
79
|
+
|
80
|
+
self._stream = DefaultFileBasedStream(
|
81
|
+
config=self._stream_config,
|
82
|
+
catalog_schema=self._catalog_schema,
|
83
|
+
stream_reader=self._stream_reader,
|
84
|
+
availability_strategy=self._availability_strategy,
|
85
|
+
discovery_policy=self._discovery_policy,
|
86
|
+
parsers={self._FILE_TYPE: self._parser},
|
87
|
+
validation_policy=self._validation_policy,
|
88
|
+
cursor=self._cursor,
|
89
|
+
)
|
90
|
+
|
91
|
+
def test_when_read_records_from_slice_then_return_records(self) -> None:
|
92
|
+
self._parser.parse_records.return_value = [self._A_RECORD]
|
93
|
+
messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}))
|
94
|
+
assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD]
|
95
|
+
|
96
|
+
def test_given_exception_when_read_records_from_slice_then_do_process_other_files(self) -> None:
|
97
|
+
"""
|
98
|
+
The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
|
99
|
+
we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
|
100
|
+
the file but the sync won't fail.
|
101
|
+
Ticket: https://github.com/airbytehq/airbyte/issues/29680
|
102
|
+
"""
|
103
|
+
self._parser.parse_records.side_effect = [ValueError("An error"), [self._A_RECORD]]
|
104
|
+
|
105
|
+
messages = list(self._stream.read_records_from_slice({"files": [
|
106
|
+
RemoteFile(uri="invalid_file", last_modified=self._NOW),
|
107
|
+
RemoteFile(uri="valid_file", last_modified=self._NOW),
|
108
|
+
]}))
|
109
|
+
|
110
|
+
assert messages[0].log.level == Level.ERROR
|
111
|
+
assert messages[1].record.data["data"] == self._A_RECORD
|
112
|
+
|
113
|
+
def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(self) -> None:
|
114
|
+
self._stream_config.schemaless = False
|
115
|
+
self._validation_policy.record_passes_validation_policy.return_value = False
|
116
|
+
self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])]
|
117
|
+
|
118
|
+
messages = list(self._stream.read_records_from_slice({"files": [
|
119
|
+
RemoteFile(uri="invalid_file", last_modified=self._NOW),
|
120
|
+
RemoteFile(uri="valid_file", last_modified=self._NOW),
|
121
|
+
]}))
|
122
|
+
|
123
|
+
assert messages[0].log.level == Level.ERROR
|
124
|
+
assert messages[1].log.level == Level.WARN
|
125
|
+
|
126
|
+
def _iter(self, x: Iterable[Any]) -> Iterator[Any]:
|
127
|
+
for item in x:
|
128
|
+
if isinstance(item, Exception):
|
129
|
+
raise item
|
130
|
+
yield item
|
File without changes
|
File without changes
|