airbyte-cdk 0.51.2__py3-none-any.whl → 0.51.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -158,7 +158,7 @@ class CsvParser(FileTypeParser):
158
158
  deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
159
159
  else:
160
160
  deduped_property_types = {}
161
- cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger)
161
+ cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
162
162
  data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
163
163
  for row in data_generator:
164
164
  yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null)
@@ -170,10 +170,10 @@ class CsvParser(FileTypeParser):
170
170
 
171
171
  @staticmethod
172
172
  def _get_cast_function(
173
- deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger
173
+ deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger, schemaless: bool
174
174
  ) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
175
175
  # Only cast values if the schema is provided
176
- if deduped_property_types:
176
+ if deduped_property_types and not schemaless:
177
177
  return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger)
178
178
  else:
179
179
  # If no schema is provided, yield the rows as they are
@@ -275,11 +275,10 @@ class CsvParser(FileTypeParser):
275
275
  except ValueError:
276
276
  warnings.append(_format_warning(key, value, prop_type))
277
277
 
278
+ result[key] = cast_value
278
279
  else:
279
280
  warnings.append(_format_warning(key, value, prop_type))
280
281
 
281
- result[key] = cast_value
282
-
283
282
  if warnings:
284
283
  logger.warning(
285
284
  f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}",
@@ -65,7 +65,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
65
65
  slices = [{"files": list(group[1])} for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)]
66
66
  return slices
67
67
 
68
- def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping[str, Any]]:
68
+ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
69
69
  """
70
70
  Yield all records from all remote files in `list_files_for_this_sync`.
71
71
 
@@ -127,9 +127,8 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
127
127
  stack_trace=traceback.format_exc(),
128
128
  ),
129
129
  )
130
- break
131
130
 
132
- else:
131
+ finally:
133
132
  if n_skipped:
134
133
  yield AirbyteMessage(
135
134
  type=MessageType.LOG,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.2
3
+ Version: 0.51.4
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
157
157
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
158
158
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
159
159
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
160
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=pOUEuAgodXwDFlSw52P2cAu4elyST6-fOspeJkb7SY8,16617
160
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=SsWy_8KunUz1MYKroix6fWv37mbPEH0h5SoW4g3Qjf4,16676
161
161
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
162
162
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
163
163
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -166,7 +166,7 @@ airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_valida
166
166
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
167
167
  airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
168
168
  airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=tvVew6din9y8a3hItzU0PjTQrMxbVI7bK-3pRTvOswg,5810
169
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=N4JUzXeM5T_KAU4OBlGtw9dzjOJRS_Dvn2nb7RstUyA,11861
169
+ airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=eFYqN657J5A0sf9og_w7qea8lu2xtUobjYYDldfmbmA,11839
170
170
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
171
171
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
172
172
  airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
@@ -315,13 +315,13 @@ unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZ
315
315
  unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
316
316
  unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
317
317
  unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
318
- unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=PIdee-wnaRNVERZV69u9LtJIxI9y_Myj9zax9UP9_0E,20315
318
+ unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
319
319
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
320
320
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
321
321
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
322
322
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
323
323
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=7DR49LCjns72Dv5-R-cg6_SUR1zpHtE9_uFEWoYwx1s,5834
324
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=M-qxXZzYx96RYzknyInZGu7L54IKC6iuv2inpPn6LJM,95783
324
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
325
325
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
326
326
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
327
327
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -330,7 +330,7 @@ unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=sx
330
330
  unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=GFACt5-0BMFVtrYueA1_6va2euHfhKcBLcYfDHHnmFI,26715
331
331
  unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
332
332
  unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=fzggaGwtXgNk-sAjQ8D71CPTCNBVxBS6HW63FKdkKME,12491
333
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=22Rgp1HvZss2WdOcfenbnvx18tfFJ_trPuWp299RW5E,1545
333
+ unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=gZlNs6s9dRgcFLrLZtkJDRSje_8gDylHj3xktjsUMVo,5782
334
334
  unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
335
335
  unit_tests/sources/fixtures/source_test_fixture.py,sha256=r-UtR241EGQMZTw1RoKaatrpCGeQn7OIuRPWfG9f7nI,5380
336
336
  unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -352,8 +352,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
352
352
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
353
353
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
354
354
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
355
- airbyte_cdk-0.51.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
356
- airbyte_cdk-0.51.2.dist-info/METADATA,sha256=l-yiYXAQKz7KgIO8yxCtNI2fC-qS3XyrrZ1wuUptaP0,9399
357
- airbyte_cdk-0.51.2.dist-info/WHEEL,sha256=5sUXSg9e4bi7lTLOHcm6QEYwO5TIF1TNbTSVFVjcJcc,92
358
- airbyte_cdk-0.51.2.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
359
- airbyte_cdk-0.51.2.dist-info/RECORD,,
355
+ airbyte_cdk-0.51.4.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
356
+ airbyte_cdk-0.51.4.dist-info/METADATA,sha256=imzQYjx2YvmpJDtuZK6lUo96gKmLJBYNOZf06KeAg-s,9399
357
+ airbyte_cdk-0.51.4.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
358
+ airbyte_cdk-0.51.4.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
359
+ airbyte_cdk-0.51.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.1)
2
+ Generator: bdist_wheel (0.41.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -100,7 +100,7 @@ logger = logging.getLogger()
100
100
  pytest.param(
101
101
  {"col9": "['a', 'b']"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col9": "['a', 'b']"}, id="cannot-cast-to-list-of-objects"
102
102
  ),
103
- pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {"col11": "x"}, id="item-not-in-props-doesn't-error"),
103
+ pytest.param({"col11": "x"}, DEFAULT_TRUE_VALUES, DEFAULT_FALSE_VALUES, {}, id="item-not-in-props-doesn't-error"),
104
104
  ],
105
105
  )
106
106
  def test_cast_to_python_type(row: Dict[str, str], true_values: Set[str], false_values: Set[str], expected_output: Dict[str, Any]) -> None:
@@ -481,7 +481,6 @@ multi_csv_stream_n_file_exceeds_limit_for_inference = (
481
481
  "data": {
482
482
  "col1": "val11b",
483
483
  "col2": "val12b",
484
- "col3": "val13b",
485
484
  "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
486
485
  "_ab_source_file_url": "b.csv",
487
486
  },
@@ -491,7 +490,6 @@ multi_csv_stream_n_file_exceeds_limit_for_inference = (
491
490
  "data": {
492
491
  "col1": "val21b",
493
492
  "col2": "val22b",
494
- "col3": "val23b",
495
493
  "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z",
496
494
  "_ab_source_file_url": "b.csv",
497
495
  },
@@ -2,9 +2,20 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from typing import Any, Mapping
5
+ import unittest
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Iterable, Iterator, Mapping
8
+ from unittest.mock import Mock
6
9
 
7
10
  import pytest
11
+ from airbyte_cdk.models import Level
12
+ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
13
+ from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
14
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
17
+ from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
18
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
8
19
  from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream
9
20
 
10
21
 
@@ -46,3 +57,74 @@ from airbyte_cdk.sources.file_based.stream.default_file_based_stream import Defa
46
57
  )
47
58
  def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None:
48
59
  assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
60
+
61
+
62
+ class DefaultFileBasedStreamTest(unittest.TestCase):
63
+ _FILE_TYPE = "file_type"
64
+ _NOW = datetime(2022, 10, 22, tzinfo=timezone.utc)
65
+ _A_RECORD = {"a_record": 1}
66
+
67
+ def setUp(self) -> None:
68
+ self._stream_config = Mock()
69
+ self._stream_config.file_type = self._FILE_TYPE
70
+ self._stream_config.name = "a stream name"
71
+ self._catalog_schema = Mock()
72
+ self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
73
+ self._availability_strategy = Mock(spec=AbstractFileBasedAvailabilityStrategy)
74
+ self._discovery_policy = Mock(spec=AbstractDiscoveryPolicy)
75
+ self._parser = Mock(spec=FileTypeParser)
76
+ self._validation_policy = Mock(spec=AbstractSchemaValidationPolicy)
77
+ self._validation_policy.name = "validation policy name"
78
+ self._cursor = Mock(spec=AbstractFileBasedCursor)
79
+
80
+ self._stream = DefaultFileBasedStream(
81
+ config=self._stream_config,
82
+ catalog_schema=self._catalog_schema,
83
+ stream_reader=self._stream_reader,
84
+ availability_strategy=self._availability_strategy,
85
+ discovery_policy=self._discovery_policy,
86
+ parsers={self._FILE_TYPE: self._parser},
87
+ validation_policy=self._validation_policy,
88
+ cursor=self._cursor,
89
+ )
90
+
91
+ def test_when_read_records_from_slice_then_return_records(self) -> None:
92
+ self._parser.parse_records.return_value = [self._A_RECORD]
93
+ messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}))
94
+ assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD]
95
+
96
+ def test_given_exception_when_read_records_from_slice_then_do_process_other_files(self) -> None:
97
+ """
98
+ The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
99
+ we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
100
+ the file but the sync won't fail.
101
+ Ticket: https://github.com/airbytehq/airbyte/issues/29680
102
+ """
103
+ self._parser.parse_records.side_effect = [ValueError("An error"), [self._A_RECORD]]
104
+
105
+ messages = list(self._stream.read_records_from_slice({"files": [
106
+ RemoteFile(uri="invalid_file", last_modified=self._NOW),
107
+ RemoteFile(uri="valid_file", last_modified=self._NOW),
108
+ ]}))
109
+
110
+ assert messages[0].log.level == Level.ERROR
111
+ assert messages[1].record.data["data"] == self._A_RECORD
112
+
113
+ def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(self) -> None:
114
+ self._stream_config.schemaless = False
115
+ self._validation_policy.record_passes_validation_policy.return_value = False
116
+ self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])]
117
+
118
+ messages = list(self._stream.read_records_from_slice({"files": [
119
+ RemoteFile(uri="invalid_file", last_modified=self._NOW),
120
+ RemoteFile(uri="valid_file", last_modified=self._NOW),
121
+ ]}))
122
+
123
+ assert messages[0].log.level == Level.ERROR
124
+ assert messages[1].log.level == Level.WARN
125
+
126
+ def _iter(self, x: Iterable[Any]) -> Iterator[Any]:
127
+ for item in x:
128
+ if isinstance(item, Exception):
129
+ raise item
130
+ yield item