airbyte-cdk 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/file_based/config/avro_format.py +1 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +1 -0
- airbyte_cdk/sources/file_based/exceptions.py +26 -1
- airbyte_cdk/sources/file_based/file_based_source.py +5 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +15 -9
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +19 -11
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +22 -14
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +8 -6
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/RECORD +20 -20
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +110 -9
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +4 -0
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +16 -31
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +87 -10
- unit_tests/sources/file_based/test_file_based_scenarios.py +2 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,9 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from enum import Enum
|
6
|
-
from typing import Union
|
6
|
+
from typing import Any, List, Union
|
7
7
|
|
8
|
+
from airbyte_cdk.models import AirbyteMessage, FailureType
|
8
9
|
from airbyte_cdk.utils import AirbyteTracedException
|
9
10
|
|
10
11
|
|
@@ -40,6 +41,30 @@ class FileBasedSourceError(Enum):
|
|
40
41
|
UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
|
41
42
|
|
42
43
|
|
44
|
+
class FileBasedErrorsCollector:
|
45
|
+
"""
|
46
|
+
The placeholder for all errors collected.
|
47
|
+
"""
|
48
|
+
|
49
|
+
errors: List[AirbyteMessage] = []
|
50
|
+
|
51
|
+
def yield_and_raise_collected(self) -> Any:
|
52
|
+
if self.errors:
|
53
|
+
# emit collected logged messages
|
54
|
+
yield from self.errors
|
55
|
+
# clean the collector
|
56
|
+
self.errors.clear()
|
57
|
+
# raising the single exception
|
58
|
+
raise AirbyteTracedException(
|
59
|
+
internal_message="Please check the logged errors for more information.",
|
60
|
+
message="Some errors occured while reading from the source.",
|
61
|
+
failure_type=FailureType.config_error,
|
62
|
+
)
|
63
|
+
|
64
|
+
def collect(self, logged_error: AirbyteMessage) -> None:
|
65
|
+
self.errors.append(logged_error)
|
66
|
+
|
67
|
+
|
43
68
|
class BaseFileBasedSourceError(Exception):
|
44
69
|
def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
|
45
70
|
if isinstance(error, FileBasedSourceError):
|
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas
|
|
14
14
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
15
15
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
|
16
16
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
|
17
|
-
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
|
17
|
+
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
|
18
18
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
19
19
|
from airbyte_cdk.sources.file_based.file_types import default_parsers
|
20
20
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
@@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
49
49
|
self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
50
50
|
self.cursor_cls = cursor_cls
|
51
51
|
self.logger = logging.getLogger(f"airbyte.{self.name}")
|
52
|
+
self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
52
53
|
|
53
54
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
54
55
|
"""
|
@@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
106
107
|
parsers=self.parsers,
|
107
108
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
108
109
|
cursor=self.cursor_cls(stream_config),
|
110
|
+
errors_collector=self.errors_collector,
|
109
111
|
)
|
110
112
|
)
|
111
113
|
return streams
|
@@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC):
|
|
121
123
|
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
|
122
124
|
) -> Iterator[AirbyteMessage]:
|
123
125
|
yield from super().read(logger, config, catalog, state)
|
126
|
+
# emit all the errors collected
|
127
|
+
yield from self.errors_collector.yield_and_raise_collected()
|
124
128
|
# count streams using a certain parser
|
125
129
|
parsed_config = self._get_parsed_config(config)
|
126
130
|
for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
|
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
|
8
8
|
import fastavro
|
9
9
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
10
10
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
11
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
11
12
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
12
13
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
13
14
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -144,15 +145,20 @@ class AvroParser(FileTypeParser):
|
|
144
145
|
if not isinstance(avro_format, AvroFormat):
|
145
146
|
raise ValueError(f"Expected ParquetFormat, got {avro_format}")
|
146
147
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
148
|
+
line_no = 0
|
149
|
+
try:
|
150
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
151
|
+
avro_reader = fastavro.reader(fp)
|
152
|
+
schema = avro_reader.writer_schema
|
153
|
+
schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
|
154
|
+
for record in avro_reader:
|
155
|
+
line_no += 1
|
156
|
+
yield {
|
157
|
+
record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
|
158
|
+
for record_field, record_value in schema_field_name_to_type.items()
|
159
|
+
}
|
160
|
+
except Exception as exc:
|
161
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
|
156
162
|
|
157
163
|
@property
|
158
164
|
def file_read_mode(self) -> FileReadMode:
|
@@ -178,17 +178,25 @@ class CsvParser(FileTypeParser):
|
|
178
178
|
logger: logging.Logger,
|
179
179
|
discovered_schema: Optional[Mapping[str, SchemaType]],
|
180
180
|
) -> Iterable[Dict[str, Any]]:
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
181
|
+
line_no = 0
|
182
|
+
try:
|
183
|
+
config_format = _extract_format(config)
|
184
|
+
if discovered_schema:
|
185
|
+
property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
|
186
|
+
deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
|
187
|
+
else:
|
188
|
+
deduped_property_types = {}
|
189
|
+
cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
|
190
|
+
data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
|
191
|
+
for row in data_generator:
|
192
|
+
line_no += 1
|
193
|
+
yield CsvParser._to_nullable(
|
194
|
+
cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
|
195
|
+
)
|
196
|
+
except RecordParseError as parse_err:
|
197
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
|
198
|
+
finally:
|
199
|
+
data_generator.close()
|
192
200
|
|
193
201
|
@property
|
194
202
|
def file_read_mode(self) -> FileReadMode:
|
@@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser):
|
|
119
119
|
break
|
120
120
|
|
121
121
|
if had_json_parsing_error and not yielded_at_least_once:
|
122
|
-
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
|
122
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
|
123
123
|
|
124
124
|
@staticmethod
|
125
125
|
def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
|
@@ -11,7 +11,7 @@ from urllib.parse import unquote
|
|
11
11
|
import pyarrow as pa
|
12
12
|
import pyarrow.parquet as pq
|
13
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
|
14
|
-
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
|
14
|
+
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
|
15
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser):
|
|
64
64
|
if not isinstance(parquet_format, ParquetFormat):
|
65
65
|
logger.info(f"Expected ParquetFormat, got {parquet_format}")
|
66
66
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
for
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
67
|
+
|
68
|
+
line_no = 0
|
69
|
+
try:
|
70
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
71
|
+
reader = pq.ParquetFile(fp)
|
72
|
+
partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
|
73
|
+
for row_group in range(reader.num_row_groups):
|
74
|
+
batch = reader.read_row_group(row_group)
|
75
|
+
for row in range(batch.num_rows):
|
76
|
+
line_no += 1
|
77
|
+
yield {
|
78
|
+
**{
|
79
|
+
column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
|
80
|
+
for column in batch.column_names
|
81
|
+
},
|
82
|
+
**partition_columns,
|
83
|
+
}
|
84
|
+
except Exception as exc:
|
85
|
+
raise RecordParseError(
|
86
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
|
87
|
+
) from exc
|
80
88
|
|
81
89
|
@staticmethod
|
82
90
|
def _extract_partitions(filepath: str) -> List[str]:
|
@@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode
|
|
10
10
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
11
11
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
|
12
12
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
13
|
-
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
|
13
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
|
14
14
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
15
15
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream):
|
|
44
44
|
discovery_policy: AbstractDiscoveryPolicy,
|
45
45
|
parsers: Dict[Type[Any], FileTypeParser],
|
46
46
|
validation_policy: AbstractSchemaValidationPolicy,
|
47
|
+
errors_collector: FileBasedErrorsCollector,
|
47
48
|
):
|
48
49
|
super().__init__()
|
49
50
|
self.config = config
|
@@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream):
|
|
53
54
|
self._discovery_policy = discovery_policy
|
54
55
|
self._availability_strategy = availability_strategy
|
55
56
|
self._parsers = parsers
|
57
|
+
self.errors_collector = errors_collector
|
56
58
|
|
57
59
|
@property
|
58
60
|
@abstractmethod
|
@@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
112
112
|
except RecordParseError:
|
113
113
|
# Increment line_no because the exception was raised before we could increment it
|
114
114
|
line_no += 1
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
115
|
+
self.errors_collector.collect(
|
116
|
+
AirbyteMessage(
|
117
|
+
type=MessageType.LOG,
|
118
|
+
log=AirbyteLogMessage(
|
119
|
+
level=Level.ERROR,
|
120
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
121
|
+
stack_trace=traceback.format_exc(),
|
122
|
+
),
|
121
123
|
),
|
122
124
|
)
|
123
125
|
|
@@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
|
|
152
152
|
airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
|
153
153
|
airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
|
154
154
|
airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
airbyte_cdk/sources/file_based/exceptions.py,sha256
|
156
|
-
airbyte_cdk/sources/file_based/file_based_source.py,sha256=
|
155
|
+
airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
|
156
|
+
airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
|
157
157
|
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
|
158
158
|
airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
|
159
159
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
|
@@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
|
|
163
163
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
|
164
164
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
165
165
|
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
|
166
|
-
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=
|
166
|
+
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
|
167
167
|
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
|
168
168
|
airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
|
169
169
|
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
|
170
|
-
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=
|
170
|
+
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
|
171
171
|
airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
|
172
172
|
airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
|
173
173
|
airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
|
174
174
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
|
175
175
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
|
176
|
-
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=
|
177
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
176
|
+
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
|
177
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
|
178
178
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
|
179
|
-
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=
|
180
|
-
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=
|
179
|
+
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
|
180
|
+
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
|
181
181
|
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
|
182
182
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
|
183
183
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
|
184
184
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
|
185
185
|
airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
|
186
|
-
airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=
|
187
|
-
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=
|
186
|
+
airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
|
187
|
+
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
|
188
188
|
airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
|
189
189
|
airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
|
190
190
|
airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
|
@@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
|
|
363
363
|
unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
364
364
|
unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
|
365
365
|
unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
|
366
|
-
unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=
|
366
|
+
unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
|
367
367
|
unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
|
368
368
|
unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
|
369
369
|
unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
|
@@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl
|
|
384
384
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
385
385
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
|
386
386
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
|
387
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
387
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
|
388
388
|
unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
|
389
389
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
|
390
390
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
|
391
391
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
|
392
392
|
unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
|
393
|
-
unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=
|
393
|
+
unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
|
394
394
|
unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
|
395
|
-
unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=
|
395
|
+
unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
|
396
396
|
unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
397
397
|
unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
|
398
|
-
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=
|
398
|
+
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
|
399
399
|
unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
400
400
|
unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
|
401
401
|
unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
444
444
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
445
445
|
unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
|
446
446
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
447
|
-
airbyte_cdk-0.58.
|
448
|
-
airbyte_cdk-0.58.
|
449
|
-
airbyte_cdk-0.58.
|
450
|
-
airbyte_cdk-0.58.
|
451
|
-
airbyte_cdk-0.58.
|
447
|
+
airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
448
|
+
airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
|
449
|
+
airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
450
|
+
airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
451
|
+
airbyte_cdk-0.58.7.dist-info/RECORD,,
|
@@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
852
852
|
]
|
853
853
|
}
|
854
854
|
)
|
855
|
+
.set_expected_read_error(
|
856
|
+
AirbyteTracedException,
|
857
|
+
"Please check the logged errors for more information.",
|
858
|
+
)
|
859
|
+
).build()
|
860
|
+
|
861
|
+
invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
|
862
|
+
TestScenarioBuilder[InMemoryFilesSource]()
|
863
|
+
.set_name("invalid_csv_multi_scenario") # too many values for the number of headers
|
864
|
+
.set_config(
|
865
|
+
{
|
866
|
+
"streams": [
|
867
|
+
{
|
868
|
+
"name": "stream1",
|
869
|
+
"format": {"filetype": "csv"},
|
870
|
+
"globs": ["*"],
|
871
|
+
"validation_policy": "Emit Record",
|
872
|
+
},
|
873
|
+
{
|
874
|
+
"name": "stream2",
|
875
|
+
"format": {"filetype": "csv"},
|
876
|
+
"globs": ["b.csv"],
|
877
|
+
"validation_policy": "Emit Record",
|
878
|
+
},
|
879
|
+
]
|
880
|
+
}
|
881
|
+
)
|
882
|
+
.set_source_builder(
|
883
|
+
FileBasedSourceBuilder()
|
884
|
+
.set_files(
|
885
|
+
{
|
886
|
+
"a.csv": {
|
887
|
+
"contents": [
|
888
|
+
("col1",),
|
889
|
+
("val11", "val12"),
|
890
|
+
("val21", "val22"),
|
891
|
+
],
|
892
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
893
|
+
},
|
894
|
+
"b.csv": {
|
895
|
+
"contents": [
|
896
|
+
("col3",),
|
897
|
+
("val13b", "val14b"),
|
898
|
+
("val23b", "val24b"),
|
899
|
+
],
|
900
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
901
|
+
},
|
902
|
+
}
|
903
|
+
)
|
904
|
+
.set_file_type("csv")
|
905
|
+
)
|
906
|
+
.set_expected_catalog(
|
907
|
+
{
|
908
|
+
"streams": [
|
909
|
+
{
|
910
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
911
|
+
"json_schema": {
|
912
|
+
"type": "object",
|
913
|
+
"properties": {
|
914
|
+
"col1": {"type": ["null", "string"]},
|
915
|
+
"col2": {"type": ["null", "string"]},
|
916
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
917
|
+
"_ab_source_file_url": {"type": "string"},
|
918
|
+
},
|
919
|
+
},
|
920
|
+
"name": "stream1",
|
921
|
+
"source_defined_cursor": True,
|
922
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
923
|
+
},
|
924
|
+
{
|
925
|
+
"json_schema": {
|
926
|
+
"type": "object",
|
927
|
+
"properties": {
|
928
|
+
"col3": {"type": ["null", "string"]},
|
929
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
930
|
+
"_ab_source_file_url": {"type": "string"},
|
931
|
+
},
|
932
|
+
},
|
933
|
+
"name": "stream2",
|
934
|
+
"source_defined_cursor": True,
|
935
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
936
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
937
|
+
},
|
938
|
+
]
|
939
|
+
}
|
940
|
+
)
|
941
|
+
.set_expected_records([])
|
942
|
+
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
943
|
+
.set_expected_logs(
|
944
|
+
{
|
945
|
+
"read": [
|
946
|
+
{
|
947
|
+
"level": "ERROR",
|
948
|
+
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
|
949
|
+
},
|
950
|
+
{
|
951
|
+
"level": "ERROR",
|
952
|
+
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
|
953
|
+
},
|
954
|
+
]
|
955
|
+
}
|
956
|
+
)
|
957
|
+
.set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
|
855
958
|
).build()
|
856
959
|
|
857
960
|
csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
|
|
2172
2275
|
},
|
2173
2276
|
]
|
2174
2277
|
)
|
2175
|
-
.
|
2176
|
-
|
2177
|
-
|
2178
|
-
{
|
2179
|
-
"level": "ERROR",
|
2180
|
-
"message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
|
2181
|
-
}
|
2182
|
-
]
|
2183
|
-
}
|
2278
|
+
.set_expected_read_error(
|
2279
|
+
AirbyteTracedException,
|
2280
|
+
f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
|
2184
2281
|
)
|
2185
2282
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
2283
|
+
.set_expected_read_error(
|
2284
|
+
AirbyteTracedException,
|
2285
|
+
"Please check the logged errors for more information.",
|
2286
|
+
)
|
2186
2287
|
).build()
|
2187
2288
|
|
2188
2289
|
csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = (
|
|
231
231
|
)
|
232
232
|
.set_expected_records([])
|
233
233
|
.set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
|
234
|
+
.set_expected_read_error(
|
235
|
+
AirbyteTracedException,
|
236
|
+
"Please check the logged errors for more information.",
|
237
|
+
)
|
234
238
|
).build()
|
235
239
|
|
236
240
|
# If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
|
@@ -2,7 +2,8 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
5
|
+
|
6
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
6
7
|
from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
|
7
8
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
|
8
9
|
|
@@ -272,6 +273,10 @@ skip_record_scenario_single_stream = (
|
|
272
273
|
]
|
273
274
|
}
|
274
275
|
)
|
276
|
+
.set_expected_read_error(
|
277
|
+
AirbyteTracedException,
|
278
|
+
"Please check the logged errors for more information.",
|
279
|
+
)
|
275
280
|
).build()
|
276
281
|
|
277
282
|
|
@@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = (
|
|
416
421
|
]
|
417
422
|
}
|
418
423
|
)
|
424
|
+
.set_expected_read_error(
|
425
|
+
AirbyteTracedException,
|
426
|
+
"Please check the logged errors for more information.",
|
427
|
+
)
|
419
428
|
).build()
|
420
429
|
|
421
430
|
|
@@ -492,19 +501,9 @@ emit_record_scenario_single_stream = (
|
|
492
501
|
},
|
493
502
|
]
|
494
503
|
)
|
495
|
-
.
|
496
|
-
|
497
|
-
|
498
|
-
{
|
499
|
-
"level": "ERROR",
|
500
|
-
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
|
501
|
-
},
|
502
|
-
{
|
503
|
-
"level": "WARN",
|
504
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
505
|
-
},
|
506
|
-
]
|
507
|
-
}
|
504
|
+
.set_expected_read_error(
|
505
|
+
AirbyteTracedException,
|
506
|
+
"Please check the logged errors for more information.",
|
508
507
|
)
|
509
508
|
).build()
|
510
509
|
|
@@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = (
|
|
640
639
|
},
|
641
640
|
]
|
642
641
|
)
|
643
|
-
.
|
644
|
-
|
645
|
-
|
646
|
-
{
|
647
|
-
"level": "ERROR",
|
648
|
-
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
|
649
|
-
},
|
650
|
-
{
|
651
|
-
"level": "WARN",
|
652
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
653
|
-
},
|
654
|
-
{
|
655
|
-
"level": "WARN",
|
656
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
657
|
-
},
|
658
|
-
]
|
659
|
-
}
|
642
|
+
.set_expected_read_error(
|
643
|
+
AirbyteTracedException,
|
644
|
+
"Please check the logged errors for more information.",
|
660
645
|
)
|
661
646
|
).build()
|
662
647
|
|
@@ -2,15 +2,18 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
import traceback
|
5
6
|
import unittest
|
6
7
|
from datetime import datetime, timezone
|
7
8
|
from typing import Any, Iterable, Iterator, Mapping
|
8
9
|
from unittest.mock import Mock
|
9
10
|
|
10
11
|
import pytest
|
11
|
-
from airbyte_cdk.models import Level
|
12
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
|
13
|
+
from airbyte_cdk.models import Type as MessageType
|
12
14
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
13
15
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
16
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
|
14
17
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
15
18
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
19
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -55,12 +58,17 @@ class MockFormat:
|
|
55
58
|
),
|
56
59
|
pytest.param(
|
57
60
|
{"type": "object", "properties": {"prop": {"type": "string"}}},
|
58
|
-
{
|
61
|
+
{
|
62
|
+
"type": ["null", "object"],
|
63
|
+
"properties": {"prop": {"type": ["null", "string"]}},
|
64
|
+
},
|
59
65
|
id="deeply-nested-schema",
|
60
66
|
),
|
61
67
|
],
|
62
68
|
)
|
63
|
-
def test_fill_nulls(
|
69
|
+
def test_fill_nulls(
|
70
|
+
input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
|
71
|
+
) -> None:
|
64
72
|
assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
|
65
73
|
|
66
74
|
|
@@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
90
98
|
parsers={MockFormat: self._parser},
|
91
99
|
validation_policy=self._validation_policy,
|
92
100
|
cursor=self._cursor,
|
101
|
+
errors_collector=FileBasedErrorsCollector(),
|
93
102
|
)
|
94
103
|
|
95
104
|
def test_when_read_records_from_slice_then_return_records(self) -> None:
|
96
105
|
self._parser.parse_records.return_value = [self._A_RECORD]
|
97
|
-
messages = list(
|
98
|
-
|
106
|
+
messages = list(
|
107
|
+
self._stream.read_records_from_slice(
|
108
|
+
{"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
|
109
|
+
)
|
110
|
+
)
|
111
|
+
assert list(map(lambda message: message.record.data["data"], messages)) == [
|
112
|
+
self._A_RECORD
|
113
|
+
]
|
99
114
|
|
100
|
-
def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
|
115
|
+
def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
|
116
|
+
self,
|
117
|
+
) -> None:
|
101
118
|
"""
|
102
119
|
The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
|
103
120
|
we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
|
104
121
|
the file but the sync won't fail.
|
105
122
|
Ticket: https://github.com/airbytehq/airbyte/issues/29680
|
106
123
|
"""
|
107
|
-
self._parser.parse_records.side_effect = [
|
124
|
+
self._parser.parse_records.side_effect = [
|
125
|
+
ValueError("An error"),
|
126
|
+
[self._A_RECORD],
|
127
|
+
]
|
108
128
|
|
109
129
|
messages = list(
|
110
130
|
self._stream.read_records_from_slice(
|
@@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
120
140
|
assert messages[0].log.level == Level.ERROR
|
121
141
|
assert messages[1].record.data["data"] == self._A_RECORD
|
122
142
|
|
123
|
-
def test_given_traced_exception_when_read_records_from_slice_then_fail(
|
143
|
+
def test_given_traced_exception_when_read_records_from_slice_then_fail(
|
144
|
+
self,
|
145
|
+
) -> None:
|
124
146
|
"""
|
125
147
|
When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
|
126
148
|
"""
|
@@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
138
160
|
)
|
139
161
|
)
|
140
162
|
|
141
|
-
def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
|
163
|
+
def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
|
164
|
+
self,
|
165
|
+
) -> None:
|
142
166
|
self._stream_config.schemaless = False
|
143
167
|
self._validation_policy.record_passes_validation_policy.return_value = False
|
144
|
-
self._parser.parse_records.side_effect = [
|
168
|
+
self._parser.parse_records.side_effect = [
|
169
|
+
self._iter([self._A_RECORD, ValueError("An error")])
|
170
|
+
]
|
145
171
|
|
146
172
|
messages = list(
|
147
173
|
self._stream.read_records_from_slice(
|
@@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
183
209
|
if isinstance(item, Exception):
|
184
210
|
raise item
|
185
211
|
yield item
|
212
|
+
|
213
|
+
|
214
|
+
class TestFileBasedErrorCollector:
|
215
|
+
test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
216
|
+
|
217
|
+
@pytest.mark.parametrize(
|
218
|
+
"stream, file, line_no, n_skipped, collector_expected_len",
|
219
|
+
(
|
220
|
+
("stream_1", "test.csv", 1, 1, 1),
|
221
|
+
("stream_2", "test2.csv", 2, 2, 2),
|
222
|
+
),
|
223
|
+
ids=[
|
224
|
+
"Single error",
|
225
|
+
"Multiple errors",
|
226
|
+
],
|
227
|
+
)
|
228
|
+
def test_collect_parsing_error(
|
229
|
+
self, stream, file, line_no, n_skipped, collector_expected_len
|
230
|
+
) -> None:
|
231
|
+
test_error_pattern = "Error parsing record."
|
232
|
+
# format the error body
|
233
|
+
test_error = (
|
234
|
+
AirbyteMessage(
|
235
|
+
type=MessageType.LOG,
|
236
|
+
log=AirbyteLogMessage(
|
237
|
+
level=Level.ERROR,
|
238
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
|
239
|
+
stack_trace=traceback.format_exc(),
|
240
|
+
),
|
241
|
+
),
|
242
|
+
)
|
243
|
+
# collecting the error
|
244
|
+
self.test_error_collector.collect(test_error)
|
245
|
+
# check the error has been collected
|
246
|
+
assert len(self.test_error_collector.errors) == collector_expected_len
|
247
|
+
# check for the patern presence for the collected errors
|
248
|
+
for error in self.test_error_collector.errors:
|
249
|
+
assert test_error_pattern in error[0].log.message
|
250
|
+
|
251
|
+
def test_yield_and_raise_collected(self) -> None:
|
252
|
+
# we expect the following method will raise the AirbyteTracedException
|
253
|
+
with pytest.raises(AirbyteTracedException) as parse_error:
|
254
|
+
list(self.test_error_collector.yield_and_raise_collected())
|
255
|
+
assert (
|
256
|
+
parse_error.value.message
|
257
|
+
== "Some errors occured while reading from the source."
|
258
|
+
)
|
259
|
+
assert (
|
260
|
+
parse_error.value.internal_message
|
261
|
+
== "Please check the logged errors for more information."
|
262
|
+
)
|
@@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
48
48
|
csv_strings_can_be_null_not_quoted_scenario,
|
49
49
|
earlier_csv_scenario,
|
50
50
|
empty_schema_inference_scenario,
|
51
|
+
invalid_csv_multi_scenario,
|
51
52
|
invalid_csv_scenario,
|
52
53
|
multi_csv_scenario,
|
53
54
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
@@ -132,6 +133,7 @@ discover_scenarios = [
|
|
132
133
|
csv_multi_stream_scenario,
|
133
134
|
csv_single_stream_scenario,
|
134
135
|
invalid_csv_scenario,
|
136
|
+
invalid_csv_multi_scenario,
|
135
137
|
single_csv_scenario,
|
136
138
|
multi_csv_scenario,
|
137
139
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
File without changes
|
File without changes
|
File without changes
|