airbyte-cdk 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/file_based/config/avro_format.py +1 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +1 -0
- airbyte_cdk/sources/file_based/exceptions.py +26 -1
- airbyte_cdk/sources/file_based/file_based_source.py +5 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +15 -9
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +19 -11
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +22 -14
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +8 -6
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/RECORD +20 -20
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +110 -9
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +4 -0
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +16 -31
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +87 -10
- unit_tests/sources/file_based/test_file_based_scenarios.py +2 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,9 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from enum import Enum
|
6
|
-
from typing import Union
|
6
|
+
from typing import Any, List, Union
|
7
7
|
|
8
|
+
from airbyte_cdk.models import AirbyteMessage, FailureType
|
8
9
|
from airbyte_cdk.utils import AirbyteTracedException
|
9
10
|
|
10
11
|
|
@@ -40,6 +41,30 @@ class FileBasedSourceError(Enum):
|
|
40
41
|
UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
|
41
42
|
|
42
43
|
|
44
|
+
class FileBasedErrorsCollector:
|
45
|
+
"""
|
46
|
+
The placeholder for all errors collected.
|
47
|
+
"""
|
48
|
+
|
49
|
+
errors: List[AirbyteMessage] = []
|
50
|
+
|
51
|
+
def yield_and_raise_collected(self) -> Any:
|
52
|
+
if self.errors:
|
53
|
+
# emit collected logged messages
|
54
|
+
yield from self.errors
|
55
|
+
# clean the collector
|
56
|
+
self.errors.clear()
|
57
|
+
# raising the single exception
|
58
|
+
raise AirbyteTracedException(
|
59
|
+
internal_message="Please check the logged errors for more information.",
|
60
|
+
message="Some errors occured while reading from the source.",
|
61
|
+
failure_type=FailureType.config_error,
|
62
|
+
)
|
63
|
+
|
64
|
+
def collect(self, logged_error: AirbyteMessage) -> None:
|
65
|
+
self.errors.append(logged_error)
|
66
|
+
|
67
|
+
|
43
68
|
class BaseFileBasedSourceError(Exception):
|
44
69
|
def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
|
45
70
|
if isinstance(error, FileBasedSourceError):
|
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas
|
|
14
14
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
15
15
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
|
16
16
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
|
17
|
-
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
|
17
|
+
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
|
18
18
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
19
19
|
from airbyte_cdk.sources.file_based.file_types import default_parsers
|
20
20
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
@@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
49
49
|
self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
50
50
|
self.cursor_cls = cursor_cls
|
51
51
|
self.logger = logging.getLogger(f"airbyte.{self.name}")
|
52
|
+
self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
52
53
|
|
53
54
|
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
|
54
55
|
"""
|
@@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC):
|
|
106
107
|
parsers=self.parsers,
|
107
108
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
108
109
|
cursor=self.cursor_cls(stream_config),
|
110
|
+
errors_collector=self.errors_collector,
|
109
111
|
)
|
110
112
|
)
|
111
113
|
return streams
|
@@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC):
|
|
121
123
|
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
|
122
124
|
) -> Iterator[AirbyteMessage]:
|
123
125
|
yield from super().read(logger, config, catalog, state)
|
126
|
+
# emit all the errors collected
|
127
|
+
yield from self.errors_collector.yield_and_raise_collected()
|
124
128
|
# count streams using a certain parser
|
125
129
|
parsed_config = self._get_parsed_config(config)
|
126
130
|
for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
|
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
|
8
8
|
import fastavro
|
9
9
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
10
10
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
11
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
11
12
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
12
13
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
13
14
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -144,15 +145,20 @@ class AvroParser(FileTypeParser):
|
|
144
145
|
if not isinstance(avro_format, AvroFormat):
|
145
146
|
raise ValueError(f"Expected ParquetFormat, got {avro_format}")
|
146
147
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
148
|
+
line_no = 0
|
149
|
+
try:
|
150
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
151
|
+
avro_reader = fastavro.reader(fp)
|
152
|
+
schema = avro_reader.writer_schema
|
153
|
+
schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
|
154
|
+
for record in avro_reader:
|
155
|
+
line_no += 1
|
156
|
+
yield {
|
157
|
+
record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
|
158
|
+
for record_field, record_value in schema_field_name_to_type.items()
|
159
|
+
}
|
160
|
+
except Exception as exc:
|
161
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
|
156
162
|
|
157
163
|
@property
|
158
164
|
def file_read_mode(self) -> FileReadMode:
|
@@ -178,17 +178,25 @@ class CsvParser(FileTypeParser):
|
|
178
178
|
logger: logging.Logger,
|
179
179
|
discovered_schema: Optional[Mapping[str, SchemaType]],
|
180
180
|
) -> Iterable[Dict[str, Any]]:
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
181
|
+
line_no = 0
|
182
|
+
try:
|
183
|
+
config_format = _extract_format(config)
|
184
|
+
if discovered_schema:
|
185
|
+
property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
|
186
|
+
deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
|
187
|
+
else:
|
188
|
+
deduped_property_types = {}
|
189
|
+
cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
|
190
|
+
data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
|
191
|
+
for row in data_generator:
|
192
|
+
line_no += 1
|
193
|
+
yield CsvParser._to_nullable(
|
194
|
+
cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
|
195
|
+
)
|
196
|
+
except RecordParseError as parse_err:
|
197
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
|
198
|
+
finally:
|
199
|
+
data_generator.close()
|
192
200
|
|
193
201
|
@property
|
194
202
|
def file_read_mode(self) -> FileReadMode:
|
@@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser):
|
|
119
119
|
break
|
120
120
|
|
121
121
|
if had_json_parsing_error and not yielded_at_least_once:
|
122
|
-
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
|
122
|
+
raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
|
123
123
|
|
124
124
|
@staticmethod
|
125
125
|
def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
|
@@ -11,7 +11,7 @@ from urllib.parse import unquote
|
|
11
11
|
import pyarrow as pa
|
12
12
|
import pyarrow.parquet as pq
|
13
13
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
|
14
|
-
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
|
14
|
+
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
|
15
15
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
16
16
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
17
17
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser):
|
|
64
64
|
if not isinstance(parquet_format, ParquetFormat):
|
65
65
|
logger.info(f"Expected ParquetFormat, got {parquet_format}")
|
66
66
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
for
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
67
|
+
|
68
|
+
line_no = 0
|
69
|
+
try:
|
70
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
71
|
+
reader = pq.ParquetFile(fp)
|
72
|
+
partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
|
73
|
+
for row_group in range(reader.num_row_groups):
|
74
|
+
batch = reader.read_row_group(row_group)
|
75
|
+
for row in range(batch.num_rows):
|
76
|
+
line_no += 1
|
77
|
+
yield {
|
78
|
+
**{
|
79
|
+
column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
|
80
|
+
for column in batch.column_names
|
81
|
+
},
|
82
|
+
**partition_columns,
|
83
|
+
}
|
84
|
+
except Exception as exc:
|
85
|
+
raise RecordParseError(
|
86
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
|
87
|
+
) from exc
|
80
88
|
|
81
89
|
@staticmethod
|
82
90
|
def _extract_partitions(filepath: str) -> List[str]:
|
@@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode
|
|
10
10
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
11
11
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
|
12
12
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
13
|
-
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
|
13
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
|
14
14
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
15
15
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream):
|
|
44
44
|
discovery_policy: AbstractDiscoveryPolicy,
|
45
45
|
parsers: Dict[Type[Any], FileTypeParser],
|
46
46
|
validation_policy: AbstractSchemaValidationPolicy,
|
47
|
+
errors_collector: FileBasedErrorsCollector,
|
47
48
|
):
|
48
49
|
super().__init__()
|
49
50
|
self.config = config
|
@@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream):
|
|
53
54
|
self._discovery_policy = discovery_policy
|
54
55
|
self._availability_strategy = availability_strategy
|
55
56
|
self._parsers = parsers
|
57
|
+
self.errors_collector = errors_collector
|
56
58
|
|
57
59
|
@property
|
58
60
|
@abstractmethod
|
@@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
112
112
|
except RecordParseError:
|
113
113
|
# Increment line_no because the exception was raised before we could increment it
|
114
114
|
line_no += 1
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
115
|
+
self.errors_collector.collect(
|
116
|
+
AirbyteMessage(
|
117
|
+
type=MessageType.LOG,
|
118
|
+
log=AirbyteLogMessage(
|
119
|
+
level=Level.ERROR,
|
120
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
121
|
+
stack_trace=traceback.format_exc(),
|
122
|
+
),
|
121
123
|
),
|
122
124
|
)
|
123
125
|
|
@@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
|
|
152
152
|
airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
|
153
153
|
airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
|
154
154
|
airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
airbyte_cdk/sources/file_based/exceptions.py,sha256
|
156
|
-
airbyte_cdk/sources/file_based/file_based_source.py,sha256=
|
155
|
+
airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
|
156
|
+
airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
|
157
157
|
airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
|
158
158
|
airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
|
159
159
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
|
@@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
|
|
163
163
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
|
164
164
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
165
165
|
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
|
166
|
-
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=
|
166
|
+
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
|
167
167
|
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
|
168
168
|
airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
|
169
169
|
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
|
170
|
-
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=
|
170
|
+
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
|
171
171
|
airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
|
172
172
|
airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
|
173
173
|
airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
|
174
174
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
|
175
175
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
|
176
|
-
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=
|
177
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
176
|
+
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
|
177
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
|
178
178
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
|
179
|
-
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=
|
180
|
-
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=
|
179
|
+
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
|
180
|
+
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
|
181
181
|
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
|
182
182
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
|
183
183
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
|
184
184
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
|
185
185
|
airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
|
186
|
-
airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=
|
187
|
-
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=
|
186
|
+
airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
|
187
|
+
airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
|
188
188
|
airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
|
189
189
|
airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
|
190
190
|
airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
|
@@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
|
|
363
363
|
unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
364
364
|
unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
|
365
365
|
unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
|
366
|
-
unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=
|
366
|
+
unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
|
367
367
|
unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
|
368
368
|
unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
|
369
369
|
unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
|
@@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl
|
|
384
384
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
385
385
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
|
386
386
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
|
387
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
387
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
|
388
388
|
unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
|
389
389
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
|
390
390
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
|
391
391
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
|
392
392
|
unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
|
393
|
-
unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=
|
393
|
+
unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
|
394
394
|
unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
|
395
|
-
unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=
|
395
|
+
unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
|
396
396
|
unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
397
397
|
unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
|
398
|
-
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=
|
398
|
+
unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
|
399
399
|
unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
400
400
|
unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
|
401
401
|
unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
444
444
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
445
445
|
unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
|
446
446
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
447
|
-
airbyte_cdk-0.58.
|
448
|
-
airbyte_cdk-0.58.
|
449
|
-
airbyte_cdk-0.58.
|
450
|
-
airbyte_cdk-0.58.
|
451
|
-
airbyte_cdk-0.58.
|
447
|
+
airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
448
|
+
airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
|
449
|
+
airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
450
|
+
airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
451
|
+
airbyte_cdk-0.58.7.dist-info/RECORD,,
|
@@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
|
|
852
852
|
]
|
853
853
|
}
|
854
854
|
)
|
855
|
+
.set_expected_read_error(
|
856
|
+
AirbyteTracedException,
|
857
|
+
"Please check the logged errors for more information.",
|
858
|
+
)
|
859
|
+
).build()
|
860
|
+
|
861
|
+
invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
|
862
|
+
TestScenarioBuilder[InMemoryFilesSource]()
|
863
|
+
.set_name("invalid_csv_multi_scenario") # too many values for the number of headers
|
864
|
+
.set_config(
|
865
|
+
{
|
866
|
+
"streams": [
|
867
|
+
{
|
868
|
+
"name": "stream1",
|
869
|
+
"format": {"filetype": "csv"},
|
870
|
+
"globs": ["*"],
|
871
|
+
"validation_policy": "Emit Record",
|
872
|
+
},
|
873
|
+
{
|
874
|
+
"name": "stream2",
|
875
|
+
"format": {"filetype": "csv"},
|
876
|
+
"globs": ["b.csv"],
|
877
|
+
"validation_policy": "Emit Record",
|
878
|
+
},
|
879
|
+
]
|
880
|
+
}
|
881
|
+
)
|
882
|
+
.set_source_builder(
|
883
|
+
FileBasedSourceBuilder()
|
884
|
+
.set_files(
|
885
|
+
{
|
886
|
+
"a.csv": {
|
887
|
+
"contents": [
|
888
|
+
("col1",),
|
889
|
+
("val11", "val12"),
|
890
|
+
("val21", "val22"),
|
891
|
+
],
|
892
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
893
|
+
},
|
894
|
+
"b.csv": {
|
895
|
+
"contents": [
|
896
|
+
("col3",),
|
897
|
+
("val13b", "val14b"),
|
898
|
+
("val23b", "val24b"),
|
899
|
+
],
|
900
|
+
"last_modified": "2023-06-05T03:54:07.000Z",
|
901
|
+
},
|
902
|
+
}
|
903
|
+
)
|
904
|
+
.set_file_type("csv")
|
905
|
+
)
|
906
|
+
.set_expected_catalog(
|
907
|
+
{
|
908
|
+
"streams": [
|
909
|
+
{
|
910
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
911
|
+
"json_schema": {
|
912
|
+
"type": "object",
|
913
|
+
"properties": {
|
914
|
+
"col1": {"type": ["null", "string"]},
|
915
|
+
"col2": {"type": ["null", "string"]},
|
916
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
917
|
+
"_ab_source_file_url": {"type": "string"},
|
918
|
+
},
|
919
|
+
},
|
920
|
+
"name": "stream1",
|
921
|
+
"source_defined_cursor": True,
|
922
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
923
|
+
},
|
924
|
+
{
|
925
|
+
"json_schema": {
|
926
|
+
"type": "object",
|
927
|
+
"properties": {
|
928
|
+
"col3": {"type": ["null", "string"]},
|
929
|
+
"_ab_source_file_last_modified": {"type": "string"},
|
930
|
+
"_ab_source_file_url": {"type": "string"},
|
931
|
+
},
|
932
|
+
},
|
933
|
+
"name": "stream2",
|
934
|
+
"source_defined_cursor": True,
|
935
|
+
"default_cursor_field": ["_ab_source_file_last_modified"],
|
936
|
+
"supported_sync_modes": ["full_refresh", "incremental"],
|
937
|
+
},
|
938
|
+
]
|
939
|
+
}
|
940
|
+
)
|
941
|
+
.set_expected_records([])
|
942
|
+
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
943
|
+
.set_expected_logs(
|
944
|
+
{
|
945
|
+
"read": [
|
946
|
+
{
|
947
|
+
"level": "ERROR",
|
948
|
+
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
|
949
|
+
},
|
950
|
+
{
|
951
|
+
"level": "ERROR",
|
952
|
+
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
|
953
|
+
},
|
954
|
+
]
|
955
|
+
}
|
956
|
+
)
|
957
|
+
.set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
|
855
958
|
).build()
|
856
959
|
|
857
960
|
csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
|
|
2172
2275
|
},
|
2173
2276
|
]
|
2174
2277
|
)
|
2175
|
-
.
|
2176
|
-
|
2177
|
-
|
2178
|
-
{
|
2179
|
-
"level": "ERROR",
|
2180
|
-
"message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
|
2181
|
-
}
|
2182
|
-
]
|
2183
|
-
}
|
2278
|
+
.set_expected_read_error(
|
2279
|
+
AirbyteTracedException,
|
2280
|
+
f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
|
2184
2281
|
)
|
2185
2282
|
.set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
|
2283
|
+
.set_expected_read_error(
|
2284
|
+
AirbyteTracedException,
|
2285
|
+
"Please check the logged errors for more information.",
|
2286
|
+
)
|
2186
2287
|
).build()
|
2187
2288
|
|
2188
2289
|
csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
|
@@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = (
|
|
231
231
|
)
|
232
232
|
.set_expected_records([])
|
233
233
|
.set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
|
234
|
+
.set_expected_read_error(
|
235
|
+
AirbyteTracedException,
|
236
|
+
"Please check the logged errors for more information.",
|
237
|
+
)
|
234
238
|
).build()
|
235
239
|
|
236
240
|
# If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
|
@@ -2,7 +2,8 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
|
5
|
+
|
6
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
6
7
|
from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
|
7
8
|
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
|
8
9
|
|
@@ -272,6 +273,10 @@ skip_record_scenario_single_stream = (
|
|
272
273
|
]
|
273
274
|
}
|
274
275
|
)
|
276
|
+
.set_expected_read_error(
|
277
|
+
AirbyteTracedException,
|
278
|
+
"Please check the logged errors for more information.",
|
279
|
+
)
|
275
280
|
).build()
|
276
281
|
|
277
282
|
|
@@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = (
|
|
416
421
|
]
|
417
422
|
}
|
418
423
|
)
|
424
|
+
.set_expected_read_error(
|
425
|
+
AirbyteTracedException,
|
426
|
+
"Please check the logged errors for more information.",
|
427
|
+
)
|
419
428
|
).build()
|
420
429
|
|
421
430
|
|
@@ -492,19 +501,9 @@ emit_record_scenario_single_stream = (
|
|
492
501
|
},
|
493
502
|
]
|
494
503
|
)
|
495
|
-
.
|
496
|
-
|
497
|
-
|
498
|
-
{
|
499
|
-
"level": "ERROR",
|
500
|
-
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
|
501
|
-
},
|
502
|
-
{
|
503
|
-
"level": "WARN",
|
504
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
505
|
-
},
|
506
|
-
]
|
507
|
-
}
|
504
|
+
.set_expected_read_error(
|
505
|
+
AirbyteTracedException,
|
506
|
+
"Please check the logged errors for more information.",
|
508
507
|
)
|
509
508
|
).build()
|
510
509
|
|
@@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = (
|
|
640
639
|
},
|
641
640
|
]
|
642
641
|
)
|
643
|
-
.
|
644
|
-
|
645
|
-
|
646
|
-
{
|
647
|
-
"level": "ERROR",
|
648
|
-
"message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
|
649
|
-
},
|
650
|
-
{
|
651
|
-
"level": "WARN",
|
652
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
653
|
-
},
|
654
|
-
{
|
655
|
-
"level": "WARN",
|
656
|
-
"message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
|
657
|
-
},
|
658
|
-
]
|
659
|
-
}
|
642
|
+
.set_expected_read_error(
|
643
|
+
AirbyteTracedException,
|
644
|
+
"Please check the logged errors for more information.",
|
660
645
|
)
|
661
646
|
).build()
|
662
647
|
|
@@ -2,15 +2,18 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
import traceback
|
5
6
|
import unittest
|
6
7
|
from datetime import datetime, timezone
|
7
8
|
from typing import Any, Iterable, Iterator, Mapping
|
8
9
|
from unittest.mock import Mock
|
9
10
|
|
10
11
|
import pytest
|
11
|
-
from airbyte_cdk.models import Level
|
12
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
|
13
|
+
from airbyte_cdk.models import Type as MessageType
|
12
14
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
|
13
15
|
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
16
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
|
14
17
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
15
18
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
19
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
@@ -55,12 +58,17 @@ class MockFormat:
|
|
55
58
|
),
|
56
59
|
pytest.param(
|
57
60
|
{"type": "object", "properties": {"prop": {"type": "string"}}},
|
58
|
-
{
|
61
|
+
{
|
62
|
+
"type": ["null", "object"],
|
63
|
+
"properties": {"prop": {"type": ["null", "string"]}},
|
64
|
+
},
|
59
65
|
id="deeply-nested-schema",
|
60
66
|
),
|
61
67
|
],
|
62
68
|
)
|
63
|
-
def test_fill_nulls(
|
69
|
+
def test_fill_nulls(
|
70
|
+
input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
|
71
|
+
) -> None:
|
64
72
|
assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
|
65
73
|
|
66
74
|
|
@@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
90
98
|
parsers={MockFormat: self._parser},
|
91
99
|
validation_policy=self._validation_policy,
|
92
100
|
cursor=self._cursor,
|
101
|
+
errors_collector=FileBasedErrorsCollector(),
|
93
102
|
)
|
94
103
|
|
95
104
|
def test_when_read_records_from_slice_then_return_records(self) -> None:
|
96
105
|
self._parser.parse_records.return_value = [self._A_RECORD]
|
97
|
-
messages = list(
|
98
|
-
|
106
|
+
messages = list(
|
107
|
+
self._stream.read_records_from_slice(
|
108
|
+
{"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
|
109
|
+
)
|
110
|
+
)
|
111
|
+
assert list(map(lambda message: message.record.data["data"], messages)) == [
|
112
|
+
self._A_RECORD
|
113
|
+
]
|
99
114
|
|
100
|
-
def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
|
115
|
+
def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
|
116
|
+
self,
|
117
|
+
) -> None:
|
101
118
|
"""
|
102
119
|
The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
|
103
120
|
we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
|
104
121
|
the file but the sync won't fail.
|
105
122
|
Ticket: https://github.com/airbytehq/airbyte/issues/29680
|
106
123
|
"""
|
107
|
-
self._parser.parse_records.side_effect = [
|
124
|
+
self._parser.parse_records.side_effect = [
|
125
|
+
ValueError("An error"),
|
126
|
+
[self._A_RECORD],
|
127
|
+
]
|
108
128
|
|
109
129
|
messages = list(
|
110
130
|
self._stream.read_records_from_slice(
|
@@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
120
140
|
assert messages[0].log.level == Level.ERROR
|
121
141
|
assert messages[1].record.data["data"] == self._A_RECORD
|
122
142
|
|
123
|
-
def test_given_traced_exception_when_read_records_from_slice_then_fail(
|
143
|
+
def test_given_traced_exception_when_read_records_from_slice_then_fail(
|
144
|
+
self,
|
145
|
+
) -> None:
|
124
146
|
"""
|
125
147
|
When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
|
126
148
|
"""
|
@@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
138
160
|
)
|
139
161
|
)
|
140
162
|
|
141
|
-
def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
|
163
|
+
def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
|
164
|
+
self,
|
165
|
+
) -> None:
|
142
166
|
self._stream_config.schemaless = False
|
143
167
|
self._validation_policy.record_passes_validation_policy.return_value = False
|
144
|
-
self._parser.parse_records.side_effect = [
|
168
|
+
self._parser.parse_records.side_effect = [
|
169
|
+
self._iter([self._A_RECORD, ValueError("An error")])
|
170
|
+
]
|
145
171
|
|
146
172
|
messages = list(
|
147
173
|
self._stream.read_records_from_slice(
|
@@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
|
|
183
209
|
if isinstance(item, Exception):
|
184
210
|
raise item
|
185
211
|
yield item
|
212
|
+
|
213
|
+
|
214
|
+
class TestFileBasedErrorCollector:
|
215
|
+
test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
216
|
+
|
217
|
+
@pytest.mark.parametrize(
|
218
|
+
"stream, file, line_no, n_skipped, collector_expected_len",
|
219
|
+
(
|
220
|
+
("stream_1", "test.csv", 1, 1, 1),
|
221
|
+
("stream_2", "test2.csv", 2, 2, 2),
|
222
|
+
),
|
223
|
+
ids=[
|
224
|
+
"Single error",
|
225
|
+
"Multiple errors",
|
226
|
+
],
|
227
|
+
)
|
228
|
+
def test_collect_parsing_error(
|
229
|
+
self, stream, file, line_no, n_skipped, collector_expected_len
|
230
|
+
) -> None:
|
231
|
+
test_error_pattern = "Error parsing record."
|
232
|
+
# format the error body
|
233
|
+
test_error = (
|
234
|
+
AirbyteMessage(
|
235
|
+
type=MessageType.LOG,
|
236
|
+
log=AirbyteLogMessage(
|
237
|
+
level=Level.ERROR,
|
238
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
|
239
|
+
stack_trace=traceback.format_exc(),
|
240
|
+
),
|
241
|
+
),
|
242
|
+
)
|
243
|
+
# collecting the error
|
244
|
+
self.test_error_collector.collect(test_error)
|
245
|
+
# check the error has been collected
|
246
|
+
assert len(self.test_error_collector.errors) == collector_expected_len
|
247
|
+
# check for the patern presence for the collected errors
|
248
|
+
for error in self.test_error_collector.errors:
|
249
|
+
assert test_error_pattern in error[0].log.message
|
250
|
+
|
251
|
+
def test_yield_and_raise_collected(self) -> None:
|
252
|
+
# we expect the following method will raise the AirbyteTracedException
|
253
|
+
with pytest.raises(AirbyteTracedException) as parse_error:
|
254
|
+
list(self.test_error_collector.yield_and_raise_collected())
|
255
|
+
assert (
|
256
|
+
parse_error.value.message
|
257
|
+
== "Some errors occured while reading from the source."
|
258
|
+
)
|
259
|
+
assert (
|
260
|
+
parse_error.value.internal_message
|
261
|
+
== "Please check the logged errors for more information."
|
262
|
+
)
|
@@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
|
|
48
48
|
csv_strings_can_be_null_not_quoted_scenario,
|
49
49
|
earlier_csv_scenario,
|
50
50
|
empty_schema_inference_scenario,
|
51
|
+
invalid_csv_multi_scenario,
|
51
52
|
invalid_csv_scenario,
|
52
53
|
multi_csv_scenario,
|
53
54
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
@@ -132,6 +133,7 @@ discover_scenarios = [
|
|
132
133
|
csv_multi_stream_scenario,
|
133
134
|
csv_single_stream_scenario,
|
134
135
|
invalid_csv_scenario,
|
136
|
+
invalid_csv_multi_scenario,
|
135
137
|
single_csv_scenario,
|
136
138
|
multi_csv_scenario,
|
137
139
|
multi_csv_stream_n_file_exceeds_limit_for_inference,
|
File without changes
|
File without changes
|
File without changes
|