airbyte-cdk 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+
5
6
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
6
7
  from pydantic import BaseModel, Field
7
8
 
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+
5
6
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
6
7
  from pydantic import BaseModel, Field
7
8
 
@@ -3,8 +3,9 @@
3
3
  #
4
4
 
5
5
  from enum import Enum
6
- from typing import Union
6
+ from typing import Any, List, Union
7
7
 
8
+ from airbyte_cdk.models import AirbyteMessage, FailureType
8
9
  from airbyte_cdk.utils import AirbyteTracedException
9
10
 
10
11
 
@@ -40,6 +41,30 @@ class FileBasedSourceError(Enum):
40
41
  UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
41
42
 
42
43
 
44
+ class FileBasedErrorsCollector:
45
+ """
46
+ The placeholder for all errors collected.
47
+ """
48
+
49
+ errors: List[AirbyteMessage] = []
50
+
51
+ def yield_and_raise_collected(self) -> Any:
52
+ if self.errors:
53
+ # emit collected logged messages
54
+ yield from self.errors
55
+ # clean the collector
56
+ self.errors.clear()
57
+ # raising the single exception
58
+ raise AirbyteTracedException(
59
+ internal_message="Please check the logged errors for more information.",
60
+ message="Some errors occured while reading from the source.",
61
+ failure_type=FailureType.config_error,
62
+ )
63
+
64
+ def collect(self, logged_error: AirbyteMessage) -> None:
65
+ self.errors.append(logged_error)
66
+
67
+
43
68
  class BaseFileBasedSourceError(Exception):
44
69
  def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
45
70
  if isinstance(error, FileBasedSourceError):
@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas
14
14
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
15
15
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
16
16
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
17
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
17
+ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
18
18
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
19
19
  from airbyte_cdk.sources.file_based.file_types import default_parsers
20
20
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
@@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC):
49
49
  self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
50
50
  self.cursor_cls = cursor_cls
51
51
  self.logger = logging.getLogger(f"airbyte.{self.name}")
52
+ self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
52
53
 
53
54
  def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
54
55
  """
@@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC):
106
107
  parsers=self.parsers,
107
108
  validation_policy=self._validate_and_get_validation_policy(stream_config),
108
109
  cursor=self.cursor_cls(stream_config),
110
+ errors_collector=self.errors_collector,
109
111
  )
110
112
  )
111
113
  return streams
@@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC):
121
123
  state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
122
124
  ) -> Iterator[AirbyteMessage]:
123
125
  yield from super().read(logger, config, catalog, state)
126
+ # emit all the errors collected
127
+ yield from self.errors_collector.yield_and_raise_collected()
124
128
  # count streams using a certain parser
125
129
  parsed_config = self._get_parsed_config(config)
126
130
  for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
8
8
  import fastavro
9
9
  from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
10
10
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
11
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
11
12
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
12
13
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
13
14
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -144,15 +145,20 @@ class AvroParser(FileTypeParser):
144
145
  if not isinstance(avro_format, AvroFormat):
145
146
  raise ValueError(f"Expected ParquetFormat, got {avro_format}")
146
147
 
147
- with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
148
- avro_reader = fastavro.reader(fp)
149
- schema = avro_reader.writer_schema
150
- schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
151
- for record in avro_reader:
152
- yield {
153
- record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
154
- for record_field, record_value in schema_field_name_to_type.items()
155
- }
148
+ line_no = 0
149
+ try:
150
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
151
+ avro_reader = fastavro.reader(fp)
152
+ schema = avro_reader.writer_schema
153
+ schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
154
+ for record in avro_reader:
155
+ line_no += 1
156
+ yield {
157
+ record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
158
+ for record_field, record_value in schema_field_name_to_type.items()
159
+ }
160
+ except Exception as exc:
161
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
156
162
 
157
163
  @property
158
164
  def file_read_mode(self) -> FileReadMode:
@@ -178,17 +178,25 @@ class CsvParser(FileTypeParser):
178
178
  logger: logging.Logger,
179
179
  discovered_schema: Optional[Mapping[str, SchemaType]],
180
180
  ) -> Iterable[Dict[str, Any]]:
181
- config_format = _extract_format(config)
182
- if discovered_schema:
183
- property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
184
- deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
185
- else:
186
- deduped_property_types = {}
187
- cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
188
- data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
189
- for row in data_generator:
190
- yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null)
191
- data_generator.close()
181
+ line_no = 0
182
+ try:
183
+ config_format = _extract_format(config)
184
+ if discovered_schema:
185
+ property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()} # type: ignore # discovered_schema["properties"] is known to be a mapping
186
+ deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
187
+ else:
188
+ deduped_property_types = {}
189
+ cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
190
+ data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
191
+ for row in data_generator:
192
+ line_no += 1
193
+ yield CsvParser._to_nullable(
194
+ cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
195
+ )
196
+ except RecordParseError as parse_err:
197
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
198
+ finally:
199
+ data_generator.close()
192
200
 
193
201
  @property
194
202
  def file_read_mode(self) -> FileReadMode:
@@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser):
119
119
  break
120
120
 
121
121
  if had_json_parsing_error and not yielded_at_least_once:
122
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
122
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
123
123
 
124
124
  @staticmethod
125
125
  def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
@@ -11,7 +11,7 @@ from urllib.parse import unquote
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
13
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
14
+ from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
15
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
16
16
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
17
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser):
64
64
  if not isinstance(parquet_format, ParquetFormat):
65
65
  logger.info(f"Expected ParquetFormat, got {parquet_format}")
66
66
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
67
- with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
68
- reader = pq.ParquetFile(fp)
69
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
70
- for row_group in range(reader.num_row_groups):
71
- batch = reader.read_row_group(row_group)
72
- for row in range(batch.num_rows):
73
- yield {
74
- **{
75
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
76
- for column in batch.column_names
77
- },
78
- **partition_columns,
79
- }
67
+
68
+ line_no = 0
69
+ try:
70
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
+ reader = pq.ParquetFile(fp)
72
+ partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
73
+ for row_group in range(reader.num_row_groups):
74
+ batch = reader.read_row_group(row_group)
75
+ for row in range(batch.num_rows):
76
+ line_no += 1
77
+ yield {
78
+ **{
79
+ column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
80
+ for column in batch.column_names
81
+ },
82
+ **partition_columns,
83
+ }
84
+ except Exception as exc:
85
+ raise RecordParseError(
86
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
87
+ ) from exc
80
88
 
81
89
  @staticmethod
82
90
  def _extract_partitions(filepath: str) -> List[str]:
@@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode
10
10
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
11
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
12
12
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
13
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
13
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
14
14
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
15
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream):
44
44
  discovery_policy: AbstractDiscoveryPolicy,
45
45
  parsers: Dict[Type[Any], FileTypeParser],
46
46
  validation_policy: AbstractSchemaValidationPolicy,
47
+ errors_collector: FileBasedErrorsCollector,
47
48
  ):
48
49
  super().__init__()
49
50
  self.config = config
@@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream):
53
54
  self._discovery_policy = discovery_policy
54
55
  self._availability_strategy = availability_strategy
55
56
  self._parsers = parsers
57
+ self.errors_collector = errors_collector
56
58
 
57
59
  @property
58
60
  @abstractmethod
@@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
112
112
  except RecordParseError:
113
113
  # Increment line_no because the exception was raised before we could increment it
114
114
  line_no += 1
115
- yield AirbyteMessage(
116
- type=MessageType.LOG,
117
- log=AirbyteLogMessage(
118
- level=Level.ERROR,
119
- message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
120
- stack_trace=traceback.format_exc(),
115
+ self.errors_collector.collect(
116
+ AirbyteMessage(
117
+ type=MessageType.LOG,
118
+ log=AirbyteLogMessage(
119
+ level=Level.ERROR,
120
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
121
+ stack_trace=traceback.format_exc(),
122
+ ),
121
123
  ),
122
124
  )
123
125
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.58.5
3
+ Version: 0.58.7
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
152
152
  airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
153
153
  airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
154
154
  airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- airbyte_cdk/sources/file_based/exceptions.py,sha256=z9JBEEhGyM1ev9P6MjjjigtyuBm3OaOl3lIhkOQf8lQ,4765
156
- airbyte_cdk/sources/file_based/file_based_source.py,sha256=2kguVKlTFg9vSE-eNZeVj4-VXElz3OuhJZrWrIdp2HE,7896
155
+ airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
156
+ airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
157
157
  airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
158
158
  airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
159
159
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
@@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
163
163
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
164
164
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
166
- airbyte_cdk/sources/file_based/config/avro_format.py,sha256=lQSEq5JZY0M5y9mW93R4EjrIb8brYXUgrXCY-6EMHww,711
166
+ airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
167
167
  airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
168
168
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
169
169
  airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
170
- airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=8GTDTQyvS7pWLVG0LWirHVE1snHd0Au5R4Ym33-ezEg,736
170
+ airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
171
171
  airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
172
172
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
173
173
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
174
174
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
175
175
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
176
- airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=gCyPJc4khkar4sdfBd-RU3CuV_k7nnsNM080tjwDOiw,8817
177
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3fKsZbFQSbo83EGNSuqZpqjTg_c,17880
176
+ airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
177
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
178
178
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
179
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
180
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
179
+ airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
180
+ airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
181
181
  airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
182
182
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
183
183
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
184
184
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
185
185
  airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
186
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=0Z343kciFr5Y66SiwzIcxT6eKG2rMQtLHgLX-vpUVa4,6278
187
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=VBVZeHt4MP_PWHE_Z6rQatiOUWu-HIRoqo2EcmvV_6E,12463
186
+ airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
187
+ airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
188
188
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
189
189
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
190
190
  airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
@@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
363
363
  unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
364
364
  unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
365
365
  unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
366
- unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=iwXCqnFGxfHh3l48wXtlD-x74rbZYcA94XXnBrcrrKQ,11616
366
+ unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
367
367
  unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
368
368
  unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
369
369
  unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
@@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl
384
384
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
385
385
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
386
386
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
387
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4mh9nllsdsi4NCOr8q0ZRZatFUz3Zf5etVcwVE_mjbM,120032
387
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
388
388
  unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
389
389
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
390
390
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
391
391
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
392
392
  unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
393
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=04exiS9j6kPyHLUUMgQLGfJHmlD1T63bixANhnUDdzk,67762
393
+ unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
394
394
  unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
395
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=Try0knJN5wfoGNO38QGoLGIcqSceSAQsUWO42CusNYI,33005
395
+ unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
396
396
  unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
397
397
  unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
398
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=3zupeqwYyAb2EP4jn_8zbdu6_gTa1HlOAu6Rh0lxStM,7786
398
+ unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
399
399
  unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
400
400
  unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
401
401
  unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
444
444
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
445
445
  unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
446
446
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
447
- airbyte_cdk-0.58.5.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
448
- airbyte_cdk-0.58.5.dist-info/METADATA,sha256=oXzgSRQhxSaFJU0NpyjMFavVxfFi14BKpQLIjS4rPNU,11073
449
- airbyte_cdk-0.58.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
450
- airbyte_cdk-0.58.5.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
451
- airbyte_cdk-0.58.5.dist-info/RECORD,,
447
+ airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
448
+ airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
449
+ airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
450
+ airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
451
+ airbyte_cdk-0.58.7.dist-info/RECORD,,
@@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
852
852
  ]
853
853
  }
854
854
  )
855
+ .set_expected_read_error(
856
+ AirbyteTracedException,
857
+ "Please check the logged errors for more information.",
858
+ )
859
+ ).build()
860
+
861
+ invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
862
+ TestScenarioBuilder[InMemoryFilesSource]()
863
+ .set_name("invalid_csv_multi_scenario") # too many values for the number of headers
864
+ .set_config(
865
+ {
866
+ "streams": [
867
+ {
868
+ "name": "stream1",
869
+ "format": {"filetype": "csv"},
870
+ "globs": ["*"],
871
+ "validation_policy": "Emit Record",
872
+ },
873
+ {
874
+ "name": "stream2",
875
+ "format": {"filetype": "csv"},
876
+ "globs": ["b.csv"],
877
+ "validation_policy": "Emit Record",
878
+ },
879
+ ]
880
+ }
881
+ )
882
+ .set_source_builder(
883
+ FileBasedSourceBuilder()
884
+ .set_files(
885
+ {
886
+ "a.csv": {
887
+ "contents": [
888
+ ("col1",),
889
+ ("val11", "val12"),
890
+ ("val21", "val22"),
891
+ ],
892
+ "last_modified": "2023-06-05T03:54:07.000Z",
893
+ },
894
+ "b.csv": {
895
+ "contents": [
896
+ ("col3",),
897
+ ("val13b", "val14b"),
898
+ ("val23b", "val24b"),
899
+ ],
900
+ "last_modified": "2023-06-05T03:54:07.000Z",
901
+ },
902
+ }
903
+ )
904
+ .set_file_type("csv")
905
+ )
906
+ .set_expected_catalog(
907
+ {
908
+ "streams": [
909
+ {
910
+ "default_cursor_field": ["_ab_source_file_last_modified"],
911
+ "json_schema": {
912
+ "type": "object",
913
+ "properties": {
914
+ "col1": {"type": ["null", "string"]},
915
+ "col2": {"type": ["null", "string"]},
916
+ "_ab_source_file_last_modified": {"type": "string"},
917
+ "_ab_source_file_url": {"type": "string"},
918
+ },
919
+ },
920
+ "name": "stream1",
921
+ "source_defined_cursor": True,
922
+ "supported_sync_modes": ["full_refresh", "incremental"],
923
+ },
924
+ {
925
+ "json_schema": {
926
+ "type": "object",
927
+ "properties": {
928
+ "col3": {"type": ["null", "string"]},
929
+ "_ab_source_file_last_modified": {"type": "string"},
930
+ "_ab_source_file_url": {"type": "string"},
931
+ },
932
+ },
933
+ "name": "stream2",
934
+ "source_defined_cursor": True,
935
+ "default_cursor_field": ["_ab_source_file_last_modified"],
936
+ "supported_sync_modes": ["full_refresh", "incremental"],
937
+ },
938
+ ]
939
+ }
940
+ )
941
+ .set_expected_records([])
942
+ .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
943
+ .set_expected_logs(
944
+ {
945
+ "read": [
946
+ {
947
+ "level": "ERROR",
948
+ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
949
+ },
950
+ {
951
+ "level": "ERROR",
952
+ "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
953
+ },
954
+ ]
955
+ }
956
+ )
957
+ .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
855
958
  ).build()
856
959
 
857
960
  csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
@@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
2172
2275
  },
2173
2276
  ]
2174
2277
  )
2175
- .set_expected_logs(
2176
- {
2177
- "read": [
2178
- {
2179
- "level": "ERROR",
2180
- "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
2181
- }
2182
- ]
2183
- }
2278
+ .set_expected_read_error(
2279
+ AirbyteTracedException,
2280
+ f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
2184
2281
  )
2185
2282
  .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
2283
+ .set_expected_read_error(
2284
+ AirbyteTracedException,
2285
+ "Please check the logged errors for more information.",
2286
+ )
2186
2287
  ).build()
2187
2288
 
2188
2289
  csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
@@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = (
231
231
  )
232
232
  .set_expected_records([])
233
233
  .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
234
+ .set_expected_read_error(
235
+ AirbyteTracedException,
236
+ "Please check the logged errors for more information.",
237
+ )
234
238
  ).build()
235
239
 
236
240
  # If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
@@ -2,7 +2,8 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
5
+
6
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
6
7
  from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
7
8
  from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
8
9
 
@@ -272,6 +273,10 @@ skip_record_scenario_single_stream = (
272
273
  ]
273
274
  }
274
275
  )
276
+ .set_expected_read_error(
277
+ AirbyteTracedException,
278
+ "Please check the logged errors for more information.",
279
+ )
275
280
  ).build()
276
281
 
277
282
 
@@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = (
416
421
  ]
417
422
  }
418
423
  )
424
+ .set_expected_read_error(
425
+ AirbyteTracedException,
426
+ "Please check the logged errors for more information.",
427
+ )
419
428
  ).build()
420
429
 
421
430
 
@@ -492,19 +501,9 @@ emit_record_scenario_single_stream = (
492
501
  },
493
502
  ]
494
503
  )
495
- .set_expected_logs(
496
- {
497
- "read": [
498
- {
499
- "level": "ERROR",
500
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
501
- },
502
- {
503
- "level": "WARN",
504
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
505
- },
506
- ]
507
- }
504
+ .set_expected_read_error(
505
+ AirbyteTracedException,
506
+ "Please check the logged errors for more information.",
508
507
  )
509
508
  ).build()
510
509
 
@@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = (
640
639
  },
641
640
  ]
642
641
  )
643
- .set_expected_logs(
644
- {
645
- "read": [
646
- {
647
- "level": "ERROR",
648
- "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
649
- },
650
- {
651
- "level": "WARN",
652
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
653
- },
654
- {
655
- "level": "WARN",
656
- "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
657
- },
658
- ]
659
- }
642
+ .set_expected_read_error(
643
+ AirbyteTracedException,
644
+ "Please check the logged errors for more information.",
660
645
  )
661
646
  ).build()
662
647
 
@@ -2,15 +2,18 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ import traceback
5
6
  import unittest
6
7
  from datetime import datetime, timezone
7
8
  from typing import Any, Iterable, Iterator, Mapping
8
9
  from unittest.mock import Mock
9
10
 
10
11
  import pytest
11
- from airbyte_cdk.models import Level
12
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
13
+ from airbyte_cdk.models import Type as MessageType
12
14
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
13
15
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
16
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
14
17
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
18
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
19
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -55,12 +58,17 @@ class MockFormat:
55
58
  ),
56
59
  pytest.param(
57
60
  {"type": "object", "properties": {"prop": {"type": "string"}}},
58
- {"type": ["null", "object"], "properties": {"prop": {"type": ["null", "string"]}}},
61
+ {
62
+ "type": ["null", "object"],
63
+ "properties": {"prop": {"type": ["null", "string"]}},
64
+ },
59
65
  id="deeply-nested-schema",
60
66
  ),
61
67
  ],
62
68
  )
63
- def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None:
69
+ def test_fill_nulls(
70
+ input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
71
+ ) -> None:
64
72
  assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
65
73
 
66
74
 
@@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
90
98
  parsers={MockFormat: self._parser},
91
99
  validation_policy=self._validation_policy,
92
100
  cursor=self._cursor,
101
+ errors_collector=FileBasedErrorsCollector(),
93
102
  )
94
103
 
95
104
  def test_when_read_records_from_slice_then_return_records(self) -> None:
96
105
  self._parser.parse_records.return_value = [self._A_RECORD]
97
- messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}))
98
- assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD]
106
+ messages = list(
107
+ self._stream.read_records_from_slice(
108
+ {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
109
+ )
110
+ )
111
+ assert list(map(lambda message: message.record.data["data"], messages)) == [
112
+ self._A_RECORD
113
+ ]
99
114
 
100
- def test_given_exception_when_read_records_from_slice_then_do_process_other_files(self) -> None:
115
+ def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
116
+ self,
117
+ ) -> None:
101
118
  """
102
119
  The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
103
120
  we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
104
121
  the file but the sync won't fail.
105
122
  Ticket: https://github.com/airbytehq/airbyte/issues/29680
106
123
  """
107
- self._parser.parse_records.side_effect = [ValueError("An error"), [self._A_RECORD]]
124
+ self._parser.parse_records.side_effect = [
125
+ ValueError("An error"),
126
+ [self._A_RECORD],
127
+ ]
108
128
 
109
129
  messages = list(
110
130
  self._stream.read_records_from_slice(
@@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
120
140
  assert messages[0].log.level == Level.ERROR
121
141
  assert messages[1].record.data["data"] == self._A_RECORD
122
142
 
123
- def test_given_traced_exception_when_read_records_from_slice_then_fail(self) -> None:
143
+ def test_given_traced_exception_when_read_records_from_slice_then_fail(
144
+ self,
145
+ ) -> None:
124
146
  """
125
147
  When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
126
148
  """
@@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
138
160
  )
139
161
  )
140
162
 
141
- def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(self) -> None:
163
+ def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
164
+ self,
165
+ ) -> None:
142
166
  self._stream_config.schemaless = False
143
167
  self._validation_policy.record_passes_validation_policy.return_value = False
144
- self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])]
168
+ self._parser.parse_records.side_effect = [
169
+ self._iter([self._A_RECORD, ValueError("An error")])
170
+ ]
145
171
 
146
172
  messages = list(
147
173
  self._stream.read_records_from_slice(
@@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
183
209
  if isinstance(item, Exception):
184
210
  raise item
185
211
  yield item
212
+
213
+
214
+ class TestFileBasedErrorCollector:
215
+ test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
216
+
217
+ @pytest.mark.parametrize(
218
+ "stream, file, line_no, n_skipped, collector_expected_len",
219
+ (
220
+ ("stream_1", "test.csv", 1, 1, 1),
221
+ ("stream_2", "test2.csv", 2, 2, 2),
222
+ ),
223
+ ids=[
224
+ "Single error",
225
+ "Multiple errors",
226
+ ],
227
+ )
228
+ def test_collect_parsing_error(
229
+ self, stream, file, line_no, n_skipped, collector_expected_len
230
+ ) -> None:
231
+ test_error_pattern = "Error parsing record."
232
+ # format the error body
233
+ test_error = (
234
+ AirbyteMessage(
235
+ type=MessageType.LOG,
236
+ log=AirbyteLogMessage(
237
+ level=Level.ERROR,
238
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
239
+ stack_trace=traceback.format_exc(),
240
+ ),
241
+ ),
242
+ )
243
+ # collecting the error
244
+ self.test_error_collector.collect(test_error)
245
+ # check the error has been collected
246
+ assert len(self.test_error_collector.errors) == collector_expected_len
247
+ # check for the patern presence for the collected errors
248
+ for error in self.test_error_collector.errors:
249
+ assert test_error_pattern in error[0].log.message
250
+
251
+ def test_yield_and_raise_collected(self) -> None:
252
+ # we expect the following method will raise the AirbyteTracedException
253
+ with pytest.raises(AirbyteTracedException) as parse_error:
254
+ list(self.test_error_collector.yield_and_raise_collected())
255
+ assert (
256
+ parse_error.value.message
257
+ == "Some errors occured while reading from the source."
258
+ )
259
+ assert (
260
+ parse_error.value.internal_message
261
+ == "Please check the logged errors for more information."
262
+ )
@@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
48
48
  csv_strings_can_be_null_not_quoted_scenario,
49
49
  earlier_csv_scenario,
50
50
  empty_schema_inference_scenario,
51
+ invalid_csv_multi_scenario,
51
52
  invalid_csv_scenario,
52
53
  multi_csv_scenario,
53
54
  multi_csv_stream_n_file_exceeds_limit_for_inference,
@@ -132,6 +133,7 @@ discover_scenarios = [
132
133
  csv_multi_stream_scenario,
133
134
  csv_single_stream_scenario,
134
135
  invalid_csv_scenario,
136
+ invalid_csv_multi_scenario,
135
137
  single_csv_scenario,
136
138
  multi_csv_scenario,
137
139
  multi_csv_stream_n_file_exceeds_limit_for_inference,