airbyte-cdk 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/file_based/config/avro_format.py +1 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +1 -0
- airbyte_cdk/sources/file_based/exceptions.py +26 -1
- airbyte_cdk/sources/file_based/file_based_source.py +5 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +15 -9
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +19 -11
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +22 -14
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +8 -6
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/RECORD +20 -20
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +110 -9
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +4 -0
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +16 -31
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +87 -10
- unit_tests/sources/file_based/test_file_based_scenarios.py +2 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/top_level.txt +0 -0
| @@ -3,8 +3,9 @@ | |
| 3 3 | 
             
            #
         | 
| 4 4 |  | 
| 5 5 | 
             
            from enum import Enum
         | 
| 6 | 
            -
            from typing import Union
         | 
| 6 | 
            +
            from typing import Any, List, Union
         | 
| 7 7 |  | 
| 8 | 
            +
            from airbyte_cdk.models import AirbyteMessage, FailureType
         | 
| 8 9 | 
             
            from airbyte_cdk.utils import AirbyteTracedException
         | 
| 9 10 |  | 
| 10 11 |  | 
| @@ -40,6 +41,30 @@ class FileBasedSourceError(Enum): | |
| 40 41 | 
             
                UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
         | 
| 41 42 |  | 
| 42 43 |  | 
| 44 | 
            +
            class FileBasedErrorsCollector:
         | 
| 45 | 
            +
                """
         | 
| 46 | 
            +
                The placeholder for all errors collected.
         | 
| 47 | 
            +
                """
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                errors: List[AirbyteMessage] = []
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                def yield_and_raise_collected(self) -> Any:
         | 
| 52 | 
            +
                    if self.errors:
         | 
| 53 | 
            +
                        # emit collected logged messages
         | 
| 54 | 
            +
                        yield from self.errors
         | 
| 55 | 
            +
                        # clean the collector
         | 
| 56 | 
            +
                        self.errors.clear()
         | 
| 57 | 
            +
                        # raising the single exception
         | 
| 58 | 
            +
                        raise AirbyteTracedException(
         | 
| 59 | 
            +
                            internal_message="Please check the logged errors for more information.",
         | 
| 60 | 
            +
                            message="Some errors occured while reading from the source.",
         | 
| 61 | 
            +
                            failure_type=FailureType.config_error,
         | 
| 62 | 
            +
                        )
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def collect(self, logged_error: AirbyteMessage) -> None:
         | 
| 65 | 
            +
                    self.errors.append(logged_error)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 43 68 | 
             
            class BaseFileBasedSourceError(Exception):
         | 
| 44 69 | 
             
                def __init__(self, error: Union[FileBasedSourceError, str], **kwargs):  # type: ignore # noqa
         | 
| 45 70 | 
             
                    if isinstance(error, FileBasedSourceError):
         | 
| @@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas | |
| 14 14 | 
             
            from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
         | 
| 15 15 | 
             
            from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
         | 
| 16 16 | 
             
            from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
         | 
| 17 | 
            -
            from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
         | 
| 17 | 
            +
            from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
         | 
| 18 18 | 
             
            from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
         | 
| 19 19 | 
             
            from airbyte_cdk.sources.file_based.file_types import default_parsers
         | 
| 20 20 | 
             
            from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
         | 
| @@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC): | |
| 49 49 | 
             
                    self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
         | 
| 50 50 | 
             
                    self.cursor_cls = cursor_cls
         | 
| 51 51 | 
             
                    self.logger = logging.getLogger(f"airbyte.{self.name}")
         | 
| 52 | 
            +
                    self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
         | 
| 52 53 |  | 
| 53 54 | 
             
                def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
         | 
| 54 55 | 
             
                    """
         | 
| @@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC): | |
| 106 107 | 
             
                                    parsers=self.parsers,
         | 
| 107 108 | 
             
                                    validation_policy=self._validate_and_get_validation_policy(stream_config),
         | 
| 108 109 | 
             
                                    cursor=self.cursor_cls(stream_config),
         | 
| 110 | 
            +
                                    errors_collector=self.errors_collector,
         | 
| 109 111 | 
             
                                )
         | 
| 110 112 | 
             
                            )
         | 
| 111 113 | 
             
                        return streams
         | 
| @@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC): | |
| 121 123 | 
             
                    state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
         | 
| 122 124 | 
             
                ) -> Iterator[AirbyteMessage]:
         | 
| 123 125 | 
             
                    yield from super().read(logger, config, catalog, state)
         | 
| 126 | 
            +
                    # emit all the errors collected
         | 
| 127 | 
            +
                    yield from self.errors_collector.yield_and_raise_collected()
         | 
| 124 128 | 
             
                    # count streams using a certain parser
         | 
| 125 129 | 
             
                    parsed_config = self._get_parsed_config(config)
         | 
| 126 130 | 
             
                    for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
         | 
| @@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple | |
| 8 8 | 
             
            import fastavro
         | 
| 9 9 | 
             
            from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
         | 
| 10 10 | 
             
            from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
         | 
| 11 | 
            +
            from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
         | 
| 11 12 | 
             
            from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
         | 
| 12 13 | 
             
            from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
         | 
| 13 14 | 
             
            from airbyte_cdk.sources.file_based.remote_file import RemoteFile
         | 
| @@ -144,15 +145,20 @@ class AvroParser(FileTypeParser): | |
| 144 145 | 
             
                    if not isinstance(avro_format, AvroFormat):
         | 
| 145 146 | 
             
                        raise ValueError(f"Expected ParquetFormat, got {avro_format}")
         | 
| 146 147 |  | 
| 147 | 
            -
                     | 
| 148 | 
            -
             | 
| 149 | 
            -
                         | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
                             | 
| 153 | 
            -
             | 
| 154 | 
            -
                                 | 
| 155 | 
            -
             | 
| 148 | 
            +
                    line_no = 0
         | 
| 149 | 
            +
                    try:
         | 
| 150 | 
            +
                        with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
         | 
| 151 | 
            +
                            avro_reader = fastavro.reader(fp)
         | 
| 152 | 
            +
                            schema = avro_reader.writer_schema
         | 
| 153 | 
            +
                            schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
         | 
| 154 | 
            +
                            for record in avro_reader:
         | 
| 155 | 
            +
                                line_no += 1
         | 
| 156 | 
            +
                                yield {
         | 
| 157 | 
            +
                                    record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
         | 
| 158 | 
            +
                                    for record_field, record_value in schema_field_name_to_type.items()
         | 
| 159 | 
            +
                                }
         | 
| 160 | 
            +
                    except Exception as exc:
         | 
| 161 | 
            +
                        raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
         | 
| 156 162 |  | 
| 157 163 | 
             
                @property
         | 
| 158 164 | 
             
                def file_read_mode(self) -> FileReadMode:
         | 
| @@ -178,17 +178,25 @@ class CsvParser(FileTypeParser): | |
| 178 178 | 
             
                    logger: logging.Logger,
         | 
| 179 179 | 
             
                    discovered_schema: Optional[Mapping[str, SchemaType]],
         | 
| 180 180 | 
             
                ) -> Iterable[Dict[str, Any]]:
         | 
| 181 | 
            -
                     | 
| 182 | 
            -
                     | 
| 183 | 
            -
                         | 
| 184 | 
            -
                         | 
| 185 | 
            -
             | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 190 | 
            -
                         | 
| 191 | 
            -
             | 
| 181 | 
            +
                    line_no = 0
         | 
| 182 | 
            +
                    try:
         | 
| 183 | 
            +
                        config_format = _extract_format(config)
         | 
| 184 | 
            +
                        if discovered_schema:
         | 
| 185 | 
            +
                            property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()}  # type: ignore # discovered_schema["properties"] is known to be a mapping
         | 
| 186 | 
            +
                            deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
         | 
| 187 | 
            +
                        else:
         | 
| 188 | 
            +
                            deduped_property_types = {}
         | 
| 189 | 
            +
                        cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
         | 
| 190 | 
            +
                        data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
         | 
| 191 | 
            +
                        for row in data_generator:
         | 
| 192 | 
            +
                            line_no += 1
         | 
| 193 | 
            +
                            yield CsvParser._to_nullable(
         | 
| 194 | 
            +
                                cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
         | 
| 195 | 
            +
                            )
         | 
| 196 | 
            +
                    except RecordParseError as parse_err:
         | 
| 197 | 
            +
                        raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
         | 
| 198 | 
            +
                    finally:
         | 
| 199 | 
            +
                        data_generator.close()
         | 
| 192 200 |  | 
| 193 201 | 
             
                @property
         | 
| 194 202 | 
             
                def file_read_mode(self) -> FileReadMode:
         | 
| @@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser): | |
| 119 119 | 
             
                                break
         | 
| 120 120 |  | 
| 121 121 | 
             
                        if had_json_parsing_error and not yielded_at_least_once:
         | 
| 122 | 
            -
                            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
         | 
| 122 | 
            +
                            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
         | 
| 123 123 |  | 
| 124 124 | 
             
                @staticmethod
         | 
| 125 125 | 
             
                def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
         | 
| @@ -11,7 +11,7 @@ from urllib.parse import unquote | |
| 11 11 | 
             
            import pyarrow as pa
         | 
| 12 12 | 
             
            import pyarrow.parquet as pq
         | 
| 13 13 | 
             
            from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
         | 
| 14 | 
            -
            from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
         | 
| 14 | 
            +
            from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
         | 
| 15 15 | 
             
            from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
         | 
| 16 16 | 
             
            from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
         | 
| 17 17 | 
             
            from airbyte_cdk.sources.file_based.remote_file import RemoteFile
         | 
| @@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser): | |
| 64 64 | 
             
                    if not isinstance(parquet_format, ParquetFormat):
         | 
| 65 65 | 
             
                        logger.info(f"Expected ParquetFormat, got {parquet_format}")
         | 
| 66 66 | 
             
                        raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
         | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
                         | 
| 71 | 
            -
                             | 
| 72 | 
            -
                            for  | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
                                     | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 67 | 
            +
             | 
| 68 | 
            +
                    line_no = 0
         | 
| 69 | 
            +
                    try:
         | 
| 70 | 
            +
                        with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
         | 
| 71 | 
            +
                            reader = pq.ParquetFile(fp)
         | 
| 72 | 
            +
                            partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
         | 
| 73 | 
            +
                            for row_group in range(reader.num_row_groups):
         | 
| 74 | 
            +
                                batch = reader.read_row_group(row_group)
         | 
| 75 | 
            +
                                for row in range(batch.num_rows):
         | 
| 76 | 
            +
                                    line_no += 1
         | 
| 77 | 
            +
                                    yield {
         | 
| 78 | 
            +
                                        **{
         | 
| 79 | 
            +
                                            column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
         | 
| 80 | 
            +
                                            for column in batch.column_names
         | 
| 81 | 
            +
                                        },
         | 
| 82 | 
            +
                                        **partition_columns,
         | 
| 83 | 
            +
                                    }
         | 
| 84 | 
            +
                    except Exception as exc:
         | 
| 85 | 
            +
                        raise RecordParseError(
         | 
| 86 | 
            +
                            FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
         | 
| 87 | 
            +
                        ) from exc
         | 
| 80 88 |  | 
| 81 89 | 
             
                @staticmethod
         | 
| 82 90 | 
             
                def _extract_partitions(filepath: str) -> List[str]:
         | 
| @@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode | |
| 10 10 | 
             
            from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
         | 
| 11 11 | 
             
            from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
         | 
| 12 12 | 
             
            from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
         | 
| 13 | 
            -
            from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
         | 
| 13 | 
            +
            from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
         | 
| 14 14 | 
             
            from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
         | 
| 15 15 | 
             
            from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
         | 
| 16 16 | 
             
            from airbyte_cdk.sources.file_based.remote_file import RemoteFile
         | 
| @@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream): | |
| 44 44 | 
             
                    discovery_policy: AbstractDiscoveryPolicy,
         | 
| 45 45 | 
             
                    parsers: Dict[Type[Any], FileTypeParser],
         | 
| 46 46 | 
             
                    validation_policy: AbstractSchemaValidationPolicy,
         | 
| 47 | 
            +
                    errors_collector: FileBasedErrorsCollector,
         | 
| 47 48 | 
             
                ):
         | 
| 48 49 | 
             
                    super().__init__()
         | 
| 49 50 | 
             
                    self.config = config
         | 
| @@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream): | |
| 53 54 | 
             
                    self._discovery_policy = discovery_policy
         | 
| 54 55 | 
             
                    self._availability_strategy = availability_strategy
         | 
| 55 56 | 
             
                    self._parsers = parsers
         | 
| 57 | 
            +
                    self.errors_collector = errors_collector
         | 
| 56 58 |  | 
| 57 59 | 
             
                @property
         | 
| 58 60 | 
             
                @abstractmethod
         | 
| @@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): | |
| 112 112 | 
             
                        except RecordParseError:
         | 
| 113 113 | 
             
                            # Increment line_no because the exception was raised before we could increment it
         | 
| 114 114 | 
             
                            line_no += 1
         | 
| 115 | 
            -
                             | 
| 116 | 
            -
                                 | 
| 117 | 
            -
             | 
| 118 | 
            -
                                     | 
| 119 | 
            -
             | 
| 120 | 
            -
             | 
| 115 | 
            +
                            self.errors_collector.collect(
         | 
| 116 | 
            +
                                AirbyteMessage(
         | 
| 117 | 
            +
                                    type=MessageType.LOG,
         | 
| 118 | 
            +
                                    log=AirbyteLogMessage(
         | 
| 119 | 
            +
                                        level=Level.ERROR,
         | 
| 120 | 
            +
                                        message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
         | 
| 121 | 
            +
                                        stack_trace=traceback.format_exc(),
         | 
| 122 | 
            +
                                    ),
         | 
| 121 123 | 
             
                                ),
         | 
| 122 124 | 
             
                            )
         | 
| 123 125 |  | 
| @@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT | |
| 152 152 | 
             
            airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
         | 
| 153 153 | 
             
            airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
         | 
| 154 154 | 
             
            airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 155 | 
            -
            airbyte_cdk/sources/file_based/exceptions.py,sha256 | 
| 156 | 
            -
            airbyte_cdk/sources/file_based/file_based_source.py,sha256= | 
| 155 | 
            +
            airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
         | 
| 156 | 
            +
            airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
         | 
| 157 157 | 
             
            airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
         | 
| 158 158 | 
             
            airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
         | 
| 159 159 | 
             
            airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
         | 
| @@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab | |
| 163 163 | 
             
            airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
         | 
| 164 164 | 
             
            airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 165 165 | 
             
            airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
         | 
| 166 | 
            -
            airbyte_cdk/sources/file_based/config/avro_format.py,sha256= | 
| 166 | 
            +
            airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
         | 
| 167 167 | 
             
            airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
         | 
| 168 168 | 
             
            airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
         | 
| 169 169 | 
             
            airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
         | 
| 170 | 
            -
            airbyte_cdk/sources/file_based/config/parquet_format.py,sha256= | 
| 170 | 
            +
            airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
         | 
| 171 171 | 
             
            airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
         | 
| 172 172 | 
             
            airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
         | 
| 173 173 | 
             
            airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
         | 
| 174 174 | 
             
            airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
         | 
| 175 175 | 
             
            airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
         | 
| 176 | 
            -
            airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256= | 
| 177 | 
            -
            airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256= | 
| 176 | 
            +
            airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
         | 
| 177 | 
            +
            airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
         | 
| 178 178 | 
             
            airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
         | 
| 179 | 
            -
            airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256= | 
| 180 | 
            -
            airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256= | 
| 179 | 
            +
            airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
         | 
| 180 | 
            +
            airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
         | 
| 181 181 | 
             
            airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
         | 
| 182 182 | 
             
            airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
         | 
| 183 183 | 
             
            airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
         | 
| 184 184 | 
             
            airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
         | 
| 185 185 | 
             
            airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
         | 
| 186 | 
            -
            airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256= | 
| 187 | 
            -
            airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256= | 
| 186 | 
            +
            airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
         | 
| 187 | 
            +
            airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
         | 
| 188 188 | 
             
            airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
         | 
| 189 189 | 
             
            airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
         | 
| 190 190 | 
             
            airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
         | 
| @@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic | |
| 363 363 | 
             
            unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 364 364 | 
             
            unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
         | 
| 365 365 | 
             
            unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
         | 
| 366 | 
            -
            unit_tests/sources/file_based/test_file_based_scenarios.py,sha256= | 
| 366 | 
            +
            unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
         | 
| 367 367 | 
             
            unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
         | 
| 368 368 | 
             
            unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
         | 
| 369 369 | 
             
            unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
         | 
| @@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl | |
| 384 384 | 
             
            unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 385 385 | 
             
            unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
         | 
| 386 386 | 
             
            unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
         | 
| 387 | 
            -
            unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256= | 
| 387 | 
            +
            unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
         | 
| 388 388 | 
             
            unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
         | 
| 389 389 | 
             
            unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
         | 
| 390 390 | 
             
            unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
         | 
| 391 391 | 
             
            unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
         | 
| 392 392 | 
             
            unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
         | 
| 393 | 
            -
            unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256= | 
| 393 | 
            +
            unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
         | 
| 394 394 | 
             
            unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
         | 
| 395 | 
            -
            unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256= | 
| 395 | 
            +
            unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
         | 
| 396 396 | 
             
            unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 397 397 | 
             
            unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
         | 
| 398 | 
            -
            unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256= | 
| 398 | 
            +
            unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
         | 
| 399 399 | 
             
            unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
         | 
| 400 400 | 
             
            unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
         | 
| 401 401 | 
             
            unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| @@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg | |
| 444 444 | 
             
            unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
         | 
| 445 445 | 
             
            unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
         | 
| 446 446 | 
             
            unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
         | 
| 447 | 
            -
            airbyte_cdk-0.58. | 
| 448 | 
            -
            airbyte_cdk-0.58. | 
| 449 | 
            -
            airbyte_cdk-0.58. | 
| 450 | 
            -
            airbyte_cdk-0.58. | 
| 451 | 
            -
            airbyte_cdk-0.58. | 
| 447 | 
            +
            airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
         | 
| 448 | 
            +
            airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
         | 
| 449 | 
            +
            airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
         | 
| 450 | 
            +
            airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
         | 
| 451 | 
            +
            airbyte_cdk-0.58.7.dist-info/RECORD,,
         | 
| @@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = ( | |
| 852 852 | 
             
                        ]
         | 
| 853 853 | 
             
                    }
         | 
| 854 854 | 
             
                )
         | 
| 855 | 
            +
                .set_expected_read_error(
         | 
| 856 | 
            +
                    AirbyteTracedException,
         | 
| 857 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 858 | 
            +
                )
         | 
| 859 | 
            +
            ).build()
         | 
| 860 | 
            +
             | 
| 861 | 
            +
            invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
         | 
| 862 | 
            +
                TestScenarioBuilder[InMemoryFilesSource]()
         | 
| 863 | 
            +
                .set_name("invalid_csv_multi_scenario")  # too many values for the number of headers
         | 
| 864 | 
            +
                .set_config(
         | 
| 865 | 
            +
                    {
         | 
| 866 | 
            +
                        "streams": [
         | 
| 867 | 
            +
                            {
         | 
| 868 | 
            +
                                "name": "stream1",
         | 
| 869 | 
            +
                                "format": {"filetype": "csv"},
         | 
| 870 | 
            +
                                "globs": ["*"],
         | 
| 871 | 
            +
                                "validation_policy": "Emit Record",
         | 
| 872 | 
            +
                            },
         | 
| 873 | 
            +
                            {
         | 
| 874 | 
            +
                                "name": "stream2",
         | 
| 875 | 
            +
                                "format": {"filetype": "csv"},
         | 
| 876 | 
            +
                                "globs": ["b.csv"],
         | 
| 877 | 
            +
                                "validation_policy": "Emit Record",
         | 
| 878 | 
            +
                            },
         | 
| 879 | 
            +
                        ]
         | 
| 880 | 
            +
                    }
         | 
| 881 | 
            +
                )
         | 
| 882 | 
            +
                .set_source_builder(
         | 
| 883 | 
            +
                    FileBasedSourceBuilder()
         | 
| 884 | 
            +
                    .set_files(
         | 
| 885 | 
            +
                        {
         | 
| 886 | 
            +
                            "a.csv": {
         | 
| 887 | 
            +
                                "contents": [
         | 
| 888 | 
            +
                                    ("col1",),
         | 
| 889 | 
            +
                                    ("val11", "val12"),
         | 
| 890 | 
            +
                                    ("val21", "val22"),
         | 
| 891 | 
            +
                                ],
         | 
| 892 | 
            +
                                "last_modified": "2023-06-05T03:54:07.000Z",
         | 
| 893 | 
            +
                            },
         | 
| 894 | 
            +
                            "b.csv": {
         | 
| 895 | 
            +
                                "contents": [
         | 
| 896 | 
            +
                                    ("col3",),
         | 
| 897 | 
            +
                                    ("val13b", "val14b"),
         | 
| 898 | 
            +
                                    ("val23b", "val24b"),
         | 
| 899 | 
            +
                                ],
         | 
| 900 | 
            +
                                "last_modified": "2023-06-05T03:54:07.000Z",
         | 
| 901 | 
            +
                            },
         | 
| 902 | 
            +
                        }
         | 
| 903 | 
            +
                    )
         | 
| 904 | 
            +
                    .set_file_type("csv")
         | 
| 905 | 
            +
                )
         | 
| 906 | 
            +
                .set_expected_catalog(
         | 
| 907 | 
            +
                    {
         | 
| 908 | 
            +
                        "streams": [
         | 
| 909 | 
            +
                            {
         | 
| 910 | 
            +
                                "default_cursor_field": ["_ab_source_file_last_modified"],
         | 
| 911 | 
            +
                                "json_schema": {
         | 
| 912 | 
            +
                                    "type": "object",
         | 
| 913 | 
            +
                                    "properties": {
         | 
| 914 | 
            +
                                        "col1": {"type": ["null", "string"]},
         | 
| 915 | 
            +
                                        "col2": {"type": ["null", "string"]},
         | 
| 916 | 
            +
                                        "_ab_source_file_last_modified": {"type": "string"},
         | 
| 917 | 
            +
                                        "_ab_source_file_url": {"type": "string"},
         | 
| 918 | 
            +
                                    },
         | 
| 919 | 
            +
                                },
         | 
| 920 | 
            +
                                "name": "stream1",
         | 
| 921 | 
            +
                                "source_defined_cursor": True,
         | 
| 922 | 
            +
                                "supported_sync_modes": ["full_refresh", "incremental"],
         | 
| 923 | 
            +
                            },
         | 
| 924 | 
            +
                            {
         | 
| 925 | 
            +
                                "json_schema": {
         | 
| 926 | 
            +
                                    "type": "object",
         | 
| 927 | 
            +
                                    "properties": {
         | 
| 928 | 
            +
                                        "col3": {"type": ["null", "string"]},
         | 
| 929 | 
            +
                                        "_ab_source_file_last_modified": {"type": "string"},
         | 
| 930 | 
            +
                                        "_ab_source_file_url": {"type": "string"},
         | 
| 931 | 
            +
                                    },
         | 
| 932 | 
            +
                                },
         | 
| 933 | 
            +
                                "name": "stream2",
         | 
| 934 | 
            +
                                "source_defined_cursor": True,
         | 
| 935 | 
            +
                                "default_cursor_field": ["_ab_source_file_last_modified"],
         | 
| 936 | 
            +
                                "supported_sync_modes": ["full_refresh", "incremental"],
         | 
| 937 | 
            +
                            },
         | 
| 938 | 
            +
                        ]
         | 
| 939 | 
            +
                    }
         | 
| 940 | 
            +
                )
         | 
| 941 | 
            +
                .set_expected_records([])
         | 
| 942 | 
            +
                .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
         | 
| 943 | 
            +
                .set_expected_logs(
         | 
| 944 | 
            +
                    {
         | 
| 945 | 
            +
                        "read": [
         | 
| 946 | 
            +
                            {
         | 
| 947 | 
            +
                                "level": "ERROR",
         | 
| 948 | 
            +
                                "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
         | 
| 949 | 
            +
                            },
         | 
| 950 | 
            +
                            {
         | 
| 951 | 
            +
                                "level": "ERROR",
         | 
| 952 | 
            +
                                "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
         | 
| 953 | 
            +
                            },
         | 
| 954 | 
            +
                        ]
         | 
| 955 | 
            +
                    }
         | 
| 956 | 
            +
                )
         | 
| 957 | 
            +
                .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
         | 
| 855 958 | 
             
            ).build()
         | 
| 856 959 |  | 
| 857 960 | 
             
            csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
         | 
| @@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = ( | |
| 2172 2275 | 
             
                        },
         | 
| 2173 2276 | 
             
                    ]
         | 
| 2174 2277 | 
             
                )
         | 
| 2175 | 
            -
                . | 
| 2176 | 
            -
                     | 
| 2177 | 
            -
             | 
| 2178 | 
            -
                            {
         | 
| 2179 | 
            -
                                "level": "ERROR",
         | 
| 2180 | 
            -
                                "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
         | 
| 2181 | 
            -
                            }
         | 
| 2182 | 
            -
                        ]
         | 
| 2183 | 
            -
                    }
         | 
| 2278 | 
            +
                .set_expected_read_error(
         | 
| 2279 | 
            +
                    AirbyteTracedException,
         | 
| 2280 | 
            +
                    f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
         | 
| 2184 2281 | 
             
                )
         | 
| 2185 2282 | 
             
                .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
         | 
| 2283 | 
            +
                .set_expected_read_error(
         | 
| 2284 | 
            +
                    AirbyteTracedException,
         | 
| 2285 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 2286 | 
            +
                )
         | 
| 2186 2287 | 
             
            ).build()
         | 
| 2187 2288 |  | 
| 2188 2289 | 
             
            csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (
         | 
| @@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = ( | |
| 231 231 | 
             
                )
         | 
| 232 232 | 
             
                .set_expected_records([])
         | 
| 233 233 | 
             
                .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
         | 
| 234 | 
            +
                .set_expected_read_error(
         | 
| 235 | 
            +
                    AirbyteTracedException,
         | 
| 236 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 237 | 
            +
                )
         | 
| 234 238 | 
             
            ).build()
         | 
| 235 239 |  | 
| 236 240 | 
             
            # If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
         | 
| @@ -2,7 +2,8 @@ | |
| 2 2 | 
             
            # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
         | 
| 3 3 | 
             
            #
         | 
| 4 4 |  | 
| 5 | 
            -
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            from airbyte_cdk.utils.traced_exception import AirbyteTracedException
         | 
| 6 7 | 
             
            from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
         | 
| 7 8 | 
             
            from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
         | 
| 8 9 |  | 
| @@ -272,6 +273,10 @@ skip_record_scenario_single_stream = ( | |
| 272 273 | 
             
                        ]
         | 
| 273 274 | 
             
                    }
         | 
| 274 275 | 
             
                )
         | 
| 276 | 
            +
                .set_expected_read_error(
         | 
| 277 | 
            +
                    AirbyteTracedException,
         | 
| 278 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 279 | 
            +
                )
         | 
| 275 280 | 
             
            ).build()
         | 
| 276 281 |  | 
| 277 282 |  | 
| @@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = ( | |
| 416 421 | 
             
                        ]
         | 
| 417 422 | 
             
                    }
         | 
| 418 423 | 
             
                )
         | 
| 424 | 
            +
                .set_expected_read_error(
         | 
| 425 | 
            +
                    AirbyteTracedException,
         | 
| 426 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 427 | 
            +
                )
         | 
| 419 428 | 
             
            ).build()
         | 
| 420 429 |  | 
| 421 430 |  | 
| @@ -492,19 +501,9 @@ emit_record_scenario_single_stream = ( | |
| 492 501 | 
             
                        },
         | 
| 493 502 | 
             
                    ]
         | 
| 494 503 | 
             
                )
         | 
| 495 | 
            -
                . | 
| 496 | 
            -
                     | 
| 497 | 
            -
             | 
| 498 | 
            -
                            {
         | 
| 499 | 
            -
                                "level": "ERROR",
         | 
| 500 | 
            -
                                "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
         | 
| 501 | 
            -
                            },
         | 
| 502 | 
            -
                            {
         | 
| 503 | 
            -
                                "level": "WARN",
         | 
| 504 | 
            -
                                "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
         | 
| 505 | 
            -
                            },
         | 
| 506 | 
            -
                        ]
         | 
| 507 | 
            -
                    }
         | 
| 504 | 
            +
                .set_expected_read_error(
         | 
| 505 | 
            +
                    AirbyteTracedException,
         | 
| 506 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 508 507 | 
             
                )
         | 
| 509 508 | 
             
            ).build()
         | 
| 510 509 |  | 
| @@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = ( | |
| 640 639 | 
             
                        },
         | 
| 641 640 | 
             
                    ]
         | 
| 642 641 | 
             
                )
         | 
| 643 | 
            -
                . | 
| 644 | 
            -
                     | 
| 645 | 
            -
             | 
| 646 | 
            -
                            {
         | 
| 647 | 
            -
                                "level": "ERROR",
         | 
| 648 | 
            -
                                "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
         | 
| 649 | 
            -
                            },
         | 
| 650 | 
            -
                            {
         | 
| 651 | 
            -
                                "level": "WARN",
         | 
| 652 | 
            -
                                "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
         | 
| 653 | 
            -
                            },
         | 
| 654 | 
            -
                            {
         | 
| 655 | 
            -
                                "level": "WARN",
         | 
| 656 | 
            -
                                "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
         | 
| 657 | 
            -
                            },
         | 
| 658 | 
            -
                        ]
         | 
| 659 | 
            -
                    }
         | 
| 642 | 
            +
                .set_expected_read_error(
         | 
| 643 | 
            +
                    AirbyteTracedException,
         | 
| 644 | 
            +
                    "Please check the logged errors for more information.",
         | 
| 660 645 | 
             
                )
         | 
| 661 646 | 
             
            ).build()
         | 
| 662 647 |  | 
| @@ -2,15 +2,18 @@ | |
| 2 2 | 
             
            # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
         | 
| 3 3 | 
             
            #
         | 
| 4 4 |  | 
| 5 | 
            +
            import traceback
         | 
| 5 6 | 
             
            import unittest
         | 
| 6 7 | 
             
            from datetime import datetime, timezone
         | 
| 7 8 | 
             
            from typing import Any, Iterable, Iterator, Mapping
         | 
| 8 9 | 
             
            from unittest.mock import Mock
         | 
| 9 10 |  | 
| 10 11 | 
             
            import pytest
         | 
| 11 | 
            -
            from airbyte_cdk.models import Level
         | 
| 12 | 
            +
            from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
         | 
| 13 | 
            +
            from airbyte_cdk.models import Type as MessageType
         | 
| 12 14 | 
             
            from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
         | 
| 13 15 | 
             
            from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
         | 
| 16 | 
            +
            from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
         | 
| 14 17 | 
             
            from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
         | 
| 15 18 | 
             
            from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
         | 
| 16 19 | 
             
            from airbyte_cdk.sources.file_based.remote_file import RemoteFile
         | 
| @@ -55,12 +58,17 @@ class MockFormat: | |
| 55 58 | 
             
                    ),
         | 
| 56 59 | 
             
                    pytest.param(
         | 
| 57 60 | 
             
                        {"type": "object", "properties": {"prop": {"type": "string"}}},
         | 
| 58 | 
            -
                        { | 
| 61 | 
            +
                        {
         | 
| 62 | 
            +
                            "type": ["null", "object"],
         | 
| 63 | 
            +
                            "properties": {"prop": {"type": ["null", "string"]}},
         | 
| 64 | 
            +
                        },
         | 
| 59 65 | 
             
                        id="deeply-nested-schema",
         | 
| 60 66 | 
             
                    ),
         | 
| 61 67 | 
             
                ],
         | 
| 62 68 | 
             
            )
         | 
| 63 | 
            -
            def test_fill_nulls( | 
| 69 | 
            +
            def test_fill_nulls(
         | 
| 70 | 
            +
                input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
         | 
| 71 | 
            +
            ) -> None:
         | 
| 64 72 | 
             
                assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
         | 
| 65 73 |  | 
| 66 74 |  | 
| @@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase): | |
| 90 98 | 
             
                        parsers={MockFormat: self._parser},
         | 
| 91 99 | 
             
                        validation_policy=self._validation_policy,
         | 
| 92 100 | 
             
                        cursor=self._cursor,
         | 
| 101 | 
            +
                        errors_collector=FileBasedErrorsCollector(),
         | 
| 93 102 | 
             
                    )
         | 
| 94 103 |  | 
| 95 104 | 
             
                def test_when_read_records_from_slice_then_return_records(self) -> None:
         | 
| 96 105 | 
             
                    self._parser.parse_records.return_value = [self._A_RECORD]
         | 
| 97 | 
            -
                    messages = list( | 
| 98 | 
            -
             | 
| 106 | 
            +
                    messages = list(
         | 
| 107 | 
            +
                        self._stream.read_records_from_slice(
         | 
| 108 | 
            +
                            {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
         | 
| 109 | 
            +
                        )
         | 
| 110 | 
            +
                    )
         | 
| 111 | 
            +
                    assert list(map(lambda message: message.record.data["data"], messages)) == [
         | 
| 112 | 
            +
                        self._A_RECORD
         | 
| 113 | 
            +
                    ]
         | 
| 99 114 |  | 
| 100 | 
            -
                def test_given_exception_when_read_records_from_slice_then_do_process_other_files( | 
| 115 | 
            +
                def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
         | 
| 116 | 
            +
                    self,
         | 
| 117 | 
            +
                ) -> None:
         | 
| 101 118 | 
             
                    """
         | 
| 102 119 | 
             
                    The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
         | 
| 103 120 | 
             
                    we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
         | 
| 104 121 | 
             
                    the file but the sync won't fail.
         | 
| 105 122 | 
             
                    Ticket: https://github.com/airbytehq/airbyte/issues/29680
         | 
| 106 123 | 
             
                    """
         | 
| 107 | 
            -
                    self._parser.parse_records.side_effect = [ | 
| 124 | 
            +
                    self._parser.parse_records.side_effect = [
         | 
| 125 | 
            +
                        ValueError("An error"),
         | 
| 126 | 
            +
                        [self._A_RECORD],
         | 
| 127 | 
            +
                    ]
         | 
| 108 128 |  | 
| 109 129 | 
             
                    messages = list(
         | 
| 110 130 | 
             
                        self._stream.read_records_from_slice(
         | 
| @@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase): | |
| 120 140 | 
             
                    assert messages[0].log.level == Level.ERROR
         | 
| 121 141 | 
             
                    assert messages[1].record.data["data"] == self._A_RECORD
         | 
| 122 142 |  | 
| 123 | 
            -
                def test_given_traced_exception_when_read_records_from_slice_then_fail( | 
| 143 | 
            +
                def test_given_traced_exception_when_read_records_from_slice_then_fail(
         | 
| 144 | 
            +
                    self,
         | 
| 145 | 
            +
                ) -> None:
         | 
| 124 146 | 
             
                    """
         | 
| 125 147 | 
             
                    When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
         | 
| 126 148 | 
             
                    """
         | 
| @@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase): | |
| 138 160 | 
             
                            )
         | 
| 139 161 | 
             
                        )
         | 
| 140 162 |  | 
| 141 | 
            -
                def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning( | 
| 163 | 
            +
                def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
         | 
| 164 | 
            +
                    self,
         | 
| 165 | 
            +
                ) -> None:
         | 
| 142 166 | 
             
                    self._stream_config.schemaless = False
         | 
| 143 167 | 
             
                    self._validation_policy.record_passes_validation_policy.return_value = False
         | 
| 144 | 
            -
                    self._parser.parse_records.side_effect = [ | 
| 168 | 
            +
                    self._parser.parse_records.side_effect = [
         | 
| 169 | 
            +
                        self._iter([self._A_RECORD, ValueError("An error")])
         | 
| 170 | 
            +
                    ]
         | 
| 145 171 |  | 
| 146 172 | 
             
                    messages = list(
         | 
| 147 173 | 
             
                        self._stream.read_records_from_slice(
         | 
| @@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase): | |
| 183 209 | 
             
                        if isinstance(item, Exception):
         | 
| 184 210 | 
             
                            raise item
         | 
| 185 211 | 
             
                        yield item
         | 
| 212 | 
            +
             | 
| 213 | 
            +
             | 
| 214 | 
            +
            class TestFileBasedErrorCollector:
         | 
| 215 | 
            +
                test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                @pytest.mark.parametrize(
         | 
| 218 | 
            +
                    "stream, file, line_no, n_skipped, collector_expected_len",
         | 
| 219 | 
            +
                    (
         | 
| 220 | 
            +
                        ("stream_1", "test.csv", 1, 1, 1),
         | 
| 221 | 
            +
                        ("stream_2", "test2.csv", 2, 2, 2),
         | 
| 222 | 
            +
                    ),
         | 
| 223 | 
            +
                    ids=[
         | 
| 224 | 
            +
                        "Single error",
         | 
| 225 | 
            +
                        "Multiple errors",
         | 
| 226 | 
            +
                    ],
         | 
| 227 | 
            +
                )
         | 
| 228 | 
            +
                def test_collect_parsing_error(
         | 
| 229 | 
            +
                    self, stream, file, line_no, n_skipped, collector_expected_len
         | 
| 230 | 
            +
                ) -> None:
         | 
| 231 | 
            +
                    test_error_pattern = "Error parsing record."
         | 
| 232 | 
            +
                    # format the error body
         | 
| 233 | 
            +
                    test_error = (
         | 
| 234 | 
            +
                        AirbyteMessage(
         | 
| 235 | 
            +
                            type=MessageType.LOG,
         | 
| 236 | 
            +
                            log=AirbyteLogMessage(
         | 
| 237 | 
            +
                                level=Level.ERROR,
         | 
| 238 | 
            +
                                message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
         | 
| 239 | 
            +
                                stack_trace=traceback.format_exc(),
         | 
| 240 | 
            +
                            ),
         | 
| 241 | 
            +
                        ),
         | 
| 242 | 
            +
                    )
         | 
| 243 | 
            +
                    # collecting the error
         | 
| 244 | 
            +
                    self.test_error_collector.collect(test_error)
         | 
| 245 | 
            +
                    # check the error has been collected
         | 
| 246 | 
            +
                    assert len(self.test_error_collector.errors) == collector_expected_len
         | 
| 247 | 
            +
                    # check for the patern presence for the collected errors
         | 
| 248 | 
            +
                    for error in self.test_error_collector.errors:
         | 
| 249 | 
            +
                        assert test_error_pattern in error[0].log.message
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                def test_yield_and_raise_collected(self) -> None:
         | 
| 252 | 
            +
                    # we expect the following method will raise the AirbyteTracedException
         | 
| 253 | 
            +
                    with pytest.raises(AirbyteTracedException) as parse_error:
         | 
| 254 | 
            +
                        list(self.test_error_collector.yield_and_raise_collected())
         | 
| 255 | 
            +
                    assert (
         | 
| 256 | 
            +
                        parse_error.value.message
         | 
| 257 | 
            +
                        == "Some errors occured while reading from the source."
         | 
| 258 | 
            +
                    )
         | 
| 259 | 
            +
                    assert (
         | 
| 260 | 
            +
                        parse_error.value.internal_message
         | 
| 261 | 
            +
                        == "Please check the logged errors for more information."
         | 
| 262 | 
            +
                    )
         | 
| @@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import ( | |
| 48 48 | 
             
                csv_strings_can_be_null_not_quoted_scenario,
         | 
| 49 49 | 
             
                earlier_csv_scenario,
         | 
| 50 50 | 
             
                empty_schema_inference_scenario,
         | 
| 51 | 
            +
                invalid_csv_multi_scenario,
         | 
| 51 52 | 
             
                invalid_csv_scenario,
         | 
| 52 53 | 
             
                multi_csv_scenario,
         | 
| 53 54 | 
             
                multi_csv_stream_n_file_exceeds_limit_for_inference,
         | 
| @@ -132,6 +133,7 @@ discover_scenarios = [ | |
| 132 133 | 
             
                csv_multi_stream_scenario,
         | 
| 133 134 | 
             
                csv_single_stream_scenario,
         | 
| 134 135 | 
             
                invalid_csv_scenario,
         | 
| 136 | 
            +
                invalid_csv_multi_scenario,
         | 
| 135 137 | 
             
                single_csv_scenario,
         | 
| 136 138 | 
             
                multi_csv_scenario,
         | 
| 137 139 | 
             
                multi_csv_stream_n_file_exceeds_limit_for_inference,
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |