PyPI - airbyte-cdk - Versions diffs - 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl - Mend

airbyte-cdk 0.58.5py3-none-any.whl → 0.58.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

airbyte_cdk/sources/file_based/config/avro_format.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field

airbyte_cdk/sources/file_based/config/parquet_format.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field

airbyte_cdk/sources/file_based/exceptions.py CHANGED Viewed

@@ -3,8 +3,9 @@
 #
 from enum import Enum
-from typing import Union
+from typing import Any, List, Union
+from airbyte_cdk.models import AirbyteMessage, FailureType
 from airbyte_cdk.utils import AirbyteTracedException
@@ -40,6 +41,30 @@ class FileBasedSourceError(Enum):
     UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
+class FileBasedErrorsCollector:
+    """
+    The placeholder for all errors collected.
+    """
+    errors: List[AirbyteMessage] = []
+    def yield_and_raise_collected(self) -> Any:
+        if self.errors:
+            # emit collected logged messages
+            yield from self.errors
+            # clean the collector
+            self.errors.clear()
+            # raising the single exception
+            raise AirbyteTracedException(
+                internal_message="Please check the logged errors for more information.",
+                message="Some errors occured while reading from the source.",
+                failure_type=FailureType.config_error,
+            )
+    def collect(self, logged_error: AirbyteMessage) -> None:
+        self.errors.append(logged_error)
 class BaseFileBasedSourceError(Exception):
     def __init__(self, error: Union[FileBasedSourceError, str], **kwargs):  # type: ignore # noqa
         if isinstance(error, FileBasedSourceError):

airbyte_cdk/sources/file_based/file_based_source.py CHANGED Viewed

@@ -14,7 +14,7 @@ from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBas
 from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
 from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
-from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
+from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
 from airbyte_cdk.sources.file_based.file_types import default_parsers
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
@@ -49,6 +49,7 @@ class FileBasedSource(AbstractSource, ABC):
         self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
         self.cursor_cls = cursor_cls
         self.logger = logging.getLogger(f"airbyte.{self.name}")
+        self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
     def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
         """
@@ -106,6 +107,7 @@ class FileBasedSource(AbstractSource, ABC):
                         parsers=self.parsers,
                         validation_policy=self._validate_and_get_validation_policy(stream_config),
                         cursor=self.cursor_cls(stream_config),
+                        errors_collector=self.errors_collector,
                     )
                 )
             return streams
@@ -121,6 +123,8 @@ class FileBasedSource(AbstractSource, ABC):
         state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
     ) -> Iterator[AirbyteMessage]:
         yield from super().read(logger, config, catalog, state)
+        # emit all the errors collected
+        yield from self.errors_collector.yield_and_raise_collected()
         # count streams using a certain parser
         parsed_config = self._get_parsed_config(config)
         for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():

airbyte_cdk/sources/file_based/file_types/avro_parser.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
 import fastavro
 from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
+from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -144,15 +145,20 @@ class AvroParser(FileTypeParser):
         if not isinstance(avro_format, AvroFormat):
             raise ValueError(f"Expected ParquetFormat, got {avro_format}")
-        with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
-            avro_reader = fastavro.reader(fp)
-            schema = avro_reader.writer_schema
-            schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
-            for record in avro_reader:
-                yield {
-                    record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
-                    for record_field, record_value in schema_field_name_to_type.items()
-                }
+        line_no = 0
+        try:
+            with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
+                avro_reader = fastavro.reader(fp)
+                schema = avro_reader.writer_schema
+                schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
+                for record in avro_reader:
+                    line_no += 1
+                    yield {
+                        record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
+                        for record_field, record_value in schema_field_name_to_type.items()
+                    }
+        except Exception as exc:
+            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
     @property
     def file_read_mode(self) -> FileReadMode:

airbyte_cdk/sources/file_based/file_types/csv_parser.py CHANGED Viewed

@@ -178,17 +178,25 @@ class CsvParser(FileTypeParser):
         logger: logging.Logger,
         discovered_schema: Optional[Mapping[str, SchemaType]],
     ) -> Iterable[Dict[str, Any]]:
-        config_format = _extract_format(config)
-        if discovered_schema:
-            property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()}  # type: ignore # discovered_schema["properties"] is known to be a mapping
-            deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
-        else:
-            deduped_property_types = {}
-        cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
-        data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
-        for row in data_generator:
-            yield CsvParser._to_nullable(cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null)
-        data_generator.close()
+        line_no = 0
+        try:
+            config_format = _extract_format(config)
+            if discovered_schema:
+                property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()}  # type: ignore # discovered_schema["properties"] is known to be a mapping
+                deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
+            else:
+                deduped_property_types = {}
+            cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
+            data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
+            for row in data_generator:
+                line_no += 1
+                yield CsvParser._to_nullable(
+                    cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
+                )
+        except RecordParseError as parse_err:
+            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
+        finally:
+            data_generator.close()
     @property
     def file_read_mode(self) -> FileReadMode:

airbyte_cdk/sources/file_based/file_types/jsonl_parser.py CHANGED Viewed

@@ -119,7 +119,7 @@ class JsonlParser(FileTypeParser):
                     break
             if had_json_parsing_error and not yielded_at_least_once:
-                raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD)
+                raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
     @staticmethod
     def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:

airbyte_cdk/sources/file_based/file_types/parquet_parser.py CHANGED Viewed

@@ -11,7 +11,7 @@ from urllib.parse import unquote
 import pyarrow as pa
 import pyarrow.parquet as pq
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
-from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
+from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -64,19 +64,27 @@ class ParquetParser(FileTypeParser):
         if not isinstance(parquet_format, ParquetFormat):
             logger.info(f"Expected ParquetFormat, got {parquet_format}")
             raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
-        with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
-            reader = pq.ParquetFile(fp)
-            partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
-            for row_group in range(reader.num_row_groups):
-                batch = reader.read_row_group(row_group)
-                for row in range(batch.num_rows):
-                    yield {
-                        **{
-                            column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
-                            for column in batch.column_names
-                        },
-                        **partition_columns,
-                    }
+        line_no = 0
+        try:
+            with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
+                reader = pq.ParquetFile(fp)
+                partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
+                for row_group in range(reader.num_row_groups):
+                    batch = reader.read_row_group(row_group)
+                    for row in range(batch.num_rows):
+                        line_no += 1
+                        yield {
+                            **{
+                                column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
+                                for column in batch.column_names
+                            },
+                            **partition_columns,
+                        }
+        except Exception as exc:
+            raise RecordParseError(
+                FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
+            ) from exc
     @staticmethod
     def _extract_partitions(filepath: str) -> List[str]:

airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py CHANGED Viewed

@@ -10,7 +10,7 @@ from airbyte_cdk.models import SyncMode
 from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
 from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
-from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError, UndefinedParserError
+from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -44,6 +44,7 @@ class AbstractFileBasedStream(Stream):
         discovery_policy: AbstractDiscoveryPolicy,
         parsers: Dict[Type[Any], FileTypeParser],
         validation_policy: AbstractSchemaValidationPolicy,
+        errors_collector: FileBasedErrorsCollector,
     ):
         super().__init__()
         self.config = config
@@ -53,6 +54,7 @@ class AbstractFileBasedStream(Stream):
         self._discovery_policy = discovery_policy
         self._availability_strategy = availability_strategy
         self._parsers = parsers
+        self.errors_collector = errors_collector
     @property
     @abstractmethod

airbyte_cdk/sources/file_based/stream/default_file_based_stream.py CHANGED Viewed

@@ -112,12 +112,14 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
             except RecordParseError:
                 # Increment line_no because the exception was raised before we could increment it
                 line_no += 1
-                yield AirbyteMessage(
-                    type=MessageType.LOG,
-                    log=AirbyteLogMessage(
-                        level=Level.ERROR,
-                        message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
-                        stack_trace=traceback.format_exc(),
+                self.errors_collector.collect(
+                    AirbyteMessage(
+                        type=MessageType.LOG,
+                        log=AirbyteLogMessage(
+                            level=Level.ERROR,
+                            message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
+                            stack_trace=traceback.format_exc(),
+                        ),
                     ),
                 )

{airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.58.5
+Version: 0.58.7
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte

{airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/RECORD RENAMED Viewed

@@ -152,8 +152,8 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
 airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
 airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
 airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-airbyte_cdk/sources/file_based/exceptions.py,sha256=z9JBEEhGyM1ev9P6MjjjigtyuBm3OaOl3lIhkOQf8lQ,4765
-airbyte_cdk/sources/file_based/file_based_source.py,sha256=2kguVKlTFg9vSE-eNZeVj4-VXElz3OuhJZrWrIdp2HE,7896
+airbyte_cdk/sources/file_based/exceptions.py,sha256=-SjdDk-mbkp5qQVUESkn788W8NmGtC2LROkZRKS_Dxc,5613
+airbyte_cdk/sources/file_based/file_based_source.py,sha256=XddFHSiL_a-VOfQF33yXVapUG6wvHu2hd9xxYEoBcuc,8180
 airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
 airbyte_cdk/sources/file_based/remote_file.py,sha256=dtRX7X06Fug-XDz93a5lkwPQy5nQgxH0-ZcXW2HuMGI,312
 airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0Ppt0i4k3sVFMSaX3wJo,9103
@@ -163,28 +163,28 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
 airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=TXo8kdwOQ3XiQbS3ccPtj9FghHFpiVL2JRWjen3NRXw,5289
 airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=dgOoQuoi7-7wdTMSP7wz4ENXIDT49Ew4FoAxnnplGGc,4956
-airbyte_cdk/sources/file_based/config/avro_format.py,sha256=lQSEq5JZY0M5y9mW93R4EjrIb8brYXUgrXCY-6EMHww,711
+airbyte_cdk/sources/file_based/config/avro_format.py,sha256=q1I2G9bGNy3ADds35PfWT7Mss6fjYzUtYDkUYvh5v7s,712
 airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
 airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=l9DFyttYbxY9exwy67WzRXySEk_yKV2G_THRA_Sq1I4,4229
 airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
-airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=8GTDTQyvS7pWLVG0LWirHVE1snHd0Au5R4Ym33-ezEg,736
+airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=yKHgXYu3zJWrGfBlJ3JQZ3gVFPumF-K4rjVPNoYTUZ0,737
 airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=axuIc4xaac7vTJKi8I9l7Lgn8gGu6bCuZNouQAEAvYs,3513
 airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
 airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
 airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
 airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=hWSDNKCIqvi7gOyfZJezuKt6-JtVroerUVTvW3ZY-R4,1017
-airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=gCyPJc4khkar4sdfBd-RU3CuV_k7nnsNM080tjwDOiw,8817
-airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3fKsZbFQSbo83EGNSuqZpqjTg_c,17880
+airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=FC3L6D32SzhAv4jyS5dnsIgYmvGHgbomJpI2xRWrbZ0,9167
+airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=HqglbQ7vopMbEWC5F-PuB-4ycXgDHLHxq1sN6IGPUpE,18215
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
-airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
-airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
+airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=MkjK_J2OqzqRPyGeQFQFADxgwqsRaNtoawB7dwKxWb0,5666
+airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=8ZuuYnS2AzlJ-IaeBP6Pnjzu4Z2zzfBWw_x9Rt9a5Qs,9363
 airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=omYdo6daIHI-YWF9WsKFdFHRXTFWgJjJ3OqegiN345k,16736
 airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
 airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
 airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
 airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
-airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=0Z343kciFr5Y66SiwzIcxT6eKG2rMQtLHgLX-vpUVa4,6278
-airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=VBVZeHt4MP_PWHE_Z6rQatiOUWu-HIRoqo2EcmvV_6E,12463
+airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=GJu02B2_fcfaOnSBvhpRyXIEEtu4v8ubFR_vQpe-YAU,6405
+airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=Ou8vKnaR7yhEG9NoOLfwOHqxYBro055nB9CCBPC2I2s,12555
 airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
 airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
 airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py,sha256=kuJRKgDYOGXRk0V0I8BpFxg0hGv7SfV_nBpmmn45F88,6815
@@ -363,7 +363,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
 unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
 unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
-unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=iwXCqnFGxfHh3l48wXtlD-x74rbZYcA94XXnBrcrrKQ,11616
+unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=rQaORUdsRdWxTMMshJxAxnp3x6Bsnuirit4yjrT0Oao,11680
 unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
 unit_tests/sources/file_based/test_scenarios.py,sha256=ONBUwnX_dWOaejKiuJQgMRWgr_0NCWJpTwf4nvw_ePg,8008
 unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
@@ -384,18 +384,18 @@ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=kmVl
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
 unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
-unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4mh9nllsdsi4NCOr8q0ZRZatFUz3Zf5etVcwVE_mjbM,120032
+unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=4RYb8_C7sJM_6pI-cftP5fCk0i6dr4BW1lpY5iQDdN8,123795
 unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
 unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
 unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
 unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
 unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=zSZtaYUflkosflxQGrTDxiJ24mhFsTJYosKyxAHgWbM,9475
-unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=04exiS9j6kPyHLUUMgQLGfJHmlD1T63bixANhnUDdzk,67762
+unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=2_p15Phk2xiBgZ0OdGYrCU9eAlTT8h_SU5nk1ehUcLk,67894
 unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=FVYbRfdj2RCLFVwUNqQKiBFMm78y6FvmTO447i3SXqY,28697
-unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=Try0knJN5wfoGNO38QGoLGIcqSceSAQsUWO42CusNYI,33005
+unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=fFcNR-lzfLQ5SS8Uetbx6iFijgs_OXlqYz3Pr1OVTAI,32221
 unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/stream/test_default_file_based_cursor.py,sha256=XhtCGvgSBFyeQwgqGciPsIB1HIlWqTcXROwnxrjutHc,13109
-unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=3zupeqwYyAb2EP4jn_8zbdu6_gTa1HlOAu6Rh0lxStM,7786
+unit_tests/sources/file_based/stream/test_default_file_based_stream.py,sha256=1GZPMIL00KGMIaYcPPBhQ0gpkYAJ48xtxXOgEwxkg84,10263
 unit_tests/sources/fixtures/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
 unit_tests/sources/fixtures/source_test_fixture.py,sha256=dvpISgio2sOp-U3bXudH_49vY4c68sO_PMs1JZTMaj0,5502
 unit_tests/sources/message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -444,8 +444,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.58.5.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.58.5.dist-info/METADATA,sha256=oXzgSRQhxSaFJU0NpyjMFavVxfFi14BKpQLIjS4rPNU,11073
-airbyte_cdk-0.58.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-airbyte_cdk-0.58.5.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.58.5.dist-info/RECORD,,
+airbyte_cdk-0.58.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.58.7.dist-info/METADATA,sha256=PAw5bOce761Nqfm14qOw6Rk60S5JqPvK9d8oUnzv8vc,11073
+airbyte_cdk-0.58.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+airbyte_cdk-0.58.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.58.7.dist-info/RECORD,,

unit_tests/sources/file_based/scenarios/csv_scenarios.py CHANGED Viewed

@@ -852,6 +852,109 @@ invalid_csv_scenario: TestScenario[InMemoryFilesSource] = (
             ]
         }
     )
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
+    )
+).build()
+invalid_csv_multi_scenario: TestScenario[InMemoryFilesSource] = (
+    TestScenarioBuilder[InMemoryFilesSource]()
+    .set_name("invalid_csv_multi_scenario")  # too many values for the number of headers
+    .set_config(
+        {
+            "streams": [
+                {
+                    "name": "stream1",
+                    "format": {"filetype": "csv"},
+                    "globs": ["*"],
+                    "validation_policy": "Emit Record",
+                },
+                {
+                    "name": "stream2",
+                    "format": {"filetype": "csv"},
+                    "globs": ["b.csv"],
+                    "validation_policy": "Emit Record",
+                },
+            ]
+        }
+    )
+    .set_source_builder(
+        FileBasedSourceBuilder()
+        .set_files(
+            {
+                "a.csv": {
+                    "contents": [
+                        ("col1",),
+                        ("val11", "val12"),
+                        ("val21", "val22"),
+                    ],
+                    "last_modified": "2023-06-05T03:54:07.000Z",
+                },
+                "b.csv": {
+                    "contents": [
+                        ("col3",),
+                        ("val13b", "val14b"),
+                        ("val23b", "val24b"),
+                    ],
+                    "last_modified": "2023-06-05T03:54:07.000Z",
+                },
+            }
+        )
+        .set_file_type("csv")
+    )
+    .set_expected_catalog(
+        {
+            "streams": [
+                {
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col1": {"type": ["null", "string"]},
+                            "col2": {"type": ["null", "string"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "stream1",
+                    "source_defined_cursor": True,
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                },
+                {
+                    "json_schema": {
+                        "type": "object",
+                        "properties": {
+                            "col3": {"type": ["null", "string"]},
+                            "_ab_source_file_last_modified": {"type": "string"},
+                            "_ab_source_file_url": {"type": "string"},
+                        },
+                    },
+                    "name": "stream2",
+                    "source_defined_cursor": True,
+                    "default_cursor_field": ["_ab_source_file_last_modified"],
+                    "supported_sync_modes": ["full_refresh", "incremental"],
+                },
+            ]
+        }
+    )
+    .set_expected_records([])
+    .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
+    .set_expected_logs(
+        {
+            "read": [
+                {
+                    "level": "ERROR",
+                    "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=1 n_skipped=0",
+                },
+                {
+                    "level": "ERROR",
+                    "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream2 file=b.csv line_no=1 n_skipped=0",
+                },
+            ]
+        }
+    )
+    .set_expected_read_error(AirbyteTracedException, "Please check the logged errors for more information.")
 ).build()
 csv_single_stream_scenario: TestScenario[InMemoryFilesSource] = (
@@ -2172,17 +2275,15 @@ csv_newline_in_values_not_quoted_scenario: TestScenario[InMemoryFilesSource] = (
             },
         ]
     )
-    .set_expected_logs(
-        {
-            "read": [
-                {
-                    "level": "ERROR",
-                    "message": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. stream=stream1 file=a.csv line_no=2 n_skipped=0",
-                }
-            ]
-        }
+    .set_expected_read_error(
+        AirbyteTracedException,
+        f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a.csv line_no=2 n_skipped=0",
     )
     .set_expected_discover_error(AirbyteTracedException, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value)
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
+    )
 ).build()
 csv_escape_char_is_set_scenario: TestScenario[InMemoryFilesSource] = (

unit_tests/sources/file_based/scenarios/unstructured_scenarios.py CHANGED Viewed

@@ -231,6 +231,10 @@ unstructured_invalid_file_type_discover_scenario_no_skip = (
     )
     .set_expected_records([])
     .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
+    )
 ).build()
 # If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types

unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py CHANGED Viewed

@@ -2,7 +2,8 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
-from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
+from airbyte_cdk.utils.traced_exception import AirbyteTracedException
 from unit_tests.sources.file_based.scenarios.file_based_source_builder import FileBasedSourceBuilder
 from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder
@@ -272,6 +273,10 @@ skip_record_scenario_single_stream = (
             ]
         }
     )
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
+    )
 ).build()
@@ -416,6 +421,10 @@ skip_record_scenario_multi_stream = (
             ]
         }
     )
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
+    )
 ).build()
@@ -492,19 +501,9 @@ emit_record_scenario_single_stream = (
             },
         ]
     )
-    .set_expected_logs(
-        {
-            "read": [
-                {
-                    "level": "ERROR",
-                    "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=c.csv line_no=2 n_skipped=0",
-                },
-                {
-                    "level": "WARN",
-                    "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
-                },
-            ]
-        }
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
     )
 ).build()
@@ -640,23 +639,9 @@ emit_record_scenario_multi_stream = (
             },
         ]
     )
-    .set_expected_logs(
-        {
-            "read": [
-                {
-                    "level": "ERROR",
-                    "message": f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream=stream1 file=a/a3.csv line_no=2 n_skipped=0",
-                },
-                {
-                    "level": "WARN",
-                    "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
-                },
-                {
-                    "level": "WARN",
-                    "message": "Could not cast the value to the expected type.: col2: value=this is text that will trigger validation policy,expected_type=integer",
-                },
-            ]
-        }
+    .set_expected_read_error(
+        AirbyteTracedException,
+        "Please check the logged errors for more information.",
     )
 ).build()

unit_tests/sources/file_based/stream/test_default_file_based_stream.py CHANGED Viewed

@@ -2,15 +2,18 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
+import traceback
 import unittest
 from datetime import datetime, timezone
 from typing import Any, Iterable, Iterator, Mapping
 from unittest.mock import Mock
 import pytest
-from airbyte_cdk.models import Level
+from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
+from airbyte_cdk.models import Type as MessageType
 from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
 from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
+from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -55,12 +58,17 @@ class MockFormat:
         ),
         pytest.param(
             {"type": "object", "properties": {"prop": {"type": "string"}}},
-            {"type": ["null", "object"], "properties": {"prop": {"type": ["null", "string"]}}},
+            {
+                "type": ["null", "object"],
+                "properties": {"prop": {"type": ["null", "string"]}},
+            },
             id="deeply-nested-schema",
         ),
     ],
 )
-def test_fill_nulls(input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]) -> None:
+def test_fill_nulls(
+    input_schema: Mapping[str, Any], expected_output: Mapping[str, Any]
+) -> None:
     assert DefaultFileBasedStream._fill_nulls(input_schema) == expected_output
@@ -90,21 +98,33 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
             parsers={MockFormat: self._parser},
             validation_policy=self._validation_policy,
             cursor=self._cursor,
+            errors_collector=FileBasedErrorsCollector(),
         )
     def test_when_read_records_from_slice_then_return_records(self) -> None:
         self._parser.parse_records.return_value = [self._A_RECORD]
-        messages = list(self._stream.read_records_from_slice({"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}))
-        assert list(map(lambda message: message.record.data["data"], messages)) == [self._A_RECORD]
+        messages = list(
+            self._stream.read_records_from_slice(
+                {"files": [RemoteFile(uri="uri", last_modified=self._NOW)]}
+            )
+        )
+        assert list(map(lambda message: message.record.data["data"], messages)) == [
+            self._A_RECORD
+        ]
-    def test_given_exception_when_read_records_from_slice_then_do_process_other_files(self) -> None:
+    def test_given_exception_when_read_records_from_slice_then_do_process_other_files(
+        self,
+    ) -> None:
         """
         The current behavior for source-s3 v3 does not fail sync on some errors and hence, we will keep this behaviour for now. One example
         we can easily reproduce this is by having a file with gzip extension that is not actually a gzip file. The reader will fail to open
         the file but the sync won't fail.
         Ticket: https://github.com/airbytehq/airbyte/issues/29680
         """
-        self._parser.parse_records.side_effect = [ValueError("An error"), [self._A_RECORD]]
+        self._parser.parse_records.side_effect = [
+            ValueError("An error"),
+            [self._A_RECORD],
+        ]
         messages = list(
             self._stream.read_records_from_slice(
@@ -120,7 +140,9 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
         assert messages[0].log.level == Level.ERROR
         assert messages[1].record.data["data"] == self._A_RECORD
-    def test_given_traced_exception_when_read_records_from_slice_then_fail(self) -> None:
+    def test_given_traced_exception_when_read_records_from_slice_then_fail(
+        self,
+    ) -> None:
         """
         When a traced exception is raised, the stream shouldn't try to handle but pass it on to the caller.
         """
@@ -138,10 +160,14 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
                 )
             )
-    def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(self) -> None:
+    def test_given_exception_after_skipping_records_when_read_records_from_slice_then_send_warning(
+        self,
+    ) -> None:
         self._stream_config.schemaless = False
         self._validation_policy.record_passes_validation_policy.return_value = False
-        self._parser.parse_records.side_effect = [self._iter([self._A_RECORD, ValueError("An error")])]
+        self._parser.parse_records.side_effect = [
+            self._iter([self._A_RECORD, ValueError("An error")])
+        ]
         messages = list(
             self._stream.read_records_from_slice(
@@ -183,3 +209,54 @@ class DefaultFileBasedStreamTest(unittest.TestCase):
             if isinstance(item, Exception):
                 raise item
             yield item
+class TestFileBasedErrorCollector:
+    test_error_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
+    @pytest.mark.parametrize(
+        "stream, file, line_no, n_skipped, collector_expected_len",
+        (
+            ("stream_1", "test.csv", 1, 1, 1),
+            ("stream_2", "test2.csv", 2, 2, 2),
+        ),
+        ids=[
+            "Single error",
+            "Multiple errors",
+        ],
+    )
+    def test_collect_parsing_error(
+        self, stream, file, line_no, n_skipped, collector_expected_len
+    ) -> None:
+        test_error_pattern = "Error parsing record."
+        # format the error body
+        test_error = (
+            AirbyteMessage(
+                type=MessageType.LOG,
+                log=AirbyteLogMessage(
+                    level=Level.ERROR,
+                    message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={stream} file={file} line_no={line_no} n_skipped={n_skipped}",
+                    stack_trace=traceback.format_exc(),
+                ),
+            ),
+        )
+        # collecting the error
+        self.test_error_collector.collect(test_error)
+        # check the error has been collected
+        assert len(self.test_error_collector.errors) == collector_expected_len
+        # check for the patern presence for the collected errors
+        for error in self.test_error_collector.errors:
+            assert test_error_pattern in error[0].log.message
+    def test_yield_and_raise_collected(self) -> None:
+        # we expect the following method will raise the AirbyteTracedException
+        with pytest.raises(AirbyteTracedException) as parse_error:
+            list(self.test_error_collector.yield_and_raise_collected())
+        assert (
+            parse_error.value.message
+            == "Some errors occured while reading from the source."
+        )
+        assert (
+            parse_error.value.internal_message
+            == "Please check the logged errors for more information."
+        )

unit_tests/sources/file_based/test_file_based_scenarios.py CHANGED Viewed

@@ -48,6 +48,7 @@ from unit_tests.sources.file_based.scenarios.csv_scenarios import (
     csv_strings_can_be_null_not_quoted_scenario,
     earlier_csv_scenario,
     empty_schema_inference_scenario,
+    invalid_csv_multi_scenario,
     invalid_csv_scenario,
     multi_csv_scenario,
     multi_csv_stream_n_file_exceeds_limit_for_inference,
@@ -132,6 +133,7 @@ discover_scenarios = [
     csv_multi_stream_scenario,
     csv_single_stream_scenario,
     invalid_csv_scenario,
+    invalid_csv_multi_scenario,
     single_csv_scenario,
     multi_csv_scenario,
     multi_csv_stream_n_file_exceeds_limit_for_inference,

{airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.58.5.dist-info → airbyte_cdk-0.58.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.58.5__py3-none-any.whl → 0.58.7__py3-none-any.whl

airbyte-cdk 0.58.5py3-none-any.whl → 0.58.7py3-none-any.whl