PyPI - airbyte-cdk - Versions diffs - 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl - Mend

airbyte-cdk 6.5.3rc2py3-none-any.whl → 6.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

airbyte_cdk/sources/file_based/file_based_stream_reader.py CHANGED Viewed

@@ -45,7 +45,9 @@ class AbstractFileBasedStreamReader(ABC):
         ...
     @abstractmethod
-    def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
+    def open_file(
+        self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
+    ) -> IOBase:
         """
         Return a file handle for reading.
@@ -80,11 +82,17 @@ class AbstractFileBasedStreamReader(ABC):
         """
         ...
-    def filter_files_by_globs_and_start_date(self, files: List[RemoteFile], globs: List[str]) -> Iterable[RemoteFile]:
+    def filter_files_by_globs_and_start_date(
+        self, files: List[RemoteFile], globs: List[str]
+    ) -> Iterable[RemoteFile]:
         """
         Utility method for filtering files based on globs.
         """
-        start_date = datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) if self.config and self.config.start_date else None
+        start_date = (
+            datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
+            if self.config and self.config.start_date
+            else None
+        )
         seen = set()
         for file in files:
@@ -120,13 +128,16 @@ class AbstractFileBasedStreamReader(ABC):
     def use_file_transfer(self) -> bool:
         if self.config:
             use_file_transfer = (
-                hasattr(self.config.delivery_method, "delivery_type") and self.config.delivery_method.delivery_type == "use_file_transfer"
+                hasattr(self.config.delivery_method, "delivery_type")
+                and self.config.delivery_method.delivery_type == "use_file_transfer"
             )
             return use_file_transfer
         return False
     @abstractmethod
-    def get_file(self, file: RemoteFile, local_directory: str, logger: logging.Logger) -> Dict[str, Any]:
+    def get_file(
+        self, file: RemoteFile, local_directory: str, logger: logging.Logger
+    ) -> Dict[str, Any]:
         """
         This is required for connectors that will support writing to
         files. It will handle the logic to download,get,read,acquire or

airbyte_cdk/sources/file_based/file_types/avro_parser.py CHANGED Viewed

@@ -9,7 +9,10 @@ import fastavro
 from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
-from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.file_based_stream_reader import (
+    AbstractFileBasedStreamReader,
+    FileReadMode,
+)
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
@@ -64,15 +67,21 @@ class AvroParser(FileTypeParser):
             avro_schema = avro_reader.writer_schema
         if not avro_schema["type"] == "record":
             unsupported_type = avro_schema["type"]
-            raise ValueError(f"Only record based avro files are supported. Found {unsupported_type}")
+            raise ValueError(
+                f"Only record based avro files are supported. Found {unsupported_type}"
+            )
         json_schema = {
-            field["name"]: AvroParser._convert_avro_type_to_json(avro_format, field["name"], field["type"])
+            field["name"]: AvroParser._convert_avro_type_to_json(
+                avro_format, field["name"], field["type"]
+            )
             for field in avro_schema["fields"]
         }
         return json_schema
     @classmethod
-    def _convert_avro_type_to_json(cls, avro_format: AvroFormat, field_name: str, avro_field: str) -> Mapping[str, Any]:
+    def _convert_avro_type_to_json(
+        cls, avro_format: AvroFormat, field_name: str, avro_field: str
+    ) -> Mapping[str, Any]:
         if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE:
             # Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings
             if avro_field == "double" and not avro_format.double_as_string:
@@ -83,17 +92,28 @@ class AvroParser(FileTypeParser):
                 return {
                     "type": "object",
                     "properties": {
-                        object_field["name"]: AvroParser._convert_avro_type_to_json(avro_format, object_field["name"], object_field["type"])
+                        object_field["name"]: AvroParser._convert_avro_type_to_json(
+                            avro_format, object_field["name"], object_field["type"]
+                        )
                         for object_field in avro_field["fields"]
                     },
                 }
             elif avro_field["type"] == "array":
                 if "items" not in avro_field:
-                    raise ValueError(f"{field_name} array type does not have a required field items")
-                return {"type": "array", "items": AvroParser._convert_avro_type_to_json(avro_format, "", avro_field["items"])}
+                    raise ValueError(
+                        f"{field_name} array type does not have a required field items"
+                    )
+                return {
+                    "type": "array",
+                    "items": AvroParser._convert_avro_type_to_json(
+                        avro_format, "", avro_field["items"]
+                    ),
+                }
             elif avro_field["type"] == "enum":
                 if "symbols" not in avro_field:
-                    raise ValueError(f"{field_name} enum type does not have a required field symbols")
+                    raise ValueError(
+                        f"{field_name} enum type does not have a required field symbols"
+                    )
                 if "name" not in avro_field:
                     raise ValueError(f"{field_name} enum type does not have a required field name")
                 return {"type": "string", "enum": avro_field["symbols"]}
@@ -102,7 +122,9 @@ class AvroParser(FileTypeParser):
                     raise ValueError(f"{field_name} map type does not have a required field values")
                 return {
                     "type": "object",
-                    "additionalProperties": AvroParser._convert_avro_type_to_json(avro_format, "", avro_field["values"]),
+                    "additionalProperties": AvroParser._convert_avro_type_to_json(
+                        avro_format, "", avro_field["values"]
+                    ),
                 }
             elif avro_field["type"] == "fixed" and avro_field.get("logicalType") != "duration":
                 if "size" not in avro_field:
@@ -115,18 +137,27 @@ class AvroParser(FileTypeParser):
                 }
             elif avro_field.get("logicalType") == "decimal":
                 if "precision" not in avro_field:
-                    raise ValueError(f"{field_name} decimal type does not have a required field precision")
+                    raise ValueError(
+                        f"{field_name} decimal type does not have a required field precision"
+                    )
                 if "scale" not in avro_field:
-                    raise ValueError(f"{field_name} decimal type does not have a required field scale")
+                    raise ValueError(
+                        f"{field_name} decimal type does not have a required field scale"
+                    )
                 max_whole_number_range = avro_field["precision"] - avro_field["scale"]
                 decimal_range = avro_field["scale"]
                 # This regex looks like a mess, but it is validation for at least one whole number and optional fractional numbers
                 # For example: ^-?\d{1,5}(?:\.\d{1,3})?$ would accept 12345.123 and 123456.12345 would  be rejected
-                return {"type": "string", "pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$"}
+                return {
+                    "type": "string",
+                    "pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$",
+                }
             elif "logicalType" in avro_field:
                 if avro_field["logicalType"] not in AVRO_LOGICAL_TYPE_TO_JSON:
-                    raise ValueError(f"{avro_field['logicalType']} is not a valid Avro logical type")
+                    raise ValueError(
+                        f"{avro_field['logicalType']} is not a valid Avro logical type"
+                    )
                 return AVRO_LOGICAL_TYPE_TO_JSON[avro_field["logicalType"]]
             else:
                 raise ValueError(f"Unsupported avro type: {avro_field}")
@@ -150,22 +181,32 @@ class AvroParser(FileTypeParser):
             with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
                 avro_reader = fastavro.reader(fp)
                 schema = avro_reader.writer_schema
-                schema_field_name_to_type = {field["name"]: field["type"] for field in schema["fields"]}
+                schema_field_name_to_type = {
+                    field["name"]: field["type"] for field in schema["fields"]
+                }
                 for record in avro_reader:
                     line_no += 1
                     yield {
-                        record_field: self._to_output_value(avro_format, schema_field_name_to_type[record_field], record[record_field])
+                        record_field: self._to_output_value(
+                            avro_format,
+                            schema_field_name_to_type[record_field],
+                            record[record_field],
+                        )
                         for record_field, record_value in schema_field_name_to_type.items()
                     }
         except Exception as exc:
-            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from exc
+            raise RecordParseError(
+                FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
+            ) from exc
     @property
     def file_read_mode(self) -> FileReadMode:
         return FileReadMode.READ_BINARY
     @staticmethod
-    def _to_output_value(avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any) -> Any:
+    def _to_output_value(
+        avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any
+    ) -> Any:
         if isinstance(record_value, bytes):
             return record_value.decode()
         elif not isinstance(record_type, Mapping):

airbyte_cdk/sources/file_based/file_types/csv_parser.py CHANGED Viewed

@@ -13,10 +13,18 @@ from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Opti
 from uuid import uuid4
 from airbyte_cdk.models import FailureType
-from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
+from airbyte_cdk.sources.file_based.config.csv_format import (
+    CsvFormat,
+    CsvHeaderAutogenerated,
+    CsvHeaderUserProvided,
+    InferenceType,
+)
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
-from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.file_based_stream_reader import (
+    AbstractFileBasedStreamReader,
+    FileReadMode,
+)
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
@@ -77,7 +85,9 @@ class _CsvReader:
                     # than headers or more headers dans columns
                     if None in row:
                         if config_format.ignore_errors_on_fields_mismatch:
-                            logger.error(f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column.")
+                            logger.error(
+                                f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column."
+                            )
                         else:
                             raise RecordParseError(
                                 FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_COLUMNS,
@@ -86,10 +96,14 @@ class _CsvReader:
                             )
                     if None in row.values():
                         if config_format.ignore_errors_on_fields_mismatch:
-                            logger.error(f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column.")
+                            logger.error(
+                                f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column."
+                            )
                         else:
                             raise RecordParseError(
-                                FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS, filename=file.uri, lineno=lineno
+                                FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS,
+                                filename=file.uri,
+                                lineno=lineno,
                             )
                     yield row
             finally:
@@ -105,7 +119,9 @@ class _CsvReader:
             return config_format.header_definition.column_names  # type: ignore  # should be CsvHeaderUserProvided given the type
         if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
-            self._skip_rows(fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header)
+            self._skip_rows(
+                fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header
+            )
             headers = self._auto_generate_headers(fp, dialect_name)
         else:
             # Then read the header
@@ -165,11 +181,15 @@ class CsvParser(FileTypeParser):
         #  sources will likely require one. Rather than modify the interface now we can wait until the real use case
         config_format = _extract_format(config)
         type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict(
-            lambda: _JsonTypeInferrer(config_format.true_values, config_format.false_values, config_format.null_values)
+            lambda: _JsonTypeInferrer(
+                config_format.true_values, config_format.false_values, config_format.null_values
+            )
             if config_format.inference_type != InferenceType.NONE
             else _DisabledTypeInferrer()
         )
-        data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
+        data_generator = self._csv_reader.read_data(
+            config, file, stream_reader, logger, self.file_read_mode
+        )
         read_bytes = 0
         for row in data_generator:
             for header, value in row.items():
@@ -187,7 +207,10 @@ class CsvParser(FileTypeParser):
                 f"Else, please contact Airbyte.",
                 failure_type=FailureType.config_error,
             )
-        schema = {header.strip(): {"type": type_inferred.infer()} for header, type_inferred in type_inferrer_by_field.items()}
+        schema = {
+            header.strip(): {"type": type_inferred.infer()}
+            for header, type_inferred in type_inferrer_by_field.items()
+        }
         data_generator.close()
         return schema
@@ -203,19 +226,30 @@ class CsvParser(FileTypeParser):
         try:
             config_format = _extract_format(config)
             if discovered_schema:
-                property_types = {col: prop["type"] for col, prop in discovered_schema["properties"].items()}  # type: ignore # discovered_schema["properties"] is known to be a mapping
+                property_types = {
+                    col: prop["type"] for col, prop in discovered_schema["properties"].items()
+                }  # type: ignore # discovered_schema["properties"] is known to be a mapping
                 deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
             else:
                 deduped_property_types = {}
-            cast_fn = CsvParser._get_cast_function(deduped_property_types, config_format, logger, config.schemaless)
-            data_generator = self._csv_reader.read_data(config, file, stream_reader, logger, self.file_read_mode)
+            cast_fn = CsvParser._get_cast_function(
+                deduped_property_types, config_format, logger, config.schemaless
+            )
+            data_generator = self._csv_reader.read_data(
+                config, file, stream_reader, logger, self.file_read_mode
+            )
             for row in data_generator:
                 line_no += 1
                 yield CsvParser._to_nullable(
-                    cast_fn(row), deduped_property_types, config_format.null_values, config_format.strings_can_be_null
+                    cast_fn(row),
+                    deduped_property_types,
+                    config_format.null_values,
+                    config_format.strings_can_be_null,
                 )
         except RecordParseError as parse_err:
-            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no) from parse_err
+            raise RecordParseError(
+                FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
+            ) from parse_err
         finally:
             data_generator.close()
@@ -225,27 +259,47 @@ class CsvParser(FileTypeParser):
     @staticmethod
     def _get_cast_function(
-        deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger, schemaless: bool
+        deduped_property_types: Mapping[str, str],
+        config_format: CsvFormat,
+        logger: logging.Logger,
+        schemaless: bool,
     ) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
         # Only cast values if the schema is provided
         if deduped_property_types and not schemaless:
-            return partial(CsvParser._cast_types, deduped_property_types=deduped_property_types, config_format=config_format, logger=logger)
+            return partial(
+                CsvParser._cast_types,
+                deduped_property_types=deduped_property_types,
+                config_format=config_format,
+                logger=logger,
+            )
         else:
             # If no schema is provided, yield the rows as they are
             return _no_cast
     @staticmethod
     def _to_nullable(
-        row: Mapping[str, str], deduped_property_types: Mapping[str, str], null_values: Set[str], strings_can_be_null: bool
+        row: Mapping[str, str],
+        deduped_property_types: Mapping[str, str],
+        null_values: Set[str],
+        strings_can_be_null: bool,
     ) -> Dict[str, Optional[str]]:
         nullable = {
-            k: None if CsvParser._value_is_none(v, deduped_property_types.get(k), null_values, strings_can_be_null) else v
+            k: None
+            if CsvParser._value_is_none(
+                v, deduped_property_types.get(k), null_values, strings_can_be_null
+            )
+            else v
             for k, v in row.items()
         }
         return nullable
     @staticmethod
-    def _value_is_none(value: Any, deduped_property_type: Optional[str], null_values: Set[str], strings_can_be_null: bool) -> bool:
+    def _value_is_none(
+        value: Any,
+        deduped_property_type: Optional[str],
+        null_values: Set[str],
+        strings_can_be_null: bool,
+    ) -> bool:
         return value in null_values and (strings_can_be_null or deduped_property_type != "string")
     @staticmethod
@@ -280,7 +334,10 @@ class CsvParser(FileTypeParser):
     @staticmethod
     def _cast_types(
-        row: Dict[str, str], deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger
+        row: Dict[str, str],
+        deduped_property_types: Mapping[str, str],
+        config_format: CsvFormat,
+        logger: logging.Logger,
     ) -> Dict[str, Any]:
         """
         Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
@@ -305,20 +362,22 @@ class CsvParser(FileTypeParser):
                     else:
                         warnings.append(_format_warning(key, value, prop_type))
-                elif python_type == bool:
+                elif python_type is bool:
                     try:
-                        cast_value = _value_to_bool(value, config_format.true_values, config_format.false_values)
+                        cast_value = _value_to_bool(
+                            value, config_format.true_values, config_format.false_values
+                        )
                     except ValueError:
                         warnings.append(_format_warning(key, value, prop_type))
-                elif python_type == dict:
+                elif python_type is dict:
                     try:
                         # we don't re-use _value_to_object here because we type the column as object as long as there is only one object
                         cast_value = orjson.loads(value)
                     except orjson.JSONDecodeError:
                         warnings.append(_format_warning(key, value, prop_type))
-                elif python_type == list:
+                elif python_type is list:
                     try:
                         cast_value = _value_to_list(value)
                     except (ValueError, json.JSONDecodeError):
@@ -364,7 +423,9 @@ class _JsonTypeInferrer(_TypeInferrer):
     _NUMBER_TYPE = "number"
     _STRING_TYPE = "string"
-    def __init__(self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]) -> None:
+    def __init__(
+        self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]
+    ) -> None:
         self._boolean_trues = boolean_trues
         self._boolean_falses = boolean_falses
         self._null_values = null_values
@@ -375,7 +436,9 @@ class _JsonTypeInferrer(_TypeInferrer):
     def infer(self) -> str:
         types_by_value = {value: self._infer_type(value) for value in self._values}
-        types_excluding_null_values = [types for types in types_by_value.values() if self._NULL_TYPE not in types]
+        types_excluding_null_values = [
+            types for types in types_by_value.values() if self._NULL_TYPE not in types
+        ]
         if not types_excluding_null_values:
             # this is highly unusual but we will consider the column as a string
             return self._STRING_TYPE

airbyte_cdk/sources/file_based/file_types/excel_parser.py CHANGED Viewed

@@ -8,9 +8,19 @@ from pathlib import Path
 from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
 import pandas as pd
-from airbyte_cdk.sources.file_based.config.file_based_stream_config import ExcelFormat, FileBasedStreamConfig
-from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
-from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
+    ExcelFormat,
+    FileBasedStreamConfig,
+)
+from airbyte_cdk.sources.file_based.exceptions import (
+    ConfigValidationError,
+    FileBasedSourceError,
+    RecordParseError,
+)
+from airbyte_cdk.sources.file_based.file_based_stream_reader import (
+    AbstractFileBasedStreamReader,
+    FileReadMode,
+)
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
@@ -63,7 +73,11 @@ class ExcelParser(FileTypeParser):
                 fields[column] = self.dtype_to_json_type(prev_frame_column_type, df_type)
         schema = {
-            field: ({"type": "string", "format": "date-time"} if fields[field] == "date-time" else {"type": fields[field]})
+            field: (
+                {"type": "string", "format": "date-time"}
+                if fields[field] == "date-time"
+                else {"type": fields[field]}
+            )
             for field in fields
         }
         return schema
@@ -101,11 +115,15 @@ class ExcelParser(FileTypeParser):
                 # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
                 # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
                 # see PR description: https://github.com/airbytehq/airbyte/pull/44444/
-                yield from orjson.loads(df.to_json(orient="records", date_format="iso", date_unit="us"))
+                yield from orjson.loads(
+                    df.to_json(orient="records", date_format="iso", date_unit="us")
+                )
         except Exception as exc:
             # Raise a RecordParseError if any exception occurs during parsing
-            raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri) from exc
+            raise RecordParseError(
+                FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri
+            ) from exc
     @property
     def file_read_mode(self) -> FileReadMode:
@@ -133,7 +151,7 @@ class ExcelParser(FileTypeParser):
         if current_type == "string":
             # Previous column values were of the string type, no need to look further.
             return current_type
-        if dtype == object:
+        if dtype is object:
             return "string"
         if dtype in number_types and (not current_type or current_type == "number"):
             return "number"

airbyte_cdk/sources/file_based/file_types/file_transfer.py CHANGED Viewed

@@ -15,7 +15,11 @@ DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
 class FileTransfer:
     def __init__(self) -> None:
-        self._local_directory = AIRBYTE_STAGING_DIRECTORY if os.path.exists(AIRBYTE_STAGING_DIRECTORY) else DEFAULT_LOCAL_DIRECTORY
+        self._local_directory = (
+            AIRBYTE_STAGING_DIRECTORY
+            if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
+            else DEFAULT_LOCAL_DIRECTORY
+        )
     def get_file(
         self,
@@ -25,7 +29,9 @@ class FileTransfer:
         logger: logging.Logger,
     ) -> Iterable[Dict[str, Any]]:
         try:
-            yield stream_reader.get_file(file=file, local_directory=self._local_directory, logger=logger)
+            yield stream_reader.get_file(
+                file=file, local_directory=self._local_directory, logger=logger
+            )
         except Exception as ex:
             logger.error("An error has occurred while getting file: %s", str(ex))
             raise ex

airbyte_cdk/sources/file_based/file_types/file_type_parser.py CHANGED Viewed

@@ -7,7 +7,10 @@ from abc import ABC, abstractmethod
 from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
-from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.file_based_stream_reader import (
+    AbstractFileBasedStreamReader,
+    FileReadMode,
+)
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
 from airbyte_cdk.sources.file_based.schema_helpers import SchemaType

airbyte_cdk/sources/file_based/file_types/jsonl_parser.py CHANGED Viewed

@@ -8,15 +8,21 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
-from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
+from airbyte_cdk.sources.file_based.file_based_stream_reader import (
+    AbstractFileBasedStreamReader,
+    FileReadMode,
+)
 from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
-from airbyte_cdk.sources.file_based.schema_helpers import PYTHON_TYPE_MAPPING, SchemaType, merge_schemas
+from airbyte_cdk.sources.file_based.schema_helpers import (
+    PYTHON_TYPE_MAPPING,
+    SchemaType,
+    merge_schemas,
+)
 from orjson import orjson
 class JsonlParser(FileTypeParser):
     MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
     ENCODING = "utf8"
@@ -103,7 +109,9 @@ class JsonlParser(FileTypeParser):
                 try:
                     record = orjson.loads(accumulator)
                     if had_json_parsing_error and not has_warned_for_multiline_json_object:
-                        logger.warning(f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced")
+                        logger.warning(
+                            f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
+                        )
                         has_warned_for_multiline_json_object = True
                     yield record
@@ -112,7 +120,11 @@ class JsonlParser(FileTypeParser):
                 except orjson.JSONDecodeError:
                     had_json_parsing_error = True
-                if read_limit and yielded_at_least_once and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
+                if (
+                    read_limit
+                    and yielded_at_least_once
+                    and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
+                ):
                     logger.warning(
                         f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
                         f"Inferring schema from an incomplete set of records."
@@ -120,7 +132,9 @@ class JsonlParser(FileTypeParser):
                     break
             if had_json_parsing_error and not yielded_at_least_once:
-                raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line)
+                raise RecordParseError(
+                    FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
+                )
     @staticmethod
     def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:

airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl

airbyte-cdk 6.5.3rc2py3-none-any.whl → 6.6.0py3-none-any.whl