PyPI - airbyte-cdk - Versions diffs - 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl - Mend

airbyte-cdk 0.51.6py3-none-any.whl → 0.51.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

airbyte_cdk/sources/declarative/models/declarative_component_schema.py CHANGED Viewed

@@ -1,7 +1,3 @@
-#
-# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
-#
 # generated by datamodel-codegen:
 #   filename:  declarative_component_schema.yaml

airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py CHANGED Viewed

@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
         for format in objects_to_check["oneOf"]:
             for key in format["properties"]:
                 object_property = format["properties"][key]
-                if "allOf" in object_property and "enum" in object_property["allOf"][0]:
-                    object_property["enum"] = object_property["allOf"][0]["enum"]
-                    object_property.pop("allOf")
+                AbstractFileBasedSpec.move_enum_to_root(object_property)
         properties_to_change = ["validation_policy"]
         for property_to_change in properties_to_change:
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
             if "anyOf" in property_object:
                 schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
                 schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
-            if "allOf" in property_object and "enum" in property_object["allOf"][0]:
-                property_object["enum"] = property_object["allOf"][0]["enum"]
-                property_object.pop("allOf")
+            AbstractFileBasedSpec.move_enum_to_root(property_object)
+        csv_format_schemas = list(
+            filter(
+                lambda format: format["properties"]["filetype"]["default"] == "csv",
+                schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
+            )
+        )
+        if len(csv_format_schemas) != 1:
+            raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
+        csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
+            "anyOf", []
+        )
+        csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
         return schema
+    @staticmethod
+    def move_enum_to_root(object_property: Dict[str, Any]) -> None:
+        if "allOf" in object_property and "enum" in object_property["allOf"][0]:
+            object_property["enum"] = object_property["allOf"][0]["enum"]
+            object_property.pop("allOf")

airbyte_cdk/sources/file_based/config/csv_format.py CHANGED Viewed

@@ -4,9 +4,9 @@
 import codecs
 from enum import Enum
-from typing import Optional, Set
+from typing import Any, Dict, List, Optional, Set, Union
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, ValidationError, root_validator, validator
 from typing_extensions import Literal
@@ -15,6 +15,52 @@ class InferenceType(Enum):
     PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
+class CsvHeaderDefinitionType(Enum):
+    FROM_CSV = "From CSV"
+    AUTOGENERATED = "Autogenerated"
+    USER_PROVIDED = "User Provided"
+class CsvHeaderFromCsv(BaseModel):
+    class Config:
+        title = "From CSV"
+    header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value  # type: ignore
+    def has_header_row(self) -> bool:
+        return True
+class CsvHeaderAutogenerated(BaseModel):
+    class Config:
+        title = "Autogenerated"
+    header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value  # type: ignore
+    def has_header_row(self) -> bool:
+        return False
+class CsvHeaderUserProvided(BaseModel):
+    class Config:
+        title = "User Provided"
+    header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value  # type: ignore
+    column_names: List[str] = Field(
+        title="Column Names",
+        description="The column names that will be used while emitting the CSV records",
+    )
+    def has_header_row(self) -> bool:
+        return False
+    @validator("column_names")
+    def validate_column_names(cls, v: List[str]) -> List[str]:
+        if not v:
+            raise ValueError("At least one column name needs to be provided when using user provided headers")
+        return v
 DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
 DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
     skip_rows_after_header: int = Field(
         title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
     )
-    autogenerate_column_names: bool = Field(
-        title="Autogenerate Column Names",
-        default=False,
-        description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
+    header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
+        title="CSV Header Definition",
+        default=CsvHeaderFromCsv(),
+        description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
     )
     true_values: Set[str] = Field(
         title="True Values",
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
         except LookupError:
             raise ValueError(f"invalid encoding format: {v}")
         return v
+    @root_validator
+    def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        definition_type = values.get("header_definition_type")
+        column_names = values.get("user_provided_column_names")
+        if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
+            raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
+        if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
+            raise ValidationError(
+                "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
+            )
+        return values

airbyte_cdk/sources/file_based/file_types/csv_parser.py CHANGED Viewed

@@ -11,7 +11,7 @@ from functools import partial
 from io import IOBase
 from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
-from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
+from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -48,11 +48,9 @@ class _CsvReader:
         with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
             headers = self._get_headers(fp, config_format, dialect_name)
-            # we assume that if we autogenerate columns, it is because we don't have headers
-            # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
             rows_to_skip = (
                 config_format.skip_rows_before_header
-                + (0 if config_format.autogenerate_column_names else 1)
+                + (1 if config_format.header_definition.has_header_row() else 0)
                 + config_format.skip_rows_after_header
             )
             self._skip_rows(fp, rows_to_skip)
@@ -74,8 +72,11 @@ class _CsvReader:
         Assumes the fp is pointing to the beginning of the files and will reset it as such
         """
         # Note that this method assumes the dialect has already been registered if we're parsing the headers
+        if isinstance(config_format.header_definition, CsvHeaderUserProvided):
+            return config_format.header_definition.column_names  # type: ignore  # should be CsvHeaderUserProvided given the type
         self._skip_rows(fp, config_format.skip_rows_before_header)
-        if config_format.autogenerate_column_names:
+        if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
             headers = self._auto_generate_headers(fp, dialect_name)
         else:
             # Then read the header

{airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.51.6
+Version: 0.51.7
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte

{airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/RECORD RENAMED Viewed

@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
 airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
 airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
 airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
-airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=-Y8Nw3-6ZtvsATixMSAWteHCvYQU965dn4NpVq6aWYs,57232
+airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
 airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
 airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
 airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -146,9 +146,9 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGf
 airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
 airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
 airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=P-CupGlif5XbKm6tc3FVC4WRMU4ogUbB3klcuZmZJ1k,3940
+airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
 airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
-airbyte_cdk/sources/file_based/config/csv_format.py,sha256=lLyjOqp2gNrXcGtSWozheMMfUQcy0NBUAMWwmDr_B7A,4672
+airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
 airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
 airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
 airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
 airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
 airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
 airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
-airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=4YdsyH7ntoadhRqMXPl8rertWsQbtE0aJihp9V5zIlg,16586
+airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
 airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
 airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -314,16 +314,17 @@ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8H
 unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
 unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
+unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
 unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
 unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
-unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
+unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
 unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
 unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
 unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
-unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
+unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
 unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
 unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
 unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -354,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.51.6.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.51.6.dist-info/METADATA,sha256=6H-ZFSOZ3NomomXDhz87qdpIVMRJ1Abd2x61R9cHV0Y,9399
-airbyte_cdk-0.51.6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-airbyte_cdk-0.51.6.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.51.6.dist-info/RECORD,,
+airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
+airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.51.7.dist-info/RECORD,,

unit_tests/sources/file_based/config/test_csv_format.py ADDED Viewed

@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+import unittest
+import pytest
+from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
+from pydantic import ValidationError
+class CsvHeaderDefinitionTest(unittest.TestCase):
+    def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
+        with pytest.raises(ValidationError):
+            CsvHeaderUserProvided(column_names=[])
+    def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
+        # no error means that this test succeeds
+        CsvHeaderUserProvided(column_names=["1", "2", "3"])
+    def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
+        assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
+    def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
+        assert not CsvHeaderAutogenerated().has_header_row()
+    def test_given_from_csv_then_csv_has_header_row(self) -> None:
+        assert CsvHeaderFromCsv().has_header_row()

unit_tests/sources/file_based/file_types/test_csv_parser.py CHANGED Viewed

@@ -13,7 +13,14 @@ from unittest import TestCase, mock
 from unittest.mock import Mock
 import pytest
-from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType
+from airbyte_cdk.sources.file_based.config.csv_format import (
+    DEFAULT_FALSE_VALUES,
+    DEFAULT_TRUE_VALUES,
+    CsvFormat,
+    CsvHeaderAutogenerated,
+    CsvHeaderUserProvided,
+    InferenceType,
+)
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
         assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
     def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
-        self._config_format.autogenerate_column_names = True
+        self._config_format.header_definition = CsvHeaderAutogenerated()
         self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
         data_generator = self._read_data()
         assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
+    def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
+        self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
+        self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
+        data_generator = self._read_data()
+        assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
+    def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
+        self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
+        self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
+        with pytest.raises(RecordParseError):
+            list(self._read_data())
     def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
         self._config_format.skip_rows_after_header = 1
         self._stream_reader.open_file.return_value = (

unit_tests/sources/file_based/scenarios/csv_scenarios.py CHANGED Viewed

@@ -180,11 +180,43 @@ single_csv_scenario = (
                                                     "default": 0,
                                                     "type": "integer",
                                                 },
-                                                "autogenerate_column_names": {
-                                                    "title": "Autogenerate Column Names",
-                                                    "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.",
-                                                    "default": False,
-                                                    "type": "boolean",
+                                                "header_definition": {
+                                                    "title": "CSV Header Definition",
+                                                    "type": "object",
+                                                    "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
+                                                    "default": {"header_definition_type": "From CSV"},
+                                                    "oneOf": [
+                                                        {
+                                                            "title": "From CSV",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
+                                                            },
+                                                        },
+                                                        {
+                                                            "title": "Autogenerated",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
+                                                            },
+                                                        },
+                                                        {
+                                                            "title": "User Provided",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
+                                                                "column_names": {
+                                                                    "title": "Column Names",
+                                                                    "description": "The column names that will be used while emitting the CSV records",
+                                                                    "type": "array",
+                                                                    "items": {
+                                                                        "type": "string"
+                                                                    },
+                                                                }
+                                                            },
+                                                            "required": ["column_names"]
+                                                        },
+                                                    ]
                                                 },
                                                 "true_values": {
                                                     "title": "True Values",
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
     )
 ).build()
 csv_custom_format_scenario = (
     TestScenarioBuilder()
     .set_name("csv_custom_format")
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
     )
 ).build()
 multi_stream_custom_format = (
     TestScenarioBuilder()
     .set_name("multi_stream_custom_format_scenario")
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
     )
 ).build()
 empty_schema_inference_scenario = (
     TestScenarioBuilder()
     .set_name("empty_schema_inference_scenario")
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
     )
 ).build()
 schemaless_csv_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_csv_scenario")
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
     )
 ).build()
 schemaless_csv_multi_stream_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_csv_multi_stream_scenario")
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
     )
 ).build()
 schemaless_with_user_input_schema_fails_connection_check_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
     .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
 ).build()
 schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
     .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
 ).build()
 csv_string_can_be_null_with_input_schemas_scenario = (
     TestScenarioBuilder()
     .set_name("csv_string_can_be_null_with_input_schema")
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
     )
 ).build()
 csv_skip_before_header_scenario = (
     TestScenarioBuilder()
     .set_name("csv_skip_before_header")
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
     )
 ).build()
 csv_skip_before_and_after_header_scenario = (
     TestScenarioBuilder()
     .set_name("csv_skip_before_after_header")
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
                     "validation_policy": "Emit Record",
                     "format": {
                         "filetype": "csv",
-                        "autogenerate_column_names": True,
+                        "header_definition": {"header_definition_type": "Autogenerated"},
                     },
                 }
             ],
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
     )
 ).build()
 earlier_csv_scenario = (
     TestScenarioBuilder()
     .set_name("earlier_csv_stream")

{airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl

airbyte-cdk 0.51.6py3-none-any.whl → 0.51.7py3-none-any.whl