PyPI - airbyte-cdk - Versions diffs - 0.51.5__py3-none-any.whl → 0.51.7__py3-none-any.whl - Mend

airbyte-cdk 0.51.5py3-none-any.whl → 0.51.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

airbyte_cdk/sources/declarative/models/declarative_component_schema.py CHANGED Viewed

@@ -1,7 +1,3 @@
-#
-# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
-#
 # generated by datamodel-codegen:
 #   filename:  declarative_component_schema.yaml

airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py CHANGED Viewed

@@ -55,7 +55,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
         """
         try:
             files = self._check_list_files(stream)
-            self._check_extensions(stream, files)
             self._check_parse_record(stream, files[0], logger)
         except CheckAvailabilityError:
             return False, "".join(traceback.format_exc())
@@ -73,11 +72,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
         return files
-    def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
-        if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
-            raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
-        return None
     def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
         parser = stream.get_parser(stream.config.file_type)

airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py CHANGED Viewed

@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
         for format in objects_to_check["oneOf"]:
             for key in format["properties"]:
                 object_property = format["properties"][key]
-                if "allOf" in object_property and "enum" in object_property["allOf"][0]:
-                    object_property["enum"] = object_property["allOf"][0]["enum"]
-                    object_property.pop("allOf")
+                AbstractFileBasedSpec.move_enum_to_root(object_property)
         properties_to_change = ["validation_policy"]
         for property_to_change in properties_to_change:
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
             if "anyOf" in property_object:
                 schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
                 schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
-            if "allOf" in property_object and "enum" in property_object["allOf"][0]:
-                property_object["enum"] = property_object["allOf"][0]["enum"]
-                property_object.pop("allOf")
+            AbstractFileBasedSpec.move_enum_to_root(property_object)
+        csv_format_schemas = list(
+            filter(
+                lambda format: format["properties"]["filetype"]["default"] == "csv",
+                schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
+            )
+        )
+        if len(csv_format_schemas) != 1:
+            raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
+        csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
+            "anyOf", []
+        )
+        csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
         return schema
+    @staticmethod
+    def move_enum_to_root(object_property: Dict[str, Any]) -> None:
+        if "allOf" in object_property and "enum" in object_property["allOf"][0]:
+            object_property["enum"] = object_property["allOf"][0]["enum"]
+            object_property.pop("allOf")

airbyte_cdk/sources/file_based/config/csv_format.py CHANGED Viewed

@@ -4,9 +4,9 @@
 import codecs
 from enum import Enum
-from typing import Optional, Set
+from typing import Any, Dict, List, Optional, Set, Union
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, ValidationError, root_validator, validator
 from typing_extensions import Literal
@@ -15,6 +15,52 @@ class InferenceType(Enum):
     PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
+class CsvHeaderDefinitionType(Enum):
+    FROM_CSV = "From CSV"
+    AUTOGENERATED = "Autogenerated"
+    USER_PROVIDED = "User Provided"
+class CsvHeaderFromCsv(BaseModel):
+    class Config:
+        title = "From CSV"
+    header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value  # type: ignore
+    def has_header_row(self) -> bool:
+        return True
+class CsvHeaderAutogenerated(BaseModel):
+    class Config:
+        title = "Autogenerated"
+    header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value  # type: ignore
+    def has_header_row(self) -> bool:
+        return False
+class CsvHeaderUserProvided(BaseModel):
+    class Config:
+        title = "User Provided"
+    header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value  # type: ignore
+    column_names: List[str] = Field(
+        title="Column Names",
+        description="The column names that will be used while emitting the CSV records",
+    )
+    def has_header_row(self) -> bool:
+        return False
+    @validator("column_names")
+    def validate_column_names(cls, v: List[str]) -> List[str]:
+        if not v:
+            raise ValueError("At least one column name needs to be provided when using user provided headers")
+        return v
 DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
 DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
     skip_rows_after_header: int = Field(
         title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
     )
-    autogenerate_column_names: bool = Field(
-        title="Autogenerate Column Names",
-        default=False,
-        description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
+    header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
+        title="CSV Header Definition",
+        default=CsvHeaderFromCsv(),
+        description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
     )
     true_values: Set[str] = Field(
         title="True Values",
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
         except LookupError:
             raise ValueError(f"invalid encoding format: {v}")
         return v
+    @root_validator
+    def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        definition_type = values.get("header_definition_type")
+        column_names = values.get("user_provided_column_names")
+        if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
+            raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
+        if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
+            raise ValidationError(
+                "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
+            )
+        return values

airbyte_cdk/sources/file_based/exceptions.py CHANGED Viewed

@@ -7,7 +7,6 @@ from enum import Enum
 class FileBasedSourceError(Enum):
     EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
-    EXTENSION_MISMATCH = "The file type that you specified for this stream does not agree with the extension of one or more files in the stream. You may need to modify your glob patterns."
     GLOB_PARSE_ERROR = (
         "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
     )

airbyte_cdk/sources/file_based/file_types/csv_parser.py CHANGED Viewed

@@ -11,7 +11,7 @@ from functools import partial
 from io import IOBase
 from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
-from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
+from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -48,11 +48,9 @@ class _CsvReader:
         with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
             headers = self._get_headers(fp, config_format, dialect_name)
-            # we assume that if we autogenerate columns, it is because we don't have headers
-            # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
             rows_to_skip = (
                 config_format.skip_rows_before_header
-                + (0 if config_format.autogenerate_column_names else 1)
+                + (1 if config_format.header_definition.has_header_row() else 0)
                 + config_format.skip_rows_after_header
             )
             self._skip_rows(fp, rows_to_skip)
@@ -74,8 +72,11 @@ class _CsvReader:
         Assumes the fp is pointing to the beginning of the files and will reset it as such
         """
         # Note that this method assumes the dialect has already been registered if we're parsing the headers
+        if isinstance(config_format.header_definition, CsvHeaderUserProvided):
+            return config_format.header_definition.column_names  # type: ignore  # should be CsvHeaderUserProvided given the type
         self._skip_rows(fp, config_format.skip_rows_before_header)
-        if config_format.autogenerate_column_names:
+        if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
             headers = self._auto_generate_headers(fp, dialect_name)
         else:
             # Then read the header

airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py CHANGED Viewed

@@ -6,7 +6,7 @@ from abc import abstractmethod
 from functools import cached_property, lru_cache
 from typing import Any, Dict, Iterable, List, Mapping, Optional
-from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode
+from airbyte_cdk.models import SyncMode
 from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
 from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
@@ -38,7 +38,7 @@ class AbstractFileBasedStream(Stream):
     def __init__(
         self,
         config: FileBasedStreamConfig,
-        catalog_schema: Optional[ConfiguredAirbyteCatalog],
+        catalog_schema: Optional[Mapping[str, Any]],
         stream_reader: AbstractFileBasedStreamReader,
         availability_strategy: AbstractFileBasedAvailabilityStrategy,
         discovery_policy: AbstractDiscoveryPolicy,

{airbyte_cdk-0.51.5.dist-info → airbyte_cdk-0.51.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: airbyte-cdk
-Version: 0.51.5
+Version: 0.51.7
 Summary: A framework for writing Airbyte Connectors.
 Home-page: https://github.com/airbytehq/airbyte
 Author: Airbyte

{airbyte_cdk-0.51.5.dist-info → airbyte_cdk-0.51.7.dist-info}/RECORD RENAMED Viewed

@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
 airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
 airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
 airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
-airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=-Y8Nw3-6ZtvsATixMSAWteHCvYQU965dn4NpVq6aWYs,57232
+airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
 airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
 airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
 airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -136,7 +136,7 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
 airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
 airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
 airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-airbyte_cdk/sources/file_based/exceptions.py,sha256=9vko7ker-nJFJ2-vSzeFAqWNgrv2unSxPZ2tTKuBbnU,3845
+airbyte_cdk/sources/file_based/exceptions.py,sha256=4jwHysXT6r2o37Z7ch00nbo45wPVsmCorRYbYTmWd2Q,3656
 airbyte_cdk/sources/file_based/file_based_source.py,sha256=NCbXAGPWBQSPAf5x2U2eCdOLUd26RhO5s6K87_AF8Es,6931
 airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
 airbyte_cdk/sources/file_based/remote_file.py,sha256=s3Qz2N786yqSMXqcWmsTOvYhgs-ry0xFcn5fGyyz7bY,581
@@ -144,11 +144,11 @@ airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0
 airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
 airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGfmQlFUMFR5h3ECc-VzBj4vair6_4WAL87AEI,277
 airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
-airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=7k9YD8ZVkUpRHN4x3F84Do8ZA91Ph576r3cNdvLBizk,4635
+airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
 airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=P-CupGlif5XbKm6tc3FVC4WRMU4ogUbB3klcuZmZJ1k,3940
+airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
 airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
-airbyte_cdk/sources/file_based/config/csv_format.py,sha256=lLyjOqp2gNrXcGtSWozheMMfUQcy0NBUAMWwmDr_B7A,4672
+airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
 airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
 airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
 airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
 airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
 airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
 airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
-airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=4YdsyH7ntoadhRqMXPl8rertWsQbtE0aJihp9V5zIlg,16586
+airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
 airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
 airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
 airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -165,7 +165,7 @@ airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEV
 airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
 airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
 airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
-airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=tvVew6din9y8a3hItzU0PjTQrMxbVI7bK-3pRTvOswg,5810
+airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=BXO0b4iHNJUsF7GVIWTnY5Zlj-IjHS_JmqQlKsSDgz8,5777
 airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=eFYqN657J5A0sf9og_w7qea8lu2xtUobjYYDldfmbmA,11839
 airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
 airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
@@ -308,20 +308,23 @@ unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
 unit_tests/sources/file_based/helpers.py,sha256=JNCRl13oLRRun2XyYLSKLzfrzzOAMT57yUY0vZasxL4,2567
 unit_tests/sources/file_based/in_memory_files_source.py,sha256=HSZEtN7wb_NhBx4LVAEeAaeTByIBYZLr6xXJLI0FFLU,7777
 unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=gVJcjj8Q83LTxcU3LL9gv-9SAY21umtOMDTy1Z7A9OU,7552
-unit_tests/sources/file_based/test_scenarios.py,sha256=3rvlUzR1e5UnVXcr8dKx01oYqteSOfYibmblPyiiA08,18245
+unit_tests/sources/file_based/test_scenarios.py,sha256=1s3hN6xkmqHKGa348rK3sDLf-PPiEx0w-qfRi70gQnc,18167
 unit_tests/sources/file_based/test_schema_helpers.py,sha256=XJ27ecw0sjlSnKgQqV1DgnnjKB1TR2btq22OITh1Qdk,12333
+unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
 unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
+unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
 unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
 unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
-unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
+unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
 unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
 unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
 unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
-unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=7DR49LCjns72Dv5-R-cg6_SUR1zpHtE9_uFEWoYwx1s,5834
-unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
+unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
+unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
 unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
 unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
 unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -352,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
 unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
 unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
 unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
-airbyte_cdk-0.51.5.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
-airbyte_cdk-0.51.5.dist-info/METADATA,sha256=6S2hoA3Ua7D7zVi9BQ7Zm6xJzPj8uBbKdifYwD-L2pw,9399
-airbyte_cdk-0.51.5.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-airbyte_cdk-0.51.5.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
-airbyte_cdk-0.51.5.dist-info/RECORD,,
+airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
+airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
+airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
+airbyte_cdk-0.51.7.dist-info/RECORD,,

unit_tests/sources/file_based/availability_strategy/__init__.py ADDED Viewed

File without changes

unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py ADDED Viewed

@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+import unittest
+from datetime import datetime
+from unittest.mock import Mock, PropertyMock
+from airbyte_cdk.sources.file_based.availability_strategy.default_file_based_availability_strategy import (
+    DefaultFileBasedAvailabilityStrategy,
+)
+from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
+from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
+from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
+from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
+from airbyte_cdk.sources.file_based.remote_file import RemoteFile
+from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
+_FILE_WITH_UNKNOWN_EXTENSION = RemoteFile(uri="a.unknown_extension", last_modified=datetime.now(), file_type="csv")
+_ANY_CONFIG = FileBasedStreamConfig(
+    name="config.name",
+    file_type="parquet",
+    format=JsonlFormat(),
+)
+_ANY_SCHEMA = {"key": "value"}
+class DefaultFileBasedAvailabilityStrategyTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
+        self._strategy = DefaultFileBasedAvailabilityStrategy(self._stream_reader)
+        self._parser = Mock(spec=FileTypeParser)
+        self._stream = Mock(spec=AbstractFileBasedStream)
+        self._stream.get_parser.return_value = self._parser
+        self._stream.catalog_schema = _ANY_SCHEMA
+        self._stream.config = _ANY_CONFIG
+        self._stream.validation_policy = PropertyMock(validate_schema_before_sync=False)
+    def test_given_file_extension_does_not_match_when_check_availability_and_parsability_then_stream_is_still_available(self) -> None:
+        """
+        Before, we had a validation on the file extension but it turns out that in production, users sometimes have mismatch there. The
+        example we've seen was for JSONL parser but the file extension was just `.json`. Note that there we more than one record extracted
+        from this stream so it's not just that the file is one JSON object
+        """
+        self._stream.list_files.return_value = [_FILE_WITH_UNKNOWN_EXTENSION]
+        self._parser.parse_records.return_value = [{"a record": 1}]
+        is_available, reason = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock())
+        assert is_available

unit_tests/sources/file_based/config/test_csv_format.py ADDED Viewed

@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+import unittest
+import pytest
+from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
+from pydantic import ValidationError
+class CsvHeaderDefinitionTest(unittest.TestCase):
+    def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
+        with pytest.raises(ValidationError):
+            CsvHeaderUserProvided(column_names=[])
+    def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
+        # no error means that this test succeeds
+        CsvHeaderUserProvided(column_names=["1", "2", "3"])
+    def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
+        assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
+    def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
+        assert not CsvHeaderAutogenerated().has_header_row()
+    def test_given_from_csv_then_csv_has_header_row(self) -> None:
+        assert CsvHeaderFromCsv().has_header_row()

unit_tests/sources/file_based/file_types/test_csv_parser.py CHANGED Viewed

@@ -13,7 +13,14 @@ from unittest import TestCase, mock
 from unittest.mock import Mock
 import pytest
-from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType
+from airbyte_cdk.sources.file_based.config.csv_format import (
+    DEFAULT_FALSE_VALUES,
+    DEFAULT_TRUE_VALUES,
+    CsvFormat,
+    CsvHeaderAutogenerated,
+    CsvHeaderUserProvided,
+    InferenceType,
+)
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
         assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
     def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
-        self._config_format.autogenerate_column_names = True
+        self._config_format.header_definition = CsvHeaderAutogenerated()
         self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
         data_generator = self._read_data()
         assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
+    def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
+        self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
+        self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
+        data_generator = self._read_data()
+        assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
+    def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
+        self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
+        self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
+        with pytest.raises(RecordParseError):
+            list(self._read_data())
     def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
         self._config_format.skip_rows_after_header = 1
         self._stream_reader.open_file.return_value = (

unit_tests/sources/file_based/scenarios/check_scenarios.py CHANGED Viewed

@@ -134,14 +134,6 @@ error_empty_stream_scenario = (
 ).build()
-error_extension_mismatch_scenario = (
-    _base_failure_scenario.copy()
-    .set_name("error_extension_mismatch_scenario")
-    .set_file_type("jsonl")
-    .set_expected_check_error(None, FileBasedSourceError.EXTENSION_MISMATCH.value)
-).build()
 error_listing_files_scenario = (
     _base_failure_scenario.copy()
     .set_name("error_listing_files_scenario")

unit_tests/sources/file_based/scenarios/csv_scenarios.py CHANGED Viewed

@@ -180,11 +180,43 @@ single_csv_scenario = (
                                                     "default": 0,
                                                     "type": "integer",
                                                 },
-                                                "autogenerate_column_names": {
-                                                    "title": "Autogenerate Column Names",
-                                                    "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.",
-                                                    "default": False,
-                                                    "type": "boolean",
+                                                "header_definition": {
+                                                    "title": "CSV Header Definition",
+                                                    "type": "object",
+                                                    "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
+                                                    "default": {"header_definition_type": "From CSV"},
+                                                    "oneOf": [
+                                                        {
+                                                            "title": "From CSV",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
+                                                            },
+                                                        },
+                                                        {
+                                                            "title": "Autogenerated",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
+                                                            },
+                                                        },
+                                                        {
+                                                            "title": "User Provided",
+                                                            "type": "object",
+                                                            "properties": {
+                                                                "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
+                                                                "column_names": {
+                                                                    "title": "Column Names",
+                                                                    "description": "The column names that will be used while emitting the CSV records",
+                                                                    "type": "array",
+                                                                    "items": {
+                                                                        "type": "string"
+                                                                    },
+                                                                }
+                                                            },
+                                                            "required": ["column_names"]
+                                                        },
+                                                    ]
                                                 },
                                                 "true_values": {
                                                     "title": "True Values",
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
     )
 ).build()
 csv_custom_format_scenario = (
     TestScenarioBuilder()
     .set_name("csv_custom_format")
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
     )
 ).build()
 multi_stream_custom_format = (
     TestScenarioBuilder()
     .set_name("multi_stream_custom_format_scenario")
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
     )
 ).build()
 empty_schema_inference_scenario = (
     TestScenarioBuilder()
     .set_name("empty_schema_inference_scenario")
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
     )
 ).build()
 schemaless_csv_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_csv_scenario")
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
     )
 ).build()
 schemaless_csv_multi_stream_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_csv_multi_stream_scenario")
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
     )
 ).build()
 schemaless_with_user_input_schema_fails_connection_check_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
     .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
 ).build()
 schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
     TestScenarioBuilder()
     .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
     .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
 ).build()
 csv_string_can_be_null_with_input_schemas_scenario = (
     TestScenarioBuilder()
     .set_name("csv_string_can_be_null_with_input_schema")
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
     )
 ).build()
 csv_skip_before_header_scenario = (
     TestScenarioBuilder()
     .set_name("csv_skip_before_header")
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
     )
 ).build()
 csv_skip_before_and_after_header_scenario = (
     TestScenarioBuilder()
     .set_name("csv_skip_before_after_header")
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
                     "validation_policy": "Emit Record",
                     "format": {
                         "filetype": "csv",
-                        "autogenerate_column_names": True,
+                        "header_definition": {"header_definition_type": "Autogenerated"},
                     },
                 }
             ],
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
     )
 ).build()
 earlier_csv_scenario = (
     TestScenarioBuilder()
     .set_name("earlier_csv_stream")

unit_tests/sources/file_based/test_scenarios.py CHANGED Viewed

@@ -24,7 +24,6 @@ from unit_tests.sources.file_based.scenarios.avro_scenarios import (
 )
 from unit_tests.sources.file_based.scenarios.check_scenarios import (
     error_empty_stream_scenario,
-    error_extension_mismatch_scenario,
     error_listing_files_scenario,
     error_multi_stream_scenario,
     error_reading_file_scenario,
@@ -309,7 +308,6 @@ def test_spec(capsys: CaptureFixture[str], scenario: TestScenario) -> None:
 check_scenarios = [
     error_empty_stream_scenario,
-    error_extension_mismatch_scenario,
     error_listing_files_scenario,
     error_reading_file_scenario,
     error_record_validation_user_provided_schema_scenario,

{airbyte_cdk-0.51.5.dist-info → airbyte_cdk-0.51.7.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{airbyte_cdk-0.51.5.dist-info → airbyte_cdk-0.51.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{airbyte_cdk-0.51.5.dist-info → airbyte_cdk-0.51.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

airbyte-cdk 0.51.5__py3-none-any.whl → 0.51.7__py3-none-any.whl

airbyte-cdk 0.51.5py3-none-any.whl → 0.51.7py3-none-any.whl