airbyte-cdk 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +0 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +21 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +64 -6
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +6 -5
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/RECORD +12 -11
- unit_tests/sources/file_based/config/test_csv_format.py +28 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +24 -2
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +38 -17
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/top_level.txt +0 -0
@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
|
|
66
66
|
for format in objects_to_check["oneOf"]:
|
67
67
|
for key in format["properties"]:
|
68
68
|
object_property = format["properties"][key]
|
69
|
-
|
70
|
-
object_property["enum"] = object_property["allOf"][0]["enum"]
|
71
|
-
object_property.pop("allOf")
|
69
|
+
AbstractFileBasedSpec.move_enum_to_root(object_property)
|
72
70
|
|
73
71
|
properties_to_change = ["validation_policy"]
|
74
72
|
for property_to_change in properties_to_change:
|
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
|
|
76
74
|
if "anyOf" in property_object:
|
77
75
|
schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
|
78
76
|
schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
AbstractFileBasedSpec.move_enum_to_root(property_object)
|
78
|
+
|
79
|
+
csv_format_schemas = list(
|
80
|
+
filter(
|
81
|
+
lambda format: format["properties"]["filetype"]["default"] == "csv",
|
82
|
+
schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
|
83
|
+
)
|
84
|
+
)
|
85
|
+
if len(csv_format_schemas) != 1:
|
86
|
+
raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
|
87
|
+
csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
|
88
|
+
"anyOf", []
|
89
|
+
)
|
90
|
+
csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
|
82
91
|
return schema
|
92
|
+
|
93
|
+
@staticmethod
|
94
|
+
def move_enum_to_root(object_property: Dict[str, Any]) -> None:
|
95
|
+
if "allOf" in object_property and "enum" in object_property["allOf"][0]:
|
96
|
+
object_property["enum"] = object_property["allOf"][0]["enum"]
|
97
|
+
object_property.pop("allOf")
|
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
import codecs
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional, Set
|
7
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field, validator
|
9
|
+
from pydantic import BaseModel, Field, ValidationError, root_validator, validator
|
10
10
|
from typing_extensions import Literal
|
11
11
|
|
12
12
|
|
@@ -15,6 +15,52 @@ class InferenceType(Enum):
|
|
15
15
|
PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
|
16
16
|
|
17
17
|
|
18
|
+
class CsvHeaderDefinitionType(Enum):
|
19
|
+
FROM_CSV = "From CSV"
|
20
|
+
AUTOGENERATED = "Autogenerated"
|
21
|
+
USER_PROVIDED = "User Provided"
|
22
|
+
|
23
|
+
|
24
|
+
class CsvHeaderFromCsv(BaseModel):
|
25
|
+
class Config:
|
26
|
+
title = "From CSV"
|
27
|
+
|
28
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
|
29
|
+
|
30
|
+
def has_header_row(self) -> bool:
|
31
|
+
return True
|
32
|
+
|
33
|
+
|
34
|
+
class CsvHeaderAutogenerated(BaseModel):
|
35
|
+
class Config:
|
36
|
+
title = "Autogenerated"
|
37
|
+
|
38
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
|
39
|
+
|
40
|
+
def has_header_row(self) -> bool:
|
41
|
+
return False
|
42
|
+
|
43
|
+
|
44
|
+
class CsvHeaderUserProvided(BaseModel):
|
45
|
+
class Config:
|
46
|
+
title = "User Provided"
|
47
|
+
|
48
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
|
49
|
+
column_names: List[str] = Field(
|
50
|
+
title="Column Names",
|
51
|
+
description="The column names that will be used while emitting the CSV records",
|
52
|
+
)
|
53
|
+
|
54
|
+
def has_header_row(self) -> bool:
|
55
|
+
return False
|
56
|
+
|
57
|
+
@validator("column_names")
|
58
|
+
def validate_column_names(cls, v: List[str]) -> List[str]:
|
59
|
+
if not v:
|
60
|
+
raise ValueError("At least one column name needs to be provided when using user provided headers")
|
61
|
+
return v
|
62
|
+
|
63
|
+
|
18
64
|
DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
|
19
65
|
DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
|
20
66
|
|
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
|
|
64
110
|
skip_rows_after_header: int = Field(
|
65
111
|
title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
|
66
112
|
)
|
67
|
-
|
68
|
-
title="
|
69
|
-
default=
|
70
|
-
description="
|
113
|
+
header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
|
114
|
+
title="CSV Header Definition",
|
115
|
+
default=CsvHeaderFromCsv(),
|
116
|
+
description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
|
71
117
|
)
|
72
118
|
true_values: Set[str] = Field(
|
73
119
|
title="True Values",
|
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
|
|
113
159
|
except LookupError:
|
114
160
|
raise ValueError(f"invalid encoding format: {v}")
|
115
161
|
return v
|
162
|
+
|
163
|
+
@root_validator
|
164
|
+
def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
165
|
+
definition_type = values.get("header_definition_type")
|
166
|
+
column_names = values.get("user_provided_column_names")
|
167
|
+
if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
|
168
|
+
raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
|
169
|
+
if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
|
170
|
+
raise ValidationError(
|
171
|
+
"`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
|
172
|
+
)
|
173
|
+
return values
|
@@ -11,7 +11,7 @@ from functools import partial
|
|
11
11
|
from io import IOBase
|
12
12
|
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
|
13
13
|
|
14
|
-
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
|
14
|
+
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
|
15
15
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
16
16
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
17
17
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
@@ -48,11 +48,9 @@ class _CsvReader:
|
|
48
48
|
with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
|
49
49
|
headers = self._get_headers(fp, config_format, dialect_name)
|
50
50
|
|
51
|
-
# we assume that if we autogenerate columns, it is because we don't have headers
|
52
|
-
# if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
|
53
51
|
rows_to_skip = (
|
54
52
|
config_format.skip_rows_before_header
|
55
|
-
+ (
|
53
|
+
+ (1 if config_format.header_definition.has_header_row() else 0)
|
56
54
|
+ config_format.skip_rows_after_header
|
57
55
|
)
|
58
56
|
self._skip_rows(fp, rows_to_skip)
|
@@ -74,8 +72,11 @@ class _CsvReader:
|
|
74
72
|
Assumes the fp is pointing to the beginning of the files and will reset it as such
|
75
73
|
"""
|
76
74
|
# Note that this method assumes the dialect has already been registered if we're parsing the headers
|
75
|
+
if isinstance(config_format.header_definition, CsvHeaderUserProvided):
|
76
|
+
return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
|
77
|
+
|
77
78
|
self._skip_rows(fp, config_format.skip_rows_before_header)
|
78
|
-
if config_format.
|
79
|
+
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
|
79
80
|
headers = self._auto_generate_headers(fp, dialect_name)
|
80
81
|
else:
|
81
82
|
# Then read the header
|
@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
|
|
64
64
|
airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
|
65
65
|
airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
|
66
66
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
|
67
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256
|
67
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
|
68
68
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
69
69
|
airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
|
70
70
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
|
@@ -146,9 +146,9 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGf
|
|
146
146
|
airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
|
147
147
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
|
148
148
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
149
|
-
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=
|
149
|
+
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
|
150
150
|
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
|
151
|
-
airbyte_cdk/sources/file_based/config/csv_format.py,sha256
|
151
|
+
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
|
152
152
|
airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
|
153
153
|
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
|
154
154
|
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
|
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
|
|
157
157
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
|
158
158
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
|
159
159
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
|
160
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
160
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
|
161
161
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
|
162
162
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
|
163
163
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
|
@@ -314,16 +314,17 @@ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8H
|
|
314
314
|
unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
|
315
315
|
unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
316
316
|
unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
|
317
|
+
unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
|
317
318
|
unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
|
318
319
|
unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
319
320
|
unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
|
320
|
-
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=
|
321
|
+
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
|
321
322
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
322
323
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
|
323
324
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
324
325
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
|
325
326
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
|
326
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
327
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
|
327
328
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
|
328
329
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
|
329
330
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
|
@@ -354,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
354
355
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
355
356
|
unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
|
356
357
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
357
|
-
airbyte_cdk-0.51.
|
358
|
-
airbyte_cdk-0.51.
|
359
|
-
airbyte_cdk-0.51.
|
360
|
-
airbyte_cdk-0.51.
|
361
|
-
airbyte_cdk-0.51.
|
358
|
+
airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
359
|
+
airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
|
360
|
+
airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
361
|
+
airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
362
|
+
airbyte_cdk-0.51.7.dist-info/RECORD,,
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
|
9
|
+
from pydantic import ValidationError
|
10
|
+
|
11
|
+
|
12
|
+
class CsvHeaderDefinitionTest(unittest.TestCase):
|
13
|
+
def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
|
14
|
+
with pytest.raises(ValidationError):
|
15
|
+
CsvHeaderUserProvided(column_names=[])
|
16
|
+
|
17
|
+
def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
|
18
|
+
# no error means that this test succeeds
|
19
|
+
CsvHeaderUserProvided(column_names=["1", "2", "3"])
|
20
|
+
|
21
|
+
def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
|
22
|
+
assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
|
23
|
+
|
24
|
+
def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
|
25
|
+
assert not CsvHeaderAutogenerated().has_header_row()
|
26
|
+
|
27
|
+
def test_given_from_csv_then_csv_has_header_row(self) -> None:
|
28
|
+
assert CsvHeaderFromCsv().has_header_row()
|
@@ -13,7 +13,14 @@ from unittest import TestCase, mock
|
|
13
13
|
from unittest.mock import Mock
|
14
14
|
|
15
15
|
import pytest
|
16
|
-
from airbyte_cdk.sources.file_based.config.csv_format import
|
16
|
+
from airbyte_cdk.sources.file_based.config.csv_format import (
|
17
|
+
DEFAULT_FALSE_VALUES,
|
18
|
+
DEFAULT_TRUE_VALUES,
|
19
|
+
CsvFormat,
|
20
|
+
CsvHeaderAutogenerated,
|
21
|
+
CsvHeaderUserProvided,
|
22
|
+
InferenceType,
|
23
|
+
)
|
17
24
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
18
25
|
from airbyte_cdk.sources.file_based.exceptions import RecordParseError
|
19
26
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
|
|
278
285
|
assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
|
279
286
|
|
280
287
|
def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
|
281
|
-
self._config_format.
|
288
|
+
self._config_format.header_definition = CsvHeaderAutogenerated()
|
282
289
|
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
|
283
290
|
|
284
291
|
data_generator = self._read_data()
|
285
292
|
|
286
293
|
assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
|
287
294
|
|
295
|
+
def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
|
296
|
+
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
|
297
|
+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
|
298
|
+
|
299
|
+
data_generator = self._read_data()
|
300
|
+
|
301
|
+
assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
|
302
|
+
|
303
|
+
def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
|
304
|
+
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
|
305
|
+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
|
306
|
+
|
307
|
+
with pytest.raises(RecordParseError):
|
308
|
+
list(self._read_data())
|
309
|
+
|
288
310
|
def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
|
289
311
|
self._config_format.skip_rows_after_header = 1
|
290
312
|
self._stream_reader.open_file.return_value = (
|
@@ -180,11 +180,43 @@ single_csv_scenario = (
|
|
180
180
|
"default": 0,
|
181
181
|
"type": "integer",
|
182
182
|
},
|
183
|
-
"
|
184
|
-
"title": "
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
183
|
+
"header_definition": {
|
184
|
+
"title": "CSV Header Definition",
|
185
|
+
"type": "object",
|
186
|
+
"description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
|
187
|
+
"default": {"header_definition_type": "From CSV"},
|
188
|
+
"oneOf": [
|
189
|
+
{
|
190
|
+
"title": "From CSV",
|
191
|
+
"type": "object",
|
192
|
+
"properties": {
|
193
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
|
194
|
+
},
|
195
|
+
},
|
196
|
+
{
|
197
|
+
"title": "Autogenerated",
|
198
|
+
"type": "object",
|
199
|
+
"properties": {
|
200
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
|
201
|
+
},
|
202
|
+
},
|
203
|
+
{
|
204
|
+
"title": "User Provided",
|
205
|
+
"type": "object",
|
206
|
+
"properties": {
|
207
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
|
208
|
+
"column_names": {
|
209
|
+
"title": "Column Names",
|
210
|
+
"description": "The column names that will be used while emitting the CSV records",
|
211
|
+
"type": "array",
|
212
|
+
"items": {
|
213
|
+
"type": "string"
|
214
|
+
},
|
215
|
+
}
|
216
|
+
},
|
217
|
+
"required": ["column_names"]
|
218
|
+
},
|
219
|
+
]
|
188
220
|
},
|
189
221
|
"true_values": {
|
190
222
|
"title": "True Values",
|
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
|
|
761
793
|
)
|
762
794
|
).build()
|
763
795
|
|
764
|
-
|
765
796
|
csv_custom_format_scenario = (
|
766
797
|
TestScenarioBuilder()
|
767
798
|
.set_name("csv_custom_format")
|
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
|
|
868
899
|
)
|
869
900
|
).build()
|
870
901
|
|
871
|
-
|
872
902
|
multi_stream_custom_format = (
|
873
903
|
TestScenarioBuilder()
|
874
904
|
.set_name("multi_stream_custom_format_scenario")
|
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
|
|
1016
1046
|
)
|
1017
1047
|
).build()
|
1018
1048
|
|
1019
|
-
|
1020
1049
|
empty_schema_inference_scenario = (
|
1021
1050
|
TestScenarioBuilder()
|
1022
1051
|
.set_name("empty_schema_inference_scenario")
|
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
|
|
1092
1121
|
)
|
1093
1122
|
).build()
|
1094
1123
|
|
1095
|
-
|
1096
1124
|
schemaless_csv_scenario = (
|
1097
1125
|
TestScenarioBuilder()
|
1098
1126
|
.set_name("schemaless_csv_scenario")
|
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
|
|
1188
1216
|
)
|
1189
1217
|
).build()
|
1190
1218
|
|
1191
|
-
|
1192
1219
|
schemaless_csv_multi_stream_scenario = (
|
1193
1220
|
TestScenarioBuilder()
|
1194
1221
|
.set_name("schemaless_csv_multi_stream_scenario")
|
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
|
|
1296
1323
|
)
|
1297
1324
|
).build()
|
1298
1325
|
|
1299
|
-
|
1300
1326
|
schemaless_with_user_input_schema_fails_connection_check_scenario = (
|
1301
1327
|
TestScenarioBuilder()
|
1302
1328
|
.set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
|
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
|
|
1361
1387
|
.set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
|
1362
1388
|
).build()
|
1363
1389
|
|
1364
|
-
|
1365
1390
|
schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
|
1366
1391
|
TestScenarioBuilder()
|
1367
1392
|
.set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
|
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
|
|
1446
1471
|
.set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
|
1447
1472
|
).build()
|
1448
1473
|
|
1449
|
-
|
1450
1474
|
csv_string_can_be_null_with_input_schemas_scenario = (
|
1451
1475
|
TestScenarioBuilder()
|
1452
1476
|
.set_name("csv_string_can_be_null_with_input_schema")
|
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
|
|
2143
2167
|
)
|
2144
2168
|
).build()
|
2145
2169
|
|
2146
|
-
|
2147
2170
|
csv_skip_before_header_scenario = (
|
2148
2171
|
TestScenarioBuilder()
|
2149
2172
|
.set_name("csv_skip_before_header")
|
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
|
|
2278
2301
|
)
|
2279
2302
|
).build()
|
2280
2303
|
|
2281
|
-
|
2282
2304
|
csv_skip_before_and_after_header_scenario = (
|
2283
2305
|
TestScenarioBuilder()
|
2284
2306
|
.set_name("csv_skip_before_after_header")
|
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
|
|
2363
2385
|
"validation_policy": "Emit Record",
|
2364
2386
|
"format": {
|
2365
2387
|
"filetype": "csv",
|
2366
|
-
"
|
2388
|
+
"header_definition": {"header_definition_type": "Autogenerated"},
|
2367
2389
|
},
|
2368
2390
|
}
|
2369
2391
|
],
|
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
|
|
2556
2578
|
)
|
2557
2579
|
).build()
|
2558
2580
|
|
2559
|
-
|
2560
2581
|
earlier_csv_scenario = (
|
2561
2582
|
TestScenarioBuilder()
|
2562
2583
|
.set_name("earlier_csv_stream")
|
File without changes
|
File without changes
|
File without changes
|