airbyte-cdk 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +0 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +21 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +64 -6
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +6 -5
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/RECORD +12 -11
- unit_tests/sources/file_based/config/test_csv_format.py +28 -0
- unit_tests/sources/file_based/file_types/test_csv_parser.py +24 -2
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +38 -17
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.51.6.dist-info → airbyte_cdk-0.51.7.dist-info}/top_level.txt +0 -0
@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
|
|
66
66
|
for format in objects_to_check["oneOf"]:
|
67
67
|
for key in format["properties"]:
|
68
68
|
object_property = format["properties"][key]
|
69
|
-
|
70
|
-
object_property["enum"] = object_property["allOf"][0]["enum"]
|
71
|
-
object_property.pop("allOf")
|
69
|
+
AbstractFileBasedSpec.move_enum_to_root(object_property)
|
72
70
|
|
73
71
|
properties_to_change = ["validation_policy"]
|
74
72
|
for property_to_change in properties_to_change:
|
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
|
|
76
74
|
if "anyOf" in property_object:
|
77
75
|
schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
|
78
76
|
schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
AbstractFileBasedSpec.move_enum_to_root(property_object)
|
78
|
+
|
79
|
+
csv_format_schemas = list(
|
80
|
+
filter(
|
81
|
+
lambda format: format["properties"]["filetype"]["default"] == "csv",
|
82
|
+
schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
|
83
|
+
)
|
84
|
+
)
|
85
|
+
if len(csv_format_schemas) != 1:
|
86
|
+
raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
|
87
|
+
csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
|
88
|
+
"anyOf", []
|
89
|
+
)
|
90
|
+
csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
|
82
91
|
return schema
|
92
|
+
|
93
|
+
@staticmethod
|
94
|
+
def move_enum_to_root(object_property: Dict[str, Any]) -> None:
|
95
|
+
if "allOf" in object_property and "enum" in object_property["allOf"][0]:
|
96
|
+
object_property["enum"] = object_property["allOf"][0]["enum"]
|
97
|
+
object_property.pop("allOf")
|
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
import codecs
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Optional, Set
|
7
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field, validator
|
9
|
+
from pydantic import BaseModel, Field, ValidationError, root_validator, validator
|
10
10
|
from typing_extensions import Literal
|
11
11
|
|
12
12
|
|
@@ -15,6 +15,52 @@ class InferenceType(Enum):
|
|
15
15
|
PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
|
16
16
|
|
17
17
|
|
18
|
+
class CsvHeaderDefinitionType(Enum):
|
19
|
+
FROM_CSV = "From CSV"
|
20
|
+
AUTOGENERATED = "Autogenerated"
|
21
|
+
USER_PROVIDED = "User Provided"
|
22
|
+
|
23
|
+
|
24
|
+
class CsvHeaderFromCsv(BaseModel):
|
25
|
+
class Config:
|
26
|
+
title = "From CSV"
|
27
|
+
|
28
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
|
29
|
+
|
30
|
+
def has_header_row(self) -> bool:
|
31
|
+
return True
|
32
|
+
|
33
|
+
|
34
|
+
class CsvHeaderAutogenerated(BaseModel):
|
35
|
+
class Config:
|
36
|
+
title = "Autogenerated"
|
37
|
+
|
38
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
|
39
|
+
|
40
|
+
def has_header_row(self) -> bool:
|
41
|
+
return False
|
42
|
+
|
43
|
+
|
44
|
+
class CsvHeaderUserProvided(BaseModel):
|
45
|
+
class Config:
|
46
|
+
title = "User Provided"
|
47
|
+
|
48
|
+
header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
|
49
|
+
column_names: List[str] = Field(
|
50
|
+
title="Column Names",
|
51
|
+
description="The column names that will be used while emitting the CSV records",
|
52
|
+
)
|
53
|
+
|
54
|
+
def has_header_row(self) -> bool:
|
55
|
+
return False
|
56
|
+
|
57
|
+
@validator("column_names")
|
58
|
+
def validate_column_names(cls, v: List[str]) -> List[str]:
|
59
|
+
if not v:
|
60
|
+
raise ValueError("At least one column name needs to be provided when using user provided headers")
|
61
|
+
return v
|
62
|
+
|
63
|
+
|
18
64
|
DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
|
19
65
|
DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
|
20
66
|
|
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
|
|
64
110
|
skip_rows_after_header: int = Field(
|
65
111
|
title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
|
66
112
|
)
|
67
|
-
|
68
|
-
title="
|
69
|
-
default=
|
70
|
-
description="
|
113
|
+
header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
|
114
|
+
title="CSV Header Definition",
|
115
|
+
default=CsvHeaderFromCsv(),
|
116
|
+
description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
|
71
117
|
)
|
72
118
|
true_values: Set[str] = Field(
|
73
119
|
title="True Values",
|
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
|
|
113
159
|
except LookupError:
|
114
160
|
raise ValueError(f"invalid encoding format: {v}")
|
115
161
|
return v
|
162
|
+
|
163
|
+
@root_validator
|
164
|
+
def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
165
|
+
definition_type = values.get("header_definition_type")
|
166
|
+
column_names = values.get("user_provided_column_names")
|
167
|
+
if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
|
168
|
+
raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
|
169
|
+
if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
|
170
|
+
raise ValidationError(
|
171
|
+
"`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
|
172
|
+
)
|
173
|
+
return values
|
@@ -11,7 +11,7 @@ from functools import partial
|
|
11
11
|
from io import IOBase
|
12
12
|
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
|
13
13
|
|
14
|
-
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
|
14
|
+
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
|
15
15
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
16
16
|
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
17
17
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
@@ -48,11 +48,9 @@ class _CsvReader:
|
|
48
48
|
with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
|
49
49
|
headers = self._get_headers(fp, config_format, dialect_name)
|
50
50
|
|
51
|
-
# we assume that if we autogenerate columns, it is because we don't have headers
|
52
|
-
# if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
|
53
51
|
rows_to_skip = (
|
54
52
|
config_format.skip_rows_before_header
|
55
|
-
+ (
|
53
|
+
+ (1 if config_format.header_definition.has_header_row() else 0)
|
56
54
|
+ config_format.skip_rows_after_header
|
57
55
|
)
|
58
56
|
self._skip_rows(fp, rows_to_skip)
|
@@ -74,8 +72,11 @@ class _CsvReader:
|
|
74
72
|
Assumes the fp is pointing to the beginning of the files and will reset it as such
|
75
73
|
"""
|
76
74
|
# Note that this method assumes the dialect has already been registered if we're parsing the headers
|
75
|
+
if isinstance(config_format.header_definition, CsvHeaderUserProvided):
|
76
|
+
return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
|
77
|
+
|
77
78
|
self._skip_rows(fp, config_format.skip_rows_before_header)
|
78
|
-
if config_format.
|
79
|
+
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
|
79
80
|
headers = self._auto_generate_headers(fp, dialect_name)
|
80
81
|
else:
|
81
82
|
# Then read the header
|
@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
|
|
64
64
|
airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
|
65
65
|
airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
|
66
66
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
|
67
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256
|
67
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
|
68
68
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
69
69
|
airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
|
70
70
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
|
@@ -146,9 +146,9 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGf
|
|
146
146
|
airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
|
147
147
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
|
148
148
|
airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
149
|
-
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=
|
149
|
+
airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
|
150
150
|
airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
|
151
|
-
airbyte_cdk/sources/file_based/config/csv_format.py,sha256
|
151
|
+
airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
|
152
152
|
airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
|
153
153
|
airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
|
154
154
|
airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
|
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
|
|
157
157
|
airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
|
158
158
|
airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
|
159
159
|
airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
|
160
|
-
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=
|
160
|
+
airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
|
161
161
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
|
162
162
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
|
163
163
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
|
@@ -314,16 +314,17 @@ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8H
|
|
314
314
|
unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
|
315
315
|
unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
316
316
|
unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
|
317
|
+
unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
|
317
318
|
unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
|
318
319
|
unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
319
320
|
unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
|
320
|
-
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=
|
321
|
+
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
|
321
322
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
322
323
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
|
323
324
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
324
325
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
|
325
326
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
|
326
|
-
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=
|
327
|
+
unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
|
327
328
|
unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
|
328
329
|
unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
|
329
330
|
unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
|
@@ -354,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
354
355
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
355
356
|
unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
|
356
357
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
357
|
-
airbyte_cdk-0.51.
|
358
|
-
airbyte_cdk-0.51.
|
359
|
-
airbyte_cdk-0.51.
|
360
|
-
airbyte_cdk-0.51.
|
361
|
-
airbyte_cdk-0.51.
|
358
|
+
airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
359
|
+
airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
|
360
|
+
airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
361
|
+
airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
362
|
+
airbyte_cdk-0.51.7.dist-info/RECORD,,
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import unittest
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
|
9
|
+
from pydantic import ValidationError
|
10
|
+
|
11
|
+
|
12
|
+
class CsvHeaderDefinitionTest(unittest.TestCase):
|
13
|
+
def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
|
14
|
+
with pytest.raises(ValidationError):
|
15
|
+
CsvHeaderUserProvided(column_names=[])
|
16
|
+
|
17
|
+
def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
|
18
|
+
# no error means that this test succeeds
|
19
|
+
CsvHeaderUserProvided(column_names=["1", "2", "3"])
|
20
|
+
|
21
|
+
def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
|
22
|
+
assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
|
23
|
+
|
24
|
+
def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
|
25
|
+
assert not CsvHeaderAutogenerated().has_header_row()
|
26
|
+
|
27
|
+
def test_given_from_csv_then_csv_has_header_row(self) -> None:
|
28
|
+
assert CsvHeaderFromCsv().has_header_row()
|
@@ -13,7 +13,14 @@ from unittest import TestCase, mock
|
|
13
13
|
from unittest.mock import Mock
|
14
14
|
|
15
15
|
import pytest
|
16
|
-
from airbyte_cdk.sources.file_based.config.csv_format import
|
16
|
+
from airbyte_cdk.sources.file_based.config.csv_format import (
|
17
|
+
DEFAULT_FALSE_VALUES,
|
18
|
+
DEFAULT_TRUE_VALUES,
|
19
|
+
CsvFormat,
|
20
|
+
CsvHeaderAutogenerated,
|
21
|
+
CsvHeaderUserProvided,
|
22
|
+
InferenceType,
|
23
|
+
)
|
17
24
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
18
25
|
from airbyte_cdk.sources.file_based.exceptions import RecordParseError
|
19
26
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
|
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
|
|
278
285
|
assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
|
279
286
|
|
280
287
|
def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
|
281
|
-
self._config_format.
|
288
|
+
self._config_format.header_definition = CsvHeaderAutogenerated()
|
282
289
|
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
|
283
290
|
|
284
291
|
data_generator = self._read_data()
|
285
292
|
|
286
293
|
assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
|
287
294
|
|
295
|
+
def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
|
296
|
+
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
|
297
|
+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
|
298
|
+
|
299
|
+
data_generator = self._read_data()
|
300
|
+
|
301
|
+
assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
|
302
|
+
|
303
|
+
def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
|
304
|
+
self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
|
305
|
+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
|
306
|
+
|
307
|
+
with pytest.raises(RecordParseError):
|
308
|
+
list(self._read_data())
|
309
|
+
|
288
310
|
def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
|
289
311
|
self._config_format.skip_rows_after_header = 1
|
290
312
|
self._stream_reader.open_file.return_value = (
|
@@ -180,11 +180,43 @@ single_csv_scenario = (
|
|
180
180
|
"default": 0,
|
181
181
|
"type": "integer",
|
182
182
|
},
|
183
|
-
"
|
184
|
-
"title": "
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
183
|
+
"header_definition": {
|
184
|
+
"title": "CSV Header Definition",
|
185
|
+
"type": "object",
|
186
|
+
"description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
|
187
|
+
"default": {"header_definition_type": "From CSV"},
|
188
|
+
"oneOf": [
|
189
|
+
{
|
190
|
+
"title": "From CSV",
|
191
|
+
"type": "object",
|
192
|
+
"properties": {
|
193
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
|
194
|
+
},
|
195
|
+
},
|
196
|
+
{
|
197
|
+
"title": "Autogenerated",
|
198
|
+
"type": "object",
|
199
|
+
"properties": {
|
200
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
|
201
|
+
},
|
202
|
+
},
|
203
|
+
{
|
204
|
+
"title": "User Provided",
|
205
|
+
"type": "object",
|
206
|
+
"properties": {
|
207
|
+
"header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
|
208
|
+
"column_names": {
|
209
|
+
"title": "Column Names",
|
210
|
+
"description": "The column names that will be used while emitting the CSV records",
|
211
|
+
"type": "array",
|
212
|
+
"items": {
|
213
|
+
"type": "string"
|
214
|
+
},
|
215
|
+
}
|
216
|
+
},
|
217
|
+
"required": ["column_names"]
|
218
|
+
},
|
219
|
+
]
|
188
220
|
},
|
189
221
|
"true_values": {
|
190
222
|
"title": "True Values",
|
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
|
|
761
793
|
)
|
762
794
|
).build()
|
763
795
|
|
764
|
-
|
765
796
|
csv_custom_format_scenario = (
|
766
797
|
TestScenarioBuilder()
|
767
798
|
.set_name("csv_custom_format")
|
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
|
|
868
899
|
)
|
869
900
|
).build()
|
870
901
|
|
871
|
-
|
872
902
|
multi_stream_custom_format = (
|
873
903
|
TestScenarioBuilder()
|
874
904
|
.set_name("multi_stream_custom_format_scenario")
|
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
|
|
1016
1046
|
)
|
1017
1047
|
).build()
|
1018
1048
|
|
1019
|
-
|
1020
1049
|
empty_schema_inference_scenario = (
|
1021
1050
|
TestScenarioBuilder()
|
1022
1051
|
.set_name("empty_schema_inference_scenario")
|
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
|
|
1092
1121
|
)
|
1093
1122
|
).build()
|
1094
1123
|
|
1095
|
-
|
1096
1124
|
schemaless_csv_scenario = (
|
1097
1125
|
TestScenarioBuilder()
|
1098
1126
|
.set_name("schemaless_csv_scenario")
|
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
|
|
1188
1216
|
)
|
1189
1217
|
).build()
|
1190
1218
|
|
1191
|
-
|
1192
1219
|
schemaless_csv_multi_stream_scenario = (
|
1193
1220
|
TestScenarioBuilder()
|
1194
1221
|
.set_name("schemaless_csv_multi_stream_scenario")
|
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
|
|
1296
1323
|
)
|
1297
1324
|
).build()
|
1298
1325
|
|
1299
|
-
|
1300
1326
|
schemaless_with_user_input_schema_fails_connection_check_scenario = (
|
1301
1327
|
TestScenarioBuilder()
|
1302
1328
|
.set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
|
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
|
|
1361
1387
|
.set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
|
1362
1388
|
).build()
|
1363
1389
|
|
1364
|
-
|
1365
1390
|
schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
|
1366
1391
|
TestScenarioBuilder()
|
1367
1392
|
.set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
|
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
|
|
1446
1471
|
.set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
|
1447
1472
|
).build()
|
1448
1473
|
|
1449
|
-
|
1450
1474
|
csv_string_can_be_null_with_input_schemas_scenario = (
|
1451
1475
|
TestScenarioBuilder()
|
1452
1476
|
.set_name("csv_string_can_be_null_with_input_schema")
|
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
|
|
2143
2167
|
)
|
2144
2168
|
).build()
|
2145
2169
|
|
2146
|
-
|
2147
2170
|
csv_skip_before_header_scenario = (
|
2148
2171
|
TestScenarioBuilder()
|
2149
2172
|
.set_name("csv_skip_before_header")
|
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
|
|
2278
2301
|
)
|
2279
2302
|
).build()
|
2280
2303
|
|
2281
|
-
|
2282
2304
|
csv_skip_before_and_after_header_scenario = (
|
2283
2305
|
TestScenarioBuilder()
|
2284
2306
|
.set_name("csv_skip_before_after_header")
|
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
|
|
2363
2385
|
"validation_policy": "Emit Record",
|
2364
2386
|
"format": {
|
2365
2387
|
"filetype": "csv",
|
2366
|
-
"
|
2388
|
+
"header_definition": {"header_definition_type": "Autogenerated"},
|
2367
2389
|
},
|
2368
2390
|
}
|
2369
2391
|
],
|
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
|
|
2556
2578
|
)
|
2557
2579
|
).build()
|
2558
2580
|
|
2559
|
-
|
2560
2581
|
earlier_csv_scenario = (
|
2561
2582
|
TestScenarioBuilder()
|
2562
2583
|
.set_name("earlier_csv_stream")
|
File without changes
|
File without changes
|
File without changes
|