airbyte-cdk 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,3 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
1
  # generated by datamodel-codegen:
6
2
  # filename: declarative_component_schema.yaml
7
3
 
@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
66
66
  for format in objects_to_check["oneOf"]:
67
67
  for key in format["properties"]:
68
68
  object_property = format["properties"][key]
69
- if "allOf" in object_property and "enum" in object_property["allOf"][0]:
70
- object_property["enum"] = object_property["allOf"][0]["enum"]
71
- object_property.pop("allOf")
69
+ AbstractFileBasedSpec.move_enum_to_root(object_property)
72
70
 
73
71
  properties_to_change = ["validation_policy"]
74
72
  for property_to_change in properties_to_change:
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
76
74
  if "anyOf" in property_object:
77
75
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
78
76
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
79
- if "allOf" in property_object and "enum" in property_object["allOf"][0]:
80
- property_object["enum"] = property_object["allOf"][0]["enum"]
81
- property_object.pop("allOf")
77
+ AbstractFileBasedSpec.move_enum_to_root(property_object)
78
+
79
+ csv_format_schemas = list(
80
+ filter(
81
+ lambda format: format["properties"]["filetype"]["default"] == "csv",
82
+ schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
83
+ )
84
+ )
85
+ if len(csv_format_schemas) != 1:
86
+ raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
87
+ csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
88
+ "anyOf", []
89
+ )
90
+ csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
82
91
  return schema
92
+
93
+ @staticmethod
94
+ def move_enum_to_root(object_property: Dict[str, Any]) -> None:
95
+ if "allOf" in object_property and "enum" in object_property["allOf"][0]:
96
+ object_property["enum"] = object_property["allOf"][0]["enum"]
97
+ object_property.pop("allOf")
@@ -4,9 +4,9 @@
4
4
 
5
5
  import codecs
6
6
  from enum import Enum
7
- from typing import Optional, Set
7
+ from typing import Any, Dict, List, Optional, Set, Union
8
8
 
9
- from pydantic import BaseModel, Field, validator
9
+ from pydantic import BaseModel, Field, ValidationError, root_validator, validator
10
10
  from typing_extensions import Literal
11
11
 
12
12
 
@@ -15,6 +15,52 @@ class InferenceType(Enum):
15
15
  PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
16
16
 
17
17
 
18
+ class CsvHeaderDefinitionType(Enum):
19
+ FROM_CSV = "From CSV"
20
+ AUTOGENERATED = "Autogenerated"
21
+ USER_PROVIDED = "User Provided"
22
+
23
+
24
+ class CsvHeaderFromCsv(BaseModel):
25
+ class Config:
26
+ title = "From CSV"
27
+
28
+ header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
29
+
30
+ def has_header_row(self) -> bool:
31
+ return True
32
+
33
+
34
+ class CsvHeaderAutogenerated(BaseModel):
35
+ class Config:
36
+ title = "Autogenerated"
37
+
38
+ header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
39
+
40
+ def has_header_row(self) -> bool:
41
+ return False
42
+
43
+
44
+ class CsvHeaderUserProvided(BaseModel):
45
+ class Config:
46
+ title = "User Provided"
47
+
48
+ header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
49
+ column_names: List[str] = Field(
50
+ title="Column Names",
51
+ description="The column names that will be used while emitting the CSV records",
52
+ )
53
+
54
+ def has_header_row(self) -> bool:
55
+ return False
56
+
57
+ @validator("column_names")
58
+ def validate_column_names(cls, v: List[str]) -> List[str]:
59
+ if not v:
60
+ raise ValueError("At least one column name needs to be provided when using user provided headers")
61
+ return v
62
+
63
+
18
64
  DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
19
65
  DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
20
66
 
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
64
110
  skip_rows_after_header: int = Field(
65
111
  title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
66
112
  )
67
- autogenerate_column_names: bool = Field(
68
- title="Autogenerate Column Names",
69
- default=False,
70
- description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
113
+ header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
114
+ title="CSV Header Definition",
115
+ default=CsvHeaderFromCsv(),
116
+ description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
71
117
  )
72
118
  true_values: Set[str] = Field(
73
119
  title="True Values",
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
113
159
  except LookupError:
114
160
  raise ValueError(f"invalid encoding format: {v}")
115
161
  return v
162
+
163
+ @root_validator
164
+ def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
165
+ definition_type = values.get("header_definition_type")
166
+ column_names = values.get("user_provided_column_names")
167
+ if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
168
+ raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
169
+ if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
170
+ raise ValidationError(
171
+ "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
172
+ )
173
+ return values
@@ -11,7 +11,7 @@ from functools import partial
11
11
  from io import IOBase
12
12
  from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
13
13
 
14
- from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
14
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
15
15
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
16
16
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
17
17
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -48,11 +48,9 @@ class _CsvReader:
48
48
  with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
49
49
  headers = self._get_headers(fp, config_format, dialect_name)
50
50
 
51
- # we assume that if we autogenerate columns, it is because we don't have headers
52
- # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
53
51
  rows_to_skip = (
54
52
  config_format.skip_rows_before_header
55
- + (0 if config_format.autogenerate_column_names else 1)
53
+ + (1 if config_format.header_definition.has_header_row() else 0)
56
54
  + config_format.skip_rows_after_header
57
55
  )
58
56
  self._skip_rows(fp, rows_to_skip)
@@ -74,8 +72,11 @@ class _CsvReader:
74
72
  Assumes the fp is pointing to the beginning of the files and will reset it as such
75
73
  """
76
74
  # Note that this method assumes the dialect has already been registered if we're parsing the headers
75
+ if isinstance(config_format.header_definition, CsvHeaderUserProvided):
76
+ return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
77
+
77
78
  self._skip_rows(fp, config_format.skip_rows_before_header)
78
- if config_format.autogenerate_column_names:
79
+ if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
79
80
  headers = self._auto_generate_headers(fp, dialect_name)
80
81
  else:
81
82
  # Then read the header
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.6
3
+ Version: 0.51.7
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
64
64
  airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
65
65
  airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
66
66
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
67
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=-Y8Nw3-6ZtvsATixMSAWteHCvYQU965dn4NpVq6aWYs,57232
67
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
68
68
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
69
69
  airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
70
70
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -146,9 +146,9 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGf
146
146
  airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
147
147
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
148
148
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=P-CupGlif5XbKm6tc3FVC4WRMU4ogUbB3klcuZmZJ1k,3940
149
+ airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
150
150
  airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
151
- airbyte_cdk/sources/file_based/config/csv_format.py,sha256=lLyjOqp2gNrXcGtSWozheMMfUQcy0NBUAMWwmDr_B7A,4672
151
+ airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
152
152
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
153
153
  airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
154
154
  airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
157
157
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
158
158
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
159
159
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
160
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=4YdsyH7ntoadhRqMXPl8rertWsQbtE0aJihp9V5zIlg,16586
160
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
161
161
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
162
162
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
163
163
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -314,16 +314,17 @@ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8H
314
314
  unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
315
315
  unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
316
  unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
317
+ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
317
318
  unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
318
319
  unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
319
320
  unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
320
- unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
321
+ unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
321
322
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
322
323
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
323
324
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
325
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
325
326
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
326
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
327
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
327
328
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
328
329
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
329
330
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -354,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
354
355
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
355
356
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
356
357
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
357
- airbyte_cdk-0.51.6.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
358
- airbyte_cdk-0.51.6.dist-info/METADATA,sha256=6H-ZFSOZ3NomomXDhz87qdpIVMRJ1Abd2x61R9cHV0Y,9399
359
- airbyte_cdk-0.51.6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
360
- airbyte_cdk-0.51.6.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
361
- airbyte_cdk-0.51.6.dist-info/RECORD,,
358
+ airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
359
+ airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
360
+ airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
361
+ airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
362
+ airbyte_cdk-0.51.7.dist-info/RECORD,,
@@ -0,0 +1,28 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import unittest
6
+
7
+ import pytest
8
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
9
+ from pydantic import ValidationError
10
+
11
+
12
+ class CsvHeaderDefinitionTest(unittest.TestCase):
13
+ def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
14
+ with pytest.raises(ValidationError):
15
+ CsvHeaderUserProvided(column_names=[])
16
+
17
+ def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
18
+ # no error means that this test succeeds
19
+ CsvHeaderUserProvided(column_names=["1", "2", "3"])
20
+
21
+ def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
22
+ assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
23
+
24
+ def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
25
+ assert not CsvHeaderAutogenerated().has_header_row()
26
+
27
+ def test_given_from_csv_then_csv_has_header_row(self) -> None:
28
+ assert CsvHeaderFromCsv().has_header_row()
@@ -13,7 +13,14 @@ from unittest import TestCase, mock
13
13
  from unittest.mock import Mock
14
14
 
15
15
  import pytest
16
- from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType
16
+ from airbyte_cdk.sources.file_based.config.csv_format import (
17
+ DEFAULT_FALSE_VALUES,
18
+ DEFAULT_TRUE_VALUES,
19
+ CsvFormat,
20
+ CsvHeaderAutogenerated,
21
+ CsvHeaderUserProvided,
22
+ InferenceType,
23
+ )
17
24
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
18
25
  from airbyte_cdk.sources.file_based.exceptions import RecordParseError
19
26
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
278
285
  assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
279
286
 
280
287
  def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
281
- self._config_format.autogenerate_column_names = True
288
+ self._config_format.header_definition = CsvHeaderAutogenerated()
282
289
  self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
283
290
 
284
291
  data_generator = self._read_data()
285
292
 
286
293
  assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
287
294
 
295
+ def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
296
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
297
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
298
+
299
+ data_generator = self._read_data()
300
+
301
+ assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
302
+
303
+ def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
304
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
305
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
306
+
307
+ with pytest.raises(RecordParseError):
308
+ list(self._read_data())
309
+
288
310
  def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
289
311
  self._config_format.skip_rows_after_header = 1
290
312
  self._stream_reader.open_file.return_value = (
@@ -180,11 +180,43 @@ single_csv_scenario = (
180
180
  "default": 0,
181
181
  "type": "integer",
182
182
  },
183
- "autogenerate_column_names": {
184
- "title": "Autogenerate Column Names",
185
- "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.",
186
- "default": False,
187
- "type": "boolean",
183
+ "header_definition": {
184
+ "title": "CSV Header Definition",
185
+ "type": "object",
186
+ "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
187
+ "default": {"header_definition_type": "From CSV"},
188
+ "oneOf": [
189
+ {
190
+ "title": "From CSV",
191
+ "type": "object",
192
+ "properties": {
193
+ "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
194
+ },
195
+ },
196
+ {
197
+ "title": "Autogenerated",
198
+ "type": "object",
199
+ "properties": {
200
+ "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
201
+ },
202
+ },
203
+ {
204
+ "title": "User Provided",
205
+ "type": "object",
206
+ "properties": {
207
+ "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
208
+ "column_names": {
209
+ "title": "Column Names",
210
+ "description": "The column names that will be used while emitting the CSV records",
211
+ "type": "array",
212
+ "items": {
213
+ "type": "string"
214
+ },
215
+ }
216
+ },
217
+ "required": ["column_names"]
218
+ },
219
+ ]
188
220
  },
189
221
  "true_values": {
190
222
  "title": "True Values",
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
761
793
  )
762
794
  ).build()
763
795
 
764
-
765
796
  csv_custom_format_scenario = (
766
797
  TestScenarioBuilder()
767
798
  .set_name("csv_custom_format")
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
868
899
  )
869
900
  ).build()
870
901
 
871
-
872
902
  multi_stream_custom_format = (
873
903
  TestScenarioBuilder()
874
904
  .set_name("multi_stream_custom_format_scenario")
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
1016
1046
  )
1017
1047
  ).build()
1018
1048
 
1019
-
1020
1049
  empty_schema_inference_scenario = (
1021
1050
  TestScenarioBuilder()
1022
1051
  .set_name("empty_schema_inference_scenario")
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
1092
1121
  )
1093
1122
  ).build()
1094
1123
 
1095
-
1096
1124
  schemaless_csv_scenario = (
1097
1125
  TestScenarioBuilder()
1098
1126
  .set_name("schemaless_csv_scenario")
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
1188
1216
  )
1189
1217
  ).build()
1190
1218
 
1191
-
1192
1219
  schemaless_csv_multi_stream_scenario = (
1193
1220
  TestScenarioBuilder()
1194
1221
  .set_name("schemaless_csv_multi_stream_scenario")
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
1296
1323
  )
1297
1324
  ).build()
1298
1325
 
1299
-
1300
1326
  schemaless_with_user_input_schema_fails_connection_check_scenario = (
1301
1327
  TestScenarioBuilder()
1302
1328
  .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
1361
1387
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1362
1388
  ).build()
1363
1389
 
1364
-
1365
1390
  schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
1366
1391
  TestScenarioBuilder()
1367
1392
  .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
1446
1471
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1447
1472
  ).build()
1448
1473
 
1449
-
1450
1474
  csv_string_can_be_null_with_input_schemas_scenario = (
1451
1475
  TestScenarioBuilder()
1452
1476
  .set_name("csv_string_can_be_null_with_input_schema")
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
2143
2167
  )
2144
2168
  ).build()
2145
2169
 
2146
-
2147
2170
  csv_skip_before_header_scenario = (
2148
2171
  TestScenarioBuilder()
2149
2172
  .set_name("csv_skip_before_header")
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
2278
2301
  )
2279
2302
  ).build()
2280
2303
 
2281
-
2282
2304
  csv_skip_before_and_after_header_scenario = (
2283
2305
  TestScenarioBuilder()
2284
2306
  .set_name("csv_skip_before_after_header")
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
2363
2385
  "validation_policy": "Emit Record",
2364
2386
  "format": {
2365
2387
  "filetype": "csv",
2366
- "autogenerate_column_names": True,
2388
+ "header_definition": {"header_definition_type": "Autogenerated"},
2367
2389
  },
2368
2390
  }
2369
2391
  ],
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
2556
2578
  )
2557
2579
  ).build()
2558
2580
 
2559
-
2560
2581
  earlier_csv_scenario = (
2561
2582
  TestScenarioBuilder()
2562
2583
  .set_name("earlier_csv_stream")