airbyte-cdk 0.51.6__py3-none-any.whl → 0.51.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,3 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
1
  # generated by datamodel-codegen:
6
2
  # filename: declarative_component_schema.yaml
7
3
 
@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
66
66
  for format in objects_to_check["oneOf"]:
67
67
  for key in format["properties"]:
68
68
  object_property = format["properties"][key]
69
- if "allOf" in object_property and "enum" in object_property["allOf"][0]:
70
- object_property["enum"] = object_property["allOf"][0]["enum"]
71
- object_property.pop("allOf")
69
+ AbstractFileBasedSpec.move_enum_to_root(object_property)
72
70
 
73
71
  properties_to_change = ["validation_policy"]
74
72
  for property_to_change in properties_to_change:
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
76
74
  if "anyOf" in property_object:
77
75
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
78
76
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
79
- if "allOf" in property_object and "enum" in property_object["allOf"][0]:
80
- property_object["enum"] = property_object["allOf"][0]["enum"]
81
- property_object.pop("allOf")
77
+ AbstractFileBasedSpec.move_enum_to_root(property_object)
78
+
79
+ csv_format_schemas = list(
80
+ filter(
81
+ lambda format: format["properties"]["filetype"]["default"] == "csv",
82
+ schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
83
+ )
84
+ )
85
+ if len(csv_format_schemas) != 1:
86
+ raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
87
+ csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
88
+ "anyOf", []
89
+ )
90
+ csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
82
91
  return schema
92
+
93
+ @staticmethod
94
+ def move_enum_to_root(object_property: Dict[str, Any]) -> None:
95
+ if "allOf" in object_property and "enum" in object_property["allOf"][0]:
96
+ object_property["enum"] = object_property["allOf"][0]["enum"]
97
+ object_property.pop("allOf")
@@ -4,9 +4,9 @@
4
4
 
5
5
  import codecs
6
6
  from enum import Enum
7
- from typing import Optional, Set
7
+ from typing import Any, Dict, List, Optional, Set, Union
8
8
 
9
- from pydantic import BaseModel, Field, validator
9
+ from pydantic import BaseModel, Field, ValidationError, root_validator, validator
10
10
  from typing_extensions import Literal
11
11
 
12
12
 
@@ -15,6 +15,52 @@ class InferenceType(Enum):
15
15
  PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
16
16
 
17
17
 
18
+ class CsvHeaderDefinitionType(Enum):
19
+ FROM_CSV = "From CSV"
20
+ AUTOGENERATED = "Autogenerated"
21
+ USER_PROVIDED = "User Provided"
22
+
23
+
24
+ class CsvHeaderFromCsv(BaseModel):
25
+ class Config:
26
+ title = "From CSV"
27
+
28
+ header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
29
+
30
+ def has_header_row(self) -> bool:
31
+ return True
32
+
33
+
34
+ class CsvHeaderAutogenerated(BaseModel):
35
+ class Config:
36
+ title = "Autogenerated"
37
+
38
+ header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
39
+
40
+ def has_header_row(self) -> bool:
41
+ return False
42
+
43
+
44
+ class CsvHeaderUserProvided(BaseModel):
45
+ class Config:
46
+ title = "User Provided"
47
+
48
+ header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
49
+ column_names: List[str] = Field(
50
+ title="Column Names",
51
+ description="The column names that will be used while emitting the CSV records",
52
+ )
53
+
54
+ def has_header_row(self) -> bool:
55
+ return False
56
+
57
+ @validator("column_names")
58
+ def validate_column_names(cls, v: List[str]) -> List[str]:
59
+ if not v:
60
+ raise ValueError("At least one column name needs to be provided when using user provided headers")
61
+ return v
62
+
63
+
18
64
  DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
19
65
  DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
20
66
 
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
64
110
  skip_rows_after_header: int = Field(
65
111
  title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
66
112
  )
67
- autogenerate_column_names: bool = Field(
68
- title="Autogenerate Column Names",
69
- default=False,
70
- description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
113
+ header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
114
+ title="CSV Header Definition",
115
+ default=CsvHeaderFromCsv(),
116
+ description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
71
117
  )
72
118
  true_values: Set[str] = Field(
73
119
  title="True Values",
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
113
159
  except LookupError:
114
160
  raise ValueError(f"invalid encoding format: {v}")
115
161
  return v
162
+
163
+ @root_validator
164
+ def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
165
+ definition_type = values.get("header_definition_type")
166
+ column_names = values.get("user_provided_column_names")
167
+ if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
168
+ raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
169
+ if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
170
+ raise ValidationError(
171
+ "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
172
+ )
173
+ return values
@@ -11,7 +11,7 @@ from functools import partial
11
11
  from io import IOBase
12
12
  from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
13
13
 
14
- from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
14
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
15
15
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
16
16
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
17
17
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -48,11 +48,9 @@ class _CsvReader:
48
48
  with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
49
49
  headers = self._get_headers(fp, config_format, dialect_name)
50
50
 
51
- # we assume that if we autogenerate columns, it is because we don't have headers
52
- # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
53
51
  rows_to_skip = (
54
52
  config_format.skip_rows_before_header
55
- + (0 if config_format.autogenerate_column_names else 1)
53
+ + (1 if config_format.header_definition.has_header_row() else 0)
56
54
  + config_format.skip_rows_after_header
57
55
  )
58
56
  self._skip_rows(fp, rows_to_skip)
@@ -74,8 +72,11 @@ class _CsvReader:
74
72
  Assumes the fp is pointing to the beginning of the files and will reset it as such
75
73
  """
76
74
  # Note that this method assumes the dialect has already been registered if we're parsing the headers
75
+ if isinstance(config_format.header_definition, CsvHeaderUserProvided):
76
+ return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
77
+
77
78
  self._skip_rows(fp, config_format.skip_rows_before_header)
78
- if config_format.autogenerate_column_names:
79
+ if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
79
80
  headers = self._auto_generate_headers(fp, dialect_name)
80
81
  else:
81
82
  # Then read the header
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.6
3
+ Version: 0.51.7
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
64
64
  airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
65
65
  airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
66
66
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
67
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=-Y8Nw3-6ZtvsATixMSAWteHCvYQU965dn4NpVq6aWYs,57232
67
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
68
68
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
69
69
  airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
70
70
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -146,9 +146,9 @@ airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGf
146
146
  airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
147
147
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
148
148
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=P-CupGlif5XbKm6tc3FVC4WRMU4ogUbB3klcuZmZJ1k,3940
149
+ airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
150
150
  airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
151
- airbyte_cdk/sources/file_based/config/csv_format.py,sha256=lLyjOqp2gNrXcGtSWozheMMfUQcy0NBUAMWwmDr_B7A,4672
151
+ airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
152
152
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
153
153
  airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
154
154
  airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
157
157
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
158
158
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
159
159
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
160
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=4YdsyH7ntoadhRqMXPl8rertWsQbtE0aJihp9V5zIlg,16586
160
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
161
161
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
162
162
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
163
163
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -314,16 +314,17 @@ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8H
314
314
  unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
315
315
  unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
316
  unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
317
+ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
317
318
  unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
318
319
  unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
319
320
  unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
320
- unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
321
+ unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
321
322
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
322
323
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
323
324
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
325
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
325
326
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
326
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
327
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
327
328
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
328
329
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
329
330
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -354,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
354
355
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
355
356
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
356
357
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
357
- airbyte_cdk-0.51.6.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
358
- airbyte_cdk-0.51.6.dist-info/METADATA,sha256=6H-ZFSOZ3NomomXDhz87qdpIVMRJ1Abd2x61R9cHV0Y,9399
359
- airbyte_cdk-0.51.6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
360
- airbyte_cdk-0.51.6.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
361
- airbyte_cdk-0.51.6.dist-info/RECORD,,
358
+ airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
359
+ airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
360
+ airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
361
+ airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
362
+ airbyte_cdk-0.51.7.dist-info/RECORD,,
@@ -0,0 +1,28 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import unittest
6
+
7
+ import pytest
8
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
9
+ from pydantic import ValidationError
10
+
11
+
12
+ class CsvHeaderDefinitionTest(unittest.TestCase):
13
+ def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
14
+ with pytest.raises(ValidationError):
15
+ CsvHeaderUserProvided(column_names=[])
16
+
17
+ def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
18
+ # no error means that this test succeeds
19
+ CsvHeaderUserProvided(column_names=["1", "2", "3"])
20
+
21
+ def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
22
+ assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
23
+
24
+ def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
25
+ assert not CsvHeaderAutogenerated().has_header_row()
26
+
27
+ def test_given_from_csv_then_csv_has_header_row(self) -> None:
28
+ assert CsvHeaderFromCsv().has_header_row()
@@ -13,7 +13,14 @@ from unittest import TestCase, mock
13
13
  from unittest.mock import Mock
14
14
 
15
15
  import pytest
16
- from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType
16
+ from airbyte_cdk.sources.file_based.config.csv_format import (
17
+ DEFAULT_FALSE_VALUES,
18
+ DEFAULT_TRUE_VALUES,
19
+ CsvFormat,
20
+ CsvHeaderAutogenerated,
21
+ CsvHeaderUserProvided,
22
+ InferenceType,
23
+ )
17
24
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
18
25
  from airbyte_cdk.sources.file_based.exceptions import RecordParseError
19
26
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
278
285
  assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
279
286
 
280
287
  def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
281
- self._config_format.autogenerate_column_names = True
288
+ self._config_format.header_definition = CsvHeaderAutogenerated()
282
289
  self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
283
290
 
284
291
  data_generator = self._read_data()
285
292
 
286
293
  assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
287
294
 
295
+ def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
296
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
297
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
298
+
299
+ data_generator = self._read_data()
300
+
301
+ assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
302
+
303
+ def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
304
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
305
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
306
+
307
+ with pytest.raises(RecordParseError):
308
+ list(self._read_data())
309
+
288
310
  def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
289
311
  self._config_format.skip_rows_after_header = 1
290
312
  self._stream_reader.open_file.return_value = (
@@ -180,11 +180,43 @@ single_csv_scenario = (
180
180
  "default": 0,
181
181
  "type": "integer",
182
182
  },
183
- "autogenerate_column_names": {
184
- "title": "Autogenerate Column Names",
185
- "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.",
186
- "default": False,
187
- "type": "boolean",
183
+ "header_definition": {
184
+ "title": "CSV Header Definition",
185
+ "type": "object",
186
+ "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
187
+ "default": {"header_definition_type": "From CSV"},
188
+ "oneOf": [
189
+ {
190
+ "title": "From CSV",
191
+ "type": "object",
192
+ "properties": {
193
+ "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
194
+ },
195
+ },
196
+ {
197
+ "title": "Autogenerated",
198
+ "type": "object",
199
+ "properties": {
200
+ "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
201
+ },
202
+ },
203
+ {
204
+ "title": "User Provided",
205
+ "type": "object",
206
+ "properties": {
207
+ "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
208
+ "column_names": {
209
+ "title": "Column Names",
210
+ "description": "The column names that will be used while emitting the CSV records",
211
+ "type": "array",
212
+ "items": {
213
+ "type": "string"
214
+ },
215
+ }
216
+ },
217
+ "required": ["column_names"]
218
+ },
219
+ ]
188
220
  },
189
221
  "true_values": {
190
222
  "title": "True Values",
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
761
793
  )
762
794
  ).build()
763
795
 
764
-
765
796
  csv_custom_format_scenario = (
766
797
  TestScenarioBuilder()
767
798
  .set_name("csv_custom_format")
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
868
899
  )
869
900
  ).build()
870
901
 
871
-
872
902
  multi_stream_custom_format = (
873
903
  TestScenarioBuilder()
874
904
  .set_name("multi_stream_custom_format_scenario")
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
1016
1046
  )
1017
1047
  ).build()
1018
1048
 
1019
-
1020
1049
  empty_schema_inference_scenario = (
1021
1050
  TestScenarioBuilder()
1022
1051
  .set_name("empty_schema_inference_scenario")
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
1092
1121
  )
1093
1122
  ).build()
1094
1123
 
1095
-
1096
1124
  schemaless_csv_scenario = (
1097
1125
  TestScenarioBuilder()
1098
1126
  .set_name("schemaless_csv_scenario")
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
1188
1216
  )
1189
1217
  ).build()
1190
1218
 
1191
-
1192
1219
  schemaless_csv_multi_stream_scenario = (
1193
1220
  TestScenarioBuilder()
1194
1221
  .set_name("schemaless_csv_multi_stream_scenario")
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
1296
1323
  )
1297
1324
  ).build()
1298
1325
 
1299
-
1300
1326
  schemaless_with_user_input_schema_fails_connection_check_scenario = (
1301
1327
  TestScenarioBuilder()
1302
1328
  .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
1361
1387
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1362
1388
  ).build()
1363
1389
 
1364
-
1365
1390
  schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
1366
1391
  TestScenarioBuilder()
1367
1392
  .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
1446
1471
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1447
1472
  ).build()
1448
1473
 
1449
-
1450
1474
  csv_string_can_be_null_with_input_schemas_scenario = (
1451
1475
  TestScenarioBuilder()
1452
1476
  .set_name("csv_string_can_be_null_with_input_schema")
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
2143
2167
  )
2144
2168
  ).build()
2145
2169
 
2146
-
2147
2170
  csv_skip_before_header_scenario = (
2148
2171
  TestScenarioBuilder()
2149
2172
  .set_name("csv_skip_before_header")
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
2278
2301
  )
2279
2302
  ).build()
2280
2303
 
2281
-
2282
2304
  csv_skip_before_and_after_header_scenario = (
2283
2305
  TestScenarioBuilder()
2284
2306
  .set_name("csv_skip_before_after_header")
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
2363
2385
  "validation_policy": "Emit Record",
2364
2386
  "format": {
2365
2387
  "filetype": "csv",
2366
- "autogenerate_column_names": True,
2388
+ "header_definition": {"header_definition_type": "Autogenerated"},
2367
2389
  },
2368
2390
  }
2369
2391
  ],
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
2556
2578
  )
2557
2579
  ).build()
2558
2580
 
2559
-
2560
2581
  earlier_csv_scenario = (
2561
2582
  TestScenarioBuilder()
2562
2583
  .set_name("earlier_csv_stream")