airbyte-cdk 0.51.5__py3-none-any.whl → 0.51.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,3 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
4
-
5
1
  # generated by datamodel-codegen:
6
2
  # filename: declarative_component_schema.yaml
7
3
 
@@ -55,7 +55,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
55
55
  """
56
56
  try:
57
57
  files = self._check_list_files(stream)
58
- self._check_extensions(stream, files)
59
58
  self._check_parse_record(stream, files[0], logger)
60
59
  except CheckAvailabilityError:
61
60
  return False, "".join(traceback.format_exc())
@@ -73,11 +72,6 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
73
72
 
74
73
  return files
75
74
 
76
- def _check_extensions(self, stream: "AbstractFileBasedStream", files: List[RemoteFile]) -> None:
77
- if not all(f.extension_agrees_with_file_type(stream.config.file_type) for f in files):
78
- raise CheckAvailabilityError(FileBasedSourceError.EXTENSION_MISMATCH, stream=stream.name)
79
- return None
80
-
81
75
  def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
82
76
  parser = stream.get_parser(stream.config.file_type)
83
77
 
@@ -66,9 +66,7 @@ class AbstractFileBasedSpec(BaseModel):
66
66
  for format in objects_to_check["oneOf"]:
67
67
  for key in format["properties"]:
68
68
  object_property = format["properties"][key]
69
- if "allOf" in object_property and "enum" in object_property["allOf"][0]:
70
- object_property["enum"] = object_property["allOf"][0]["enum"]
71
- object_property.pop("allOf")
69
+ AbstractFileBasedSpec.move_enum_to_root(object_property)
72
70
 
73
71
  properties_to_change = ["validation_policy"]
74
72
  for property_to_change in properties_to_change:
@@ -76,7 +74,24 @@ class AbstractFileBasedSpec(BaseModel):
76
74
  if "anyOf" in property_object:
77
75
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
78
76
  schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
79
- if "allOf" in property_object and "enum" in property_object["allOf"][0]:
80
- property_object["enum"] = property_object["allOf"][0]["enum"]
81
- property_object.pop("allOf")
77
+ AbstractFileBasedSpec.move_enum_to_root(property_object)
78
+
79
+ csv_format_schemas = list(
80
+ filter(
81
+ lambda format: format["properties"]["filetype"]["default"] == "csv",
82
+ schema["properties"]["streams"]["items"]["properties"]["format"]["oneOf"],
83
+ )
84
+ )
85
+ if len(csv_format_schemas) != 1:
86
+ raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
87
+ csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
88
+ "anyOf", []
89
+ )
90
+ csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
82
91
  return schema
92
+
93
+ @staticmethod
94
+ def move_enum_to_root(object_property: Dict[str, Any]) -> None:
95
+ if "allOf" in object_property and "enum" in object_property["allOf"][0]:
96
+ object_property["enum"] = object_property["allOf"][0]["enum"]
97
+ object_property.pop("allOf")
@@ -4,9 +4,9 @@
4
4
 
5
5
  import codecs
6
6
  from enum import Enum
7
- from typing import Optional, Set
7
+ from typing import Any, Dict, List, Optional, Set, Union
8
8
 
9
- from pydantic import BaseModel, Field, validator
9
+ from pydantic import BaseModel, Field, ValidationError, root_validator, validator
10
10
  from typing_extensions import Literal
11
11
 
12
12
 
@@ -15,6 +15,52 @@ class InferenceType(Enum):
15
15
  PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
16
16
 
17
17
 
18
+ class CsvHeaderDefinitionType(Enum):
19
+ FROM_CSV = "From CSV"
20
+ AUTOGENERATED = "Autogenerated"
21
+ USER_PROVIDED = "User Provided"
22
+
23
+
24
+ class CsvHeaderFromCsv(BaseModel):
25
+ class Config:
26
+ title = "From CSV"
27
+
28
+ header_definition_type: Literal[CsvHeaderDefinitionType.FROM_CSV.value] = CsvHeaderDefinitionType.FROM_CSV.value # type: ignore
29
+
30
+ def has_header_row(self) -> bool:
31
+ return True
32
+
33
+
34
+ class CsvHeaderAutogenerated(BaseModel):
35
+ class Config:
36
+ title = "Autogenerated"
37
+
38
+ header_definition_type: Literal[CsvHeaderDefinitionType.AUTOGENERATED.value] = CsvHeaderDefinitionType.AUTOGENERATED.value # type: ignore
39
+
40
+ def has_header_row(self) -> bool:
41
+ return False
42
+
43
+
44
+ class CsvHeaderUserProvided(BaseModel):
45
+ class Config:
46
+ title = "User Provided"
47
+
48
+ header_definition_type: Literal[CsvHeaderDefinitionType.USER_PROVIDED.value] = CsvHeaderDefinitionType.USER_PROVIDED.value # type: ignore
49
+ column_names: List[str] = Field(
50
+ title="Column Names",
51
+ description="The column names that will be used while emitting the CSV records",
52
+ )
53
+
54
+ def has_header_row(self) -> bool:
55
+ return False
56
+
57
+ @validator("column_names")
58
+ def validate_column_names(cls, v: List[str]) -> List[str]:
59
+ if not v:
60
+ raise ValueError("At least one column name needs to be provided when using user provided headers")
61
+ return v
62
+
63
+
18
64
  DEFAULT_TRUE_VALUES = ["y", "yes", "t", "true", "on", "1"]
19
65
  DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
20
66
 
@@ -64,10 +110,10 @@ class CsvFormat(BaseModel):
64
110
  skip_rows_after_header: int = Field(
65
111
  title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
66
112
  )
67
- autogenerate_column_names: bool = Field(
68
- title="Autogenerate Column Names",
69
- default=False,
70
- description="Whether to autogenerate column names if column_names is empty. If true, column names will be of the form “f0”, “f1”… If false, column names will be read from the first CSV row after skip_rows_before_header.",
113
+ header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
114
+ title="CSV Header Definition",
115
+ default=CsvHeaderFromCsv(),
116
+ description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
71
117
  )
72
118
  true_values: Set[str] = Field(
73
119
  title="True Values",
@@ -113,3 +159,15 @@ class CsvFormat(BaseModel):
113
159
  except LookupError:
114
160
  raise ValueError(f"invalid encoding format: {v}")
115
161
  return v
162
+
163
+ @root_validator
164
+ def validate_optional_args(cls, values: Dict[str, Any]) -> Dict[str, Any]:
165
+ definition_type = values.get("header_definition_type")
166
+ column_names = values.get("user_provided_column_names")
167
+ if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
168
+ raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
169
+ if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
170
+ raise ValidationError(
171
+ "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
172
+ )
173
+ return values
@@ -7,7 +7,6 @@ from enum import Enum
7
7
 
8
8
  class FileBasedSourceError(Enum):
9
9
  EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
10
- EXTENSION_MISMATCH = "The file type that you specified for this stream does not agree with the extension of one or more files in the stream. You may need to modify your glob patterns."
11
10
  GLOB_PARSE_ERROR = (
12
11
  "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
13
12
  )
@@ -11,7 +11,7 @@ from functools import partial
11
11
  from io import IOBase
12
12
  from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set
13
13
 
14
- from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
14
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, CsvHeaderAutogenerated, CsvHeaderUserProvided, InferenceType
15
15
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
16
16
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
17
17
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -48,11 +48,9 @@ class _CsvReader:
48
48
  with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
49
49
  headers = self._get_headers(fp, config_format, dialect_name)
50
50
 
51
- # we assume that if we autogenerate columns, it is because we don't have headers
52
- # if a user wants to autogenerate_column_names with a CSV having headers, he can skip rows
53
51
  rows_to_skip = (
54
52
  config_format.skip_rows_before_header
55
- + (0 if config_format.autogenerate_column_names else 1)
53
+ + (1 if config_format.header_definition.has_header_row() else 0)
56
54
  + config_format.skip_rows_after_header
57
55
  )
58
56
  self._skip_rows(fp, rows_to_skip)
@@ -74,8 +72,11 @@ class _CsvReader:
74
72
  Assumes the fp is pointing to the beginning of the files and will reset it as such
75
73
  """
76
74
  # Note that this method assumes the dialect has already been registered if we're parsing the headers
75
+ if isinstance(config_format.header_definition, CsvHeaderUserProvided):
76
+ return config_format.header_definition.column_names # type: ignore # should be CsvHeaderUserProvided given the type
77
+
77
78
  self._skip_rows(fp, config_format.skip_rows_before_header)
78
- if config_format.autogenerate_column_names:
79
+ if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
79
80
  headers = self._auto_generate_headers(fp, dialect_name)
80
81
  else:
81
82
  # Then read the header
@@ -6,7 +6,7 @@ from abc import abstractmethod
6
6
  from functools import cached_property, lru_cache
7
7
  from typing import Any, Dict, Iterable, List, Mapping, Optional
8
8
 
9
- from airbyte_cdk.models import ConfiguredAirbyteCatalog, SyncMode
9
+ from airbyte_cdk.models import SyncMode
10
10
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
11
11
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
12
12
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
@@ -38,7 +38,7 @@ class AbstractFileBasedStream(Stream):
38
38
  def __init__(
39
39
  self,
40
40
  config: FileBasedStreamConfig,
41
- catalog_schema: Optional[ConfiguredAirbyteCatalog],
41
+ catalog_schema: Optional[Mapping[str, Any]],
42
42
  stream_reader: AbstractFileBasedStreamReader,
43
43
  availability_strategy: AbstractFileBasedAvailabilityStrategy,
44
44
  discovery_policy: AbstractDiscoveryPolicy,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.51.5
3
+ Version: 0.51.7
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -64,7 +64,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=dyIM-bzh54
64
64
  airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=Dc0F87nElWsz_Ikj938eQ9uqZvyqgFhZ8Dqf_-hvndc,4800
65
65
  airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=V6WGKJ9cXX1rjuM4bK3Cs9xEryMlkY2U3FMsSBhrgC8,3098
66
66
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=EiYnzwCHZV7EYqMJqcy6xKSeHvTKZBsQndjbEwmiTW4,93
67
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=-Y8Nw3-6ZtvsATixMSAWteHCvYQU965dn4NpVq6aWYs,57232
67
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=ebor38wlQVqYD2QXk5X8v9xDZl0cEpIc2mFaKvpuiPE,57170
68
68
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
69
69
  airbyte_cdk/sources/declarative/parsers/class_types_registry.py,sha256=bK4a74opm6WHyV7HqOVws6GE5Z7cLNc5MaTha69abIQ,6086
70
70
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=y7_G5mM07zxT5YG975kdC2PAja-Uc83pYp8WrV3GNdo,522
@@ -136,7 +136,7 @@ airbyte_cdk/sources/embedded/catalog.py,sha256=mIM7rO5CZAUIHKbrKwn1-Zn9_e3sLiHrT
136
136
  airbyte_cdk/sources/embedded/runner.py,sha256=kZ0CcUANuMjdZ4fmvp_w9P2IcsS9WSHxNqYHqMwcfXI,1390
137
137
  airbyte_cdk/sources/embedded/tools.py,sha256=-Z4tZ4AP1OTi_zrqFM3YV8Rt7c60wvsrv0Dc-rTZ2uw,744
138
138
  airbyte_cdk/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- airbyte_cdk/sources/file_based/exceptions.py,sha256=9vko7ker-nJFJ2-vSzeFAqWNgrv2unSxPZ2tTKuBbnU,3845
139
+ airbyte_cdk/sources/file_based/exceptions.py,sha256=4jwHysXT6r2o37Z7ch00nbo45wPVsmCorRYbYTmWd2Q,3656
140
140
  airbyte_cdk/sources/file_based/file_based_source.py,sha256=NCbXAGPWBQSPAf5x2U2eCdOLUd26RhO5s6K87_AF8Es,6931
141
141
  airbyte_cdk/sources/file_based/file_based_stream_reader.py,sha256=K9fFHcSL4E8v-X2l38wRAcZCjpyifr35orvby8vQt84,3749
142
142
  airbyte_cdk/sources/file_based/remote_file.py,sha256=s3Qz2N786yqSMXqcWmsTOvYhgs-ry0xFcn5fGyyz7bY,581
@@ -144,11 +144,11 @@ airbyte_cdk/sources/file_based/schema_helpers.py,sha256=XBkOutIw_n6SNYU34qbyTbl0
144
144
  airbyte_cdk/sources/file_based/types.py,sha256=INxG7OPnkdUP69oYNKMAbwhvV1AGvLRHs1J6pIia2FI,218
145
145
  airbyte_cdk/sources/file_based/availability_strategy/__init__.py,sha256=WiPPuQGfmQlFUMFR5h3ECc-VzBj4vair6_4WAL87AEI,277
146
146
  airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py,sha256=NeHCiG4FFohzYpQQFfmTL4-5oI0nElHWgXX1xrm8-SU,1269
147
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=7k9YD8ZVkUpRHN4x3F84Do8ZA91Ph576r3cNdvLBizk,4635
147
+ airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=InGBlGbInuNUQ6oaK5A9oICVc7ZNHMSYo8g5Vy2smOo,4266
148
148
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=P-CupGlif5XbKm6tc3FVC4WRMU4ogUbB3klcuZmZJ1k,3940
149
+ airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
150
150
  airbyte_cdk/sources/file_based/config/avro_format.py,sha256=qGBB0RTjWDGZW-ilIwIq9OZl2BC-jBaq2WGrI3WVBsQ,597
151
- airbyte_cdk/sources/file_based/config/csv_format.py,sha256=lLyjOqp2gNrXcGtSWozheMMfUQcy0NBUAMWwmDr_B7A,4672
151
+ airbyte_cdk/sources/file_based/config/csv_format.py,sha256=-r-uGQlo-nXfhPuOR05XtYx_1vht74r8_am2_p8mcP8,7166
152
152
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5R0UlPJUGGx5OnpezZ0Fd8dyO4y2vMZtiPZR_3rfvSk,5916
153
153
  airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=B-s1uy9RiKpKMwmMlR7UT3WeQPlTI-xclD0fVM4IU1Q,254
154
154
  airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=zvcHATNKoBIgU2UXuGnoldqLoRXG_X8ZzAkpqGPJtq4,625
@@ -157,7 +157,7 @@ airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha
157
157
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=Mx3zT9Dem4uNfaUT0oOtrESsuB1LrGAi5N-uw2swZZA,701
158
158
  airbyte_cdk/sources/file_based/file_types/__init__.py,sha256=N3a8cjXwRUN2__46IJTwrWlsyFiSA1xtSgPcPH28sn0,476
159
159
  airbyte_cdk/sources/file_based/file_types/avro_parser.py,sha256=7PVaW17wn80HYW1mu074X2dy0UgFoqFqGIOKN2ZMKD0,8686
160
- airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=4YdsyH7ntoadhRqMXPl8rertWsQbtE0aJihp9V5zIlg,16586
160
+ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=KgdpdkHAFducvXM2jQr356M0WVol-vX0cm42n9Kf_Yc,16684
161
161
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=cThTLc1YKSAapOn70lB09SzruRIPSShGIMz1f92QYV8,1555
162
162
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
163
163
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=QulQ_soGb1LpQ_KTxqWZjmfACGkTUDUOeuSmNFtcSLk,8717
@@ -165,7 +165,7 @@ airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEV
165
165
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
166
166
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
167
167
  airbyte_cdk/sources/file_based/stream/__init__.py,sha256=QPDqdgjsabOQD93dSFqHGaFS_3pIwm-chEabZHiPJi0,265
168
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=tvVew6din9y8a3hItzU0PjTQrMxbVI7bK-3pRTvOswg,5810
168
+ airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py,sha256=BXO0b4iHNJUsF7GVIWTnY5Zlj-IjHS_JmqQlKsSDgz8,5777
169
169
  airbyte_cdk/sources/file_based/stream/default_file_based_stream.py,sha256=eFYqN657J5A0sf9og_w7qea8lu2xtUobjYYDldfmbmA,11839
170
170
  airbyte_cdk/sources/file_based/stream/cursor/__init__.py,sha256=MhFB5hOo8sjwvCh8gangaymdg3EJWYt_72brFOZt068,191
171
171
  airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py,sha256=i-FPeK8lwCzX34GCcmvL5Yvdh8-uu7FeCVYDoFbD7IY,1920
@@ -308,20 +308,23 @@ unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
308
308
  unit_tests/sources/file_based/helpers.py,sha256=JNCRl13oLRRun2XyYLSKLzfrzzOAMT57yUY0vZasxL4,2567
309
309
  unit_tests/sources/file_based/in_memory_files_source.py,sha256=HSZEtN7wb_NhBx4LVAEeAaeTByIBYZLr6xXJLI0FFLU,7777
310
310
  unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=gVJcjj8Q83LTxcU3LL9gv-9SAY21umtOMDTy1Z7A9OU,7552
311
- unit_tests/sources/file_based/test_scenarios.py,sha256=3rvlUzR1e5UnVXcr8dKx01oYqteSOfYibmblPyiiA08,18245
311
+ unit_tests/sources/file_based/test_scenarios.py,sha256=1s3hN6xkmqHKGa348rK3sDLf-PPiEx0w-qfRi70gQnc,18167
312
312
  unit_tests/sources/file_based/test_schema_helpers.py,sha256=XJ27ecw0sjlSnKgQqV1DgnnjKB1TR2btq22OITh1Qdk,12333
313
+ unit_tests/sources/file_based/availability_strategy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
314
+ unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py,sha256=HzxFJVJFv3YpjVmJm45ZyS2HpbnhtEX2hm4r8VjkRFE,2463
313
315
  unit_tests/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
314
316
  unit_tests/sources/file_based/config/test_abstract_file_based_spec.py,sha256=wmZAC-nBiUedMZi0n4zaC9oiZD9UTuYP5zJC1xxRnME,1216
317
+ unit_tests/sources/file_based/config/test_csv_format.py,sha256=VYL-9Ec8hW_yO2Pj9F8pDfITcgEAFtSublYda7ut7QE,1132
315
318
  unit_tests/sources/file_based/config/test_file_based_stream_config.py,sha256=1eMsHlMQIFwyw20HjnhgKuiw6399sMcLTQ4LP09kTT4,3060
316
319
  unit_tests/sources/file_based/file_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
317
320
  unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=INqwKXcgNb3h_tktNXYU6WNUD-iNwRYHCd3IrnQa5R4,11051
318
- unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=KB4WDy3aMAZ0CmJiqFaTUOZlK4urpvG9bwcwQ-h2-VY,20303
321
+ unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=4onvErJCMNeSquZr7c1dX4TzqJlvQ3wulYCjAU_IblU,21266
319
322
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
320
323
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=D7sKTty8aEqMDWWGKWUqDbWjTxhGkygU7ns4-_JceRY,13543
321
324
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
322
325
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=xUfw0crAvSTgQ2-chJx2ZiigQyo5IfrCuOFC1TWXXsQ,29795
323
- unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=7DR49LCjns72Dv5-R-cg6_SUR1zpHtE9_uFEWoYwx1s,5834
324
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=UVdDblKj3R5qQkh-dj4xqZ2822GyJuymaAerWbX9HeE,95707
326
+ unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=xZf28rlH93ap8JCkAjwocng-uAW-mvMx6BDOLbvVCig,5588
327
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=nG4O2Ah0Uwgjg6SVTuioO_gPOigKxm-PlM2Tw21svYw,98724
325
328
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=0maHng11cFmvzFLOniyBxOEYoKj4DYR3NO9-pSYoFLs,60710
326
329
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=N83fga4gMKkbm6hYnen1Z5p5eEgjnMB_M_sXx6B96cU,27503
327
330
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=-cBO1ZwberBxNMqDOtKz8yGwm3zB7elz_st2NKHeczM,26955
@@ -352,8 +355,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
352
355
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
353
356
  unit_tests/utils/test_stream_status_utils.py,sha256=NpV155JMXA6CG-2Zvofa14lItobyh3Onttc59X4m5DI,3382
354
357
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
355
- airbyte_cdk-0.51.5.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
356
- airbyte_cdk-0.51.5.dist-info/METADATA,sha256=6S2hoA3Ua7D7zVi9BQ7Zm6xJzPj8uBbKdifYwD-L2pw,9399
357
- airbyte_cdk-0.51.5.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
358
- airbyte_cdk-0.51.5.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
359
- airbyte_cdk-0.51.5.dist-info/RECORD,,
358
+ airbyte_cdk-0.51.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
359
+ airbyte_cdk-0.51.7.dist-info/METADATA,sha256=YOrAlHsZod4Nq3VugY7nbE7MDd8r8ZU7gcvX4YzTuk0,9399
360
+ airbyte_cdk-0.51.7.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
361
+ airbyte_cdk-0.51.7.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
362
+ airbyte_cdk-0.51.7.dist-info/RECORD,,
@@ -0,0 +1,52 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import unittest
6
+ from datetime import datetime
7
+ from unittest.mock import Mock, PropertyMock
8
+
9
+ from airbyte_cdk.sources.file_based.availability_strategy.default_file_based_availability_strategy import (
10
+ DefaultFileBasedAvailabilityStrategy,
11
+ )
12
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
13
+ from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
14
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
15
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
17
+ from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
18
+
19
+ _FILE_WITH_UNKNOWN_EXTENSION = RemoteFile(uri="a.unknown_extension", last_modified=datetime.now(), file_type="csv")
20
+ _ANY_CONFIG = FileBasedStreamConfig(
21
+ name="config.name",
22
+ file_type="parquet",
23
+ format=JsonlFormat(),
24
+ )
25
+ _ANY_SCHEMA = {"key": "value"}
26
+
27
+
28
+ class DefaultFileBasedAvailabilityStrategyTest(unittest.TestCase):
29
+
30
+ def setUp(self) -> None:
31
+ self._stream_reader = Mock(spec=AbstractFileBasedStreamReader)
32
+ self._strategy = DefaultFileBasedAvailabilityStrategy(self._stream_reader)
33
+
34
+ self._parser = Mock(spec=FileTypeParser)
35
+ self._stream = Mock(spec=AbstractFileBasedStream)
36
+ self._stream.get_parser.return_value = self._parser
37
+ self._stream.catalog_schema = _ANY_SCHEMA
38
+ self._stream.config = _ANY_CONFIG
39
+ self._stream.validation_policy = PropertyMock(validate_schema_before_sync=False)
40
+
41
+ def test_given_file_extension_does_not_match_when_check_availability_and_parsability_then_stream_is_still_available(self) -> None:
42
+ """
43
+ Before, we had a validation on the file extension but it turns out that in production, users sometimes have mismatch there. The
44
+ example we've seen was for JSONL parser but the file extension was just `.json`. Note that there we more than one record extracted
45
+ from this stream so it's not just that the file is one JSON object
46
+ """
47
+ self._stream.list_files.return_value = [_FILE_WITH_UNKNOWN_EXTENSION]
48
+ self._parser.parse_records.return_value = [{"a record": 1}]
49
+
50
+ is_available, reason = self._strategy.check_availability_and_parsability(self._stream, Mock(), Mock())
51
+
52
+ assert is_available
@@ -0,0 +1,28 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import unittest
6
+
7
+ import pytest
8
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvHeaderAutogenerated, CsvHeaderFromCsv, CsvHeaderUserProvided
9
+ from pydantic import ValidationError
10
+
11
+
12
+ class CsvHeaderDefinitionTest(unittest.TestCase):
13
+ def test_given_user_provided_and_not_column_names_provided_then_raise_exception(self) -> None:
14
+ with pytest.raises(ValidationError):
15
+ CsvHeaderUserProvided(column_names=[])
16
+
17
+ def test_given_user_provided_and_column_names_then_config_is_valid(self) -> None:
18
+ # no error means that this test succeeds
19
+ CsvHeaderUserProvided(column_names=["1", "2", "3"])
20
+
21
+ def test_given_user_provided_then_csv_does_not_have_header_row(self) -> None:
22
+ assert not CsvHeaderUserProvided(column_names=["1", "2", "3"]).has_header_row()
23
+
24
+ def test_given_autogenerated_then_csv_does_not_have_header_row(self) -> None:
25
+ assert not CsvHeaderAutogenerated().has_header_row()
26
+
27
+ def test_given_from_csv_then_csv_has_header_row(self) -> None:
28
+ assert CsvHeaderFromCsv().has_header_row()
@@ -13,7 +13,14 @@ from unittest import TestCase, mock
13
13
  from unittest.mock import Mock
14
14
 
15
15
  import pytest
16
- from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat, InferenceType
16
+ from airbyte_cdk.sources.file_based.config.csv_format import (
17
+ DEFAULT_FALSE_VALUES,
18
+ DEFAULT_TRUE_VALUES,
19
+ CsvFormat,
20
+ CsvHeaderAutogenerated,
21
+ CsvHeaderUserProvided,
22
+ InferenceType,
23
+ )
17
24
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
18
25
  from airbyte_cdk.sources.file_based.exceptions import RecordParseError
19
26
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -278,13 +285,28 @@ class CsvReaderTest(unittest.TestCase):
278
285
  assert list(data_generator) == [{"header": "a value"}, {"header": "another value"}]
279
286
 
280
287
  def test_given_autogenerated_headers_when_read_data_then_generate_headers_with_format_fX(self) -> None:
281
- self._config_format.autogenerate_column_names = True
288
+ self._config_format.header_definition = CsvHeaderAutogenerated()
282
289
  self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3,4,5,6"]).build()
283
290
 
284
291
  data_generator = self._read_data()
285
292
 
286
293
  assert list(data_generator) == [{"f0": "0", "f1": "1", "f2": "2", "f3": "3", "f4": "4", "f5": "5", "f6": "6"}]
287
294
 
295
+ def test_given_user_provided_headers_when_read_data_then_use_user_provided_headers(self) -> None:
296
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["first", "second", "third", "fourth"])
297
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
298
+
299
+ data_generator = self._read_data()
300
+
301
+ assert list(data_generator) == [{"first": "0", "second": "1", "third": "2", "fourth": "3"}]
302
+
303
+ def test_given_len_mistmatch_on_user_provided_headers_when_read_data_then_raise_error(self) -> None:
304
+ self._config_format.header_definition = CsvHeaderUserProvided(column_names=["missing", "one", "column"])
305
+ self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["0,1,2,3"]).build()
306
+
307
+ with pytest.raises(RecordParseError):
308
+ list(self._read_data())
309
+
288
310
  def test_given_skip_rows_after_header_when_read_data_then_do_not_parse_skipped_rows(self) -> None:
289
311
  self._config_format.skip_rows_after_header = 1
290
312
  self._stream_reader.open_file.return_value = (
@@ -134,14 +134,6 @@ error_empty_stream_scenario = (
134
134
  ).build()
135
135
 
136
136
 
137
- error_extension_mismatch_scenario = (
138
- _base_failure_scenario.copy()
139
- .set_name("error_extension_mismatch_scenario")
140
- .set_file_type("jsonl")
141
- .set_expected_check_error(None, FileBasedSourceError.EXTENSION_MISMATCH.value)
142
- ).build()
143
-
144
-
145
137
  error_listing_files_scenario = (
146
138
  _base_failure_scenario.copy()
147
139
  .set_name("error_listing_files_scenario")
@@ -180,11 +180,43 @@ single_csv_scenario = (
180
180
  "default": 0,
181
181
  "type": "integer",
182
182
  },
183
- "autogenerate_column_names": {
184
- "title": "Autogenerate Column Names",
185
- "description": "Whether to autogenerate column names if column_names is empty. If true, column names will be of the form \u201cf0\u201d, \u201cf1\u201d\u2026 If false, column names will be read from the first CSV row after skip_rows_before_header.",
186
- "default": False,
187
- "type": "boolean",
183
+ "header_definition": {
184
+ "title": "CSV Header Definition",
185
+ "type": "object",
186
+ "description": "How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
187
+ "default": {"header_definition_type": "From CSV"},
188
+ "oneOf": [
189
+ {
190
+ "title": "From CSV",
191
+ "type": "object",
192
+ "properties": {
193
+ "header_definition_type": {"title": "Header Definition Type", "default": "From CSV", "enum": ["From CSV"], "type": "string"},
194
+ },
195
+ },
196
+ {
197
+ "title": "Autogenerated",
198
+ "type": "object",
199
+ "properties": {
200
+ "header_definition_type": {"title": "Header Definition Type", "default": "Autogenerated", "enum": ["Autogenerated"], "type": "string"},
201
+ },
202
+ },
203
+ {
204
+ "title": "User Provided",
205
+ "type": "object",
206
+ "properties": {
207
+ "header_definition_type": {"title": "Header Definition Type", "default": "User Provided", "enum": ["User Provided"], "type": "string"},
208
+ "column_names": {
209
+ "title": "Column Names",
210
+ "description": "The column names that will be used while emitting the CSV records",
211
+ "type": "array",
212
+ "items": {
213
+ "type": "string"
214
+ },
215
+ }
216
+ },
217
+ "required": ["column_names"]
218
+ },
219
+ ]
188
220
  },
189
221
  "true_values": {
190
222
  "title": "True Values",
@@ -761,7 +793,6 @@ csv_multi_stream_scenario = (
761
793
  )
762
794
  ).build()
763
795
 
764
-
765
796
  csv_custom_format_scenario = (
766
797
  TestScenarioBuilder()
767
798
  .set_name("csv_custom_format")
@@ -868,7 +899,6 @@ csv_custom_format_scenario = (
868
899
  )
869
900
  ).build()
870
901
 
871
-
872
902
  multi_stream_custom_format = (
873
903
  TestScenarioBuilder()
874
904
  .set_name("multi_stream_custom_format_scenario")
@@ -1016,7 +1046,6 @@ multi_stream_custom_format = (
1016
1046
  )
1017
1047
  ).build()
1018
1048
 
1019
-
1020
1049
  empty_schema_inference_scenario = (
1021
1050
  TestScenarioBuilder()
1022
1051
  .set_name("empty_schema_inference_scenario")
@@ -1092,7 +1121,6 @@ empty_schema_inference_scenario = (
1092
1121
  )
1093
1122
  ).build()
1094
1123
 
1095
-
1096
1124
  schemaless_csv_scenario = (
1097
1125
  TestScenarioBuilder()
1098
1126
  .set_name("schemaless_csv_scenario")
@@ -1188,7 +1216,6 @@ schemaless_csv_scenario = (
1188
1216
  )
1189
1217
  ).build()
1190
1218
 
1191
-
1192
1219
  schemaless_csv_multi_stream_scenario = (
1193
1220
  TestScenarioBuilder()
1194
1221
  .set_name("schemaless_csv_multi_stream_scenario")
@@ -1296,7 +1323,6 @@ schemaless_csv_multi_stream_scenario = (
1296
1323
  )
1297
1324
  ).build()
1298
1325
 
1299
-
1300
1326
  schemaless_with_user_input_schema_fails_connection_check_scenario = (
1301
1327
  TestScenarioBuilder()
1302
1328
  .set_name("schemaless_with_user_input_schema_fails_connection_check_scenario")
@@ -1361,7 +1387,6 @@ schemaless_with_user_input_schema_fails_connection_check_scenario = (
1361
1387
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1362
1388
  ).build()
1363
1389
 
1364
-
1365
1390
  schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario = (
1366
1391
  TestScenarioBuilder()
1367
1392
  .set_name("schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario")
@@ -1446,7 +1471,6 @@ schemaless_with_user_input_schema_fails_connection_check_multi_stream_scenario =
1446
1471
  .set_expected_read_error(ConfigValidationError, FileBasedSourceError.CONFIG_VALIDATION_ERROR.value)
1447
1472
  ).build()
1448
1473
 
1449
-
1450
1474
  csv_string_can_be_null_with_input_schemas_scenario = (
1451
1475
  TestScenarioBuilder()
1452
1476
  .set_name("csv_string_can_be_null_with_input_schema")
@@ -2143,7 +2167,6 @@ csv_custom_delimiter_in_double_quotes_scenario = (
2143
2167
  )
2144
2168
  ).build()
2145
2169
 
2146
-
2147
2170
  csv_skip_before_header_scenario = (
2148
2171
  TestScenarioBuilder()
2149
2172
  .set_name("csv_skip_before_header")
@@ -2278,7 +2301,6 @@ csv_skip_after_header_scenario = (
2278
2301
  )
2279
2302
  ).build()
2280
2303
 
2281
-
2282
2304
  csv_skip_before_and_after_header_scenario = (
2283
2305
  TestScenarioBuilder()
2284
2306
  .set_name("csv_skip_before_after_header")
@@ -2363,7 +2385,7 @@ csv_autogenerate_column_names_scenario = (
2363
2385
  "validation_policy": "Emit Record",
2364
2386
  "format": {
2365
2387
  "filetype": "csv",
2366
- "autogenerate_column_names": True,
2388
+ "header_definition": {"header_definition_type": "Autogenerated"},
2367
2389
  },
2368
2390
  }
2369
2391
  ],
@@ -2556,7 +2578,6 @@ csv_custom_null_values_scenario = (
2556
2578
  )
2557
2579
  ).build()
2558
2580
 
2559
-
2560
2581
  earlier_csv_scenario = (
2561
2582
  TestScenarioBuilder()
2562
2583
  .set_name("earlier_csv_stream")
@@ -24,7 +24,6 @@ from unit_tests.sources.file_based.scenarios.avro_scenarios import (
24
24
  )
25
25
  from unit_tests.sources.file_based.scenarios.check_scenarios import (
26
26
  error_empty_stream_scenario,
27
- error_extension_mismatch_scenario,
28
27
  error_listing_files_scenario,
29
28
  error_multi_stream_scenario,
30
29
  error_reading_file_scenario,
@@ -309,7 +308,6 @@ def test_spec(capsys: CaptureFixture[str], scenario: TestScenario) -> None:
309
308
 
310
309
  check_scenarios = [
311
310
  error_empty_stream_scenario,
312
- error_extension_mismatch_scenario,
313
311
  error_listing_files_scenario,
314
312
  error_reading_file_scenario,
315
313
  error_record_validation_user_provided_schema_scenario,