airbyte-cdk 0.53.2__py3-none-any.whl → 0.53.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,6 +4,7 @@
4
4
 
5
5
  from typing import List, Literal, Optional, Union
6
6
 
7
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
7
8
  from pydantic import BaseModel, Field
8
9
 
9
10
 
@@ -16,11 +17,10 @@ class SeparatorSplitterConfigModel(BaseModel):
16
17
  )
17
18
  keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
18
19
 
19
- class Config:
20
+ class Config(OneOfOptionConfig):
20
21
  title = "By Separator"
21
- schema_extra = {
22
- "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
23
- }
22
+ description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
23
+ discriminator = "mode"
24
24
 
25
25
 
26
26
  class MarkdownHeaderSplitterConfigModel(BaseModel):
@@ -33,11 +33,10 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
33
33
  ge=1,
34
34
  )
35
35
 
36
- class Config:
36
+ class Config(OneOfOptionConfig):
37
37
  title = "By Markdown header"
38
- schema_extra = {
39
- "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
40
- }
38
+ description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
39
+ discriminator = "mode"
41
40
 
42
41
 
43
42
  class CodeSplitterConfigModel(BaseModel):
@@ -65,11 +64,12 @@ class CodeSplitterConfigModel(BaseModel):
65
64
  ],
66
65
  )
67
66
 
68
- class Config:
67
+ class Config(OneOfOptionConfig):
69
68
  title = "By Programming Language"
70
- schema_extra = {
71
- "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
72
- }
69
+ description = (
70
+ "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
71
+ )
72
+ discriminator = "mode"
73
73
 
74
74
 
75
75
  TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
@@ -128,11 +128,12 @@ class OpenAIEmbeddingConfigModel(BaseModel):
128
128
  mode: Literal["openai"] = Field("openai", const=True)
129
129
  openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
130
130
 
131
- class Config:
131
+ class Config(OneOfOptionConfig):
132
132
  title = "OpenAI"
133
- schema_extra = {
134
- "description": "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
135
- }
133
+ description = (
134
+ "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
135
+ )
136
+ discriminator = "mode"
136
137
 
137
138
 
138
139
  class OpenAICompatibleEmbeddingConfigModel(BaseModel):
@@ -151,9 +152,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
151
152
  title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
152
153
  )
153
154
 
154
- class Config:
155
+ class Config(OneOfOptionConfig):
155
156
  title = "OpenAI-compatible"
156
- schema_extra = {"description": "Use a service that's compatible with the OpenAI API to embed text."}
157
+ description = "Use a service that's compatible with the OpenAI API to embed text."
158
+ discriminator = "mode"
157
159
 
158
160
 
159
161
  class AzureOpenAIEmbeddingConfigModel(BaseModel):
@@ -177,21 +179,19 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
177
179
  examples=["your-resource-name"],
178
180
  )
179
181
 
180
- class Config:
182
+ class Config(OneOfOptionConfig):
181
183
  title = "Azure OpenAI"
182
- schema_extra = {
183
- "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
184
- }
184
+ description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
185
+ discriminator = "mode"
185
186
 
186
187
 
187
188
  class FakeEmbeddingConfigModel(BaseModel):
188
189
  mode: Literal["fake"] = Field("fake", const=True)
189
190
 
190
- class Config:
191
+ class Config(OneOfOptionConfig):
191
192
  title = "Fake"
192
- schema_extra = {
193
- "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
194
- }
193
+ description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
194
+ discriminator = "mode"
195
195
 
196
196
 
197
197
  class FromFieldEmbeddingConfigModel(BaseModel):
@@ -203,17 +203,17 @@ class FromFieldEmbeddingConfigModel(BaseModel):
203
203
  ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
204
204
  )
205
205
 
206
- class Config:
206
+ class Config(OneOfOptionConfig):
207
207
  title = "From Field"
208
- schema_extra = {
209
- "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
210
- }
208
+ description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
209
+ discriminator = "mode"
211
210
 
212
211
 
213
212
  class CohereEmbeddingConfigModel(BaseModel):
214
213
  mode: Literal["cohere"] = Field("cohere", const=True)
215
214
  cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
216
215
 
217
- class Config:
216
+ class Config(OneOfOptionConfig):
218
217
  title = "Cohere"
219
- schema_extra = {"description": "Use the Cohere API to embed text."}
218
+ description = "Use the Cohere API to embed text."
219
+ discriminator = "mode"
@@ -2,12 +2,14 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
5
6
  from pydantic import BaseModel, Field
6
7
 
7
8
 
8
9
  class AvroFormat(BaseModel):
9
- class Config:
10
+ class Config(OneOfOptionConfig):
10
11
  title = "Avro Format"
12
+ discriminator = "filetype"
11
13
 
12
14
  filetype: str = Field(
13
15
  "avro",
@@ -6,6 +6,7 @@ import codecs
6
6
  from enum import Enum
7
7
  from typing import Any, Dict, List, Optional, Set, Union
8
8
 
9
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
9
10
  from pydantic import BaseModel, Field, ValidationError, root_validator, validator
10
11
 
11
12
 
@@ -21,8 +22,9 @@ class CsvHeaderDefinitionType(Enum):
21
22
 
22
23
 
23
24
  class CsvHeaderFromCsv(BaseModel):
24
- class Config:
25
+ class Config(OneOfOptionConfig):
25
26
  title = "From CSV"
27
+ discriminator = "header_definition_type"
26
28
 
27
29
  header_definition_type: str = Field(
28
30
  CsvHeaderDefinitionType.FROM_CSV.value,
@@ -34,8 +36,9 @@ class CsvHeaderFromCsv(BaseModel):
34
36
 
35
37
 
36
38
  class CsvHeaderAutogenerated(BaseModel):
37
- class Config:
39
+ class Config(OneOfOptionConfig):
38
40
  title = "Autogenerated"
41
+ discriminator = "header_definition_type"
39
42
 
40
43
  header_definition_type: str = Field(
41
44
  CsvHeaderDefinitionType.AUTOGENERATED.value,
@@ -47,8 +50,9 @@ class CsvHeaderAutogenerated(BaseModel):
47
50
 
48
51
 
49
52
  class CsvHeaderUserProvided(BaseModel):
50
- class Config:
53
+ class Config(OneOfOptionConfig):
51
54
  title = "User Provided"
55
+ discriminator = "header_definition_type"
52
56
 
53
57
  header_definition_type: str = Field(
54
58
  CsvHeaderDefinitionType.USER_PROVIDED.value,
@@ -74,8 +78,9 @@ DEFAULT_FALSE_VALUES = ["n", "no", "f", "false", "off", "0"]
74
78
 
75
79
 
76
80
  class CsvFormat(BaseModel):
77
- class Config:
81
+ class Config(OneOfOptionConfig):
78
82
  title = "CSV Format"
83
+ discriminator = "filetype"
79
84
 
80
85
  filetype: str = Field(
81
86
  "csv",
@@ -123,7 +128,7 @@ class CsvFormat(BaseModel):
123
128
  )
124
129
  header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
125
130
  title="CSV Header Definition",
126
- default=CsvHeaderFromCsv(),
131
+ default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
127
132
  description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
128
133
  )
129
134
  true_values: Set[str] = Field(
@@ -2,12 +2,14 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
5
6
  from pydantic import BaseModel, Field
6
7
 
7
8
 
8
9
  class JsonlFormat(BaseModel):
9
- class Config:
10
+ class Config(OneOfOptionConfig):
10
11
  title = "Jsonl Format"
12
+ discriminator = "filetype"
11
13
 
12
14
  filetype: str = Field(
13
15
  "jsonl",
@@ -2,12 +2,14 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
5
6
  from pydantic import BaseModel, Field
6
7
 
7
8
 
8
9
  class ParquetFormat(BaseModel):
9
- class Config:
10
+ class Config(OneOfOptionConfig):
10
11
  title = "Parquet Format"
12
+ discriminator = "filetype"
11
13
 
12
14
  filetype: str = Field(
13
15
  "parquet",
@@ -2,15 +2,26 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from typing import Optional
6
+
7
+ from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
5
8
  from pydantic import BaseModel, Field
6
9
 
7
10
 
8
11
  class UnstructuredFormat(BaseModel):
9
- class Config:
12
+ class Config(OneOfOptionConfig):
10
13
  title = "Document File Type Format (Experimental)"
11
- schema_extra = {"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."}
14
+ description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
15
+ discriminator = "filetype"
12
16
 
13
17
  filetype: str = Field(
14
18
  "unstructured",
15
19
  const=True,
16
20
  )
21
+
22
+ skip_unprocessable_file_types: Optional[bool] = Field(
23
+ default=True,
24
+ title="Skip Unprocessable File Types",
25
+ description="If true, skip files that cannot be parsed because of their file type and log a warning. If false, fail the sync. Corrupted files with valid file types will still result in a failed sync.",
26
+ always_show=True,
27
+ )
@@ -6,6 +6,7 @@ from io import BytesIO, IOBase
6
6
  from typing import Any, Dict, Iterable, List, Mapping, Optional
7
7
 
8
8
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
9
+ from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
9
10
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
10
11
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
11
12
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
@@ -60,11 +61,12 @@ class UnstructuredParser(FileTypeParser):
60
61
  stream_reader: AbstractFileBasedStreamReader,
61
62
  logger: logging.Logger,
62
63
  ) -> SchemaType:
64
+ format = _extract_format(config)
63
65
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
64
66
  filetype = self._get_filetype(file_handle, file)
65
67
 
66
68
  if filetype not in self._supported_file_types():
67
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri)
69
+ self._handle_unprocessable_file(file, format, logger)
68
70
 
69
71
  return {
70
72
  "content": {"type": "string"},
@@ -79,14 +81,16 @@ class UnstructuredParser(FileTypeParser):
79
81
  logger: logging.Logger,
80
82
  discovered_schema: Optional[Mapping[str, SchemaType]],
81
83
  ) -> Iterable[Dict[str, Any]]:
84
+ format = _extract_format(config)
82
85
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
83
- markdown = self._read_file(file_handle, file)
84
- yield {
85
- "content": markdown,
86
- "document_key": file.uri,
87
- }
88
-
89
- def _read_file(self, file_handle: IOBase, remote_file: RemoteFile) -> str:
86
+ markdown = self._read_file(file_handle, file, format, logger)
87
+ if markdown is not None:
88
+ yield {
89
+ "content": markdown,
90
+ "document_key": file.uri,
91
+ }
92
+
93
+ def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> Optional[str]:
90
94
  _import_unstructured()
91
95
  if (
92
96
  (not unstructured_partition_pdf)
@@ -104,7 +108,8 @@ class UnstructuredParser(FileTypeParser):
104
108
  decoded_content: str = unstructured_optional_decode(file_content)
105
109
  return decoded_content
106
110
  if filetype not in self._supported_file_types():
107
- raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri)
111
+ self._handle_unprocessable_file(remote_file, format, logger)
112
+ return None
108
113
 
109
114
  file: Any = file_handle
110
115
  if filetype == FileType.PDF:
@@ -120,6 +125,12 @@ class UnstructuredParser(FileTypeParser):
120
125
 
121
126
  return self._render_markdown(elements)
122
127
 
128
+ def _handle_unprocessable_file(self, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> None:
129
+ if format.skip_unprocessable_file_types:
130
+ logger.warn(f"File {remote_file.uri} cannot be parsed. Skipping it.")
131
+ else:
132
+ raise RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri)
133
+
123
134
  def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
124
135
  """
125
136
  Detect the file type based on the file name and the file content.
@@ -172,3 +183,10 @@ class UnstructuredParser(FileTypeParser):
172
183
  @property
173
184
  def file_read_mode(self) -> FileReadMode:
174
185
  return FileReadMode.READ_BINARY
186
+
187
+
188
+ def _extract_format(config: FileBasedStreamConfig) -> UnstructuredFormat:
189
+ config_format = config.format
190
+ if not isinstance(config_format, UnstructuredFormat):
191
+ raise ValueError(f"Invalid format config: {config_format}")
192
+ return config_format
@@ -154,7 +154,7 @@ class ThreadBasedConcurrentStream(AbstractStream):
154
154
  if len(futures) < self._max_concurrent_tasks:
155
155
  return
156
156
 
157
- for index in range(len(futures)):
157
+ for index in reversed(range(len(futures))):
158
158
  future = futures[index]
159
159
  optional_exception = future.exception()
160
160
  if optional_exception:
@@ -0,0 +1,33 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Any, Dict
6
+
7
+
8
+ class OneOfOptionConfig:
9
+ """
10
+ Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
11
+
12
+ Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
13
+
14
+ Usage:
15
+
16
+ ```python
17
+ class OptionModel(BaseModel):
18
+ mode: Literal["option_a"] = Field("option_a", const=True)
19
+ option_a_field: str = Field(...)
20
+
21
+ class Config(OneOfOptionConfig):
22
+ title = "Option A"
23
+ description = "Option A description"
24
+ discriminator = "mode"
25
+ ```
26
+ """
27
+
28
+ @staticmethod
29
+ def schema_extra(schema: Dict[str, Any], model: Any) -> None:
30
+ if hasattr(model.Config, "description"):
31
+ schema["description"] = model.Config.description
32
+ if hasattr(model.Config, "discriminator"):
33
+ schema.setdefault("required", []).append(model.Config.discriminator)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.53.2
3
+ Version: 0.53.4
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -13,7 +13,7 @@ airbyte_cdk/connector_builder/models.py,sha256=U2LrL1syxZ0gQ3LgnwVj9ozL6uGH5f9bi
13
13
  airbyte_cdk/destinations/__init__.py,sha256=0Uxmz3iBAyZJdk_bqUVt2pb0UwRTpFjTnFE6fQFbWKY,126
14
14
  airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQwQj8MyVm3BI,5420
15
15
  airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
16
- airbyte_cdk/destinations/vector_db_based/config.py,sha256=tMp8blgdrI4t7a9Ri9Vydk0TOcRqLTHHUjVlXtc0Wa4,9562
16
+ airbyte_cdk/destinations/vector_db_based/config.py,sha256=FrbW0RVzTrdMotgj7uPjUsEiD8Ij9_Z6FA3OXdqZv3Y,9812
17
17
  airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=ldrlmCT4gFHc_A5B_um4OteXg1OR0LGyDmswO1316tA,8649
18
18
  airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=davAE4UtrpWDjbV74tck5zvKksxizvSdF9X51WFMbW4,10913
19
19
  airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=58Uf34yIe0QHbnpbkS7rH2sqL7eLzwWUjx7X4yciyeA,3165
@@ -156,12 +156,12 @@ airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availab
156
156
  airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py,sha256=DFUJZzUfl6lBbIEVk-BXFh-yGxXle0anM7eM2NsnCeQ,5019
157
157
  airbyte_cdk/sources/file_based/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py,sha256=WrV4sKtJoZ1dK31HK7NdBKlnYHkmu6NqjmEpkVqJ6tQ,4582
159
- airbyte_cdk/sources/file_based/config/avro_format.py,sha256=oLJIuNInu-MgjkVFqwHvmQ4CPZa4NZingq_I0_trQ3g,589
160
- airbyte_cdk/sources/file_based/config/csv_format.py,sha256=xlBZ5WyAshagjjjbUV_je1JyZ1oY1GbIzJRUZ9UfSvo,7095
159
+ airbyte_cdk/sources/file_based/config/avro_format.py,sha256=lQSEq5JZY0M5y9mW93R4EjrIb8brYXUgrXCY-6EMHww,711
160
+ airbyte_cdk/sources/file_based/config/csv_format.py,sha256=L3JEgb91yrCob1oYrGl0088QEWblkOsRfDmMfWRQ0bg,7482
161
161
  airbyte_cdk/sources/file_based/config/file_based_stream_config.py,sha256=5x2BQVV_ZZcV5727gIypnfoIiI21X_dnkkjCAkQy3ZI,3967
162
- airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=usmTeTw8xw8OKwrz8MsiS5E1LQiVEbedGHMHNAfOOlk,252
163
- airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=O_Eq0yVzjPiKDz8H1-f9yMowtCcJwT9F2prNYpXZkp0,614
164
- airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=iiEGIPspyDcGY36cagqNV3CazEJdZoTrSZwpJZb_laE,430
162
+ airbyte_cdk/sources/file_based/config/jsonl_format.py,sha256=fAPzZnoghGgHjaDvx6Qo68C8j54mBxo1NTdpwSI0VZo,374
163
+ airbyte_cdk/sources/file_based/config/parquet_format.py,sha256=8GTDTQyvS7pWLVG0LWirHVE1snHd0Au5R4Ym33-ezEg,736
164
+ airbyte_cdk/sources/file_based/config/unstructured_format.py,sha256=8yc0TMhlf1bcJc34IXzYkYHQ5HpGN4rt1f3zKSiCeYk,934
165
165
  airbyte_cdk/sources/file_based/discovery_policy/__init__.py,sha256=x_7JsQGiS7Ytmr0ZDS0SNYGcNUzC4wCm3_1-Mf3ZFnw,283
166
166
  airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py,sha256=0o_qmEO0-IojO4Ckgp4V3ackTM9Ui1sUHW5HwANueLM,621
167
167
  airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py,sha256=QeZghVmf2Cq4wy_6NYcHmR6SLgdWfsGgctYg2ZsjFE4,939
@@ -171,7 +171,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VVV829XszmDRqmgv6
171
171
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=b987gENSP649ijRd_33ZVJVNIlFMr-F-FkG333NkNFc,2235
172
172
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=Kz6HLF0CrFHQ1Y6rJKGr7KmBWSLeDYFQmkg0WIi7Frg,5395
173
173
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Hz_3GqCPKmTuHJgMHY_afD3Ul6YsF28lEPeJSnpvNc4,8776
174
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=OMx1Xpp_xgtcTehtpsz9GCvr86-fgdEmS4ev0VOgNZE,7213
174
+ airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=DcRoivT3mwJY8aJjTIzDADKwMR0s6mp2RuCsj8l8Bko,8115
175
175
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
176
176
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
177
177
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
@@ -199,7 +199,7 @@ airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=-WETGIY5_QFmVeDFiqm4
199
199
  airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py,sha256=uAUhCkxFOaptDJfIEDmFnnF4xn4coG5kvE4B_5tVx14,1557
200
200
  airbyte_cdk/sources/streams/concurrent/partition_reader.py,sha256=H8sGVVGx6uKMSUehRaqmVbE19DE3cx3NivQ4sFj8wbk,1303
201
201
  airbyte_cdk/sources/streams/concurrent/state_converter.py,sha256=PwqcRVPR6LQxWL0yvPTp_u2Uh0hBJU-BDSjPKiyJVEk,4689
202
- airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py,sha256=ZfjrqY-I43H-qQHmwJnHnP_4snvPBFwD9dIVwV1gOqU,10833
202
+ airbyte_cdk/sources/streams/concurrent/thread_based_concurrent_stream.py,sha256=M7CpPPBswHTYjG4opiTOf5eWHOJ6i4TyP0v991pFxOo,10843
203
203
  airbyte_cdk/sources/streams/concurrent/partitions/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
204
204
  airbyte_cdk/sources/streams/concurrent/partitions/partition.py,sha256=tjXF8lZMvyfZaCYCHr5aTPwbVstmRjYZDwYAvLDY-ds,1312
205
205
  airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py,sha256=_ymkkBr71_qt1fW0_MUqw96OfNBkeJngXQ09yolEDHw,441
@@ -238,6 +238,7 @@ airbyte_cdk/utils/datetime_format_inferrer.py,sha256=gGKDQ3OdY18R5CVFhq4c7zB_E4C
238
238
  airbyte_cdk/utils/event_timing.py,sha256=Hn5kCc9xGKLcV5EYpJCZwNiz9neKKu2WG8FJF_hy278,2377
239
239
  airbyte_cdk/utils/is_cloud_environment.py,sha256=KAR_Ak_aD2X6a2zEtRAg3kaEHiJtOifWE1uwqW2-yHE,566
240
240
  airbyte_cdk/utils/mapping_helpers.py,sha256=tVkbgnxy12Ah2Jxh_3tKW7CTKTAVIcPexsBhsiyTbp4,1729
241
+ airbyte_cdk/utils/oneof_option_config.py,sha256=N8EmWdYdwt0FM7fuShh6H8nj_r4KEL9tb2DJJtwsPow,1180
241
242
  airbyte_cdk/utils/schema_inferrer.py,sha256=D8vFVgeK6VLcAug4YVAHfa3D29On0A_nMlwq9SPlfPI,3799
242
243
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=LGjSSk8lmBiC0GiHqxDwu_iMN6bCe05UMpz9e7nCw5E,741
243
244
  airbyte_cdk/utils/stream_status_utils.py,sha256=X1Vy7BhglycjdIWpfKDfwJussNCxYffelKt6Utjx-qY,1005
@@ -338,7 +339,7 @@ unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slic
338
339
  unit_tests/sources/file_based/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
339
340
  unit_tests/sources/file_based/helpers.py,sha256=MZTwaWtX0a6TPbFcUMP-EgqBunK2wpoElgApCEE1bN4,2659
340
341
  unit_tests/sources/file_based/in_memory_files_source.py,sha256=r2yD6-_ABXG7_PIyTq4ACN21sHyg3g-Hd9dIgxfDQUk,8235
341
- unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=dhWlq2qKuPBxCeVvFCSoySGXEbJCszunblWOjAnFpuw,11430
342
+ unit_tests/sources/file_based/test_file_based_scenarios.py,sha256=9xVFaFFHjnzZziVmoVmLTULdxANt_zSrwVgANAVytl4,11564
342
343
  unit_tests/sources/file_based/test_file_based_stream_reader.py,sha256=P6yTp7tbPfREzi5SXg4SSSql5nxiRV571YdOmwb_SzY,9219
343
344
  unit_tests/sources/file_based/test_scenarios.py,sha256=2-9pqnfva3RDRyODy0xcK6mxrP_mHH5vLrmBhqgZO8o,9703
344
345
  unit_tests/sources/file_based/test_schema_helpers.py,sha256=IYIDdLRK41RkSG_ZW2cagAt9krV4QLbkzu6r7vPx9Js,12047
@@ -355,17 +356,17 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
355
356
  unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
356
357
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
357
358
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
358
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=I9yY11rrtdybKl5C-yh3qwKL1_aCnNwcg1xcENujDak,5670
359
+ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=W6jbuX2BBwHECuTS8NUlPgjFptrXGDQW4tJZUKwcfR0,7028
359
360
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
360
361
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
361
362
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
362
- unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=M4Ol5y1WrUlNhSW2uyD4aUfoxeg2FrPKGHT5tfxXeBM,108612
363
+ unit_tests/sources/file_based/scenarios/csv_scenarios.py,sha256=ochAex6o44Ov7-KpTWnaYOZja_kzprBB1aM9eVQIHeg,109887
363
364
  unit_tests/sources/file_based/scenarios/file_based_source_builder.py,sha256=wgb7l5VohcEvZT82ZpJcjINSrjuJtzJZS4zuZjdKpJ4,3874
364
365
  unit_tests/sources/file_based/scenarios/incremental_scenarios.py,sha256=B7YE2IbvgTH_v7DYQEuv7yn2IG15aKUvJ_7dA4d3Cg4,69413
365
366
  unit_tests/sources/file_based/scenarios/jsonl_scenarios.py,sha256=LsOf-tpjWNuwskPcgAMhMpQQ3iaHaD3PjPmt2M2zSzo,31839
366
367
  unit_tests/sources/file_based/scenarios/parquet_scenarios.py,sha256=MGgLCqkTJb8uNEwYZY3zbVVDZRSBKSmf2s8VMuYse_I,26549
367
368
  unit_tests/sources/file_based/scenarios/scenario_builder.py,sha256=feSSViayuoxTquoRhMUg4Lcui7dtwWHQ1Fe5y9igWSo,8728
368
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=BuAPJMP1Aur35lN24S3mEwj9zl6OYWs7aI4sdC_dGwo,62420
369
+ unit_tests/sources/file_based/scenarios/unstructured_scenarios.py,sha256=3jeDxyLh6LgwK0wMhU884fqSXG47H3AWvIQDD15jO6c,64973
369
370
  unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py,sha256=7CxIaqZxAGSPs4AtcKZ9FLVVYQPsS__uXi9wnQMKn3U,28322
370
371
  unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py,sha256=Try0knJN5wfoGNO38QGoLGIcqSceSAQsUWO42CusNYI,33005
371
372
  unit_tests/sources/file_based/stream/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -386,7 +387,7 @@ unit_tests/sources/streams/concurrent/test_concurrent_partition_generator.py,sha
386
387
  unit_tests/sources/streams/concurrent/test_cursor.py,sha256=sqH8xE3GamETSSVqsdKafziAAm-y_j_MegES_C5ExMM,5790
387
388
  unit_tests/sources/streams/concurrent/test_partition_reader.py,sha256=eM5dzfmLKm9Lj-BfQUjAZRhCZzfvhk7AkKpcHGcoPfg,931
388
389
  unit_tests/sources/streams/concurrent/test_state_converter.py,sha256=rvg8becWR1iPdm5TAanZssKj5_iw8dInE_uqmjqghZE,8349
389
- unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py,sha256=HJYZCwgSyGmKdQSInZK1rUDbtW5RepdOa05hC10RPe4,10894
390
+ unit_tests/sources/streams/concurrent/test_thread_based_concurrent_stream.py,sha256=_jBMJIZ6Hu9mWX4v9SRUdtxvgntA-rQpNbbygBi6HXA,11629
390
391
  unit_tests/sources/streams/concurrent/scenarios/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
391
392
  unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py,sha256=x77AQf8_O4dQ2aF1o800CzI0hOEyU8ayxoNdSOvxkhM,10495
392
393
  unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py,sha256=FdgEO-bWA_IDFIJb0W83qE4QCCZ8eexbn_Mq8LJq0iE,5040
@@ -410,8 +411,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
410
411
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
411
412
  unit_tests/utils/test_stream_status_utils.py,sha256=N2TxwKge45RHUKFlPcP2o5jXYjJPKMKiu6Fm2_leZYY,3388
412
413
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
413
- airbyte_cdk-0.53.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
414
- airbyte_cdk-0.53.2.dist-info/METADATA,sha256=DC0qcLA2D2QlnQKG4S8Ojcm01bLCdN5msrVZy0T6DhI,11983
415
- airbyte_cdk-0.53.2.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
416
- airbyte_cdk-0.53.2.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
417
- airbyte_cdk-0.53.2.dist-info/RECORD,,
414
+ airbyte_cdk-0.53.4.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
415
+ airbyte_cdk-0.53.4.dist-info/METADATA,sha256=n_0ZAbpdj_k3YO8y-B1g9javdvN876EaSuA6-Cu5xQY,11983
416
+ airbyte_cdk-0.53.4.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
417
+ airbyte_cdk-0.53.4.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
418
+ airbyte_cdk-0.53.4.dist-info/RECORD,,
@@ -7,6 +7,7 @@ from datetime import datetime
7
7
  from unittest.mock import MagicMock, mock_open, patch
8
8
 
9
9
  import pytest
10
+ from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
10
11
  from airbyte_cdk.sources.file_based.exceptions import RecordParseError
11
12
  from airbyte_cdk.sources.file_based.file_types import UnstructuredParser
12
13
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -17,37 +18,48 @@ FILE_URI = "path/to/file.xyz"
17
18
 
18
19
 
19
20
  @pytest.mark.parametrize(
20
- "filetype, raises",
21
+ "filetype, format_config, raises",
21
22
  [
22
23
  pytest.param(
23
24
  FileType.MD,
25
+ UnstructuredFormat(skip_unprocessable_file_types=False),
24
26
  False,
25
27
  id="markdown file",
26
28
  ),
27
29
  pytest.param(
28
30
  FileType.CSV,
31
+ UnstructuredFormat(skip_unprocessable_file_types=False),
29
32
  True,
30
33
  id="wrong file format",
31
34
  ),
35
+ pytest.param(
36
+ FileType.CSV,
37
+ UnstructuredFormat(skip_unprocessable_file_types=True),
38
+ False,
39
+ id="wrong file format skipping",
40
+ ),
32
41
  pytest.param(
33
42
  FileType.PDF,
43
+ UnstructuredFormat(skip_unprocessable_file_types=False),
34
44
  False,
35
45
  id="pdf file",
36
46
  ),
37
47
  pytest.param(
38
48
  FileType.DOCX,
49
+ UnstructuredFormat(skip_unprocessable_file_types=False),
39
50
  False,
40
51
  id="docx file",
41
52
  ),
42
53
  pytest.param(
43
54
  FileType.PPTX,
55
+ UnstructuredFormat(skip_unprocessable_file_types=False),
44
56
  False,
45
57
  id="pptx file",
46
58
  ),
47
59
  ],
48
60
  )
49
61
  @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.detect_filetype")
50
- def test_infer_schema(mock_detect_filetype, filetype, raises):
62
+ def test_infer_schema(mock_detect_filetype, filetype, format_config, raises):
51
63
  # use a fresh event loop to avoid leaking into other tests
52
64
  main_loop = asyncio.get_event_loop()
53
65
  loop = asyncio.new_event_loop()
@@ -59,11 +71,13 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
59
71
  fake_file.uri = FILE_URI
60
72
  logger = MagicMock()
61
73
  mock_detect_filetype.return_value = filetype
74
+ config = MagicMock()
75
+ config.format = format_config
62
76
  if raises:
63
77
  with pytest.raises(RecordParseError):
64
- loop.run_until_complete(UnstructuredParser().infer_schema(MagicMock(), fake_file, stream_reader, logger))
78
+ loop.run_until_complete(UnstructuredParser().infer_schema(config, fake_file, stream_reader, logger))
65
79
  else:
66
- schema = loop.run_until_complete(UnstructuredParser().infer_schema(MagicMock(), MagicMock(), MagicMock(), MagicMock()))
80
+ schema = loop.run_until_complete(UnstructuredParser().infer_schema(config, MagicMock(), MagicMock(), MagicMock()))
67
81
  assert schema == {
68
82
  "content": {"type": "string"},
69
83
  "document_key": {"type": "string"},
@@ -73,10 +87,11 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
73
87
 
74
88
 
75
89
  @pytest.mark.parametrize(
76
- "filetype, parse_result, raises, expected_records",
90
+ "filetype, format_config, parse_result, raises, expected_records",
77
91
  [
78
92
  pytest.param(
79
93
  FileType.MD,
94
+ UnstructuredFormat(skip_unprocessable_file_types=False),
80
95
  "test",
81
96
  False,
82
97
  [
@@ -89,13 +104,23 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
89
104
  ),
90
105
  pytest.param(
91
106
  FileType.CSV,
92
- "test",
107
+ UnstructuredFormat(skip_unprocessable_file_types=False),
108
+ None,
93
109
  True,
94
110
  None,
95
111
  id="wrong file format",
96
112
  ),
113
+ pytest.param(
114
+ FileType.CSV,
115
+ UnstructuredFormat(skip_unprocessable_file_types=True),
116
+ None,
117
+ False,
118
+ [],
119
+ id="skip_unprocessable_file_types",
120
+ ),
97
121
  pytest.param(
98
122
  FileType.PDF,
123
+ UnstructuredFormat(skip_unprocessable_file_types=False),
99
124
  [
100
125
  Title("heading"),
101
126
  Text("This is the text"),
@@ -113,6 +138,7 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
113
138
  ),
114
139
  pytest.param(
115
140
  FileType.PDF,
141
+ UnstructuredFormat(skip_unprocessable_file_types=False),
116
142
  [
117
143
  Title("first level heading", metadata=ElementMetadata(category_depth=1)),
118
144
  Title("second level heading", metadata=ElementMetadata(category_depth=2)),
@@ -128,6 +154,7 @@ def test_infer_schema(mock_detect_filetype, filetype, raises):
128
154
  ),
129
155
  pytest.param(
130
156
  FileType.DOCX,
157
+ UnstructuredFormat(skip_unprocessable_file_types=False),
131
158
  [
132
159
  Title("heading"),
133
160
  Text("This is the text"),
@@ -157,6 +184,7 @@ def test_parse_records(
157
184
  mock_partition_pptx,
158
185
  mock_partition_pdf,
159
186
  filetype,
187
+ format_config,
160
188
  parse_result,
161
189
  raises,
162
190
  expected_records,
@@ -166,6 +194,8 @@ def test_parse_records(
166
194
  fake_file = RemoteFile(uri=FILE_URI, last_modified=datetime.now())
167
195
  fake_file.uri = FILE_URI
168
196
  logger = MagicMock()
197
+ config = MagicMock()
198
+ config.format = format_config
169
199
  mock_detect_filetype.return_value = filetype
170
200
  mock_partition_docx.return_value = parse_result
171
201
  mock_partition_pptx.return_value = parse_result
@@ -173,6 +203,6 @@ def test_parse_records(
173
203
  mock_optional_decode.side_effect = lambda x: x.decode("utf-8")
174
204
  if raises:
175
205
  with pytest.raises(RecordParseError):
176
- list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock()))
206
+ list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock()))
177
207
  else:
178
- assert list(UnstructuredParser().parse_records(MagicMock(), fake_file, stream_reader, logger, MagicMock())) == expected_records
208
+ assert list(UnstructuredParser().parse_records(config, fake_file, stream_reader, logger, MagicMock())) == expected_records
@@ -122,6 +122,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
122
122
  "type": "boolean",
123
123
  },
124
124
  },
125
+ "required": ["filetype"],
125
126
  },
126
127
  {
127
128
  "title": "CSV Format",
@@ -200,6 +201,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
200
201
  "type": "string",
201
202
  },
202
203
  },
204
+ "required": ["header_definition_type"],
203
205
  },
204
206
  {
205
207
  "title": "Autogenerated",
@@ -212,6 +214,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
212
214
  "type": "string",
213
215
  },
214
216
  },
217
+ "required": ["header_definition_type"],
215
218
  },
216
219
  {
217
220
  "title": "User Provided",
@@ -230,7 +233,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
230
233
  "items": {"type": "string"},
231
234
  },
232
235
  },
233
- "required": ["column_names"],
236
+ "required": ["column_names", "header_definition_type"],
234
237
  },
235
238
  ],
236
239
  },
@@ -258,6 +261,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
258
261
  "enum": ["None", "Primitive Types Only"],
259
262
  },
260
263
  },
264
+ "required": ["filetype"],
261
265
  },
262
266
  {
263
267
  "title": "Jsonl Format",
@@ -265,6 +269,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
265
269
  "properties": {
266
270
  "filetype": {"title": "Filetype", "default": "jsonl", "const": "jsonl", "type": "string"}
267
271
  },
272
+ "required": ["filetype"],
268
273
  },
269
274
  {
270
275
  "title": "Parquet Format",
@@ -283,6 +288,7 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
283
288
  "type": "boolean",
284
289
  },
285
290
  },
291
+ "required": ["filetype"],
286
292
  },
287
293
  {
288
294
  "title": "Document File Type Format (Experimental)",
@@ -293,9 +299,17 @@ single_csv_scenario: TestScenario[InMemoryFilesSource] = (
293
299
  "default": "unstructured",
294
300
  "const": "unstructured",
295
301
  "type": "string",
296
- }
302
+ },
303
+ "skip_unprocessable_file_types": {
304
+ "type": "boolean",
305
+ "default": True,
306
+ "title": "Skip Unprocessable File Types",
307
+ "description": "If true, skip files that cannot be parsed because of their file type and log a warning. If false, fail the sync. Corrupted files with valid file types will still result in a failed sync.",
308
+ "always_show": True,
309
+ },
297
310
  },
298
311
  "description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
312
+ "required": ["filetype"],
299
313
  },
300
314
  ],
301
315
  },
@@ -112,15 +112,16 @@ simple_markdown_scenario = (
112
112
  )
113
113
  ).build()
114
114
 
115
- unstructured_invalid_file_type_discover_scenario = (
115
+ # If skip unprocessable file types is set to false, then discover will fail if it encounters a non-matching file type
116
+ unstructured_invalid_file_type_discover_scenario_no_skip = (
116
117
  TestScenarioBuilder()
117
- .set_name("unstructured_invalid_file_type_discover_scenario")
118
+ .set_name("unstructured_invalid_file_type_discover_scenario_no_skip")
118
119
  .set_config(
119
120
  {
120
121
  "streams": [
121
122
  {
122
123
  "name": "stream1",
123
- "format": {"filetype": "unstructured"},
124
+ "format": {"filetype": "unstructured", "skip_unprocessable_file_types": False},
124
125
  "globs": ["*"],
125
126
  "validation_policy": "Emit Record",
126
127
  }
@@ -172,6 +173,69 @@ unstructured_invalid_file_type_discover_scenario = (
172
173
  .set_expected_discover_error(AirbyteTracedException, "Error inferring schema from files")
173
174
  ).build()
174
175
 
176
+ # If skip unprocessable file types is set to true, then discover will succeed even if there are non-matching file types
177
+ unstructured_invalid_file_type_discover_scenario_skip = (
178
+ TestScenarioBuilder()
179
+ .set_name("unstructured_invalid_file_type_discover_scenario_skip")
180
+ .set_config(
181
+ {
182
+ "streams": [
183
+ {
184
+ "name": "stream1",
185
+ "format": {"filetype": "unstructured", "skip_unprocessable_file_types": True},
186
+ "globs": ["*"],
187
+ "validation_policy": "Emit Record",
188
+ }
189
+ ]
190
+ }
191
+ )
192
+ .set_source_builder(
193
+ FileBasedSourceBuilder()
194
+ .set_files(
195
+ {
196
+ "a.txt": {
197
+ "contents": bytes("Just a humble text file", "UTF-8"),
198
+ "last_modified": "2023-06-05T03:54:07.000Z",
199
+ },
200
+ }
201
+ )
202
+ .set_file_type("unstructured")
203
+ )
204
+ .set_expected_catalog(
205
+ {
206
+ "streams": [
207
+ {
208
+ "default_cursor_field": ["_ab_source_file_last_modified"],
209
+ "json_schema": {
210
+ "type": "object",
211
+ "properties": {
212
+ "document_key": {
213
+ "type": ["null", "string"],
214
+ },
215
+ "content": {
216
+ "type": ["null", "string"],
217
+ },
218
+ "_ab_source_file_last_modified": {
219
+ "type": "string",
220
+ },
221
+ "_ab_source_file_url": {
222
+ "type": "string",
223
+ },
224
+ },
225
+ },
226
+ "name": "stream1",
227
+ "source_defined_cursor": True,
228
+ "supported_sync_modes": ["full_refresh", "incremental"],
229
+ }
230
+ ]
231
+ }
232
+ )
233
+ .set_expected_records([])
234
+ ).build()
235
+
236
+ # TODO When working on https://github.com/airbytehq/airbyte/issues/31605, this test should be split into two tests:
237
+ # 1. Test that the file is skipped if skip_unprocessable_file_types is set to true
238
+ # 2. Test that the sync fails if skip_unprocessable_file_types is set to false
175
239
  unstructured_invalid_file_type_read_scenario = (
176
240
  TestScenarioBuilder()
177
241
  .set_name("unstructured_invalid_file_type_read_scenario")
@@ -102,7 +102,8 @@ from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenari
102
102
  from unit_tests.sources.file_based.scenarios.unstructured_scenarios import (
103
103
  simple_markdown_scenario,
104
104
  simple_unstructured_scenario,
105
- unstructured_invalid_file_type_discover_scenario,
105
+ unstructured_invalid_file_type_discover_scenario_no_skip,
106
+ unstructured_invalid_file_type_discover_scenario_skip,
106
107
  unstructured_invalid_file_type_read_scenario,
107
108
  )
108
109
  from unit_tests.sources.file_based.scenarios.user_input_schema_scenarios import (
@@ -203,7 +204,8 @@ discover_scenarios = [
203
204
  single_partitioned_parquet_scenario,
204
205
  simple_markdown_scenario,
205
206
  simple_unstructured_scenario,
206
- unstructured_invalid_file_type_discover_scenario,
207
+ unstructured_invalid_file_type_discover_scenario_no_skip,
208
+ unstructured_invalid_file_type_discover_scenario_skip,
207
209
  unstructured_invalid_file_type_read_scenario,
208
210
  ]
209
211
 
@@ -13,6 +13,8 @@ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partitio
13
13
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
14
14
  from airbyte_cdk.sources.streams.concurrent.thread_based_concurrent_stream import ThreadBasedConcurrentStream
15
15
 
16
+ _MAX_CONCURRENT_TASKS = 2
17
+
16
18
 
17
19
  class ThreadBasedConcurrentStreamTest(unittest.TestCase):
18
20
  def setUp(self):
@@ -39,7 +41,7 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
39
41
  self._logger,
40
42
  self._message_repository,
41
43
  1,
42
- 2,
44
+ _MAX_CONCURRENT_TASKS,
43
45
  0,
44
46
  cursor=self._cursor,
45
47
  )
@@ -142,15 +144,33 @@ class ThreadBasedConcurrentStreamTest(unittest.TestCase):
142
144
  f2 = Mock()
143
145
 
144
146
  # Verify that the done() method will be called until only one future is still running
145
- f1.done.return_value = False
147
+ f1.done.return_value = True
146
148
  f1.exception.return_value = None
147
- f2.done.return_value = False
148
- f2.exception.return_value = ValueError("An exception")
149
+ f2.done.return_value = True
150
+ f2.exception.return_value = ValueError("ERROR")
149
151
  futures = [f1, f2]
150
152
 
151
153
  with pytest.raises(RuntimeError):
152
154
  self._stream._wait_while_too_many_pending_futures(futures)
153
155
 
156
+ def test_given_removing_multiple_elements_when_pruning_then_fail_immediately(self):
157
+ # Verify that the done() method will be called until only one future is still running
158
+ futures = []
159
+ for _ in range(_MAX_CONCURRENT_TASKS + 1):
160
+ future = Mock()
161
+ future.done.return_value = True
162
+ future.exception.return_value = None
163
+ futures.append(future)
164
+
165
+ pending_future = Mock()
166
+ pending_future.done.return_value = False
167
+ pending_future.exception.return_value = None
168
+ futures.append(pending_future)
169
+
170
+ self._stream._wait_while_too_many_pending_futures(futures)
171
+
172
+ assert futures == [pending_future]
173
+
154
174
  def test_as_airbyte_stream(self):
155
175
  expected_airbyte_stream = AirbyteStream(
156
176
  name=self._name,