airbyte-cdk 0.57.1__py3-none-any.whl → 0.57.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,6 +4,7 @@
4
4
 
5
5
  import os
6
6
  from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass
7
8
  from typing import List, Optional, Union, cast
8
9
 
9
10
  from airbyte_cdk.destinations.vector_db_based.config import (
@@ -15,8 +16,8 @@ from airbyte_cdk.destinations.vector_db_based.config import (
15
16
  OpenAIEmbeddingConfigModel,
16
17
  ProcessingConfigModel,
17
18
  )
18
- from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
19
19
  from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
20
+ from airbyte_cdk.models import AirbyteRecordMessage
20
21
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
21
22
  from langchain.embeddings.cohere import CohereEmbeddings
22
23
  from langchain.embeddings.fake import FakeEmbeddings
@@ -24,6 +25,12 @@ from langchain.embeddings.localai import LocalAIEmbeddings
24
25
  from langchain.embeddings.openai import OpenAIEmbeddings
25
26
 
26
27
 
28
+ @dataclass
29
+ class Document:
30
+ page_content: str
31
+ record: AirbyteRecordMessage
32
+
33
+
27
34
  class Embedder(ABC):
28
35
  """
29
36
  Embedder is an abstract class that defines the interface for embedding text.
@@ -41,7 +48,7 @@ class Embedder(ABC):
41
48
  pass
42
49
 
43
50
  @abstractmethod
44
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
51
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
45
52
  """
46
53
  Embed the text of each chunk and return the resulting embedding vectors.
47
54
  If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
@@ -72,7 +79,7 @@ class BaseOpenAIEmbedder(Embedder):
72
79
  return format_exception(e)
73
80
  return None
74
81
 
75
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
82
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
76
83
  """
77
84
  Embed the text of each chunk and return the resulting embedding vectors.
78
85
 
@@ -80,9 +87,9 @@ class BaseOpenAIEmbedder(Embedder):
80
87
  It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls,
81
88
  but the built-in retry mechanism of the OpenAI client handles that.
82
89
  """
83
- # Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of chunks that can be embedded at once without exhausting the limit in a single request
90
+ # Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request
84
91
  embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size
85
- batches = create_chunks(chunks, batch_size=embedding_batch_size)
92
+ batches = create_chunks(documents, batch_size=embedding_batch_size)
86
93
  embeddings: List[Optional[List[float]]] = []
87
94
  for batch in batches:
88
95
  embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
@@ -121,8 +128,8 @@ class CohereEmbedder(Embedder):
121
128
  return format_exception(e)
122
129
  return None
123
130
 
124
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
125
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
131
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
132
+ return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
126
133
 
127
134
  @property
128
135
  def embedding_dimensions(self) -> int:
@@ -142,8 +149,8 @@ class FakeEmbedder(Embedder):
142
149
  return format_exception(e)
143
150
  return None
144
151
 
145
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
146
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
152
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
153
+ return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
147
154
 
148
155
  @property
149
156
  def embedding_dimensions(self) -> int:
@@ -173,8 +180,8 @@ class OpenAICompatibleEmbedder(Embedder):
173
180
  return format_exception(e)
174
181
  return None
175
182
 
176
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
177
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([chunk.page_content or "" for chunk in chunks]))
183
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
184
+ return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
178
185
 
179
186
  @property
180
187
  def embedding_dimensions(self) -> int:
@@ -190,32 +197,32 @@ class FromFieldEmbedder(Embedder):
190
197
  def check(self) -> Optional[str]:
191
198
  return None
192
199
 
193
- def embed_chunks(self, chunks: List[Chunk]) -> List[Optional[List[float]]]:
200
+ def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
194
201
  """
195
202
  From each chunk, pull the embedding from the field specified in the config.
196
203
  Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
197
204
  """
198
205
  embeddings: List[Optional[List[float]]] = []
199
- for chunk in chunks:
200
- data = chunk.record.data
206
+ for document in documents:
207
+ data = document.record.data
201
208
  if self.config.field_name not in data:
202
209
  raise AirbyteTracedException(
203
210
  internal_message="Embedding vector field not found",
204
211
  failure_type=FailureType.config_error,
205
- message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
212
+ message=f"Record {str(data)[:250]}... in stream {document.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
206
213
  )
207
214
  field = data[self.config.field_name]
208
215
  if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
209
216
  raise AirbyteTracedException(
210
217
  internal_message="Embedding vector field not a list of numbers",
211
218
  failure_type=FailureType.config_error,
212
- message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
219
+ message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
213
220
  )
214
221
  if len(field) != self.config.dimensions:
215
222
  raise AirbyteTracedException(
216
223
  internal_message="Embedding vector field has wrong length",
217
224
  failure_type=FailureType.config_error,
218
- message=f"Record {str(data)[:250]}... in stream {chunk.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
225
+ message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
219
226
  )
220
227
  embeddings.append(field)
221
228
 
@@ -8,7 +8,7 @@ from typing import Dict, Iterable, List, Tuple
8
8
 
9
9
  from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
10
10
  from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
11
- from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
11
+ from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder
12
12
  from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
13
13
  from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
14
14
 
@@ -16,14 +16,14 @@ from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
16
16
  class Writer:
17
17
  """
18
18
  The Writer class is orchestrating the document processor, the embedder and the indexer:
19
- * Incoming records are passed through the document processor to generate documents
20
- * One the configured batch size is reached, the documents are passed to the embedder to generate embeddings
21
- * The embedder embeds the documents
22
- * The indexer deletes old documents by the associated record id before indexing the new ones
19
+ * Incoming records are passed through the document processor to generate chunks
20
+ * One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings
21
+ * The embedder embeds the chunks
22
+ * The indexer deletes old chunks by the associated record id before indexing the new ones
23
23
 
24
24
  The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
25
25
  The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
26
- The omit_raw_text parameter can be used to omit the raw text from the documents. This can be useful if the raw text is very large and not needed for the destination.
26
+ The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination.
27
27
  """
28
28
 
29
29
  def __init__(
@@ -37,21 +37,29 @@ class Writer:
37
37
  self._init_batch()
38
38
 
39
39
  def _init_batch(self) -> None:
40
- self.documents: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
40
+ self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
41
41
  self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
42
- self.number_of_documents = 0
42
+ self.number_of_chunks = 0
43
+
44
+ def _convert_to_document(self, chunk: Chunk) -> Document:
45
+ """
46
+ Convert a chunk to a document for the embedder.
47
+ """
48
+ if chunk.page_content is None:
49
+ raise ValueError("Cannot embed a chunk without page content")
50
+ return Document(page_content=chunk.page_content, record=chunk.record)
43
51
 
44
52
  def _process_batch(self) -> None:
45
53
  for (namespace, stream), ids in self.ids_to_delete.items():
46
54
  self.indexer.delete(ids, namespace, stream)
47
55
 
48
- for (namespace, stream), documents in self.documents.items():
49
- embeddings = self.embedder.embed_chunks(documents)
50
- for i, document in enumerate(documents):
56
+ for (namespace, stream), chunks in self.chunks.items():
57
+ embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
58
+ for i, document in enumerate(chunks):
51
59
  document.embedding = embeddings[i]
52
60
  if self.omit_raw_text:
53
61
  document.page_content = None
54
- self.indexer.index(documents, namespace, stream)
62
+ self.indexer.index(chunks, namespace, stream)
55
63
 
56
64
  self._init_batch()
57
65
 
@@ -65,12 +73,12 @@ class Writer:
65
73
  self._process_batch()
66
74
  yield message
67
75
  elif message.type == Type.RECORD:
68
- record_documents, record_id_to_delete = self.processor.process(message.record)
69
- self.documents[(message.record.namespace, message.record.stream)].extend(record_documents)
76
+ record_chunks, record_id_to_delete = self.processor.process(message.record)
77
+ self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
70
78
  if record_id_to_delete is not None:
71
79
  self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
72
- self.number_of_documents += len(record_documents)
73
- if self.number_of_documents >= self.batch_size:
80
+ self.number_of_chunks += len(record_chunks)
81
+ if self.number_of_chunks >= self.batch_size:
74
82
  self._process_batch()
75
83
 
76
84
  self._process_batch()
@@ -3,6 +3,7 @@
3
3
  #
4
4
  import logging
5
5
  import traceback
6
+ from datetime import datetime
6
7
  from io import BytesIO, IOBase
7
8
  from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
8
9
 
@@ -56,6 +57,8 @@ def user_error(e: Exception) -> bool:
56
57
  """
57
58
  Return True if this exception is caused by user error, False otherwise.
58
59
  """
60
+ if not isinstance(e, RecordParseError):
61
+ return False
59
62
  if not isinstance(e, requests.exceptions.RequestException):
60
63
  return False
61
64
  return bool(e.response and 400 <= e.response.status_code < 500)
@@ -164,10 +167,14 @@ class UnstructuredParser(FileTypeParser):
164
167
  return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
165
168
  elif format.processing.mode == "api":
166
169
  try:
167
- result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy)
170
+ result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
168
171
  except Exception as e:
169
- # Re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
172
+ # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
173
+ #
174
+ # For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
170
175
  # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
176
+ if isinstance(e, RecordParseError):
177
+ raise e
171
178
  raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
172
179
 
173
180
  return result
@@ -210,7 +217,13 @@ class UnstructuredParser(FileTypeParser):
210
217
  return False, "Base URL must start with https://"
211
218
 
212
219
  try:
213
- self._read_file_remotely(BytesIO(b"# Airbyte source connection test"), format_config.processing, FileType.MD, "auto")
220
+ self._read_file_remotely(
221
+ BytesIO(b"# Airbyte source connection test"),
222
+ format_config.processing,
223
+ FileType.MD,
224
+ "auto",
225
+ RemoteFile(uri="test", last_modified=datetime.now()),
226
+ )
214
227
  except Exception:
215
228
  return False, "".join(traceback.format_exc())
216
229
 
@@ -218,14 +231,16 @@ class UnstructuredParser(FileTypeParser):
218
231
 
219
232
  @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
220
233
  def _read_file_remotely_with_retries(
221
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str
234
+ self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
222
235
  ) -> str:
223
236
  """
224
237
  Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
225
238
  """
226
- return self._read_file_remotely(file_handle, format, filetype, strategy)
239
+ return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
227
240
 
228
- def _read_file_remotely(self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str) -> str:
241
+ def _read_file_remotely(
242
+ self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
243
+ ) -> str:
229
244
  headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
230
245
 
231
246
  data = self._params_to_dict(format.parameters, strategy)
@@ -233,7 +248,13 @@ class UnstructuredParser(FileTypeParser):
233
248
  file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
234
249
 
235
250
  response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
236
- response.raise_for_status()
251
+
252
+ if response.status_code == 422:
253
+ # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
254
+ raise self._create_parse_error(remote_file, response.json())
255
+ else:
256
+ # Other error statuses are raised as requests exceptions (retry everything except user errors)
257
+ response.raise_for_status()
237
258
 
238
259
  json_response = response.json()
239
260
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 0.57.1
3
+ Version: 0.57.3
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://github.com/airbytehq/airbyte
6
6
  Author: Airbyte
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.8
19
19
  Requires-Python: >=3.8
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE.txt
22
- Requires-Dist: airbyte-protocol-models ==0.4.2
22
+ Requires-Dist: airbyte-protocol-models ==0.5.1
23
23
  Requires-Dist: backoff
24
24
  Requires-Dist: dpath ~=2.0.1
25
25
  Requires-Dist: isodate ~=0.6.1
@@ -15,11 +15,11 @@ airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQw
15
15
  airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
16
16
  airbyte_cdk/destinations/vector_db_based/config.py,sha256=ibGA5rQepeiscNTZC6GlvYaL_m3EhNGJ0FkegYo1CiU,12324
17
17
  airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=DjyegDH7jYh7N_1JiYSDaqc3OMEb4V5R_LtGxaGOhW4,9083
18
- airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=SNNEePbKD_OlDCmT3ZvbbYGYc9K0sH-4eT1sR8cRZ90,11264
18
+ airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=HxQCPwRpALmo5MvEhTuXdjinoBzlbNVvVunRw3EVgaE,11443
19
19
  airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=beiSi2Uu67EoTr7yQSaCJFAh9RajHFGKA4PoTbpTOqM,3243
20
20
  airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
21
21
  airbyte_cdk/destinations/vector_db_based/utils.py,sha256=dKpjY0QQVr5wMe6XHE_XdeL-nNqAew5InCfxkbyyf5A,1073
22
- airbyte_cdk/destinations/vector_db_based/writer.py,sha256=xMZVoOYshPp1bHm_9lJ752sUPGPasMKy2H_9TGMghj0,4053
22
+ airbyte_cdk/destinations/vector_db_based/writer.py,sha256=2EOkNcOe9pKGz7DgC6iSHjWoxbYF0IZ7PcpsQYIOgUk,4394
23
23
  airbyte_cdk/models/__init__.py,sha256=Kg8YHBqUsNWHlAw-u3ZGdG4dxLh7qBlHhqMRfamNCRU,1708
24
24
  airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
25
25
  airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
@@ -177,7 +177,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3
177
177
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
178
178
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
179
179
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
180
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=rY_4XuZ2nyI4487Bp7qKGM-hHGiDNxEy7w3kVrdx5vQ,15663
180
+ airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=vw9As28N7QPWkSPq0v-mHvnRtoiM51q8swpX4iG-1vI,16694
181
181
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
182
182
  airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
183
183
  airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
@@ -375,7 +375,7 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
375
375
  unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
376
376
  unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
377
377
  unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
378
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=ailTQJ3zWciQwNUsem6XdS4WETQW_OJrpXt6S78zb5Y,21153
378
+ unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=Ts-5Zzcq8ETwgb2aCXlk5EIZtQTM9_OOx1HL8TNk0IU,22454
379
379
  unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
380
380
  unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
381
381
  unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
@@ -439,8 +439,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
439
439
  unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
440
440
  unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
441
441
  unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
442
- airbyte_cdk-0.57.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
443
- airbyte_cdk-0.57.1.dist-info/METADATA,sha256=ZAdnO4gzEpVsZ46_Wc1I2bkqr8SFvWk30G9G4oOPOik,11983
444
- airbyte_cdk-0.57.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
445
- airbyte_cdk-0.57.1.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
446
- airbyte_cdk-0.57.1.dist-info/RECORD,,
442
+ airbyte_cdk-0.57.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
443
+ airbyte_cdk-0.57.3.dist-info/METADATA,sha256=rPyT57ZcShy6cYBeEwA7cf35b8NVHwZL3eX-3X-K_B8,11983
444
+ airbyte_cdk-0.57.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
445
+ airbyte_cdk-0.57.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
446
+ airbyte_cdk-0.57.3.dist-info/RECORD,,
@@ -315,7 +315,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
315
315
 
316
316
 
317
317
  @pytest.mark.parametrize(
318
- "filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records",
318
+ "filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records, http_status_code",
319
319
  [
320
320
  pytest.param(
321
321
  FileType.PDF,
@@ -332,6 +332,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
332
332
  "_ab_source_file_parse_error": None
333
333
  }
334
334
  ],
335
+ 200,
335
336
  id="basic_request",
336
337
  ),
337
338
  pytest.param(
@@ -349,6 +350,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
349
350
  "_ab_source_file_parse_error": None
350
351
  }
351
352
  ],
353
+ 200,
352
354
  id="request_with_params",
353
355
  ),
354
356
  pytest.param(
@@ -366,6 +368,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
366
368
  "_ab_source_file_parse_error": None
367
369
  }
368
370
  ],
371
+ 200,
369
372
  id="handle_markdown_locally",
370
373
  ),
371
374
  pytest.param(
@@ -394,6 +397,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
394
397
  ],
395
398
  True,
396
399
  None,
400
+ 200,
397
401
  id="retry_and_raise_on_api_error",
398
402
  ),
399
403
  pytest.param(
@@ -422,6 +426,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
422
426
  "_ab_source_file_parse_error": None
423
427
  }
424
428
  ],
429
+ 200,
425
430
  id="retry_and_recover",
426
431
  ),
427
432
  pytest.param(
@@ -438,6 +443,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
438
443
  ],
439
444
  True,
440
445
  None,
446
+ 200,
441
447
  id="no_retry_on_unexpected_error",
442
448
  ),
443
449
  pytest.param(
@@ -454,8 +460,29 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
454
460
  ],
455
461
  True,
456
462
  None,
463
+ 400,
457
464
  id="no_retry_on_400_error",
458
465
  ),
466
+ pytest.param(
467
+ FileType.PDF,
468
+ UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")),
469
+ None,
470
+ "test",
471
+ [{"detail": "Something went wrong"}],
472
+ [
473
+ call("https://api.unstructured.io/general/v0/general", headers={"accept": "application/json", "unstructured-api-key": "test"}, data={"strategy": "auto"}, files={"files": ("filename", mock.ANY, "application/pdf")}),
474
+ ],
475
+ False,
476
+ [
477
+ {
478
+ "content": None,
479
+ "document_key": FILE_URI,
480
+ "_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=[{'detail': 'Something went wrong'}]",
481
+ }
482
+ ],
483
+ 422,
484
+ id="error_record_on_422_error",
485
+ ),
459
486
  ],
460
487
  )
461
488
  @patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests")
@@ -473,6 +500,7 @@ def test_parse_records_remotely(
473
500
  expected_requests,
474
501
  raises,
475
502
  expected_records,
503
+ http_status_code
476
504
  ):
477
505
  stream_reader = MagicMock()
478
506
  mock_open(stream_reader.open_file, read_data=bytes(str(file_content), "utf-8"))
@@ -484,6 +512,7 @@ def test_parse_records_remotely(
484
512
  mock_detect_filetype.return_value = filetype
485
513
  mock_response = MagicMock()
486
514
  mock_response.json.return_value = json_response
515
+ mock_response.status_code = http_status_code
487
516
  if raises_for_status:
488
517
  mock_response.raise_for_status.side_effect = raises_for_status
489
518
  requests_mock.post.return_value = mock_response