airbyte-cdk 0.57.1__py3-none-any.whl → 0.57.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/destinations/vector_db_based/embedder.py +24 -17
- airbyte_cdk/destinations/vector_db_based/writer.py +24 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +28 -7
- {airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/METADATA +2 -2
- {airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/RECORD +9 -9
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +30 -1
- {airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.57.1.dist-info → airbyte_cdk-0.57.3.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
import os
|
6
6
|
from abc import ABC, abstractmethod
|
7
|
+
from dataclasses import dataclass
|
7
8
|
from typing import List, Optional, Union, cast
|
8
9
|
|
9
10
|
from airbyte_cdk.destinations.vector_db_based.config import (
|
@@ -15,8 +16,8 @@ from airbyte_cdk.destinations.vector_db_based.config import (
|
|
15
16
|
OpenAIEmbeddingConfigModel,
|
16
17
|
ProcessingConfigModel,
|
17
18
|
)
|
18
|
-
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
19
19
|
from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
|
20
|
+
from airbyte_cdk.models import AirbyteRecordMessage
|
20
21
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
21
22
|
from langchain.embeddings.cohere import CohereEmbeddings
|
22
23
|
from langchain.embeddings.fake import FakeEmbeddings
|
@@ -24,6 +25,12 @@ from langchain.embeddings.localai import LocalAIEmbeddings
|
|
24
25
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
25
26
|
|
26
27
|
|
28
|
+
@dataclass
|
29
|
+
class Document:
|
30
|
+
page_content: str
|
31
|
+
record: AirbyteRecordMessage
|
32
|
+
|
33
|
+
|
27
34
|
class Embedder(ABC):
|
28
35
|
"""
|
29
36
|
Embedder is an abstract class that defines the interface for embedding text.
|
@@ -41,7 +48,7 @@ class Embedder(ABC):
|
|
41
48
|
pass
|
42
49
|
|
43
50
|
@abstractmethod
|
44
|
-
def
|
51
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
45
52
|
"""
|
46
53
|
Embed the text of each chunk and return the resulting embedding vectors.
|
47
54
|
If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
|
@@ -72,7 +79,7 @@ class BaseOpenAIEmbedder(Embedder):
|
|
72
79
|
return format_exception(e)
|
73
80
|
return None
|
74
81
|
|
75
|
-
def
|
82
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
76
83
|
"""
|
77
84
|
Embed the text of each chunk and return the resulting embedding vectors.
|
78
85
|
|
@@ -80,9 +87,9 @@ class BaseOpenAIEmbedder(Embedder):
|
|
80
87
|
It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls,
|
81
88
|
but the built-in retry mechanism of the OpenAI client handles that.
|
82
89
|
"""
|
83
|
-
# Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of
|
90
|
+
# Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request
|
84
91
|
embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size
|
85
|
-
batches = create_chunks(
|
92
|
+
batches = create_chunks(documents, batch_size=embedding_batch_size)
|
86
93
|
embeddings: List[Optional[List[float]]] = []
|
87
94
|
for batch in batches:
|
88
95
|
embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
|
@@ -121,8 +128,8 @@ class CohereEmbedder(Embedder):
|
|
121
128
|
return format_exception(e)
|
122
129
|
return None
|
123
130
|
|
124
|
-
def
|
125
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
131
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
132
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
126
133
|
|
127
134
|
@property
|
128
135
|
def embedding_dimensions(self) -> int:
|
@@ -142,8 +149,8 @@ class FakeEmbedder(Embedder):
|
|
142
149
|
return format_exception(e)
|
143
150
|
return None
|
144
151
|
|
145
|
-
def
|
146
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
152
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
153
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
147
154
|
|
148
155
|
@property
|
149
156
|
def embedding_dimensions(self) -> int:
|
@@ -173,8 +180,8 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
173
180
|
return format_exception(e)
|
174
181
|
return None
|
175
182
|
|
176
|
-
def
|
177
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
183
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
184
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
178
185
|
|
179
186
|
@property
|
180
187
|
def embedding_dimensions(self) -> int:
|
@@ -190,32 +197,32 @@ class FromFieldEmbedder(Embedder):
|
|
190
197
|
def check(self) -> Optional[str]:
|
191
198
|
return None
|
192
199
|
|
193
|
-
def
|
200
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
194
201
|
"""
|
195
202
|
From each chunk, pull the embedding from the field specified in the config.
|
196
203
|
Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
|
197
204
|
"""
|
198
205
|
embeddings: List[Optional[List[float]]] = []
|
199
|
-
for
|
200
|
-
data =
|
206
|
+
for document in documents:
|
207
|
+
data = document.record.data
|
201
208
|
if self.config.field_name not in data:
|
202
209
|
raise AirbyteTracedException(
|
203
210
|
internal_message="Embedding vector field not found",
|
204
211
|
failure_type=FailureType.config_error,
|
205
|
-
message=f"Record {str(data)[:250]}... in stream {
|
212
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
|
206
213
|
)
|
207
214
|
field = data[self.config.field_name]
|
208
215
|
if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
|
209
216
|
raise AirbyteTracedException(
|
210
217
|
internal_message="Embedding vector field not a list of numbers",
|
211
218
|
failure_type=FailureType.config_error,
|
212
|
-
message=f"Record {str(data)[:250]}... in stream {
|
219
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
213
220
|
)
|
214
221
|
if len(field) != self.config.dimensions:
|
215
222
|
raise AirbyteTracedException(
|
216
223
|
internal_message="Embedding vector field has wrong length",
|
217
224
|
failure_type=FailureType.config_error,
|
218
|
-
message=f"Record {str(data)[:250]}... in stream {
|
225
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
219
226
|
)
|
220
227
|
embeddings.append(field)
|
221
228
|
|
@@ -8,7 +8,7 @@ from typing import Dict, Iterable, List, Tuple
|
|
8
8
|
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
|
11
|
-
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder
|
12
12
|
from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
|
13
13
|
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
|
14
14
|
|
@@ -16,14 +16,14 @@ from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
|
|
16
16
|
class Writer:
|
17
17
|
"""
|
18
18
|
The Writer class is orchestrating the document processor, the embedder and the indexer:
|
19
|
-
* Incoming records are passed through the document processor to generate
|
20
|
-
* One the configured batch size is reached, the
|
21
|
-
* The embedder embeds the
|
22
|
-
* The indexer deletes old
|
19
|
+
* Incoming records are passed through the document processor to generate chunks
|
20
|
+
* One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings
|
21
|
+
* The embedder embeds the chunks
|
22
|
+
* The indexer deletes old chunks by the associated record id before indexing the new ones
|
23
23
|
|
24
24
|
The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
|
25
25
|
The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
|
26
|
-
The omit_raw_text parameter can be used to omit the raw text from the
|
26
|
+
The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination.
|
27
27
|
"""
|
28
28
|
|
29
29
|
def __init__(
|
@@ -37,21 +37,29 @@ class Writer:
|
|
37
37
|
self._init_batch()
|
38
38
|
|
39
39
|
def _init_batch(self) -> None:
|
40
|
-
self.
|
40
|
+
self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
|
41
41
|
self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
|
42
|
-
self.
|
42
|
+
self.number_of_chunks = 0
|
43
|
+
|
44
|
+
def _convert_to_document(self, chunk: Chunk) -> Document:
|
45
|
+
"""
|
46
|
+
Convert a chunk to a document for the embedder.
|
47
|
+
"""
|
48
|
+
if chunk.page_content is None:
|
49
|
+
raise ValueError("Cannot embed a chunk without page content")
|
50
|
+
return Document(page_content=chunk.page_content, record=chunk.record)
|
43
51
|
|
44
52
|
def _process_batch(self) -> None:
|
45
53
|
for (namespace, stream), ids in self.ids_to_delete.items():
|
46
54
|
self.indexer.delete(ids, namespace, stream)
|
47
55
|
|
48
|
-
for (namespace, stream),
|
49
|
-
embeddings = self.embedder.
|
50
|
-
for i, document in enumerate(
|
56
|
+
for (namespace, stream), chunks in self.chunks.items():
|
57
|
+
embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
|
58
|
+
for i, document in enumerate(chunks):
|
51
59
|
document.embedding = embeddings[i]
|
52
60
|
if self.omit_raw_text:
|
53
61
|
document.page_content = None
|
54
|
-
self.indexer.index(
|
62
|
+
self.indexer.index(chunks, namespace, stream)
|
55
63
|
|
56
64
|
self._init_batch()
|
57
65
|
|
@@ -65,12 +73,12 @@ class Writer:
|
|
65
73
|
self._process_batch()
|
66
74
|
yield message
|
67
75
|
elif message.type == Type.RECORD:
|
68
|
-
|
69
|
-
self.
|
76
|
+
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
77
|
+
self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
|
70
78
|
if record_id_to_delete is not None:
|
71
79
|
self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
|
72
|
-
self.
|
73
|
-
if self.
|
80
|
+
self.number_of_chunks += len(record_chunks)
|
81
|
+
if self.number_of_chunks >= self.batch_size:
|
74
82
|
self._process_batch()
|
75
83
|
|
76
84
|
self._process_batch()
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#
|
4
4
|
import logging
|
5
5
|
import traceback
|
6
|
+
from datetime import datetime
|
6
7
|
from io import BytesIO, IOBase
|
7
8
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
8
9
|
|
@@ -56,6 +57,8 @@ def user_error(e: Exception) -> bool:
|
|
56
57
|
"""
|
57
58
|
Return True if this exception is caused by user error, False otherwise.
|
58
59
|
"""
|
60
|
+
if not isinstance(e, RecordParseError):
|
61
|
+
return False
|
59
62
|
if not isinstance(e, requests.exceptions.RequestException):
|
60
63
|
return False
|
61
64
|
return bool(e.response and 400 <= e.response.status_code < 500)
|
@@ -164,10 +167,14 @@ class UnstructuredParser(FileTypeParser):
|
|
164
167
|
return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
|
165
168
|
elif format.processing.mode == "api":
|
166
169
|
try:
|
167
|
-
result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy)
|
170
|
+
result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
|
168
171
|
except Exception as e:
|
169
|
-
#
|
172
|
+
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
173
|
+
#
|
174
|
+
# For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
|
170
175
|
# Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
|
176
|
+
if isinstance(e, RecordParseError):
|
177
|
+
raise e
|
171
178
|
raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
|
172
179
|
|
173
180
|
return result
|
@@ -210,7 +217,13 @@ class UnstructuredParser(FileTypeParser):
|
|
210
217
|
return False, "Base URL must start with https://"
|
211
218
|
|
212
219
|
try:
|
213
|
-
self._read_file_remotely(
|
220
|
+
self._read_file_remotely(
|
221
|
+
BytesIO(b"# Airbyte source connection test"),
|
222
|
+
format_config.processing,
|
223
|
+
FileType.MD,
|
224
|
+
"auto",
|
225
|
+
RemoteFile(uri="test", last_modified=datetime.now()),
|
226
|
+
)
|
214
227
|
except Exception:
|
215
228
|
return False, "".join(traceback.format_exc())
|
216
229
|
|
@@ -218,14 +231,16 @@ class UnstructuredParser(FileTypeParser):
|
|
218
231
|
|
219
232
|
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
|
220
233
|
def _read_file_remotely_with_retries(
|
221
|
-
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str
|
234
|
+
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
|
222
235
|
) -> str:
|
223
236
|
"""
|
224
237
|
Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
|
225
238
|
"""
|
226
|
-
return self._read_file_remotely(file_handle, format, filetype, strategy)
|
239
|
+
return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
|
227
240
|
|
228
|
-
def _read_file_remotely(
|
241
|
+
def _read_file_remotely(
|
242
|
+
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
|
243
|
+
) -> str:
|
229
244
|
headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
|
230
245
|
|
231
246
|
data = self._params_to_dict(format.parameters, strategy)
|
@@ -233,7 +248,13 @@ class UnstructuredParser(FileTypeParser):
|
|
233
248
|
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
|
234
249
|
|
235
250
|
response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
|
236
|
-
|
251
|
+
|
252
|
+
if response.status_code == 422:
|
253
|
+
# 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
|
254
|
+
raise self._create_parse_error(remote_file, response.json())
|
255
|
+
else:
|
256
|
+
# Other error statuses are raised as requests exceptions (retry everything except user errors)
|
257
|
+
response.raise_for_status()
|
237
258
|
|
238
259
|
json_response = response.json()
|
239
260
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: airbyte-cdk
|
3
|
-
Version: 0.57.
|
3
|
+
Version: 0.57.3
|
4
4
|
Summary: A framework for writing Airbyte Connectors.
|
5
5
|
Home-page: https://github.com/airbytehq/airbyte
|
6
6
|
Author: Airbyte
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
19
19
|
Requires-Python: >=3.8
|
20
20
|
Description-Content-Type: text/markdown
|
21
21
|
License-File: LICENSE.txt
|
22
|
-
Requires-Dist: airbyte-protocol-models ==0.
|
22
|
+
Requires-Dist: airbyte-protocol-models ==0.5.1
|
23
23
|
Requires-Dist: backoff
|
24
24
|
Requires-Dist: dpath ~=2.0.1
|
25
25
|
Requires-Dist: isodate ~=0.6.1
|
@@ -15,11 +15,11 @@ airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQw
|
|
15
15
|
airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
|
16
16
|
airbyte_cdk/destinations/vector_db_based/config.py,sha256=ibGA5rQepeiscNTZC6GlvYaL_m3EhNGJ0FkegYo1CiU,12324
|
17
17
|
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=DjyegDH7jYh7N_1JiYSDaqc3OMEb4V5R_LtGxaGOhW4,9083
|
18
|
-
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=
|
18
|
+
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=HxQCPwRpALmo5MvEhTuXdjinoBzlbNVvVunRw3EVgaE,11443
|
19
19
|
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=beiSi2Uu67EoTr7yQSaCJFAh9RajHFGKA4PoTbpTOqM,3243
|
20
20
|
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
|
21
21
|
airbyte_cdk/destinations/vector_db_based/utils.py,sha256=dKpjY0QQVr5wMe6XHE_XdeL-nNqAew5InCfxkbyyf5A,1073
|
22
|
-
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=
|
22
|
+
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=2EOkNcOe9pKGz7DgC6iSHjWoxbYF0IZ7PcpsQYIOgUk,4394
|
23
23
|
airbyte_cdk/models/__init__.py,sha256=Kg8YHBqUsNWHlAw-u3ZGdG4dxLh7qBlHhqMRfamNCRU,1708
|
24
24
|
airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
|
25
25
|
airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
|
@@ -177,7 +177,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3
|
|
177
177
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
|
178
178
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
|
179
179
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
|
180
|
-
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=
|
180
|
+
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=vw9As28N7QPWkSPq0v-mHvnRtoiM51q8swpX4iG-1vI,16694
|
181
181
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
|
182
182
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
|
183
183
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
|
@@ -375,7 +375,7 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
|
|
375
375
|
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
|
376
376
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
377
377
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
|
378
|
-
unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=
|
378
|
+
unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=Ts-5Zzcq8ETwgb2aCXlk5EIZtQTM9_OOx1HL8TNk0IU,22454
|
379
379
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
380
380
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
|
381
381
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
|
@@ -439,8 +439,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
439
439
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
440
440
|
unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
|
441
441
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
442
|
-
airbyte_cdk-0.57.
|
443
|
-
airbyte_cdk-0.57.
|
444
|
-
airbyte_cdk-0.57.
|
445
|
-
airbyte_cdk-0.57.
|
446
|
-
airbyte_cdk-0.57.
|
442
|
+
airbyte_cdk-0.57.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
443
|
+
airbyte_cdk-0.57.3.dist-info/METADATA,sha256=rPyT57ZcShy6cYBeEwA7cf35b8NVHwZL3eX-3X-K_B8,11983
|
444
|
+
airbyte_cdk-0.57.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
445
|
+
airbyte_cdk-0.57.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
446
|
+
airbyte_cdk-0.57.3.dist-info/RECORD,,
|
@@ -315,7 +315,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
315
315
|
|
316
316
|
|
317
317
|
@pytest.mark.parametrize(
|
318
|
-
"filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records",
|
318
|
+
"filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records, http_status_code",
|
319
319
|
[
|
320
320
|
pytest.param(
|
321
321
|
FileType.PDF,
|
@@ -332,6 +332,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
332
332
|
"_ab_source_file_parse_error": None
|
333
333
|
}
|
334
334
|
],
|
335
|
+
200,
|
335
336
|
id="basic_request",
|
336
337
|
),
|
337
338
|
pytest.param(
|
@@ -349,6 +350,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
349
350
|
"_ab_source_file_parse_error": None
|
350
351
|
}
|
351
352
|
],
|
353
|
+
200,
|
352
354
|
id="request_with_params",
|
353
355
|
),
|
354
356
|
pytest.param(
|
@@ -366,6 +368,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
366
368
|
"_ab_source_file_parse_error": None
|
367
369
|
}
|
368
370
|
],
|
371
|
+
200,
|
369
372
|
id="handle_markdown_locally",
|
370
373
|
),
|
371
374
|
pytest.param(
|
@@ -394,6 +397,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
394
397
|
],
|
395
398
|
True,
|
396
399
|
None,
|
400
|
+
200,
|
397
401
|
id="retry_and_raise_on_api_error",
|
398
402
|
),
|
399
403
|
pytest.param(
|
@@ -422,6 +426,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
422
426
|
"_ab_source_file_parse_error": None
|
423
427
|
}
|
424
428
|
],
|
429
|
+
200,
|
425
430
|
id="retry_and_recover",
|
426
431
|
),
|
427
432
|
pytest.param(
|
@@ -438,6 +443,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
438
443
|
],
|
439
444
|
True,
|
440
445
|
None,
|
446
|
+
200,
|
441
447
|
id="no_retry_on_unexpected_error",
|
442
448
|
),
|
443
449
|
pytest.param(
|
@@ -454,8 +460,29 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
454
460
|
],
|
455
461
|
True,
|
456
462
|
None,
|
463
|
+
400,
|
457
464
|
id="no_retry_on_400_error",
|
458
465
|
),
|
466
|
+
pytest.param(
|
467
|
+
FileType.PDF,
|
468
|
+
UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")),
|
469
|
+
None,
|
470
|
+
"test",
|
471
|
+
[{"detail": "Something went wrong"}],
|
472
|
+
[
|
473
|
+
call("https://api.unstructured.io/general/v0/general", headers={"accept": "application/json", "unstructured-api-key": "test"}, data={"strategy": "auto"}, files={"files": ("filename", mock.ANY, "application/pdf")}),
|
474
|
+
],
|
475
|
+
False,
|
476
|
+
[
|
477
|
+
{
|
478
|
+
"content": None,
|
479
|
+
"document_key": FILE_URI,
|
480
|
+
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=[{'detail': 'Something went wrong'}]",
|
481
|
+
}
|
482
|
+
],
|
483
|
+
422,
|
484
|
+
id="error_record_on_422_error",
|
485
|
+
),
|
459
486
|
],
|
460
487
|
)
|
461
488
|
@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests")
|
@@ -473,6 +500,7 @@ def test_parse_records_remotely(
|
|
473
500
|
expected_requests,
|
474
501
|
raises,
|
475
502
|
expected_records,
|
503
|
+
http_status_code
|
476
504
|
):
|
477
505
|
stream_reader = MagicMock()
|
478
506
|
mock_open(stream_reader.open_file, read_data=bytes(str(file_content), "utf-8"))
|
@@ -484,6 +512,7 @@ def test_parse_records_remotely(
|
|
484
512
|
mock_detect_filetype.return_value = filetype
|
485
513
|
mock_response = MagicMock()
|
486
514
|
mock_response.json.return_value = json_response
|
515
|
+
mock_response.status_code = http_status_code
|
487
516
|
if raises_for_status:
|
488
517
|
mock_response.raise_for_status.side_effect = raises_for_status
|
489
518
|
requests_mock.post.return_value = mock_response
|
File without changes
|
File without changes
|
File without changes
|