airbyte-cdk 0.57.2__py3-none-any.whl → 0.57.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/destinations/vector_db_based/embedder.py +24 -17
- airbyte_cdk/destinations/vector_db_based/writer.py +24 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +28 -7
- {airbyte_cdk-0.57.2.dist-info → airbyte_cdk-0.57.3.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.57.2.dist-info → airbyte_cdk-0.57.3.dist-info}/RECORD +9 -9
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +30 -1
- {airbyte_cdk-0.57.2.dist-info → airbyte_cdk-0.57.3.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.57.2.dist-info → airbyte_cdk-0.57.3.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.57.2.dist-info → airbyte_cdk-0.57.3.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
import os
|
6
6
|
from abc import ABC, abstractmethod
|
7
|
+
from dataclasses import dataclass
|
7
8
|
from typing import List, Optional, Union, cast
|
8
9
|
|
9
10
|
from airbyte_cdk.destinations.vector_db_based.config import (
|
@@ -15,8 +16,8 @@ from airbyte_cdk.destinations.vector_db_based.config import (
|
|
15
16
|
OpenAIEmbeddingConfigModel,
|
16
17
|
ProcessingConfigModel,
|
17
18
|
)
|
18
|
-
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
19
19
|
from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
|
20
|
+
from airbyte_cdk.models import AirbyteRecordMessage
|
20
21
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
21
22
|
from langchain.embeddings.cohere import CohereEmbeddings
|
22
23
|
from langchain.embeddings.fake import FakeEmbeddings
|
@@ -24,6 +25,12 @@ from langchain.embeddings.localai import LocalAIEmbeddings
|
|
24
25
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
25
26
|
|
26
27
|
|
28
|
+
@dataclass
|
29
|
+
class Document:
|
30
|
+
page_content: str
|
31
|
+
record: AirbyteRecordMessage
|
32
|
+
|
33
|
+
|
27
34
|
class Embedder(ABC):
|
28
35
|
"""
|
29
36
|
Embedder is an abstract class that defines the interface for embedding text.
|
@@ -41,7 +48,7 @@ class Embedder(ABC):
|
|
41
48
|
pass
|
42
49
|
|
43
50
|
@abstractmethod
|
44
|
-
def
|
51
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
45
52
|
"""
|
46
53
|
Embed the text of each chunk and return the resulting embedding vectors.
|
47
54
|
If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
|
@@ -72,7 +79,7 @@ class BaseOpenAIEmbedder(Embedder):
|
|
72
79
|
return format_exception(e)
|
73
80
|
return None
|
74
81
|
|
75
|
-
def
|
82
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
76
83
|
"""
|
77
84
|
Embed the text of each chunk and return the resulting embedding vectors.
|
78
85
|
|
@@ -80,9 +87,9 @@ class BaseOpenAIEmbedder(Embedder):
|
|
80
87
|
It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls,
|
81
88
|
but the built-in retry mechanism of the OpenAI client handles that.
|
82
89
|
"""
|
83
|
-
# Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of
|
90
|
+
# Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request
|
84
91
|
embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size
|
85
|
-
batches = create_chunks(
|
92
|
+
batches = create_chunks(documents, batch_size=embedding_batch_size)
|
86
93
|
embeddings: List[Optional[List[float]]] = []
|
87
94
|
for batch in batches:
|
88
95
|
embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
|
@@ -121,8 +128,8 @@ class CohereEmbedder(Embedder):
|
|
121
128
|
return format_exception(e)
|
122
129
|
return None
|
123
130
|
|
124
|
-
def
|
125
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
131
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
132
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
126
133
|
|
127
134
|
@property
|
128
135
|
def embedding_dimensions(self) -> int:
|
@@ -142,8 +149,8 @@ class FakeEmbedder(Embedder):
|
|
142
149
|
return format_exception(e)
|
143
150
|
return None
|
144
151
|
|
145
|
-
def
|
146
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
152
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
153
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
147
154
|
|
148
155
|
@property
|
149
156
|
def embedding_dimensions(self) -> int:
|
@@ -173,8 +180,8 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
173
180
|
return format_exception(e)
|
174
181
|
return None
|
175
182
|
|
176
|
-
def
|
177
|
-
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([
|
183
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
184
|
+
return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
|
178
185
|
|
179
186
|
@property
|
180
187
|
def embedding_dimensions(self) -> int:
|
@@ -190,32 +197,32 @@ class FromFieldEmbedder(Embedder):
|
|
190
197
|
def check(self) -> Optional[str]:
|
191
198
|
return None
|
192
199
|
|
193
|
-
def
|
200
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
194
201
|
"""
|
195
202
|
From each chunk, pull the embedding from the field specified in the config.
|
196
203
|
Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
|
197
204
|
"""
|
198
205
|
embeddings: List[Optional[List[float]]] = []
|
199
|
-
for
|
200
|
-
data =
|
206
|
+
for document in documents:
|
207
|
+
data = document.record.data
|
201
208
|
if self.config.field_name not in data:
|
202
209
|
raise AirbyteTracedException(
|
203
210
|
internal_message="Embedding vector field not found",
|
204
211
|
failure_type=FailureType.config_error,
|
205
|
-
message=f"Record {str(data)[:250]}... in stream {
|
212
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
|
206
213
|
)
|
207
214
|
field = data[self.config.field_name]
|
208
215
|
if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
|
209
216
|
raise AirbyteTracedException(
|
210
217
|
internal_message="Embedding vector field not a list of numbers",
|
211
218
|
failure_type=FailureType.config_error,
|
212
|
-
message=f"Record {str(data)[:250]}... in stream {
|
219
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
213
220
|
)
|
214
221
|
if len(field) != self.config.dimensions:
|
215
222
|
raise AirbyteTracedException(
|
216
223
|
internal_message="Embedding vector field has wrong length",
|
217
224
|
failure_type=FailureType.config_error,
|
218
|
-
message=f"Record {str(data)[:250]}... in stream {
|
225
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
219
226
|
)
|
220
227
|
embeddings.append(field)
|
221
228
|
|
@@ -8,7 +8,7 @@ from typing import Dict, Iterable, List, Tuple
|
|
8
8
|
|
9
9
|
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
10
10
|
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
|
11
|
-
from airbyte_cdk.destinations.vector_db_based.embedder import Embedder
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder
|
12
12
|
from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
|
13
13
|
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
|
14
14
|
|
@@ -16,14 +16,14 @@ from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
|
|
16
16
|
class Writer:
|
17
17
|
"""
|
18
18
|
The Writer class is orchestrating the document processor, the embedder and the indexer:
|
19
|
-
* Incoming records are passed through the document processor to generate
|
20
|
-
* One the configured batch size is reached, the
|
21
|
-
* The embedder embeds the
|
22
|
-
* The indexer deletes old
|
19
|
+
* Incoming records are passed through the document processor to generate chunks
|
20
|
+
* One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings
|
21
|
+
* The embedder embeds the chunks
|
22
|
+
* The indexer deletes old chunks by the associated record id before indexing the new ones
|
23
23
|
|
24
24
|
The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
|
25
25
|
The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
|
26
|
-
The omit_raw_text parameter can be used to omit the raw text from the
|
26
|
+
The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination.
|
27
27
|
"""
|
28
28
|
|
29
29
|
def __init__(
|
@@ -37,21 +37,29 @@ class Writer:
|
|
37
37
|
self._init_batch()
|
38
38
|
|
39
39
|
def _init_batch(self) -> None:
|
40
|
-
self.
|
40
|
+
self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
|
41
41
|
self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
|
42
|
-
self.
|
42
|
+
self.number_of_chunks = 0
|
43
|
+
|
44
|
+
def _convert_to_document(self, chunk: Chunk) -> Document:
|
45
|
+
"""
|
46
|
+
Convert a chunk to a document for the embedder.
|
47
|
+
"""
|
48
|
+
if chunk.page_content is None:
|
49
|
+
raise ValueError("Cannot embed a chunk without page content")
|
50
|
+
return Document(page_content=chunk.page_content, record=chunk.record)
|
43
51
|
|
44
52
|
def _process_batch(self) -> None:
|
45
53
|
for (namespace, stream), ids in self.ids_to_delete.items():
|
46
54
|
self.indexer.delete(ids, namespace, stream)
|
47
55
|
|
48
|
-
for (namespace, stream),
|
49
|
-
embeddings = self.embedder.
|
50
|
-
for i, document in enumerate(
|
56
|
+
for (namespace, stream), chunks in self.chunks.items():
|
57
|
+
embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
|
58
|
+
for i, document in enumerate(chunks):
|
51
59
|
document.embedding = embeddings[i]
|
52
60
|
if self.omit_raw_text:
|
53
61
|
document.page_content = None
|
54
|
-
self.indexer.index(
|
62
|
+
self.indexer.index(chunks, namespace, stream)
|
55
63
|
|
56
64
|
self._init_batch()
|
57
65
|
|
@@ -65,12 +73,12 @@ class Writer:
|
|
65
73
|
self._process_batch()
|
66
74
|
yield message
|
67
75
|
elif message.type == Type.RECORD:
|
68
|
-
|
69
|
-
self.
|
76
|
+
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
77
|
+
self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
|
70
78
|
if record_id_to_delete is not None:
|
71
79
|
self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
|
72
|
-
self.
|
73
|
-
if self.
|
80
|
+
self.number_of_chunks += len(record_chunks)
|
81
|
+
if self.number_of_chunks >= self.batch_size:
|
74
82
|
self._process_batch()
|
75
83
|
|
76
84
|
self._process_batch()
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#
|
4
4
|
import logging
|
5
5
|
import traceback
|
6
|
+
from datetime import datetime
|
6
7
|
from io import BytesIO, IOBase
|
7
8
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
8
9
|
|
@@ -56,6 +57,8 @@ def user_error(e: Exception) -> bool:
|
|
56
57
|
"""
|
57
58
|
Return True if this exception is caused by user error, False otherwise.
|
58
59
|
"""
|
60
|
+
if not isinstance(e, RecordParseError):
|
61
|
+
return False
|
59
62
|
if not isinstance(e, requests.exceptions.RequestException):
|
60
63
|
return False
|
61
64
|
return bool(e.response and 400 <= e.response.status_code < 500)
|
@@ -164,10 +167,14 @@ class UnstructuredParser(FileTypeParser):
|
|
164
167
|
return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
|
165
168
|
elif format.processing.mode == "api":
|
166
169
|
try:
|
167
|
-
result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy)
|
170
|
+
result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
|
168
171
|
except Exception as e:
|
169
|
-
#
|
172
|
+
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
173
|
+
#
|
174
|
+
# For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
|
170
175
|
# Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
|
176
|
+
if isinstance(e, RecordParseError):
|
177
|
+
raise e
|
171
178
|
raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
|
172
179
|
|
173
180
|
return result
|
@@ -210,7 +217,13 @@ class UnstructuredParser(FileTypeParser):
|
|
210
217
|
return False, "Base URL must start with https://"
|
211
218
|
|
212
219
|
try:
|
213
|
-
self._read_file_remotely(
|
220
|
+
self._read_file_remotely(
|
221
|
+
BytesIO(b"# Airbyte source connection test"),
|
222
|
+
format_config.processing,
|
223
|
+
FileType.MD,
|
224
|
+
"auto",
|
225
|
+
RemoteFile(uri="test", last_modified=datetime.now()),
|
226
|
+
)
|
214
227
|
except Exception:
|
215
228
|
return False, "".join(traceback.format_exc())
|
216
229
|
|
@@ -218,14 +231,16 @@ class UnstructuredParser(FileTypeParser):
|
|
218
231
|
|
219
232
|
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
|
220
233
|
def _read_file_remotely_with_retries(
|
221
|
-
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str
|
234
|
+
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
|
222
235
|
) -> str:
|
223
236
|
"""
|
224
237
|
Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
|
225
238
|
"""
|
226
|
-
return self._read_file_remotely(file_handle, format, filetype, strategy)
|
239
|
+
return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
|
227
240
|
|
228
|
-
def _read_file_remotely(
|
241
|
+
def _read_file_remotely(
|
242
|
+
self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
|
243
|
+
) -> str:
|
229
244
|
headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
|
230
245
|
|
231
246
|
data = self._params_to_dict(format.parameters, strategy)
|
@@ -233,7 +248,13 @@ class UnstructuredParser(FileTypeParser):
|
|
233
248
|
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
|
234
249
|
|
235
250
|
response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
|
236
|
-
|
251
|
+
|
252
|
+
if response.status_code == 422:
|
253
|
+
# 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
|
254
|
+
raise self._create_parse_error(remote_file, response.json())
|
255
|
+
else:
|
256
|
+
# Other error statuses are raised as requests exceptions (retry everything except user errors)
|
257
|
+
response.raise_for_status()
|
237
258
|
|
238
259
|
json_response = response.json()
|
239
260
|
|
@@ -15,11 +15,11 @@ airbyte_cdk/destinations/destination.py,sha256=_tIMnKcRQbtIsjVvNOVjfbIxgCNLuBXQw
|
|
15
15
|
airbyte_cdk/destinations/vector_db_based/__init__.py,sha256=eAkzwTjBbXBhJ5GfPO5I53Zgpv5xQFLRQS8n4nuyPt0,1006
|
16
16
|
airbyte_cdk/destinations/vector_db_based/config.py,sha256=ibGA5rQepeiscNTZC6GlvYaL_m3EhNGJ0FkegYo1CiU,12324
|
17
17
|
airbyte_cdk/destinations/vector_db_based/document_processor.py,sha256=DjyegDH7jYh7N_1JiYSDaqc3OMEb4V5R_LtGxaGOhW4,9083
|
18
|
-
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=
|
18
|
+
airbyte_cdk/destinations/vector_db_based/embedder.py,sha256=HxQCPwRpALmo5MvEhTuXdjinoBzlbNVvVunRw3EVgaE,11443
|
19
19
|
airbyte_cdk/destinations/vector_db_based/indexer.py,sha256=beiSi2Uu67EoTr7yQSaCJFAh9RajHFGKA4PoTbpTOqM,3243
|
20
20
|
airbyte_cdk/destinations/vector_db_based/test_utils.py,sha256=8d1Smk4jQRKtDfloXfEq12T-BU8ByyzzSBwAlchsU4A,1807
|
21
21
|
airbyte_cdk/destinations/vector_db_based/utils.py,sha256=dKpjY0QQVr5wMe6XHE_XdeL-nNqAew5InCfxkbyyf5A,1073
|
22
|
-
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=
|
22
|
+
airbyte_cdk/destinations/vector_db_based/writer.py,sha256=2EOkNcOe9pKGz7DgC6iSHjWoxbYF0IZ7PcpsQYIOgUk,4394
|
23
23
|
airbyte_cdk/models/__init__.py,sha256=Kg8YHBqUsNWHlAw-u3ZGdG4dxLh7qBlHhqMRfamNCRU,1708
|
24
24
|
airbyte_cdk/models/airbyte_protocol.py,sha256=DoJvnmGM3xMAZFTwA6_RGMiKSFqfE3ib_Ru0KJ65Ag4,100
|
25
25
|
airbyte_cdk/models/well_known_types.py,sha256=KKfNbow2gdLoC1Z4hcXy_JR8m_acsB2ol7gQuEgjobw,117
|
@@ -177,7 +177,7 @@ airbyte_cdk/sources/file_based/file_types/csv_parser.py,sha256=VS2Ld9rfm4tLkwNZ3
|
|
177
177
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=Gbn-8v1-jLhKpJXTNOOc5PZT1Jzah6G-INCZt4snLdQ,2819
|
178
178
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=S7OtfRRvQ8P6YbZVdJ8h7mw1hnWQUVSHR9Jy12U1Yy0,5634
|
179
179
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=Jq_-WSbyueVwyLYrrGafXhvcA1LDOeps0A_uBhStOHI,9017
|
180
|
-
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=
|
180
|
+
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=vw9As28N7QPWkSPq0v-mHvnRtoiM51q8swpX4iG-1vI,16694
|
181
181
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
|
182
182
|
airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py,sha256=uwk6Ugf23xKG4PRPVVRVwpcNjTwPgxejl03vLSEzK0s,604
|
183
183
|
airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py,sha256=ZeAa0z50ywMU2chNjQ7JpL4yePU1NajhBa8FS7rXLVo,1643
|
@@ -375,7 +375,7 @@ unit_tests/sources/file_based/file_types/test_avro_parser.py,sha256=LCoGa0fvOber
|
|
375
375
|
unit_tests/sources/file_based/file_types/test_csv_parser.py,sha256=zgHjLfPASRwFxkubdRK0UkskGTOAdASpWHKucm0AmqM,22423
|
376
376
|
unit_tests/sources/file_based/file_types/test_jsonl_parser.py,sha256=foTf9U9LyAS8OR0BonwNgFWPqTrmzFV2lpPUfRMrioE,6134
|
377
377
|
unit_tests/sources/file_based/file_types/test_parquet_parser.py,sha256=J66wfbAaflSe5y3ixCZ4tLPEQdU62eYj-pNXycCtK0U,14159
|
378
|
-
unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=
|
378
|
+
unit_tests/sources/file_based/file_types/test_unstructured_parser.py,sha256=Ts-5Zzcq8ETwgb2aCXlk5EIZtQTM9_OOx1HL8TNk0IU,22454
|
379
379
|
unit_tests/sources/file_based/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
380
380
|
unit_tests/sources/file_based/scenarios/avro_scenarios.py,sha256=oeQUmCV7d2aTShreYc-PvVb4cWqLSsVwHfg-lcKjzPs,30554
|
381
381
|
unit_tests/sources/file_based/scenarios/check_scenarios.py,sha256=0xkt21ASTnTAMP0RYJEsF3yMGsNN7wWOoG_tmzL9PYw,6750
|
@@ -439,8 +439,8 @@ unit_tests/utils/test_schema_inferrer.py,sha256=Z2jHBZ540wnYkylIdV_2xr75Vtwlxuyg
|
|
439
439
|
unit_tests/utils/test_secret_utils.py,sha256=XKe0f1RHYii8iwE6ATmBr5JGDI1pzzrnZUGdUSMJQP4,4886
|
440
440
|
unit_tests/utils/test_stream_status_utils.py,sha256=Xr8MZ2HWgTVIyMbywDvuYkRaUF4RZLQOT8-JjvcfR24,2970
|
441
441
|
unit_tests/utils/test_traced_exception.py,sha256=bDFP5zMBizFenz6V2WvEZTRCKGB5ijh3DBezjbfoYIs,4198
|
442
|
-
airbyte_cdk-0.57.
|
443
|
-
airbyte_cdk-0.57.
|
444
|
-
airbyte_cdk-0.57.
|
445
|
-
airbyte_cdk-0.57.
|
446
|
-
airbyte_cdk-0.57.
|
442
|
+
airbyte_cdk-0.57.3.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
443
|
+
airbyte_cdk-0.57.3.dist-info/METADATA,sha256=rPyT57ZcShy6cYBeEwA7cf35b8NVHwZL3eX-3X-K_B8,11983
|
444
|
+
airbyte_cdk-0.57.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
445
|
+
airbyte_cdk-0.57.3.dist-info/top_level.txt,sha256=edvsDKTnE6sD2wfCUaeTfKf5gQIL6CPVMwVL2sWZzqo,51
|
446
|
+
airbyte_cdk-0.57.3.dist-info/RECORD,,
|
@@ -315,7 +315,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
315
315
|
|
316
316
|
|
317
317
|
@pytest.mark.parametrize(
|
318
|
-
"filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records",
|
318
|
+
"filetype, format_config, raises_for_status, file_content, json_response, expected_requests, raises, expected_records, http_status_code",
|
319
319
|
[
|
320
320
|
pytest.param(
|
321
321
|
FileType.PDF,
|
@@ -332,6 +332,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
332
332
|
"_ab_source_file_parse_error": None
|
333
333
|
}
|
334
334
|
],
|
335
|
+
200,
|
335
336
|
id="basic_request",
|
336
337
|
),
|
337
338
|
pytest.param(
|
@@ -349,6 +350,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
349
350
|
"_ab_source_file_parse_error": None
|
350
351
|
}
|
351
352
|
],
|
353
|
+
200,
|
352
354
|
id="request_with_params",
|
353
355
|
),
|
354
356
|
pytest.param(
|
@@ -366,6 +368,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
366
368
|
"_ab_source_file_parse_error": None
|
367
369
|
}
|
368
370
|
],
|
371
|
+
200,
|
369
372
|
id="handle_markdown_locally",
|
370
373
|
),
|
371
374
|
pytest.param(
|
@@ -394,6 +397,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
394
397
|
],
|
395
398
|
True,
|
396
399
|
None,
|
400
|
+
200,
|
397
401
|
id="retry_and_raise_on_api_error",
|
398
402
|
),
|
399
403
|
pytest.param(
|
@@ -422,6 +426,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
422
426
|
"_ab_source_file_parse_error": None
|
423
427
|
}
|
424
428
|
],
|
429
|
+
200,
|
425
430
|
id="retry_and_recover",
|
426
431
|
),
|
427
432
|
pytest.param(
|
@@ -438,6 +443,7 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
438
443
|
],
|
439
444
|
True,
|
440
445
|
None,
|
446
|
+
200,
|
441
447
|
id="no_retry_on_unexpected_error",
|
442
448
|
),
|
443
449
|
pytest.param(
|
@@ -454,8 +460,29 @@ def test_check_config(requests_mock, format_config, raises_for_status, json_resp
|
|
454
460
|
],
|
455
461
|
True,
|
456
462
|
None,
|
463
|
+
400,
|
457
464
|
id="no_retry_on_400_error",
|
458
465
|
),
|
466
|
+
pytest.param(
|
467
|
+
FileType.PDF,
|
468
|
+
UnstructuredFormat(skip_unprocessable_file_types=False, processing=APIProcessingConfigModel(mode="api", api_key="test")),
|
469
|
+
None,
|
470
|
+
"test",
|
471
|
+
[{"detail": "Something went wrong"}],
|
472
|
+
[
|
473
|
+
call("https://api.unstructured.io/general/v0/general", headers={"accept": "application/json", "unstructured-api-key": "test"}, data={"strategy": "auto"}, files={"files": ("filename", mock.ANY, "application/pdf")}),
|
474
|
+
],
|
475
|
+
False,
|
476
|
+
[
|
477
|
+
{
|
478
|
+
"content": None,
|
479
|
+
"document_key": FILE_URI,
|
480
|
+
"_ab_source_file_parse_error": "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable. Contact Support if you need assistance.\nfilename=path/to/file.xyz message=[{'detail': 'Something went wrong'}]",
|
481
|
+
}
|
482
|
+
],
|
483
|
+
422,
|
484
|
+
id="error_record_on_422_error",
|
485
|
+
),
|
459
486
|
],
|
460
487
|
)
|
461
488
|
@patch("airbyte_cdk.sources.file_based.file_types.unstructured_parser.requests")
|
@@ -473,6 +500,7 @@ def test_parse_records_remotely(
|
|
473
500
|
expected_requests,
|
474
501
|
raises,
|
475
502
|
expected_records,
|
503
|
+
http_status_code
|
476
504
|
):
|
477
505
|
stream_reader = MagicMock()
|
478
506
|
mock_open(stream_reader.open_file, read_data=bytes(str(file_content), "utf-8"))
|
@@ -484,6 +512,7 @@ def test_parse_records_remotely(
|
|
484
512
|
mock_detect_filetype.return_value = filetype
|
485
513
|
mock_response = MagicMock()
|
486
514
|
mock_response.json.return_value = json_response
|
515
|
+
mock_response.status_code = http_status_code
|
487
516
|
if raises_for_status:
|
488
517
|
mock_response.raise_for_status.side_effect = raises_for_status
|
489
518
|
requests_mock.post.return_value = mock_response
|
File without changes
|
File without changes
|
File without changes
|