unstructured-ingest 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_lancedb.py +46 -9
- test/integration/connectors/test_pinecone.py +60 -9
- test/integration/embedders/test_azure_openai.py +59 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +4 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +1 -5
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +12 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +31 -21
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +0 -3
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +0 -10
- unstructured_ingest/v2/processes/embedder.py +30 -0
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/METADATA +16 -16
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/RECORD +18 -16
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ from lancedb import AsyncConnection
|
|
|
12
12
|
from upath import UPath
|
|
13
13
|
|
|
14
14
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
15
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
15
16
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
16
17
|
from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
|
|
17
18
|
LanceDBAwsAccessConfig,
|
|
@@ -43,7 +44,6 @@ DATABASE_NAME = "database"
|
|
|
43
44
|
TABLE_NAME = "elements"
|
|
44
45
|
DIMENSION = 384
|
|
45
46
|
NUMBER_EXPECTED_ROWS = 22
|
|
46
|
-
NUMBER_EXPECTED_COLUMNS = 10
|
|
47
47
|
S3_BUCKET = "s3://utic-ingest-test-fixtures/"
|
|
48
48
|
GS_BUCKET = "gs://utic-test-ingest-fixtures-output/"
|
|
49
49
|
AZURE_BUCKET = "az://utic-ingest-test-fixtures-output/"
|
|
@@ -54,9 +54,9 @@ REQUIRED_ENV_VARS = {
|
|
|
54
54
|
"local": (),
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
|
|
58
57
|
SCHEMA = pa.schema(
|
|
59
58
|
[
|
|
59
|
+
pa.field(RECORD_ID_LABEL, pa.string()),
|
|
60
60
|
pa.field("vector", pa.list_(pa.float16(), DIMENSION)),
|
|
61
61
|
pa.field("text", pa.string(), nullable=True),
|
|
62
62
|
pa.field("type", pa.string(), nullable=True),
|
|
@@ -69,6 +69,7 @@ SCHEMA = pa.schema(
|
|
|
69
69
|
pa.field("metadata-page_number", pa.int32(), nullable=True),
|
|
70
70
|
]
|
|
71
71
|
)
|
|
72
|
+
NUMBER_EXPECTED_COLUMNS = len(SCHEMA.names)
|
|
72
73
|
|
|
73
74
|
|
|
74
75
|
@pytest_asyncio.fixture
|
|
@@ -116,7 +117,7 @@ async def test_lancedb_destination(
|
|
|
116
117
|
file_data = FileData(
|
|
117
118
|
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
118
119
|
connector_type=CONNECTOR_TYPE,
|
|
119
|
-
identifier="mock
|
|
120
|
+
identifier="mock-file-data",
|
|
120
121
|
)
|
|
121
122
|
stager = LanceDBUploadStager()
|
|
122
123
|
uploader = _get_uploader(uri)
|
|
@@ -129,17 +130,52 @@ async def test_lancedb_destination(
|
|
|
129
130
|
|
|
130
131
|
await uploader.run_async(path=staged_file_path, file_data=file_data)
|
|
131
132
|
|
|
132
|
-
|
|
133
|
-
|
|
133
|
+
# Test upload to empty table
|
|
134
|
+
with await connection.open_table(TABLE_NAME) as table:
|
|
135
|
+
table_df: pd.DataFrame = await table.to_pandas()
|
|
134
136
|
|
|
135
137
|
assert len(table_df) == NUMBER_EXPECTED_ROWS
|
|
136
138
|
assert len(table_df.columns) == NUMBER_EXPECTED_COLUMNS
|
|
137
139
|
|
|
140
|
+
assert table_df[RECORD_ID_LABEL][0] == file_data.identifier
|
|
138
141
|
assert table_df["element_id"][0] == "2470d8dc42215b3d68413b55bf00fed2"
|
|
139
142
|
assert table_df["type"][0] == "CompositeElement"
|
|
140
143
|
assert table_df["metadata-filename"][0] == "DA-1p-with-duplicate-pages.pdf.json"
|
|
141
144
|
assert table_df["metadata-text_as_html"][0] is None
|
|
142
145
|
|
|
146
|
+
# Test upload of the second file, rows should be appended
|
|
147
|
+
file_data.identifier = "mock-file-data-2"
|
|
148
|
+
staged_second_file_path = stager.run(
|
|
149
|
+
elements_filepath=upload_file,
|
|
150
|
+
file_data=file_data,
|
|
151
|
+
output_dir=tmp_path,
|
|
152
|
+
output_filename=f"{upload_file.stem}-2{upload_file.suffix}",
|
|
153
|
+
)
|
|
154
|
+
await uploader.run_async(path=staged_second_file_path, file_data=file_data)
|
|
155
|
+
with await connection.open_table(TABLE_NAME) as table:
|
|
156
|
+
appended_table_df: pd.DataFrame = await table.to_pandas()
|
|
157
|
+
assert len(appended_table_df) == 2 * NUMBER_EXPECTED_ROWS
|
|
158
|
+
|
|
159
|
+
# Test re-upload of the first file, rows should be overwritten, not appended
|
|
160
|
+
await uploader.run_async(path=staged_file_path, file_data=file_data)
|
|
161
|
+
with await connection.open_table(TABLE_NAME) as table:
|
|
162
|
+
overwritten_table_df: pd.DataFrame = await table.to_pandas()
|
|
163
|
+
assert len(overwritten_table_df) == 2 * NUMBER_EXPECTED_ROWS
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class TestPrecheck:
|
|
167
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
168
|
+
@pytest.mark.parametrize("connection_with_uri", ["local", "s3", "gcs", "az"], indirect=True)
|
|
169
|
+
def test_succeeds(
|
|
170
|
+
self,
|
|
171
|
+
upload_file: Path,
|
|
172
|
+
connection_with_uri: tuple[AsyncConnection, str],
|
|
173
|
+
tmp_path: Path,
|
|
174
|
+
) -> None:
|
|
175
|
+
_, uri = connection_with_uri
|
|
176
|
+
uploader = _get_uploader(uri)
|
|
177
|
+
uploader.precheck()
|
|
178
|
+
|
|
143
179
|
|
|
144
180
|
def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path) -> str:
|
|
145
181
|
if target == "local":
|
|
@@ -158,11 +194,12 @@ def _get_uploader(
|
|
|
158
194
|
uri: str,
|
|
159
195
|
) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
|
|
160
196
|
target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
|
|
197
|
+
upload_config = LanceDBUploaderConfig(table_name=TABLE_NAME)
|
|
161
198
|
if target == "az":
|
|
162
199
|
azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
|
|
163
200
|
access_config_kwargs = _parse_azure_connection_string(azure_connection_string)
|
|
164
201
|
return LanceDBAzureUploader(
|
|
165
|
-
upload_config=
|
|
202
|
+
upload_config=upload_config,
|
|
166
203
|
connection_config=LanceDBAzureConnectionConfig(
|
|
167
204
|
access_config=LanceDBAzureAccessConfig(**access_config_kwargs),
|
|
168
205
|
uri=uri,
|
|
@@ -171,7 +208,7 @@ def _get_uploader(
|
|
|
171
208
|
|
|
172
209
|
elif target == "s3":
|
|
173
210
|
return LanceDBAwsUploader(
|
|
174
|
-
upload_config=
|
|
211
|
+
upload_config=upload_config,
|
|
175
212
|
connection_config=LanceDBAwsConnectionConfig(
|
|
176
213
|
access_config=LanceDBAwsAccessConfig(
|
|
177
214
|
aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
|
|
@@ -182,7 +219,7 @@ def _get_uploader(
|
|
|
182
219
|
)
|
|
183
220
|
elif target == "gs":
|
|
184
221
|
return LanceDBGSPUploader(
|
|
185
|
-
upload_config=
|
|
222
|
+
upload_config=upload_config,
|
|
186
223
|
connection_config=LanceDBGCSConnectionConfig(
|
|
187
224
|
access_config=LanceDBGCSAccessConfig(
|
|
188
225
|
google_service_account_key=os.getenv("GCP_INGEST_SERVICE_KEY")
|
|
@@ -192,7 +229,7 @@ def _get_uploader(
|
|
|
192
229
|
)
|
|
193
230
|
else:
|
|
194
231
|
return LanceDBLocalUploader(
|
|
195
|
-
upload_config=
|
|
232
|
+
upload_config=upload_config,
|
|
196
233
|
connection_config=LanceDBLocalConnectionConfig(
|
|
197
234
|
access_config=LanceDBLocalAccessConfig(),
|
|
198
235
|
uri=uri,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
@@ -19,6 +20,7 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
|
19
20
|
from unstructured_ingest.v2.logger import logger
|
|
20
21
|
from unstructured_ingest.v2.processes.connectors.pinecone import (
|
|
21
22
|
CONNECTOR_TYPE,
|
|
23
|
+
MAX_QUERY_RESULTS,
|
|
22
24
|
PineconeAccessConfig,
|
|
23
25
|
PineconeConnectionConfig,
|
|
24
26
|
PineconeUploader,
|
|
@@ -118,7 +120,10 @@ def validate_pinecone_index(
|
|
|
118
120
|
f"retry attempt {i}: expected {expected_num_of_vectors} != vector count {vector_count}"
|
|
119
121
|
)
|
|
120
122
|
time.sleep(interval)
|
|
121
|
-
assert vector_count == expected_num_of_vectors
|
|
123
|
+
assert vector_count == expected_num_of_vectors, (
|
|
124
|
+
f"vector count from index ({vector_count}) doesn't "
|
|
125
|
+
f"match expected number: {expected_num_of_vectors}"
|
|
126
|
+
)
|
|
122
127
|
|
|
123
128
|
|
|
124
129
|
@requires_env(API_KEY)
|
|
@@ -147,10 +152,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
|
|
|
147
152
|
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
148
153
|
uploader.precheck()
|
|
149
154
|
|
|
150
|
-
|
|
151
|
-
await uploader.run_async(path=new_upload_file, file_data=file_data)
|
|
152
|
-
else:
|
|
153
|
-
uploader.run(path=new_upload_file, file_data=file_data)
|
|
155
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
154
156
|
with new_upload_file.open() as f:
|
|
155
157
|
staged_content = json.load(f)
|
|
156
158
|
expected_num_of_vectors = len(staged_content)
|
|
@@ -160,10 +162,59 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
|
|
|
160
162
|
)
|
|
161
163
|
|
|
162
164
|
# Rerun uploader and make sure no duplicates exist
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
165
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
166
|
+
logger.info("validating second upload")
|
|
167
|
+
validate_pinecone_index(
|
|
168
|
+
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@requires_env(API_KEY)
|
|
173
|
+
@pytest.mark.asyncio
|
|
174
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
175
|
+
@pytest.mark.skip(reason="TODO: get this to work")
|
|
176
|
+
async def test_pinecone_destination_large_index(
|
|
177
|
+
pinecone_index: str, upload_file: Path, temp_dir: Path
|
|
178
|
+
):
|
|
179
|
+
new_file = temp_dir / "large_file.json"
|
|
180
|
+
with upload_file.open() as f:
|
|
181
|
+
upload_content = json.load(f)
|
|
182
|
+
|
|
183
|
+
min_entries = math.ceil((MAX_QUERY_RESULTS * 2) / len(upload_content))
|
|
184
|
+
new_content = (upload_content * min_entries)[: (2 * MAX_QUERY_RESULTS)]
|
|
185
|
+
print(f"Creating large index content with {len(new_content)} records")
|
|
186
|
+
with new_file.open("w") as f:
|
|
187
|
+
json.dump(new_content, f)
|
|
188
|
+
|
|
189
|
+
expected_num_of_vectors = len(new_content)
|
|
190
|
+
file_data = FileData(
|
|
191
|
+
source_identifiers=SourceIdentifiers(fullpath=new_file.name, filename=new_file.name),
|
|
192
|
+
connector_type=CONNECTOR_TYPE,
|
|
193
|
+
identifier="pinecone_mock_id",
|
|
194
|
+
)
|
|
195
|
+
connection_config = PineconeConnectionConfig(
|
|
196
|
+
index_name=pinecone_index,
|
|
197
|
+
access_config=PineconeAccessConfig(api_key=get_api_key()),
|
|
198
|
+
)
|
|
199
|
+
stager_config = PineconeUploadStagerConfig()
|
|
200
|
+
stager = PineconeUploadStager(upload_stager_config=stager_config)
|
|
201
|
+
new_upload_file = stager.run(
|
|
202
|
+
elements_filepath=new_file,
|
|
203
|
+
output_dir=temp_dir,
|
|
204
|
+
output_filename=new_file.name,
|
|
205
|
+
file_data=file_data,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
upload_config = PineconeUploaderConfig()
|
|
209
|
+
uploader = PineconeUploader(connection_config=connection_config, upload_config=upload_config)
|
|
210
|
+
uploader.precheck()
|
|
211
|
+
|
|
212
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
213
|
+
validate_pinecone_index(
|
|
214
|
+
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
215
|
+
)
|
|
216
|
+
# Rerun uploader and make sure no duplicates exist
|
|
217
|
+
uploader.run(path=new_upload_file, file_data=file_data)
|
|
167
218
|
logger.info("validating second upload")
|
|
168
219
|
validate_pinecone_index(
|
|
169
220
|
index_name=pinecone_index, expected_num_of_vectors=expected_num_of_vectors
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
7
|
+
from test.integration.utils import requires_env
|
|
8
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
9
|
+
AzureOpenAIEmbeddingConfig,
|
|
10
|
+
AzureOpenAIEmbeddingEncoder,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
13
|
+
|
|
14
|
+
API_KEY = "AZURE_OPENAI_API_KEY"
|
|
15
|
+
ENDPOINT = "AZURE_OPENAI_ENDPOINT"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class AzureData:
|
|
20
|
+
api_key: str
|
|
21
|
+
endpoint: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_azure_data() -> AzureData:
|
|
25
|
+
api_key = os.getenv(API_KEY, None)
|
|
26
|
+
assert api_key
|
|
27
|
+
endpoint = os.getenv(ENDPOINT, None)
|
|
28
|
+
assert endpoint
|
|
29
|
+
return AzureData(api_key, endpoint)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@requires_env(API_KEY, ENDPOINT)
|
|
33
|
+
def test_azure_openai_embedder(embedder_file: Path):
|
|
34
|
+
azure_data = get_azure_data()
|
|
35
|
+
embedder_config = EmbedderConfig(
|
|
36
|
+
embedding_provider="azure-openai",
|
|
37
|
+
embedding_api_key=azure_data.api_key,
|
|
38
|
+
embedding_azure_endpoint=azure_data.endpoint,
|
|
39
|
+
)
|
|
40
|
+
embedder = Embedder(config=embedder_config)
|
|
41
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
42
|
+
assert results
|
|
43
|
+
with embedder_file.open("r") as f:
|
|
44
|
+
original_elements = json.load(f)
|
|
45
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@requires_env(API_KEY, ENDPOINT)
|
|
49
|
+
def test_raw_azure_openai_embedder(embedder_file: Path):
|
|
50
|
+
azure_data = get_azure_data()
|
|
51
|
+
embedder = AzureOpenAIEmbeddingEncoder(
|
|
52
|
+
config=AzureOpenAIEmbeddingConfig(
|
|
53
|
+
api_key=azure_data.api_key,
|
|
54
|
+
azure_endpoint=azure_data.endpoint,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
validate_raw_embedder(
|
|
58
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
59
|
+
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.5" # pragma: no cover
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
7
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from openai import AzureOpenAI
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
14
|
+
api_version: str = Field(description="Azure API version", default="2024-06-01")
|
|
15
|
+
azure_endpoint: str
|
|
16
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
17
|
+
|
|
18
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
19
|
+
def get_client(self) -> "AzureOpenAI":
|
|
20
|
+
from openai import AzureOpenAI
|
|
21
|
+
|
|
22
|
+
return AzureOpenAI(
|
|
23
|
+
api_key=self.api_key.get_secret_value(),
|
|
24
|
+
api_version=self.api_version,
|
|
25
|
+
azure_endpoint=self.azure_endpoint,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
|
|
31
|
+
config: AzureOpenAIEmbeddingConfig
|
|
@@ -219,6 +219,9 @@ class CouchbaseIndexer(Indexer):
|
|
|
219
219
|
|
|
220
220
|
|
|
221
221
|
class CouchbaseDownloaderConfig(DownloaderConfig):
|
|
222
|
+
collection_id: str = Field(
|
|
223
|
+
default="id", description="The unique key of the id field in the collection"
|
|
224
|
+
)
|
|
222
225
|
fields: list[str] = field(default_factory=list)
|
|
223
226
|
|
|
224
227
|
|
|
@@ -250,7 +253,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
250
253
|
def generate_download_response(
|
|
251
254
|
self, result: dict, bucket: str, file_data: FileData
|
|
252
255
|
) -> DownloadResponse:
|
|
253
|
-
record_id = result[
|
|
256
|
+
record_id = result[self.download_config.collection_id]
|
|
254
257
|
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
255
258
|
filename = f"{filename_id}.txt"
|
|
256
259
|
download_path = self.download_dir / Path(filename)
|
|
@@ -142,8 +142,6 @@ class ElasticsearchIndexer(Indexer):
|
|
|
142
142
|
def precheck(self) -> None:
|
|
143
143
|
try:
|
|
144
144
|
with self.connection_config.get_client() as client:
|
|
145
|
-
if not client.ping():
|
|
146
|
-
raise SourceConnectionError("cluster not detected")
|
|
147
145
|
indices = client.indices.get_alias(index="*")
|
|
148
146
|
if self.index_config.index_name not in indices:
|
|
149
147
|
raise SourceConnectionError(
|
|
@@ -393,11 +391,9 @@ class ElasticsearchUploader(Uploader):
|
|
|
393
391
|
def precheck(self) -> None:
|
|
394
392
|
try:
|
|
395
393
|
with self.connection_config.get_client() as client:
|
|
396
|
-
if not client.ping():
|
|
397
|
-
raise DestinationConnectionError("cluster not detected")
|
|
398
394
|
indices = client.indices.get_alias(index="*")
|
|
399
395
|
if self.upload_config.index_name not in indices:
|
|
400
|
-
raise
|
|
396
|
+
raise DestinationConnectionError(
|
|
401
397
|
"index {} not found: {}".format(
|
|
402
398
|
self.upload_config.index_name, ", ".join(indices.keys())
|
|
403
399
|
)
|
|
@@ -15,6 +15,7 @@ from unstructured_ingest.error import DestinationConnectionError
|
|
|
15
15
|
from unstructured_ingest.logger import logger
|
|
16
16
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
17
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
18
19
|
from unstructured_ingest.v2.interfaces.connector import ConnectionConfig
|
|
19
20
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
20
21
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager, UploadStagerConfig
|
|
@@ -84,7 +85,7 @@ class LanceDBUploadStager(UploadStager):
|
|
|
84
85
|
|
|
85
86
|
df = pd.DataFrame(
|
|
86
87
|
[
|
|
87
|
-
self._conform_element_contents(element_contents)
|
|
88
|
+
self._conform_element_contents(element_contents, file_data)
|
|
88
89
|
for element_contents in elements_contents
|
|
89
90
|
]
|
|
90
91
|
)
|
|
@@ -94,9 +95,10 @@ class LanceDBUploadStager(UploadStager):
|
|
|
94
95
|
|
|
95
96
|
return output_path
|
|
96
97
|
|
|
97
|
-
def _conform_element_contents(self, element: dict) -> dict:
|
|
98
|
+
def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
|
|
98
99
|
return {
|
|
99
100
|
"vector": element.pop("embeddings", None),
|
|
101
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
100
102
|
**flatten_dict(element, separator="-"),
|
|
101
103
|
}
|
|
102
104
|
|
|
@@ -134,6 +136,14 @@ class LanceDBUploader(Uploader):
|
|
|
134
136
|
async with self.get_table() as table:
|
|
135
137
|
schema = await table.schema()
|
|
136
138
|
df = self._fit_to_schema(df, schema)
|
|
139
|
+
if RECORD_ID_LABEL not in schema.names:
|
|
140
|
+
logger.warning(
|
|
141
|
+
f"Designated table doesn't contain {RECORD_ID_LABEL} column of type"
|
|
142
|
+
" string which is required to support overwriting updates on subsequent"
|
|
143
|
+
" uploads of the same record. New rows will be appended instead."
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
137
147
|
await table.add(data=df)
|
|
138
148
|
|
|
139
149
|
def _fit_to_schema(self, df: pd.DataFrame, schema) -> pd.DataFrame:
|
|
@@ -31,6 +31,7 @@ CONNECTOR_TYPE = "pinecone"
|
|
|
31
31
|
MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
|
|
32
32
|
MAX_POOL_THREADS = 100
|
|
33
33
|
MAX_METADATA_BYTES = 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
|
|
34
|
+
MAX_QUERY_RESULTS = 10000
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class PineconeAccessConfig(AccessConfig):
|
|
@@ -84,7 +85,7 @@ ALLOWED_FIELDS = (
|
|
|
84
85
|
|
|
85
86
|
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
86
87
|
metadata_fields: list[str] = Field(
|
|
87
|
-
default=
|
|
88
|
+
default=list(ALLOWED_FIELDS),
|
|
88
89
|
description=(
|
|
89
90
|
"which metadata from the source element to map to the payload metadata being sent to "
|
|
90
91
|
"Pinecone."
|
|
@@ -137,7 +138,6 @@ class PineconeUploadStager(UploadStager):
|
|
|
137
138
|
flatten_lists=True,
|
|
138
139
|
remove_none=True,
|
|
139
140
|
)
|
|
140
|
-
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
141
141
|
metadata_size_bytes = len(json.dumps(metadata).encode())
|
|
142
142
|
if metadata_size_bytes > MAX_METADATA_BYTES:
|
|
143
143
|
logger.info(
|
|
@@ -146,6 +146,8 @@ class PineconeUploadStager(UploadStager):
|
|
|
146
146
|
)
|
|
147
147
|
metadata = {}
|
|
148
148
|
|
|
149
|
+
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
150
|
+
|
|
149
151
|
return {
|
|
150
152
|
"id": str(uuid.uuid4()),
|
|
151
153
|
"values": embeddings,
|
|
@@ -213,6 +215,18 @@ class PineconeUploader(Uploader):
|
|
|
213
215
|
f"from pinecone index: {resp}"
|
|
214
216
|
)
|
|
215
217
|
|
|
218
|
+
def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
|
|
219
|
+
while True:
|
|
220
|
+
query_results = index.query(**query_params)
|
|
221
|
+
matches = query_results.get("matches", [])
|
|
222
|
+
if not matches:
|
|
223
|
+
break
|
|
224
|
+
ids = [match["id"] for match in matches]
|
|
225
|
+
delete_params = {"ids": ids}
|
|
226
|
+
if namespace := self.upload_config.namespace:
|
|
227
|
+
delete_params["namespace"] = namespace
|
|
228
|
+
index.delete(**delete_params)
|
|
229
|
+
|
|
216
230
|
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
217
231
|
logger.debug(
|
|
218
232
|
f"deleting any content with metadata "
|
|
@@ -221,29 +235,25 @@ class PineconeUploader(Uploader):
|
|
|
221
235
|
)
|
|
222
236
|
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
223
237
|
index_stats = index.describe_index_stats()
|
|
238
|
+
dimension = index_stats["dimension"]
|
|
224
239
|
total_vectors = index_stats["total_vector_count"]
|
|
225
240
|
if total_vectors == 0:
|
|
226
241
|
return
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
query_params["namespace"] = namespace
|
|
235
|
-
while True:
|
|
236
|
-
query_results = index.query(**query_params)
|
|
237
|
-
matches = query_results.get("matches", [])
|
|
238
|
-
if not matches:
|
|
239
|
-
break
|
|
240
|
-
ids = [match["id"] for match in matches]
|
|
241
|
-
delete_params = {"ids": ids}
|
|
242
|
+
while total_vectors > 0:
|
|
243
|
+
top_k = min(total_vectors, MAX_QUERY_RESULTS)
|
|
244
|
+
query_params = {
|
|
245
|
+
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
|
|
246
|
+
"vector": [0] * dimension,
|
|
247
|
+
"top_k": top_k,
|
|
248
|
+
}
|
|
242
249
|
if namespace := self.upload_config.namespace:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
250
|
+
query_params["namespace"] = namespace
|
|
251
|
+
self.delete_by_query(index=index, query_params=query_params)
|
|
252
|
+
index_stats = index.describe_index_stats()
|
|
253
|
+
total_vectors = index_stats["total_vector_count"]
|
|
254
|
+
|
|
255
|
+
logger.info(
|
|
256
|
+
f"deleted {total_vectors} records with metadata "
|
|
247
257
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
248
258
|
f"from pinecone index"
|
|
249
259
|
)
|
|
@@ -10,8 +10,6 @@ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
|
|
|
10
10
|
from .embedded import weaviate_embedded_destination_entry
|
|
11
11
|
from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
|
|
12
12
|
from .local import weaviate_local_destination_entry
|
|
13
|
-
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
14
|
-
from .weaviate import weaviate_destination_entry
|
|
15
13
|
|
|
16
14
|
add_destination_entry(
|
|
17
15
|
destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
|
|
@@ -22,4 +20,3 @@ add_destination_entry(
|
|
|
22
20
|
add_destination_entry(
|
|
23
21
|
destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
|
|
24
22
|
)
|
|
25
|
-
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
@@ -22,7 +22,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
22
|
UploadStagerConfig,
|
|
23
23
|
)
|
|
24
24
|
from unstructured_ingest.v2.logger import logger
|
|
25
|
-
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
26
25
|
|
|
27
26
|
if TYPE_CHECKING:
|
|
28
27
|
from weaviate.classes.init import Timeout
|
|
@@ -288,12 +287,3 @@ class WeaviateUploader(Uploader, ABC):
|
|
|
288
287
|
vector=vector,
|
|
289
288
|
)
|
|
290
289
|
self.check_for_errors(client=weaviate_client)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
weaviate_destination_entry = DestinationRegistryEntry(
|
|
294
|
-
connection_config=WeaviateConnectionConfig,
|
|
295
|
-
uploader=WeaviateUploader,
|
|
296
|
-
uploader_config=WeaviateUploaderConfig,
|
|
297
|
-
upload_stager=WeaviateUploadStager,
|
|
298
|
-
upload_stager_config=WeaviateUploadStagerConfig,
|
|
299
|
-
)
|
|
@@ -16,6 +16,7 @@ class EmbedderConfig(BaseModel):
|
|
|
16
16
|
embedding_provider: Optional[
|
|
17
17
|
Literal[
|
|
18
18
|
"openai",
|
|
19
|
+
"azure-openai",
|
|
19
20
|
"huggingface",
|
|
20
21
|
"aws-bedrock",
|
|
21
22
|
"vertexai",
|
|
@@ -43,6 +44,14 @@ class EmbedderConfig(BaseModel):
|
|
|
43
44
|
embedding_aws_region: Optional[str] = Field(
|
|
44
45
|
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
45
46
|
)
|
|
47
|
+
embedding_azure_endpoint: Optional[str] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Your Azure endpoint, including the resource, "
|
|
50
|
+
"e.g. `https://example-resource.azure.openai.com/`",
|
|
51
|
+
)
|
|
52
|
+
embedding_azure_api_version: Optional[str] = Field(
|
|
53
|
+
description="Azure API version", default=None
|
|
54
|
+
)
|
|
46
55
|
|
|
47
56
|
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
48
57
|
from unstructured_ingest.embed.huggingface import (
|
|
@@ -59,6 +68,25 @@ class EmbedderConfig(BaseModel):
|
|
|
59
68
|
|
|
60
69
|
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
61
70
|
|
|
71
|
+
def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
72
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
73
|
+
AzureOpenAIEmbeddingConfig,
|
|
74
|
+
AzureOpenAIEmbeddingEncoder,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
config_kwargs = {
|
|
78
|
+
"api_key": self.embedding_api_key,
|
|
79
|
+
"azure_endpoint": self.embedding_azure_endpoint,
|
|
80
|
+
}
|
|
81
|
+
if api_version := self.embedding_azure_api_version:
|
|
82
|
+
config_kwargs["api_version"] = api_version
|
|
83
|
+
if model_name := self.embedding_model_name:
|
|
84
|
+
config_kwargs["model_name"] = model_name
|
|
85
|
+
|
|
86
|
+
return AzureOpenAIEmbeddingEncoder(
|
|
87
|
+
config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
|
|
88
|
+
)
|
|
89
|
+
|
|
62
90
|
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
63
91
|
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
64
92
|
|
|
@@ -146,6 +174,8 @@ class EmbedderConfig(BaseModel):
|
|
|
146
174
|
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
147
175
|
if self.embedding_provider == "togetherai":
|
|
148
176
|
return self.get_togetherai_embedder(embedding_kwargs=kwargs)
|
|
177
|
+
if self.embedding_provider == "azure-openai":
|
|
178
|
+
return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
|
|
149
179
|
|
|
150
180
|
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
151
181
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: dataclasses-json
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
25
|
Requires-Dist: pandas
|
|
26
|
+
Requires-Dist: dataclasses-json
|
|
28
27
|
Requires-Dist: tqdm
|
|
29
|
-
Requires-Dist: python-dateutil
|
|
30
|
-
Requires-Dist: click
|
|
31
28
|
Requires-Dist: opentelemetry-sdk
|
|
29
|
+
Requires-Dist: pydantic>=2.7
|
|
30
|
+
Requires-Dist: click
|
|
31
|
+
Requires-Dist: python-dateutil
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
|
|
|
51
51
|
Provides-Extra: clarifai
|
|
52
52
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
53
53
|
Provides-Extra: confluence
|
|
54
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
55
54
|
Requires-Dist: requests; extra == "confluence"
|
|
55
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
56
|
Provides-Extra: couchbase
|
|
57
57
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
58
58
|
Provides-Extra: csv
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
64
63
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
78
78
|
Provides-Extra: embed-mixedbreadai
|
|
79
79
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
80
80
|
Provides-Extra: embed-octoai
|
|
81
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
82
81
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
82
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
83
83
|
Provides-Extra: embed-vertexai
|
|
84
84
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
85
85
|
Provides-Extra: embed-voyageai
|
|
@@ -88,8 +88,8 @@ Provides-Extra: epub
|
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
90
|
Requires-Dist: bs4; extra == "gcs"
|
|
91
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
92
91
|
Requires-Dist: fsspec; extra == "gcs"
|
|
92
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
95
|
Requires-Dist: requests; extra == "github"
|
|
@@ -117,26 +117,26 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
117
117
|
Provides-Extra: msg
|
|
118
118
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
119
119
|
Provides-Extra: notion
|
|
120
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
120
121
|
Requires-Dist: backoff; extra == "notion"
|
|
121
122
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
122
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
123
123
|
Requires-Dist: httpx; extra == "notion"
|
|
124
124
|
Provides-Extra: odt
|
|
125
125
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
126
126
|
Provides-Extra: onedrive
|
|
127
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
128
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
129
127
|
Requires-Dist: bs4; extra == "onedrive"
|
|
128
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
129
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
130
130
|
Provides-Extra: openai
|
|
131
|
-
Requires-Dist: openai; extra == "openai"
|
|
132
131
|
Requires-Dist: tiktoken; extra == "openai"
|
|
132
|
+
Requires-Dist: openai; extra == "openai"
|
|
133
133
|
Provides-Extra: opensearch
|
|
134
134
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
135
135
|
Provides-Extra: org
|
|
136
136
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
137
137
|
Provides-Extra: outlook
|
|
138
|
-
Requires-Dist: msal; extra == "outlook"
|
|
139
138
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
139
|
+
Requires-Dist: msal; extra == "outlook"
|
|
140
140
|
Provides-Extra: pdf
|
|
141
141
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
142
142
|
Provides-Extra: pinecone
|
|
@@ -158,16 +158,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
158
158
|
Provides-Extra: rtf
|
|
159
159
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
160
160
|
Provides-Extra: s3
|
|
161
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
162
161
|
Requires-Dist: s3fs; extra == "s3"
|
|
162
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
163
163
|
Provides-Extra: salesforce
|
|
164
164
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
165
165
|
Provides-Extra: sftp
|
|
166
166
|
Requires-Dist: paramiko; extra == "sftp"
|
|
167
167
|
Requires-Dist: fsspec; extra == "sftp"
|
|
168
168
|
Provides-Extra: sharepoint
|
|
169
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
170
169
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
170
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
171
171
|
Provides-Extra: singlestore
|
|
172
172
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
173
173
|
Provides-Extra: slack
|
|
@@ -10,11 +10,11 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
|
|
|
10
10
|
test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
|
|
11
11
|
test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
|
|
12
12
|
test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
|
|
13
|
-
test/integration/connectors/test_lancedb.py,sha256=
|
|
13
|
+
test/integration/connectors/test_lancedb.py,sha256=U2HfIrf6iJ7lYMn-vz0j-LesVyDY-jc9QrQhlJVhG9Q,9183
|
|
14
14
|
test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
|
|
15
15
|
test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
|
|
16
16
|
test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
|
|
17
|
-
test/integration/connectors/test_pinecone.py,sha256=
|
|
17
|
+
test/integration/connectors/test_pinecone.py,sha256=i-v5WkAI9M6SUZI7ch9qdILlRHopAdptpkSY12-BaTk,9483
|
|
18
18
|
test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
|
|
19
19
|
test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
|
|
20
20
|
test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -39,6 +39,7 @@ test/integration/connectors/weaviate/test_cloud.py,sha256=07VxNRxWWcgTstFfpoZ1Fl
|
|
|
39
39
|
test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
|
|
40
40
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
42
|
+
test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
|
|
42
43
|
test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
|
|
43
44
|
test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
|
|
44
45
|
test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
|
|
@@ -82,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
82
83
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
84
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
84
85
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
85
|
-
unstructured_ingest/__version__.py,sha256=
|
|
86
|
+
unstructured_ingest/__version__.py,sha256=70Yw9e-njzEFR9kr-pzp5J1EslWrJuu4TCVbxa-fdmM,42
|
|
86
87
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
87
88
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
88
89
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -250,6 +251,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
|
|
|
250
251
|
unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
|
|
251
252
|
unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
|
|
252
253
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
254
|
+
unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
|
|
253
255
|
unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
|
|
254
256
|
unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
|
|
255
257
|
unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
|
|
@@ -389,7 +391,7 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4
|
|
|
389
391
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
390
392
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
391
393
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
392
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
394
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZxl3Nf8-99509ClQ,7721
|
|
393
395
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
394
396
|
unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
|
|
395
397
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
@@ -399,7 +401,7 @@ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRH
|
|
|
399
401
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
|
|
400
402
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
|
|
401
403
|
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
|
|
402
|
-
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=
|
|
404
|
+
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
|
|
403
405
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
|
|
404
406
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
|
|
405
407
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
|
|
@@ -409,7 +411,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
|
|
|
409
411
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
|
|
410
412
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
|
|
411
413
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
412
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256
|
|
414
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=-J6QPJv_jmjln8cTUsfEEAyd_hi_fmD-uwB6C84rA4w,11930
|
|
413
415
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
414
416
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
|
|
415
417
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
@@ -421,7 +423,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P
|
|
|
421
423
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
|
|
422
424
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
|
|
423
425
|
unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
|
|
424
|
-
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=
|
|
426
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=sI58uypWr1mpSl4bxr46nIfypGZ4aqryCT83qqCVnSM,18921
|
|
425
427
|
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
|
|
426
428
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
427
429
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
|
|
@@ -441,7 +443,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur
|
|
|
441
443
|
unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
|
|
442
444
|
unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
|
|
443
445
|
unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
|
|
444
|
-
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=
|
|
446
|
+
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7FODnesYu8cFx1PeQJZxXij-8Dei4Kk3Bs0oxoUGBtI,5745
|
|
445
447
|
unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
|
|
446
448
|
unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
|
|
447
449
|
unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
|
|
@@ -454,14 +456,14 @@ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1
|
|
|
454
456
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
|
|
455
457
|
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
|
|
456
458
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
|
|
457
|
-
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=
|
|
459
|
+
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
458
460
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
459
461
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
460
462
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
461
|
-
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=
|
|
462
|
-
unstructured_ingest-0.3.
|
|
463
|
-
unstructured_ingest-0.3.
|
|
464
|
-
unstructured_ingest-0.3.
|
|
465
|
-
unstructured_ingest-0.3.
|
|
466
|
-
unstructured_ingest-0.3.
|
|
467
|
-
unstructured_ingest-0.3.
|
|
463
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
|
|
464
|
+
unstructured_ingest-0.3.5.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
465
|
+
unstructured_ingest-0.3.5.dist-info/METADATA,sha256=6lMRXK_RZho8cMblH299fqDfZix6a9843VGiPvhnDV8,7393
|
|
466
|
+
unstructured_ingest-0.3.5.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
467
|
+
unstructured_ingest-0.3.5.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
468
|
+
unstructured_ingest-0.3.5.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
469
|
+
unstructured_ingest-0.3.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.3.dist-info → unstructured_ingest-0.3.5.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|