unstructured-ingest 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_lancedb.py +7 -7
- test/integration/connectors/test_milvus.py +34 -6
- test/integration/connectors/weaviate/test_cloud.py +34 -0
- test/integration/embedders/test_azure_openai.py +59 -0
- test/unit/test_utils.py +21 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/azure_openai.py +31 -0
- unstructured_ingest/utils/string_and_date_utils.py +10 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +16 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +4 -1
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -4
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +7 -7
- unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +9 -3
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +4 -3
- unstructured_ingest/v2/processes/embedder.py +30 -0
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/METADATA +17 -15
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/RECORD +22 -18
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/top_level.txt +0 -0
|
@@ -14,9 +14,9 @@ from upath import UPath
|
|
|
14
14
|
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
15
15
|
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
16
16
|
from unstructured_ingest.v2.processes.connectors.lancedb.aws import (
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
LanceDBAwsAccessConfig,
|
|
18
|
+
LanceDBAwsConnectionConfig,
|
|
19
|
+
LanceDBAwsUploader,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connectors.lancedb.azure import (
|
|
22
22
|
LanceDBAzureAccessConfig,
|
|
@@ -156,7 +156,7 @@ def _get_uri(target: Literal["local", "s3", "gcs", "az"], local_base_path: Path)
|
|
|
156
156
|
|
|
157
157
|
def _get_uploader(
|
|
158
158
|
uri: str,
|
|
159
|
-
) -> Union[LanceDBAzureUploader, LanceDBAzureUploader,
|
|
159
|
+
) -> Union[LanceDBAzureUploader, LanceDBAzureUploader, LanceDBAwsUploader, LanceDBGSPUploader]:
|
|
160
160
|
target = uri.split("://", maxsplit=1)[0] if uri.startswith(("s3", "az", "gs")) else "local"
|
|
161
161
|
if target == "az":
|
|
162
162
|
azure_connection_string = os.getenv("AZURE_DEST_CONNECTION_STR")
|
|
@@ -170,10 +170,10 @@ def _get_uploader(
|
|
|
170
170
|
)
|
|
171
171
|
|
|
172
172
|
elif target == "s3":
|
|
173
|
-
return
|
|
173
|
+
return LanceDBAwsUploader(
|
|
174
174
|
upload_config=LanceDBUploaderConfig(table_name=TABLE_NAME),
|
|
175
|
-
connection_config=
|
|
176
|
-
access_config=
|
|
175
|
+
connection_config=LanceDBAwsConnectionConfig(
|
|
176
|
+
access_config=LanceDBAwsAccessConfig(
|
|
177
177
|
aws_access_key_id=os.getenv("S3_INGEST_TEST_ACCESS_KEY"),
|
|
178
178
|
aws_secret_access_key=os.getenv("S3_INGEST_TEST_SECRET_KEY"),
|
|
179
179
|
),
|
|
@@ -15,6 +15,7 @@ from pymilvus.milvus_client import IndexParams
|
|
|
15
15
|
from test.integration.connectors.utils.constants import DESTINATION_TAG, env_setup_path
|
|
16
16
|
from test.integration.connectors.utils.docker import healthcheck_wait
|
|
17
17
|
from test.integration.connectors.utils.docker_compose import docker_compose_context
|
|
18
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
18
19
|
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
19
20
|
from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
20
21
|
CONNECTOR_TYPE,
|
|
@@ -24,9 +25,10 @@ from unstructured_ingest.v2.processes.connectors.milvus import (
|
|
|
24
25
|
MilvusUploadStager,
|
|
25
26
|
)
|
|
26
27
|
|
|
27
|
-
DB_URI = "http://localhost:19530"
|
|
28
28
|
DB_NAME = "test_database"
|
|
29
|
-
|
|
29
|
+
EXISTENT_COLLECTION_NAME = "test_collection"
|
|
30
|
+
NONEXISTENT_COLLECTION_NAME = "nonexistent_collection"
|
|
31
|
+
DB_URI = "http://localhost:19530"
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
def get_schema() -> CollectionSchema:
|
|
@@ -55,7 +57,9 @@ def get_index_params() -> IndexParams:
|
|
|
55
57
|
return index_params
|
|
56
58
|
|
|
57
59
|
|
|
58
|
-
|
|
60
|
+
# NOTE: Precheck tests are read-only so they don't interfere with destination test,
|
|
61
|
+
# using scope="module" we can limit number of times the docker-compose has to be run
|
|
62
|
+
@pytest.fixture(scope="module")
|
|
59
63
|
def collection():
|
|
60
64
|
docker_client = docker.from_env()
|
|
61
65
|
with docker_compose_context(docker_compose_path=env_setup_path / "milvus"):
|
|
@@ -73,10 +77,10 @@ def collection():
|
|
|
73
77
|
schema = get_schema()
|
|
74
78
|
index_params = get_index_params()
|
|
75
79
|
collection_resp = milvus_client.create_collection(
|
|
76
|
-
collection_name=
|
|
80
|
+
collection_name=EXISTENT_COLLECTION_NAME, schema=schema, index_params=index_params
|
|
77
81
|
)
|
|
78
|
-
print(f"Created collection {
|
|
79
|
-
yield
|
|
82
|
+
print(f"Created collection {EXISTENT_COLLECTION_NAME}: {collection_resp}")
|
|
83
|
+
yield EXISTENT_COLLECTION_NAME
|
|
80
84
|
finally:
|
|
81
85
|
milvus_client.close()
|
|
82
86
|
|
|
@@ -139,3 +143,27 @@ async def test_milvus_destination(
|
|
|
139
143
|
uploader.run(path=staged_filepath, file_data=file_data)
|
|
140
144
|
with uploader.get_client() as client:
|
|
141
145
|
validate_count(client=client, expected_count=expected_count)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
149
|
+
def test_precheck_succeeds(collection: str):
|
|
150
|
+
uploader = MilvusUploader(
|
|
151
|
+
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
152
|
+
upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
|
|
153
|
+
)
|
|
154
|
+
uploader.precheck()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
158
|
+
def test_precheck_fails_on_nonexistent_collection(collection: str):
|
|
159
|
+
uploader = MilvusUploader(
|
|
160
|
+
connection_config=MilvusConnectionConfig(uri=DB_URI),
|
|
161
|
+
upload_config=MilvusUploaderConfig(
|
|
162
|
+
db_name=DB_NAME, collection_name=NONEXISTENT_COLLECTION_NAME
|
|
163
|
+
),
|
|
164
|
+
)
|
|
165
|
+
with pytest.raises(
|
|
166
|
+
DestinationConnectionError,
|
|
167
|
+
match=f"Collection '{NONEXISTENT_COLLECTION_NAME}' does not exist",
|
|
168
|
+
):
|
|
169
|
+
uploader.precheck()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import ValidationError
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
|
|
5
|
+
CloudWeaviateAccessConfig,
|
|
6
|
+
CloudWeaviateConnectionConfig,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_weaviate_failing_connection_config():
|
|
11
|
+
with pytest.raises(ValidationError):
|
|
12
|
+
CloudWeaviateConnectionConfig(
|
|
13
|
+
access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
|
|
14
|
+
username="username",
|
|
15
|
+
cluster_url="clusterurl",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_weaviate_connection_config_happy_path():
|
|
20
|
+
CloudWeaviateConnectionConfig(
|
|
21
|
+
access_config=CloudWeaviateAccessConfig(
|
|
22
|
+
api_key="my key",
|
|
23
|
+
),
|
|
24
|
+
cluster_url="clusterurl",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_weaviate_connection_config_anonymous():
|
|
29
|
+
CloudWeaviateConnectionConfig(
|
|
30
|
+
access_config=CloudWeaviateAccessConfig(api_key="my key", password="password"),
|
|
31
|
+
username="username",
|
|
32
|
+
anonymous=True,
|
|
33
|
+
cluster_url="clusterurl",
|
|
34
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
|
|
7
|
+
from test.integration.utils import requires_env
|
|
8
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
9
|
+
AzureOpenAIEmbeddingConfig,
|
|
10
|
+
AzureOpenAIEmbeddingEncoder,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
13
|
+
|
|
14
|
+
API_KEY = "AZURE_OPENAI_API_KEY"
|
|
15
|
+
ENDPOINT = "AZURE_OPENAI_ENDPOINT"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class AzureData:
|
|
20
|
+
api_key: str
|
|
21
|
+
endpoint: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_azure_data() -> AzureData:
|
|
25
|
+
api_key = os.getenv(API_KEY, None)
|
|
26
|
+
assert api_key
|
|
27
|
+
endpoint = os.getenv(ENDPOINT, None)
|
|
28
|
+
assert endpoint
|
|
29
|
+
return AzureData(api_key, endpoint)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@requires_env(API_KEY, ENDPOINT)
|
|
33
|
+
def test_azure_openai_embedder(embedder_file: Path):
|
|
34
|
+
azure_data = get_azure_data()
|
|
35
|
+
embedder_config = EmbedderConfig(
|
|
36
|
+
embedding_provider="azure-openai",
|
|
37
|
+
embedding_api_key=azure_data.api_key,
|
|
38
|
+
embedding_azure_endpoint=azure_data.endpoint,
|
|
39
|
+
)
|
|
40
|
+
embedder = Embedder(config=embedder_config)
|
|
41
|
+
results = embedder.run(elements_filepath=embedder_file)
|
|
42
|
+
assert results
|
|
43
|
+
with embedder_file.open("r") as f:
|
|
44
|
+
original_elements = json.load(f)
|
|
45
|
+
validate_embedding_output(original_elements=original_elements, output_elements=results)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@requires_env(API_KEY, ENDPOINT)
|
|
49
|
+
def test_raw_azure_openai_embedder(embedder_file: Path):
|
|
50
|
+
azure_data = get_azure_data()
|
|
51
|
+
embedder = AzureOpenAIEmbeddingEncoder(
|
|
52
|
+
config=AzureOpenAIEmbeddingConfig(
|
|
53
|
+
api_key=azure_data.api_key,
|
|
54
|
+
azure_endpoint=azure_data.endpoint,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
validate_raw_embedder(
|
|
58
|
+
embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
|
|
59
|
+
)
|
test/unit/test_utils.py
CHANGED
|
@@ -8,7 +8,11 @@ import pytz
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.cli.utils import extract_config
|
|
10
10
|
from unstructured_ingest.interfaces import BaseConfig
|
|
11
|
-
from unstructured_ingest.utils.string_and_date_utils import
|
|
11
|
+
from unstructured_ingest.utils.string_and_date_utils import (
|
|
12
|
+
ensure_isoformat_datetime,
|
|
13
|
+
json_to_dict,
|
|
14
|
+
truncate_string_bytes,
|
|
15
|
+
)
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
@dataclass
|
|
@@ -162,3 +166,19 @@ def test_ensure_isoformat_datetime_fails_on_string():
|
|
|
162
166
|
def test_ensure_isoformat_datetime_fails_on_int():
|
|
163
167
|
with pytest.raises(TypeError):
|
|
164
168
|
ensure_isoformat_datetime(1111)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_truncate_string_bytes_return_truncated_string():
|
|
172
|
+
test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
|
|
173
|
+
max_bytes = 11
|
|
174
|
+
result = truncate_string_bytes(test_string, max_bytes)
|
|
175
|
+
assert result == "abcdef안"
|
|
176
|
+
assert len(result.encode("utf-8")) <= max_bytes
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_truncate_string_bytes_return_untouched_string():
|
|
180
|
+
test_string = "abcdef"
|
|
181
|
+
max_bytes = 11
|
|
182
|
+
result = truncate_string_bytes(test_string, max_bytes)
|
|
183
|
+
assert result == "abcdef"
|
|
184
|
+
assert len(result.encode("utf-8")) <= max_bytes
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.4" # pragma: no cover
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
7
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from openai import AzureOpenAI
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
|
|
14
|
+
api_version: str = Field(description="Azure API version", default="2024-06-01")
|
|
15
|
+
azure_endpoint: str
|
|
16
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
17
|
+
|
|
18
|
+
@requires_dependencies(["openai"], extras="openai")
|
|
19
|
+
def get_client(self) -> "AzureOpenAI":
|
|
20
|
+
from openai import AzureOpenAI
|
|
21
|
+
|
|
22
|
+
return AzureOpenAI(
|
|
23
|
+
api_key=self.api_key.get_secret_value(),
|
|
24
|
+
api_version=self.api_version,
|
|
25
|
+
azure_endpoint=self.azure_endpoint,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
|
|
31
|
+
config: AzureOpenAIEmbeddingConfig
|
|
@@ -37,3 +37,13 @@ def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
|
|
|
37
37
|
raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e
|
|
38
38
|
else:
|
|
39
39
|
raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def truncate_string_bytes(string: str, max_bytes: int, encoding: str = "utf-8") -> str:
|
|
43
|
+
"""
|
|
44
|
+
Truncates a string to a specified maximum number of bytes.
|
|
45
|
+
"""
|
|
46
|
+
encoded_string = str(string).encode(encoding)
|
|
47
|
+
if len(encoded_string) <= max_bytes:
|
|
48
|
+
return string
|
|
49
|
+
return encoded_string[:max_bytes].decode(encoding, errors="ignore")
|
|
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
)
|
|
20
20
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
21
21
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
|
|
22
23
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
23
24
|
from unstructured_ingest.v2.interfaces import (
|
|
24
25
|
AccessConfig,
|
|
@@ -50,6 +51,8 @@ if TYPE_CHECKING:
|
|
|
50
51
|
|
|
51
52
|
CONNECTOR_TYPE = "astradb"
|
|
52
53
|
|
|
54
|
+
MAX_CONTENT_PARAM_BYTE_SIZE = 8000
|
|
55
|
+
|
|
53
56
|
|
|
54
57
|
class AstraDBAccessConfig(AccessConfig):
|
|
55
58
|
token: str = Field(description="Astra DB Token with access to the database.")
|
|
@@ -301,7 +304,20 @@ class AstraDBUploadStager(UploadStager):
|
|
|
301
304
|
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
302
305
|
)
|
|
303
306
|
|
|
307
|
+
def truncate_dict_elements(self, element_dict: dict) -> None:
|
|
308
|
+
text = element_dict.pop("text", None)
|
|
309
|
+
if text is not None:
|
|
310
|
+
element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
|
|
311
|
+
metadata = element_dict.get("metadata")
|
|
312
|
+
if metadata is not None and isinstance(metadata, dict):
|
|
313
|
+
text_as_html = element_dict["metadata"].pop("text_as_html", None)
|
|
314
|
+
if text_as_html is not None:
|
|
315
|
+
element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
|
|
316
|
+
text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
|
|
317
|
+
)
|
|
318
|
+
|
|
304
319
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
320
|
+
self.truncate_dict_elements(element_dict)
|
|
305
321
|
return {
|
|
306
322
|
"$vector": element_dict.pop("embeddings", None),
|
|
307
323
|
"content": element_dict.pop("text", None),
|
|
@@ -219,6 +219,9 @@ class CouchbaseIndexer(Indexer):
|
|
|
219
219
|
|
|
220
220
|
|
|
221
221
|
class CouchbaseDownloaderConfig(DownloaderConfig):
|
|
222
|
+
collection_id: str = Field(
|
|
223
|
+
default="id", description="The unique key of the id field in the collection"
|
|
224
|
+
)
|
|
222
225
|
fields: list[str] = field(default_factory=list)
|
|
223
226
|
|
|
224
227
|
|
|
@@ -250,7 +253,7 @@ class CouchbaseDownloader(Downloader):
|
|
|
250
253
|
def generate_download_response(
|
|
251
254
|
self, result: dict, bucket: str, file_data: FileData
|
|
252
255
|
) -> DownloadResponse:
|
|
253
|
-
record_id = result[
|
|
256
|
+
record_id = result[self.download_config.collection_id]
|
|
254
257
|
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
|
|
255
258
|
filename = f"{filename_id}.txt"
|
|
256
259
|
download_path = self.download_dir / Path(filename)
|
|
@@ -6,12 +6,25 @@ from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
|
|
|
6
6
|
from .aws import lancedb_aws_destination_entry
|
|
7
7
|
from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
|
|
8
8
|
from .azure import lancedb_azure_destination_entry
|
|
9
|
+
from .cloud import CONNECTOR_TYPE as LANCEDB_CLOUD_CONNECTOR_TYPE
|
|
10
|
+
from .cloud import lancedb_cloud_destination_entry
|
|
9
11
|
from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
|
|
10
12
|
from .gcp import lancedb_gcp_destination_entry
|
|
11
13
|
from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
|
|
12
14
|
from .local import lancedb_local_destination_entry
|
|
13
15
|
|
|
14
|
-
add_destination_entry(
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
add_destination_entry(
|
|
16
|
+
add_destination_entry(
|
|
17
|
+
destination_type=LANCEDB_S3_CONNECTOR_TYPE, entry=lancedb_aws_destination_entry
|
|
18
|
+
)
|
|
19
|
+
add_destination_entry(
|
|
20
|
+
destination_type=LANCEDB_AZURE_CONNECTOR_TYPE, entry=lancedb_azure_destination_entry
|
|
21
|
+
)
|
|
22
|
+
add_destination_entry(
|
|
23
|
+
destination_type=LANCEDB_GCS_CONNECTOR_TYPE, entry=lancedb_gcp_destination_entry
|
|
24
|
+
)
|
|
25
|
+
add_destination_entry(
|
|
26
|
+
destination_type=LANCEDB_LOCAL_CONNECTOR_TYPE, entry=lancedb_local_destination_entry
|
|
27
|
+
)
|
|
28
|
+
add_destination_entry(
|
|
29
|
+
destination_type=LANCEDB_CLOUD_CONNECTOR_TYPE, entry=lancedb_cloud_destination_entry
|
|
30
|
+
)
|
|
@@ -15,28 +15,28 @@ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
|
|
|
15
15
|
CONNECTOR_TYPE = "lancedb_aws"
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class
|
|
18
|
+
class LanceDBAwsAccessConfig(AccessConfig):
|
|
19
19
|
aws_access_key_id: str = Field(description="The AWS access key ID to use.")
|
|
20
20
|
aws_secret_access_key: str = Field(description="The AWS secret access key to use.")
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class
|
|
24
|
-
access_config: Secret[
|
|
23
|
+
class LanceDBAwsConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
24
|
+
access_config: Secret[LanceDBAwsAccessConfig]
|
|
25
25
|
|
|
26
26
|
def get_storage_options(self) -> dict:
|
|
27
27
|
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@dataclass
|
|
31
|
-
class
|
|
31
|
+
class LanceDBAwsUploader(LanceDBUploader):
|
|
32
32
|
upload_config: LanceDBUploaderConfig
|
|
33
|
-
connection_config:
|
|
33
|
+
connection_config: LanceDBAwsConnectionConfig
|
|
34
34
|
connector_type: str = CONNECTOR_TYPE
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
lancedb_aws_destination_entry = DestinationRegistryEntry(
|
|
38
|
-
connection_config=
|
|
39
|
-
uploader=
|
|
38
|
+
connection_config=LanceDBAwsConnectionConfig,
|
|
39
|
+
uploader=LanceDBAwsUploader,
|
|
40
40
|
uploader_config=LanceDBUploaderConfig,
|
|
41
41
|
upload_stager_config=LanceDBUploadStagerConfig,
|
|
42
42
|
upload_stager=LanceDBUploadStager,
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBCloudAccessConfig(AccessConfig):
|
|
19
|
+
api_key: str = Field(description="Api key associated with LanceDb cloud")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
23
|
+
access_config: Secret[LanceDBCloudAccessConfig]
|
|
24
|
+
|
|
25
|
+
def get_storage_options(self) -> dict:
|
|
26
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class LanceDBCloudUploader(LanceDBUploader):
|
|
31
|
+
upload_config: LanceDBUploaderConfig
|
|
32
|
+
connection_config: LanceDBCloudConnectionConfig
|
|
33
|
+
connector_type: str = CONNECTOR_TYPE
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
lancedb_cloud_destination_entry = DestinationRegistryEntry(
|
|
37
|
+
connection_config=LanceDBCloudConnectionConfig,
|
|
38
|
+
uploader=LanceDBCloudUploader,
|
|
39
|
+
uploader_config=LanceDBUploaderConfig,
|
|
40
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
41
|
+
upload_stager=LanceDBUploadStager,
|
|
42
|
+
)
|
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
|
|
11
|
-
from unstructured_ingest.error import WriteError
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
14
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
@@ -66,7 +66,6 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
69
|
-
|
|
70
69
|
fields_to_include: Optional[list[str]] = None
|
|
71
70
|
"""If set - list of fields to include in the output.
|
|
72
71
|
Unspecified fields are removed from the elements.
|
|
@@ -174,6 +173,14 @@ class MilvusUploader(Uploader):
|
|
|
174
173
|
upload_config: MilvusUploaderConfig
|
|
175
174
|
connector_type: str = CONNECTOR_TYPE
|
|
176
175
|
|
|
176
|
+
@DestinationConnectionError.wrap
|
|
177
|
+
def precheck(self):
|
|
178
|
+
with self.get_client() as client:
|
|
179
|
+
if not client.has_collection(self.upload_config.collection_name):
|
|
180
|
+
raise DestinationConnectionError(
|
|
181
|
+
f"Collection '{self.upload_config.collection_name}' does not exist"
|
|
182
|
+
)
|
|
183
|
+
|
|
177
184
|
@contextmanager
|
|
178
185
|
def get_client(self) -> Generator["MilvusClient", None, None]:
|
|
179
186
|
client = self.connection_config.get_client()
|
|
@@ -218,7 +225,6 @@ class MilvusUploader(Uploader):
|
|
|
218
225
|
f"db in collection {self.upload_config.collection_name}"
|
|
219
226
|
)
|
|
220
227
|
with self.get_client() as client:
|
|
221
|
-
|
|
222
228
|
try:
|
|
223
229
|
res = client.insert(collection_name=self.upload_config.collection_name, data=data)
|
|
224
230
|
except MilvusException as milvus_exception:
|
|
@@ -55,10 +55,11 @@ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
|
|
|
55
55
|
"client_secret": access_config.client_secret is not None,
|
|
56
56
|
"client_password": access_config.password is not None and self.username is not None,
|
|
57
57
|
}
|
|
58
|
-
|
|
58
|
+
existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
|
|
59
|
+
|
|
60
|
+
if len(existing_auths) == 0:
|
|
59
61
|
raise ValueError("No auth values provided and anonymous is False")
|
|
60
|
-
if len(
|
|
61
|
-
existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
|
|
62
|
+
if len(existing_auths) > 1:
|
|
62
63
|
raise ValueError(
|
|
63
64
|
"Multiple auth values provided, only one approach can be used: {}".format(
|
|
64
65
|
", ".join(existing_auths)
|
|
@@ -16,6 +16,7 @@ class EmbedderConfig(BaseModel):
|
|
|
16
16
|
embedding_provider: Optional[
|
|
17
17
|
Literal[
|
|
18
18
|
"openai",
|
|
19
|
+
"azure-openai",
|
|
19
20
|
"huggingface",
|
|
20
21
|
"aws-bedrock",
|
|
21
22
|
"vertexai",
|
|
@@ -43,6 +44,14 @@ class EmbedderConfig(BaseModel):
|
|
|
43
44
|
embedding_aws_region: Optional[str] = Field(
|
|
44
45
|
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
45
46
|
)
|
|
47
|
+
embedding_azure_endpoint: Optional[str] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Your Azure endpoint, including the resource, "
|
|
50
|
+
"e.g. `https://example-resource.azure.openai.com/`",
|
|
51
|
+
)
|
|
52
|
+
embedding_azure_api_version: Optional[str] = Field(
|
|
53
|
+
description="Azure API version", default=None
|
|
54
|
+
)
|
|
46
55
|
|
|
47
56
|
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
48
57
|
from unstructured_ingest.embed.huggingface import (
|
|
@@ -59,6 +68,25 @@ class EmbedderConfig(BaseModel):
|
|
|
59
68
|
|
|
60
69
|
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
61
70
|
|
|
71
|
+
def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
72
|
+
from unstructured_ingest.embed.azure_openai import (
|
|
73
|
+
AzureOpenAIEmbeddingConfig,
|
|
74
|
+
AzureOpenAIEmbeddingEncoder,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
config_kwargs = {
|
|
78
|
+
"api_key": self.embedding_api_key,
|
|
79
|
+
"azure_endpoint": self.embedding_azure_endpoint,
|
|
80
|
+
}
|
|
81
|
+
if api_version := self.embedding_azure_api_version:
|
|
82
|
+
config_kwargs["api_version"] = api_version
|
|
83
|
+
if model_name := self.embedding_model_name:
|
|
84
|
+
config_kwargs["model_name"] = model_name
|
|
85
|
+
|
|
86
|
+
return AzureOpenAIEmbeddingEncoder(
|
|
87
|
+
config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
|
|
88
|
+
)
|
|
89
|
+
|
|
62
90
|
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
63
91
|
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
64
92
|
|
|
@@ -146,6 +174,8 @@ class EmbedderConfig(BaseModel):
|
|
|
146
174
|
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
147
175
|
if self.embedding_provider == "togetherai":
|
|
148
176
|
return self.get_togetherai_embedder(embedding_kwargs=kwargs)
|
|
177
|
+
if self.embedding_provider == "azure-openai":
|
|
178
|
+
return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
|
|
149
179
|
|
|
150
180
|
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
151
181
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,20 +22,20 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: python-dateutil
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
25
|
Requires-Dist: opentelemetry-sdk
|
|
28
|
-
Requires-Dist: click
|
|
29
|
-
Requires-Dist: tqdm
|
|
30
26
|
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: python-dateutil
|
|
28
|
+
Requires-Dist: pydantic>=2.7
|
|
31
29
|
Requires-Dist: dataclasses-json
|
|
30
|
+
Requires-Dist: tqdm
|
|
31
|
+
Requires-Dist: click
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
@@ -44,15 +44,15 @@ Provides-Extra: biomed
|
|
|
44
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: boxfs; extra == "box"
|
|
48
47
|
Requires-Dist: fsspec; extra == "box"
|
|
48
|
+
Requires-Dist: boxfs; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
52
52
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
53
53
|
Provides-Extra: confluence
|
|
54
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
55
54
|
Requires-Dist: requests; extra == "confluence"
|
|
55
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
56
|
Provides-Extra: couchbase
|
|
57
57
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
58
58
|
Provides-Extra: csv
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
64
63
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
72
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
74
74
|
Provides-Extra: elasticsearch
|
|
75
75
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
76
|
Provides-Extra: embed-huggingface
|
|
@@ -88,8 +88,8 @@ Provides-Extra: epub
|
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
90
|
Requires-Dist: bs4; extra == "gcs"
|
|
91
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
92
91
|
Requires-Dist: fsspec; extra == "gcs"
|
|
92
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
94
|
Requires-Dist: requests; extra == "github"
|
|
95
95
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
@@ -98,14 +98,16 @@ Requires-Dist: python-gitlab; extra == "gitlab"
|
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
101
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
106
106
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
107
107
|
Provides-Extra: kdbai
|
|
108
108
|
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
109
|
+
Provides-Extra: lancedb
|
|
110
|
+
Requires-Dist: lancedb; extra == "lancedb"
|
|
109
111
|
Provides-Extra: md
|
|
110
112
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
111
113
|
Provides-Extra: milvus
|
|
@@ -115,10 +117,10 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
117
|
Provides-Extra: msg
|
|
116
118
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
119
|
Provides-Extra: notion
|
|
118
|
-
Requires-Dist: backoff; extra == "notion"
|
|
119
120
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
120
|
-
Requires-Dist:
|
|
121
|
+
Requires-Dist: backoff; extra == "notion"
|
|
121
122
|
Requires-Dist: httpx; extra == "notion"
|
|
123
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
122
124
|
Provides-Extra: odt
|
|
123
125
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
126
|
Provides-Extra: onedrive
|
|
@@ -161,8 +163,8 @@ Requires-Dist: s3fs; extra == "s3"
|
|
|
161
163
|
Provides-Extra: salesforce
|
|
162
164
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
165
|
Provides-Extra: sftp
|
|
164
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
165
166
|
Requires-Dist: fsspec; extra == "sftp"
|
|
167
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
166
168
|
Provides-Extra: sharepoint
|
|
167
169
|
Requires-Dist: msal; extra == "sharepoint"
|
|
168
170
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
@@ -10,8 +10,8 @@ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworD
|
|
|
10
10
|
test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
|
|
11
11
|
test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
|
|
12
12
|
test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
|
|
13
|
-
test/integration/connectors/test_lancedb.py,sha256=
|
|
14
|
-
test/integration/connectors/test_milvus.py,sha256=
|
|
13
|
+
test/integration/connectors/test_lancedb.py,sha256=8hRlqw3zYOcFCu6PPlejquSvvEM_3OEBzKTQbNm_Zmg,7635
|
|
14
|
+
test/integration/connectors/test_milvus.py,sha256=p4UujDr_tsRaQDmhDmDZp38t8oSFm7hrTqiq6NNuhGo,5933
|
|
15
15
|
test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
|
|
16
16
|
test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
|
|
17
17
|
test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
|
|
@@ -35,9 +35,11 @@ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQ
|
|
|
35
35
|
test/integration/connectors/utils/validation.py,sha256=SwvPVuHjJxTo8xEUwnuL9FZNpu3sZZ8iouOz5xh_kB8,14272
|
|
36
36
|
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
38
|
+
test/integration/connectors/weaviate/test_cloud.py,sha256=07VxNRxWWcgTstFfpoZ1FlVnEhcBnQlo5nosWKjKz_4,979
|
|
38
39
|
test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
|
|
39
40
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
41
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
42
|
+
test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
|
|
41
43
|
test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
|
|
42
44
|
test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
|
|
43
45
|
test/integration/embedders/test_mixedbread.py,sha256=RrLv8SByMNXsgrlh94RbaT-VyxZ4-DILO-OPpmOwvSI,1441
|
|
@@ -54,7 +56,7 @@ test/unit/test_chunking_utils.py,sha256=0iPwfnMPpyTm-yOE0BXMnEQQP4iguS6NhOqgMQU5
|
|
|
54
56
|
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
55
57
|
test/unit/test_interfaces.py,sha256=XNj8qasc1ltaeUv-2y31rv7R9xquo0rgRrMvBZoNZLw,9623
|
|
56
58
|
test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
|
|
57
|
-
test/unit/test_utils.py,sha256=
|
|
59
|
+
test/unit/test_utils.py,sha256=Q6mp9YZPah8z3-2lreyRbmAc7m2Y_w26_N9vocSInoA,5421
|
|
58
60
|
test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
61
|
test/unit/embed/test_mixedbreadai.py,sha256=XFNJDP5pIgF3eQYwBiuEWmH3zZWx72Wpwyv-Q4m0DJg,1332
|
|
60
62
|
test/unit/embed/test_octoai.py,sha256=Ha9EgAW64Q45hFj51tToe8RyKXWXwqAkdDqSFDMu37Q,831
|
|
@@ -81,7 +83,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
81
83
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
84
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
83
85
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
84
|
-
unstructured_ingest/__version__.py,sha256=
|
|
86
|
+
unstructured_ingest/__version__.py,sha256=0rNziXrR8RxleBY3pKm77TbOCJ0CwApHiLqXBAViUAo,42
|
|
85
87
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
86
88
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
87
89
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -249,6 +251,7 @@ unstructured_ingest/connector/notion/types/database_properties/unique_id.py,sha2
|
|
|
249
251
|
unstructured_ingest/connector/notion/types/database_properties/url.py,sha256=iXQ2tVUm9UlKVtDA0NQiFIRJ5PHYW9wOaWt2vFfSVCg,862
|
|
250
252
|
unstructured_ingest/connector/notion/types/database_properties/verification.py,sha256=J_DLjY-v2T6xDGMQ7FkI0YMKMA6SG6Y3yYW7qUD1hKA,2334
|
|
251
253
|
unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
254
|
+
unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
|
|
252
255
|
unstructured_ingest/embed/bedrock.py,sha256=-PRdZsF44vwi6G4G75gdO31AJKfZWClOXkJQAk7rEO8,3096
|
|
253
256
|
unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
|
|
254
257
|
unstructured_ingest/embed/interfaces.py,sha256=au4Xp8ciDvo4bidlUbazFW2aC7NZW5-UDLKXBFVzAX4,2025
|
|
@@ -342,7 +345,7 @@ unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSz
|
|
|
342
345
|
unstructured_ingest/utils/data_prep.py,sha256=IDAedOSBdgZpD9IY4tLJT-rmKGV7GHtU6KRj6VM-_tE,4666
|
|
343
346
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
344
347
|
unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
|
|
345
|
-
unstructured_ingest/utils/string_and_date_utils.py,sha256=
|
|
348
|
+
unstructured_ingest/utils/string_and_date_utils.py,sha256=kijtPlGAbH376vVjFSo5H_ZhW-FEcMC2sCNsSNwDOjo,1729
|
|
346
349
|
unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
|
|
347
350
|
unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
348
351
|
unstructured_ingest/v2/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
|
|
@@ -388,23 +391,23 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=zlgXgwReX9TBOdfTpS9hETah4
|
|
|
388
391
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
389
392
|
unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
|
|
390
393
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
391
|
-
unstructured_ingest/v2/processes/embedder.py,sha256=
|
|
394
|
+
unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZxl3Nf8-99509ClQ,7721
|
|
392
395
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
393
396
|
unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
|
|
394
397
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
395
398
|
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
|
|
396
399
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
397
|
-
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=
|
|
400
|
+
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=QTUQ-cv_iZi9eaXRRHQNKhtgFn-Pi20AXdSVaDFg9DM,15498
|
|
398
401
|
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
|
|
399
402
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
|
|
400
403
|
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
|
|
401
|
-
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=
|
|
404
|
+
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=LbUJLt6fqaNYSmy9vUiovG-UOALMcvh8OD-gZAaf-f4,12333
|
|
402
405
|
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
|
|
403
406
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
|
|
404
407
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
|
|
405
408
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
|
|
406
409
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
|
|
407
|
-
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=
|
|
410
|
+
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqAqlMcHHJSBR-GGbaZf1nvobLE,10089
|
|
408
411
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
|
|
409
412
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
|
|
410
413
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
@@ -435,9 +438,10 @@ unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfh
|
|
|
435
438
|
unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=qprsfI8VH0mVTa1MOCpa2D4coyopinQ5ag2KXcAecXE,3296
|
|
436
439
|
unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=qEv_yaG94KekFtfS06KgpTTbqeJkje0hn5uOjsMMngw,9414
|
|
437
440
|
unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=vwLZjvc_C17zOqcrzic0aIoPwS98sqYiwiMknw2IcK4,2586
|
|
438
|
-
unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=
|
|
439
|
-
unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=
|
|
441
|
+
unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=LW37xZrn48JeHluRNulLTreUPdaF-ZU81F7MCUHcCv8,1253
|
|
442
|
+
unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur53C4MEnpLplfO8U91KYgk--0kk5pE,1413
|
|
440
443
|
unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
|
|
444
|
+
unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
|
|
441
445
|
unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
|
|
442
446
|
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
|
|
443
447
|
unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
|
|
@@ -453,13 +457,13 @@ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK
|
|
|
453
457
|
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
|
|
454
458
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
|
|
455
459
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
|
|
456
|
-
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=
|
|
460
|
+
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
457
461
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
458
462
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
459
463
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
|
|
460
|
-
unstructured_ingest-0.3.
|
|
461
|
-
unstructured_ingest-0.3.
|
|
462
|
-
unstructured_ingest-0.3.
|
|
463
|
-
unstructured_ingest-0.3.
|
|
464
|
-
unstructured_ingest-0.3.
|
|
465
|
-
unstructured_ingest-0.3.
|
|
464
|
+
unstructured_ingest-0.3.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
465
|
+
unstructured_ingest-0.3.4.dist-info/METADATA,sha256=6Nj2KHvch7j5QLfahz5NcFHmmNq9vNixTfZSDUEQPjo,7393
|
|
466
|
+
unstructured_ingest-0.3.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
467
|
+
unstructured_ingest-0.3.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
468
|
+
unstructured_ingest-0.3.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
469
|
+
unstructured_ingest-0.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.2.dist-info → unstructured_ingest-0.3.4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|