unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +17 -7
- unstructured_ingest/pipeline/reformat/embedding.py +3 -5
- unstructured_ingest/utils/data_prep.py +20 -12
- unstructured_ingest/v2/cli/base/src.py +2 -1
- unstructured_ingest/v2/pipeline/interfaces.py +3 -1
- unstructured_ingest/v2/pipeline/pipeline.py +25 -23
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -2
- unstructured_ingest/v2/processes/connectors/onedrive.py +6 -4
- unstructured_ingest/v2/processes/connectors/pinecone.py +37 -15
- unstructured_ingest/v2/processes/connectors/sharepoint.py +1 -1
- unstructured_ingest/v2/processes/embedder.py +41 -24
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/METADATA +214 -211
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/RECORD +28 -19
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/top_level.txt +0 -0
|
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import flatten_dict
|
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from unstructured.documents.elements import Element
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
28
29
|
|
|
29
30
|
A = TypeVar("A", bound="DataClassJsonMixin")
|
|
30
31
|
|
|
@@ -204,22 +205,31 @@ class EmbeddingConfig(BaseConfig):
|
|
|
204
205
|
kwargs["model_name"] = self.model_name
|
|
205
206
|
# TODO make this more dynamic to map to encoder configs
|
|
206
207
|
if self.provider == "langchain-openai":
|
|
207
|
-
from
|
|
208
|
+
from unstructured_ingest.embed.openai import (
|
|
209
|
+
OpenAIEmbeddingConfig,
|
|
210
|
+
OpenAIEmbeddingEncoder,
|
|
211
|
+
)
|
|
208
212
|
|
|
209
213
|
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
|
|
210
214
|
elif self.provider == "langchain-huggingface":
|
|
211
|
-
from
|
|
215
|
+
from unstructured_ingest.embed.huggingface import (
|
|
212
216
|
HuggingFaceEmbeddingConfig,
|
|
213
217
|
HuggingFaceEmbeddingEncoder,
|
|
214
218
|
)
|
|
215
219
|
|
|
216
220
|
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
|
|
217
221
|
elif self.provider == "octoai":
|
|
218
|
-
from
|
|
222
|
+
from unstructured_ingest.embed.octoai import (
|
|
223
|
+
OctoAiEmbeddingConfig,
|
|
224
|
+
OctoAIEmbeddingEncoder,
|
|
225
|
+
)
|
|
219
226
|
|
|
220
227
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
221
228
|
elif self.provider == "langchain-aws-bedrock":
|
|
222
|
-
from
|
|
229
|
+
from unstructured_ingest.embed.bedrock import (
|
|
230
|
+
BedrockEmbeddingConfig,
|
|
231
|
+
BedrockEmbeddingEncoder,
|
|
232
|
+
)
|
|
223
233
|
|
|
224
234
|
return BedrockEmbeddingEncoder(
|
|
225
235
|
config=BedrockEmbeddingConfig(
|
|
@@ -229,14 +239,14 @@ class EmbeddingConfig(BaseConfig):
|
|
|
229
239
|
)
|
|
230
240
|
)
|
|
231
241
|
elif self.provider == "langchain-vertexai":
|
|
232
|
-
from
|
|
242
|
+
from unstructured_ingest.embed.vertexai import (
|
|
233
243
|
VertexAIEmbeddingConfig,
|
|
234
244
|
VertexAIEmbeddingEncoder,
|
|
235
245
|
)
|
|
236
246
|
|
|
237
247
|
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
|
238
248
|
elif self.provider == "langchain-voyageai":
|
|
239
|
-
from
|
|
249
|
+
from unstructured_ingest.embed.voyageai import (
|
|
240
250
|
VoyageAIEmbeddingConfig,
|
|
241
251
|
VoyageAIEmbeddingEncoder,
|
|
242
252
|
)
|
|
@@ -27,8 +27,6 @@ class Embedder(ReformatNode):
|
|
|
27
27
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
28
28
|
|
|
29
29
|
def run(self, elements_json: str) -> Optional[str]:
|
|
30
|
-
from unstructured.staging.base import elements_from_json
|
|
31
|
-
|
|
32
30
|
try:
|
|
33
31
|
elements_json_filename = os.path.basename(elements_json)
|
|
34
32
|
filename_ext = os.path.basename(elements_json_filename)
|
|
@@ -48,10 +46,10 @@ class Embedder(ReformatNode):
|
|
|
48
46
|
):
|
|
49
47
|
logger.debug(f"File exists: {json_path}, skipping embedding")
|
|
50
48
|
return str(json_path)
|
|
51
|
-
|
|
49
|
+
with open(elements_json) as f:
|
|
50
|
+
elements = json.load(f)
|
|
52
51
|
embedder = self.embedder_config.get_embedder()
|
|
53
|
-
|
|
54
|
-
element_dicts = [e.to_dict() for e in embedded_elements]
|
|
52
|
+
element_dicts = embedder.embed_documents(elements=elements)
|
|
55
53
|
with open(json_path, "w", encoding="utf8") as output_f:
|
|
56
54
|
logger.info(f"writing embeddings content to {json_path}")
|
|
57
55
|
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Optional, Sequence, cast
|
|
4
|
+
from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
5
|
|
|
6
6
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
7
7
|
|
|
8
|
+
T = TypeVar("T")
|
|
9
|
+
IterableT = Iterable[T]
|
|
8
10
|
|
|
9
|
-
|
|
11
|
+
|
|
12
|
+
def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
|
|
10
13
|
"""A helper function to break an iterable into batches of size batch_size."""
|
|
11
14
|
it = iter(iterable)
|
|
12
15
|
chunk = tuple(itertools.islice(it, batch_size))
|
|
@@ -16,23 +19,28 @@ def batch_generator(iterable, batch_size=100):
|
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
def generator_batching_wbytes(
|
|
19
|
-
iterable
|
|
20
|
-
|
|
22
|
+
iterable: IterableT,
|
|
23
|
+
batch_size_limit_bytes: Optional[int] = None,
|
|
24
|
+
max_batch_size: Optional[int] = None,
|
|
25
|
+
) -> IterableT:
|
|
26
|
+
if not batch_size_limit_bytes and not max_batch_size:
|
|
27
|
+
return iterable
|
|
21
28
|
"""A helper function to break an iterable into chunks of specified bytes."""
|
|
22
29
|
current_batch, current_batch_size = [], 0
|
|
23
30
|
|
|
24
31
|
for item in iterable:
|
|
25
32
|
item_size_bytes = len(json.dumps(item).encode("utf-8"))
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
current_batch_size
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
current_batch.append(item)
|
|
32
|
-
current_batch_size += item_size_bytes
|
|
33
|
-
else:
|
|
33
|
+
if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
|
|
34
|
+
yield current_batch
|
|
35
|
+
current_batch, current_batch_size = [item], item_size_bytes
|
|
36
|
+
continue
|
|
37
|
+
if max_batch_size and len(current_batch) + 1 > max_batch_size:
|
|
34
38
|
yield current_batch
|
|
35
39
|
current_batch, current_batch_size = [item], item_size_bytes
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
current_batch.append(item)
|
|
43
|
+
current_batch_size += item_size_bytes
|
|
36
44
|
|
|
37
45
|
if current_batch:
|
|
38
46
|
yield current_batch
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
import click
|
|
5
6
|
from pydantic import BaseModel
|
|
@@ -47,7 +48,7 @@ class SrcCmd(BaseCmd):
|
|
|
47
48
|
options = self.consolidate_options(options=options)
|
|
48
49
|
return options
|
|
49
50
|
|
|
50
|
-
def cmd(self, ctx: click.Context, **options) -> None:
|
|
51
|
+
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
|
|
51
52
|
if ctx.invoked_subcommand:
|
|
52
53
|
return
|
|
53
54
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import asyncio
|
|
2
4
|
import logging
|
|
3
5
|
import multiprocessing as mp
|
|
@@ -132,7 +134,7 @@ class PipelineStep(ABC):
|
|
|
132
134
|
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
133
135
|
raise NotImplementedError
|
|
134
136
|
|
|
135
|
-
def run(self, _fn:
|
|
137
|
+
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
|
|
136
138
|
kwargs = kwargs.copy()
|
|
137
139
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
138
140
|
tracer = otel_handler.get_tracer()
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import multiprocessing as mp
|
|
3
5
|
from dataclasses import InitVar, dataclass, field
|
|
4
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
5
7
|
|
|
6
8
|
from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
|
|
7
9
|
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
@@ -48,33 +50,33 @@ class Pipeline:
|
|
|
48
50
|
partitioner: InitVar[Partitioner]
|
|
49
51
|
partitioner_step: PartitionStep = field(init=False)
|
|
50
52
|
|
|
51
|
-
chunker: InitVar[
|
|
52
|
-
chunker_step: ChunkStep = field(init=False, default=None)
|
|
53
|
+
chunker: InitVar[Chunker | None] = None
|
|
54
|
+
chunker_step: ChunkStep | None = field(init=False, default=None)
|
|
53
55
|
|
|
54
|
-
embedder: InitVar[
|
|
55
|
-
embedder_step: EmbedStep = field(init=False, default=None)
|
|
56
|
+
embedder: InitVar[Embedder | None] = None
|
|
57
|
+
embedder_step: EmbedStep | None = field(init=False, default=None)
|
|
56
58
|
|
|
57
|
-
stager: InitVar[
|
|
58
|
-
stager_step: UploadStageStep = field(init=False, default=None)
|
|
59
|
+
stager: InitVar[UploadStager | None] = None
|
|
60
|
+
stager_step: UploadStageStep | None = field(init=False, default=None)
|
|
59
61
|
|
|
60
62
|
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
61
|
-
uploader_step: UploadStep = field(init=False, default=None)
|
|
63
|
+
uploader_step: UploadStep | None = field(init=False, default=None)
|
|
62
64
|
|
|
63
|
-
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
65
|
+
uncompress_step: UncompressStep | None = field(init=False, default=None)
|
|
64
66
|
|
|
65
|
-
filterer: InitVar[
|
|
66
|
-
filter_step: FilterStep = field(init=False, default=None)
|
|
67
|
+
filterer: InitVar[Filterer | None] = None
|
|
68
|
+
filter_step: FilterStep | None = field(init=False, default=None)
|
|
67
69
|
|
|
68
70
|
def __post_init__(
|
|
69
71
|
self,
|
|
70
72
|
indexer: IndexerT,
|
|
71
73
|
downloader: DownloaderT,
|
|
72
74
|
partitioner: Partitioner,
|
|
73
|
-
chunker: Chunker = None,
|
|
74
|
-
embedder: Embedder = None,
|
|
75
|
-
stager: UploadStager = None,
|
|
76
|
-
uploader: Uploader = None,
|
|
77
|
-
filterer: Filterer = None,
|
|
75
|
+
chunker: Chunker | None = None,
|
|
76
|
+
embedder: Embedder | None = None,
|
|
77
|
+
stager: UploadStager | None = None,
|
|
78
|
+
uploader: Uploader | None = None,
|
|
79
|
+
filterer: Filterer | None = None,
|
|
78
80
|
):
|
|
79
81
|
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
80
82
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
|
|
@@ -136,7 +138,7 @@ class Pipeline:
|
|
|
136
138
|
if self.context.status:
|
|
137
139
|
raise PipelineError("Pipeline did not run successfully")
|
|
138
140
|
|
|
139
|
-
def clean_results(self, results:
|
|
141
|
+
def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
|
|
140
142
|
if not results:
|
|
141
143
|
return None
|
|
142
144
|
results = [r for r in results if r]
|
|
@@ -274,12 +276,12 @@ class Pipeline:
|
|
|
274
276
|
downloader_config: DownloaderConfigT,
|
|
275
277
|
source_connection_config: ConnectionConfig,
|
|
276
278
|
partitioner_config: PartitionerConfig,
|
|
277
|
-
filterer_config: FiltererConfig = None,
|
|
278
|
-
chunker_config:
|
|
279
|
-
embedder_config:
|
|
280
|
-
destination_connection_config:
|
|
281
|
-
stager_config:
|
|
282
|
-
uploader_config:
|
|
279
|
+
filterer_config: FiltererConfig | None = None,
|
|
280
|
+
chunker_config: ChunkerConfig | None = None,
|
|
281
|
+
embedder_config: EmbedderConfig | None = None,
|
|
282
|
+
destination_connection_config: ConnectionConfig | None = None,
|
|
283
|
+
stager_config: UploadStagerConfigT | None = None,
|
|
284
|
+
uploader_config: UploaderConfigT | None = None,
|
|
283
285
|
) -> "Pipeline":
|
|
284
286
|
# Get registry key based on indexer config
|
|
285
287
|
source_entry = {
|
|
@@ -41,9 +41,14 @@ class ChromaAccessConfig(AccessConfig):
|
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
SecretChromaAccessConfig = Secret[ChromaAccessConfig]
|
|
45
|
+
|
|
46
|
+
|
|
44
47
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
45
48
|
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
|
-
access_config:
|
|
49
|
+
access_config: SecretChromaAccessConfig = Field(
|
|
50
|
+
default=SecretChromaAccessConfig(secret_value=ChromaAccessConfig())
|
|
51
|
+
)
|
|
47
52
|
path: Optional[str] = Field(
|
|
48
53
|
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
49
54
|
)
|
|
@@ -28,8 +28,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.logger import logger
|
|
30
30
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
31
|
-
|
|
32
|
-
from .utils import conform_string_to_dict
|
|
31
|
+
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
33
32
|
|
|
34
33
|
CONNECTOR_TYPE = "google_drive"
|
|
35
34
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
from dataclasses import dataclass
|
|
3
5
|
from pathlib import Path
|
|
@@ -103,7 +105,7 @@ class OnedriveIndexer(Indexer):
|
|
|
103
105
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
104
106
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
105
107
|
|
|
106
|
-
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
108
|
+
def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
|
|
107
109
|
drive_items = folder.children.get().execute_query()
|
|
108
110
|
files = [d for d in drive_items if d.is_file]
|
|
109
111
|
if not recursive:
|
|
@@ -139,12 +141,12 @@ class OnedriveIndexer(Indexer):
|
|
|
139
141
|
server_path = file_path + "/" + filename
|
|
140
142
|
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
|
|
141
143
|
date_modified_dt = (
|
|
142
|
-
parser.parse(drive_item.last_modified_datetime)
|
|
144
|
+
parser.parse(str(drive_item.last_modified_datetime))
|
|
143
145
|
if drive_item.last_modified_datetime
|
|
144
146
|
else None
|
|
145
147
|
)
|
|
146
148
|
date_created_at = (
|
|
147
|
-
parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
|
|
149
|
+
parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
|
|
148
150
|
)
|
|
149
151
|
return FileData(
|
|
150
152
|
identifier=drive_item.id,
|
|
@@ -156,7 +158,7 @@ class OnedriveIndexer(Indexer):
|
|
|
156
158
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
157
159
|
version=drive_item.etag,
|
|
158
160
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
159
|
-
date_created=str(date_created_at.timestamp()) if
|
|
161
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
160
162
|
date_processed=str(time()),
|
|
161
163
|
record_locator={
|
|
162
164
|
"user_pname": self.connection_config.user_pname,
|
|
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
|
|
|
27
27
|
|
|
28
28
|
CONNECTOR_TYPE = "pinecone"
|
|
29
29
|
MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
|
|
30
|
+
MAX_POOL_THREADS = 100
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class PineconeAccessConfig(AccessConfig):
|
|
@@ -45,7 +46,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
45
46
|
)
|
|
46
47
|
|
|
47
48
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
48
|
-
def get_index(self) -> "PineconeIndex":
|
|
49
|
+
def get_index(self, **index_kwargs) -> "PineconeIndex":
|
|
49
50
|
from pinecone import Pinecone
|
|
50
51
|
|
|
51
52
|
from unstructured_ingest import __version__ as unstructured_version
|
|
@@ -55,7 +56,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
55
56
|
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
56
57
|
)
|
|
57
58
|
|
|
58
|
-
index = pc.Index(self.index_name)
|
|
59
|
+
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
59
60
|
logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
|
|
60
61
|
return index
|
|
61
62
|
|
|
@@ -65,7 +66,13 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
|
65
66
|
|
|
66
67
|
|
|
67
68
|
class PineconeUploaderConfig(UploaderConfig):
|
|
68
|
-
batch_size: int = Field(
|
|
69
|
+
batch_size: Optional[int] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="Optional number of records per batch. Will otherwise limit by size.",
|
|
72
|
+
)
|
|
73
|
+
pool_threads: Optional[int] = Field(
|
|
74
|
+
default=1, description="Optional limit on number of threads to use for upload"
|
|
75
|
+
)
|
|
69
76
|
|
|
70
77
|
|
|
71
78
|
ALLOWED_FIELDS = (
|
|
@@ -149,29 +156,44 @@ class PineconeUploader(Uploader):
|
|
|
149
156
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
150
157
|
|
|
151
158
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
152
|
-
def
|
|
159
|
+
def upsert_batches_async(self, elements_dict: list[dict]):
|
|
153
160
|
from pinecone.exceptions import PineconeApiException
|
|
154
161
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
chunks = list(
|
|
163
|
+
generator_batching_wbytes(
|
|
164
|
+
iterable=elements_dict,
|
|
165
|
+
batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
|
|
166
|
+
max_batch_size=self.upload_config.batch_size,
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
logger.info(f"Split doc with {len(elements_dict)} elements into {len(chunks)} batches")
|
|
170
|
+
|
|
171
|
+
max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
|
|
172
|
+
if self.upload_config.pool_threads:
|
|
173
|
+
pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
|
|
174
|
+
else:
|
|
175
|
+
pool_threads = max_pool_threads
|
|
176
|
+
index = self.connection_config.get_index(pool_threads=pool_threads)
|
|
177
|
+
with index:
|
|
178
|
+
async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
|
|
179
|
+
# Wait for and retrieve responses (this raises in case of error)
|
|
180
|
+
try:
|
|
181
|
+
results = [async_result.get() for async_result in async_results]
|
|
182
|
+
except PineconeApiException as api_error:
|
|
183
|
+
raise DestinationConnectionError(f"http error: {api_error}") from api_error
|
|
184
|
+
logger.debug(f"results: {results}")
|
|
161
185
|
|
|
162
186
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
163
187
|
with path.open("r") as file:
|
|
164
188
|
elements_dict = json.load(file)
|
|
165
189
|
logger.info(
|
|
166
|
-
f"writing
|
|
190
|
+
f"writing a total of {len(elements_dict)} elements via"
|
|
191
|
+
f" document batches to destination"
|
|
167
192
|
f" index named {self.connection_config.index_name}"
|
|
168
193
|
f" with batch size {self.upload_config.batch_size}"
|
|
169
194
|
)
|
|
170
195
|
|
|
171
|
-
|
|
172
|
-
elements_dict, MAX_PAYLOAD_SIZE - 100, self.upload_config.batch_size
|
|
173
|
-
):
|
|
174
|
-
self.upsert_batch(batch=batch)
|
|
196
|
+
self.upsert_batches_async(elements_dict=elements_dict)
|
|
175
197
|
|
|
176
198
|
|
|
177
199
|
pinecone_destination_entry = DestinationRegistryEntry(
|
|
@@ -139,7 +139,7 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
139
139
|
|
|
140
140
|
class SharepointIndexerConfig(IndexerConfig):
|
|
141
141
|
path: Optional[str] = Field(
|
|
142
|
-
|
|
142
|
+
default=None,
|
|
143
143
|
description="Path from which to start parsing files. If the connector is to \
|
|
144
144
|
process all sites within the tenant this filter will be applied to \
|
|
145
145
|
all sites document libraries.",
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
@@ -5,11 +6,10 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel, Field, SecretStr
|
|
7
8
|
|
|
8
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class EmbedderConfig(BaseModel):
|
|
@@ -21,6 +21,7 @@ class EmbedderConfig(BaseModel):
|
|
|
21
21
|
"langchain-vertexai",
|
|
22
22
|
"langchain-voyageai",
|
|
23
23
|
"octoai",
|
|
24
|
+
"mixedbread-ai",
|
|
24
25
|
]
|
|
25
26
|
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
26
27
|
embedding_api_key: Optional[SecretStr] = Field(
|
|
@@ -42,30 +43,31 @@ class EmbedderConfig(BaseModel):
|
|
|
42
43
|
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
43
44
|
)
|
|
44
45
|
|
|
45
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
|
|
46
46
|
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
47
|
-
from
|
|
47
|
+
from unstructured_ingest.embed.huggingface import (
|
|
48
48
|
HuggingFaceEmbeddingConfig,
|
|
49
49
|
HuggingFaceEmbeddingEncoder,
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
return HuggingFaceEmbeddingEncoder(
|
|
52
|
+
return HuggingFaceEmbeddingEncoder(
|
|
53
|
+
config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
|
|
54
|
+
)
|
|
53
55
|
|
|
54
|
-
@requires_dependencies(dependencies=["unstructured"], extras="openai")
|
|
55
56
|
def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
56
|
-
from
|
|
57
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
57
58
|
|
|
58
|
-
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(
|
|
59
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
59
60
|
|
|
60
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
|
|
61
61
|
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
62
|
-
from
|
|
62
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
63
63
|
|
|
64
|
-
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(
|
|
64
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
|
|
65
65
|
|
|
66
|
-
@requires_dependencies(dependencies=["unstructured"], extras="bedrock")
|
|
67
66
|
def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
|
|
68
|
-
from
|
|
67
|
+
from unstructured_ingest.embed.bedrock import (
|
|
68
|
+
BedrockEmbeddingConfig,
|
|
69
|
+
BedrockEmbeddingEncoder,
|
|
70
|
+
)
|
|
69
71
|
|
|
70
72
|
return BedrockEmbeddingEncoder(
|
|
71
73
|
config=BedrockEmbeddingConfig(
|
|
@@ -75,20 +77,35 @@ class EmbedderConfig(BaseModel):
|
|
|
75
77
|
)
|
|
76
78
|
)
|
|
77
79
|
|
|
78
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
|
|
79
80
|
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
80
|
-
from
|
|
81
|
+
from unstructured_ingest.embed.vertexai import (
|
|
81
82
|
VertexAIEmbeddingConfig,
|
|
82
83
|
VertexAIEmbeddingEncoder,
|
|
83
84
|
)
|
|
84
85
|
|
|
85
|
-
return VertexAIEmbeddingEncoder(
|
|
86
|
+
return VertexAIEmbeddingEncoder(
|
|
87
|
+
config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
88
|
+
)
|
|
86
89
|
|
|
87
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
|
|
88
90
|
def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
89
|
-
from
|
|
91
|
+
from unstructured_ingest.embed.voyageai import (
|
|
92
|
+
VoyageAIEmbeddingConfig,
|
|
93
|
+
VoyageAIEmbeddingEncoder,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return VoyageAIEmbeddingEncoder(
|
|
97
|
+
config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
98
|
+
)
|
|
90
99
|
|
|
91
|
-
|
|
100
|
+
def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
101
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
102
|
+
MixedbreadAIEmbeddingConfig,
|
|
103
|
+
MixedbreadAIEmbeddingEncoder,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return MixedbreadAIEmbeddingEncoder(
|
|
107
|
+
config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
108
|
+
)
|
|
92
109
|
|
|
93
110
|
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
94
111
|
kwargs: dict[str, Any] = {}
|
|
@@ -114,6 +131,8 @@ class EmbedderConfig(BaseModel):
|
|
|
114
131
|
|
|
115
132
|
if self.embedding_provider == "langchain-voyageai":
|
|
116
133
|
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
134
|
+
if self.embedding_provider == "mixedbread-ai":
|
|
135
|
+
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
117
136
|
|
|
118
137
|
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
119
138
|
|
|
@@ -122,14 +141,12 @@ class EmbedderConfig(BaseModel):
|
|
|
122
141
|
class Embedder(BaseProcess, ABC):
|
|
123
142
|
config: EmbedderConfig
|
|
124
143
|
|
|
125
|
-
@requires_dependencies(dependencies=["unstructured"])
|
|
126
144
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
127
|
-
from unstructured.staging.base import elements_from_json
|
|
128
|
-
|
|
129
145
|
# TODO update base embedder classes to support async
|
|
130
146
|
embedder = self.config.get_embedder()
|
|
131
|
-
|
|
147
|
+
with elements_filepath.open("r") as elements_file:
|
|
148
|
+
elements = json.load(elements_file)
|
|
132
149
|
if not elements:
|
|
133
150
|
return [e.to_dict() for e in elements]
|
|
134
151
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
135
|
-
return
|
|
152
|
+
return embedded_elements
|