unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +21 -11
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +4 -6
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/data_prep.py +20 -12
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +3 -2
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +12 -3
- unstructured_ingest/v2/pipeline/pipeline.py +42 -29
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
- unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
- unstructured_ingest/v2/processes/embedder.py +41 -24
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from abc import ABC
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
@@ -5,11 +6,10 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel, Field, SecretStr
|
|
7
8
|
|
|
8
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class EmbedderConfig(BaseModel):
|
|
@@ -21,6 +21,7 @@ class EmbedderConfig(BaseModel):
|
|
|
21
21
|
"langchain-vertexai",
|
|
22
22
|
"langchain-voyageai",
|
|
23
23
|
"octoai",
|
|
24
|
+
"mixedbread-ai",
|
|
24
25
|
]
|
|
25
26
|
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
26
27
|
embedding_api_key: Optional[SecretStr] = Field(
|
|
@@ -42,30 +43,31 @@ class EmbedderConfig(BaseModel):
|
|
|
42
43
|
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
43
44
|
)
|
|
44
45
|
|
|
45
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
|
|
46
46
|
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
47
|
-
from
|
|
47
|
+
from unstructured_ingest.embed.huggingface import (
|
|
48
48
|
HuggingFaceEmbeddingConfig,
|
|
49
49
|
HuggingFaceEmbeddingEncoder,
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
return HuggingFaceEmbeddingEncoder(
|
|
52
|
+
return HuggingFaceEmbeddingEncoder(
|
|
53
|
+
config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
|
|
54
|
+
)
|
|
53
55
|
|
|
54
|
-
@requires_dependencies(dependencies=["unstructured"], extras="openai")
|
|
55
56
|
def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
56
|
-
from
|
|
57
|
+
from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
57
58
|
|
|
58
|
-
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(
|
|
59
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
|
|
59
60
|
|
|
60
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
|
|
61
61
|
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
62
|
-
from
|
|
62
|
+
from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
63
63
|
|
|
64
|
-
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(
|
|
64
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
|
|
65
65
|
|
|
66
|
-
@requires_dependencies(dependencies=["unstructured"], extras="bedrock")
|
|
67
66
|
def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
|
|
68
|
-
from
|
|
67
|
+
from unstructured_ingest.embed.bedrock import (
|
|
68
|
+
BedrockEmbeddingConfig,
|
|
69
|
+
BedrockEmbeddingEncoder,
|
|
70
|
+
)
|
|
69
71
|
|
|
70
72
|
return BedrockEmbeddingEncoder(
|
|
71
73
|
config=BedrockEmbeddingConfig(
|
|
@@ -75,20 +77,35 @@ class EmbedderConfig(BaseModel):
|
|
|
75
77
|
)
|
|
76
78
|
)
|
|
77
79
|
|
|
78
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
|
|
79
80
|
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
80
|
-
from
|
|
81
|
+
from unstructured_ingest.embed.vertexai import (
|
|
81
82
|
VertexAIEmbeddingConfig,
|
|
82
83
|
VertexAIEmbeddingEncoder,
|
|
83
84
|
)
|
|
84
85
|
|
|
85
|
-
return VertexAIEmbeddingEncoder(
|
|
86
|
+
return VertexAIEmbeddingEncoder(
|
|
87
|
+
config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
88
|
+
)
|
|
86
89
|
|
|
87
|
-
@requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
|
|
88
90
|
def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
89
|
-
from
|
|
91
|
+
from unstructured_ingest.embed.voyageai import (
|
|
92
|
+
VoyageAIEmbeddingConfig,
|
|
93
|
+
VoyageAIEmbeddingEncoder,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return VoyageAIEmbeddingEncoder(
|
|
97
|
+
config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
98
|
+
)
|
|
90
99
|
|
|
91
|
-
|
|
100
|
+
def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
101
|
+
from unstructured_ingest.embed.mixedbreadai import (
|
|
102
|
+
MixedbreadAIEmbeddingConfig,
|
|
103
|
+
MixedbreadAIEmbeddingEncoder,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return MixedbreadAIEmbeddingEncoder(
|
|
107
|
+
config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
|
|
108
|
+
)
|
|
92
109
|
|
|
93
110
|
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
94
111
|
kwargs: dict[str, Any] = {}
|
|
@@ -114,6 +131,8 @@ class EmbedderConfig(BaseModel):
|
|
|
114
131
|
|
|
115
132
|
if self.embedding_provider == "langchain-voyageai":
|
|
116
133
|
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
134
|
+
if self.embedding_provider == "mixedbread-ai":
|
|
135
|
+
return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
|
|
117
136
|
|
|
118
137
|
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
119
138
|
|
|
@@ -122,14 +141,12 @@ class EmbedderConfig(BaseModel):
|
|
|
122
141
|
class Embedder(BaseProcess, ABC):
|
|
123
142
|
config: EmbedderConfig
|
|
124
143
|
|
|
125
|
-
@requires_dependencies(dependencies=["unstructured"])
|
|
126
144
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
127
|
-
from unstructured.staging.base import elements_from_json
|
|
128
|
-
|
|
129
145
|
# TODO update base embedder classes to support async
|
|
130
146
|
embedder = self.config.get_embedder()
|
|
131
|
-
|
|
147
|
+
with elements_filepath.open("r") as elements_file:
|
|
148
|
+
elements = json.load(elements_file)
|
|
132
149
|
if not elements:
|
|
133
150
|
return [e.to_dict() for e in elements]
|
|
134
151
|
embedded_elements = embedder.embed_documents(elements=elements)
|
|
135
|
-
return
|
|
152
|
+
return embedded_elements
|
|
@@ -47,7 +47,7 @@ class Filterer(BaseProcess, ABC):
|
|
|
47
47
|
for pattern in patterns:
|
|
48
48
|
if fnmatch.filter([path], pattern):
|
|
49
49
|
return True
|
|
50
|
-
logger.debug(f"
|
|
50
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
51
51
|
return False
|
|
52
52
|
|
|
53
53
|
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
@@ -145,7 +145,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
145
145
|
class FileDataSourceMetadata(DataSourceMetadata):
|
|
146
146
|
filesize_bytes: Optional[int] = None
|
|
147
147
|
|
|
148
|
-
logger.debug(f"
|
|
148
|
+
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
149
149
|
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
150
150
|
elements = partition(
|
|
151
151
|
filename=str(filename.resolve()),
|
|
@@ -165,7 +165,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
165
165
|
|
|
166
166
|
partition_request = self.config.to_partition_kwargs()
|
|
167
167
|
|
|
168
|
-
#
|
|
168
|
+
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
169
169
|
# Prior to this it was a dataclass which doesn't have .__fields
|
|
170
170
|
try:
|
|
171
171
|
possible_fields = PartitionParameters.__fields__
|
|
@@ -182,7 +182,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
182
182
|
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
183
183
|
)
|
|
184
184
|
)
|
|
185
|
-
logger.debug(f"
|
|
185
|
+
logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
|
|
186
186
|
with open(filename, "rb") as f:
|
|
187
187
|
files = Files(
|
|
188
188
|
content=f.read(),
|
unstructured_ingest/v2/utils.py
CHANGED
|
@@ -20,6 +20,11 @@ def is_secret(value: Any) -> bool:
|
|
|
20
20
|
def serialize_base_model(model: BaseModel) -> dict:
|
|
21
21
|
# To get the full serialized dict regardless of if values are marked as Secret
|
|
22
22
|
model_dict = model.dict()
|
|
23
|
+
return serialize_base_dict(model_dict=model_dict)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def serialize_base_dict(model_dict: dict) -> dict:
|
|
27
|
+
model_dict = model_dict.copy()
|
|
23
28
|
for k, v in model_dict.items():
|
|
24
29
|
if isinstance(v, _SecretBase):
|
|
25
30
|
secret_value = v.get_secret_value()
|
|
@@ -27,6 +32,8 @@ def serialize_base_model(model: BaseModel) -> dict:
|
|
|
27
32
|
model_dict[k] = serialize_base_model(model=secret_value)
|
|
28
33
|
else:
|
|
29
34
|
model_dict[k] = secret_value
|
|
35
|
+
if isinstance(v, dict):
|
|
36
|
+
model_dict[k] = serialize_base_dict(model_dict=v)
|
|
30
37
|
|
|
31
38
|
return model_dict
|
|
32
39
|
|