unstructured-ingest 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -50,7 +50,7 @@ def test_raw_mixedbread_embedder(embedder_file: Path):
50
50
  embedder=embedder,
51
51
  embedder_file=embedder_file,
52
52
  expected_dimension=1024,
53
- expected_is_unit_vector=False,
53
+ expected_is_unit_vector=True,
54
54
  )
55
55
 
56
56
 
@@ -67,5 +67,5 @@ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
67
67
  embedder=embedder,
68
68
  embedder_file=embedder_file,
69
69
  expected_dimension=1024,
70
- expected_is_unit_vector=False,
70
+ expected_is_unit_vector=True,
71
71
  )
@@ -61,3 +61,19 @@ async def test_raw_async_voyageai_embedder(embedder_file: Path):
61
61
  await validate_raw_embedder_async(
62
62
  embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
63
63
  )
64
+
65
+
66
+ @requires_env(API_KEY)
67
+ def test_voyageai_multimodal_embedder(embedder_file: Path):
68
+ api_key = get_api_key()
69
+ embedder_config = EmbedderConfig(
70
+ embedding_provider="voyageai",
71
+ embedding_api_key=api_key,
72
+ embedding_model_name="voyage-multimodal-3",
73
+ )
74
+ embedder = Embedder(config=embedder_config)
75
+ results = embedder.run(elements_filepath=embedder_file)
76
+ assert results
77
+ with embedder_file.open("r") as f:
78
+ original_elements = json.load(f)
79
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
@@ -1 +1 @@
1
- __version__ = "0.5.6" # pragma: no cover
1
+ __version__ = "0.5.7" # pragma: no cover
@@ -49,7 +49,7 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
49
49
  def is_unit_vector(self) -> bool:
50
50
  """Denotes if the embedding vector is a unit vector."""
51
51
  exemplary_embedding = self.get_exemplary_embedding()
52
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
53
53
 
54
54
  def get_client(self):
55
55
  raise NotImplementedError
@@ -103,7 +103,7 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
103
103
  async def is_unit_vector(self) -> bool:
104
104
  """Denotes if the embedding vector is a unit vector."""
105
105
  exemplary_embedding = await self.get_exemplary_embedding()
106
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
106
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
107
107
 
108
108
  def get_client(self):
109
109
  raise NotImplementedError
@@ -96,7 +96,11 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
96
96
  return self.config.get_client()
97
97
 
98
98
  def embed_batch(self, client: "VoyageAIClient", batch: list[str]) -> list[list[float]]:
99
- response = client.embed(texts=batch, model=self.config.embedder_model_name)
99
+ if self.config.embedder_model_name == "voyage-multimodal-3":
100
+ batch = [[text] for text in batch]
101
+ response = client.multimodal_embed(inputs=batch, model=self.config.embedder_model_name)
102
+ else:
103
+ response = client.embed(texts=batch, model=self.config.embedder_model_name)
100
104
  return response.embeddings
101
105
 
102
106
 
@@ -113,5 +117,11 @@ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
113
117
  async def embed_batch(
114
118
  self, client: "AsyncVoyageAIClient", batch: list[str]
115
119
  ) -> list[list[float]]:
116
- response = await client.embed(texts=batch, model=self.config.embedder_model_name)
120
+ if self.config.embedder_model_name == "voyage-multimodal-3":
121
+ batch = [[text] for text in batch]
122
+ response = await client.multimodal_embed(
123
+ inputs=batch, model=self.config.embedder_model_name
124
+ )
125
+ else:
126
+ response = await client.embed(texts=batch, model=self.config.embedder_model_name)
117
127
  return response.embeddings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: opentelemetry-sdk
26
+ Requires-Dist: pydantic>=2.7
25
27
  Requires-Dist: dataclasses_json
26
- Requires-Dist: click
27
28
  Requires-Dist: python-dateutil
28
29
  Requires-Dist: tqdm
29
- Requires-Dist: pydantic>=2.7
30
- Requires-Dist: opentelemetry-sdk
30
+ Requires-Dist: click
31
31
  Requires-Dist: pandas
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
@@ -74,8 +74,8 @@ Provides-Extra: biomed
74
74
  Requires-Dist: bs4; extra == "biomed"
75
75
  Requires-Dist: requests; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: fsspec; extra == "box"
78
77
  Requires-Dist: boxfs; extra == "box"
78
+ Requires-Dist: fsspec; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
@@ -102,8 +102,8 @@ Requires-Dist: bs4; extra == "gcs"
102
102
  Requires-Dist: fsspec; extra == "gcs"
103
103
  Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: pygithub>1.58.0; extra == "github"
106
105
  Requires-Dist: requests; extra == "github"
106
+ Requires-Dist: pygithub>1.58.0; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
124
124
  Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
- Requires-Dist: cymple; extra == "neo4j"
128
127
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
128
+ Requires-Dist: cymple; extra == "neo4j"
129
129
  Requires-Dist: networkx; extra == "neo4j"
130
130
  Provides-Extra: notion
131
- Requires-Dist: backoff; extra == "notion"
132
- Requires-Dist: httpx; extra == "notion"
133
131
  Requires-Dist: notion-client; extra == "notion"
132
+ Requires-Dist: backoff; extra == "notion"
134
133
  Requires-Dist: htmlBuilder; extra == "notion"
134
+ Requires-Dist: httpx; extra == "notion"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: bs4; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
+ Requires-Dist: msal; extra == "onedrive"
138
+ Requires-Dist: bs4; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: msal; extra == "outlook"
143
142
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
+ Requires-Dist: msal; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -152,11 +152,11 @@ Requires-Dist: praw; extra == "reddit"
152
152
  Provides-Extra: redis
153
153
  Requires-Dist: redis; extra == "redis"
154
154
  Provides-Extra: s3
155
- Requires-Dist: fsspec; extra == "s3"
156
155
  Requires-Dist: s3fs; extra == "s3"
156
+ Requires-Dist: fsspec; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: msal; extra == "sharepoint"
159
158
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
+ Requires-Dist: msal; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
@@ -165,8 +165,8 @@ Requires-Dist: fsspec; extra == "sftp"
165
165
  Provides-Extra: slack
166
166
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
167
  Provides-Extra: snowflake
168
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
169
168
  Requires-Dist: psycopg2-binary; extra == "snowflake"
169
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
170
170
  Provides-Extra: wikipedia
171
171
  Requires-Dist: wikipedia; extra == "wikipedia"
172
172
  Provides-Extra: weaviate
@@ -182,14 +182,14 @@ Requires-Dist: httpx; extra == "vectara"
182
182
  Requires-Dist: requests; extra == "vectara"
183
183
  Requires-Dist: aiofiles; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
- Requires-Dist: vastdb; extra == "vastdb"
186
185
  Requires-Dist: pyarrow; extra == "vastdb"
187
186
  Requires-Dist: ibis; extra == "vastdb"
187
+ Requires-Dist: vastdb; extra == "vastdb"
188
188
  Provides-Extra: embed-huggingface
189
189
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
190
190
  Provides-Extra: embed-octoai
191
- Requires-Dist: openai; extra == "embed-octoai"
192
191
  Requires-Dist: tiktoken; extra == "embed-octoai"
192
+ Requires-Dist: openai; extra == "embed-octoai"
193
193
  Provides-Extra: embed-vertexai
194
194
  Requires-Dist: vertexai; extra == "embed-vertexai"
195
195
  Provides-Extra: embed-voyageai
@@ -197,8 +197,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
197
197
  Provides-Extra: embed-mixedbreadai
198
198
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
199
199
  Provides-Extra: openai
200
- Requires-Dist: openai; extra == "openai"
201
200
  Requires-Dist: tiktoken; extra == "openai"
201
+ Requires-Dist: openai; extra == "openai"
202
202
  Provides-Extra: bedrock
203
203
  Requires-Dist: boto3; extra == "bedrock"
204
204
  Requires-Dist: aioboto3; extra == "bedrock"
@@ -60,12 +60,12 @@ test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdR
60
60
  test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
61
61
  test/integration/embedders/test_bedrock.py,sha256=vmjoi1uUk-LX4Yz0ZPn6Ry1JdVEsyIhLhPbSPmkeT9o,3553
62
62
  test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
63
- test/integration/embedders/test_mixedbread.py,sha256=lLz_cooyC38VSo-FMHbhKpHvYs3QzA20NOIvM5oooaw,1998
63
+ test/integration/embedders/test_mixedbread.py,sha256=oesaTY8H7es72vhctmNVU0oWkHNJQckHd_KD-K6kWxI,1996
64
64
  test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
65
65
  test/integration/embedders/test_openai.py,sha256=9XioXuvdnbh_3vRmRwpMsi1D5heCcY7KA4nHb5vOU_M,2127
66
66
  test/integration/embedders/test_togetherai.py,sha256=hsg3c3SGJGd93unz4-VLYmFXxLA1vmrD5xK5Gj-g0R4,2205
67
67
  test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gTnc8K4AnHm6EgPU,1830
68
- test/integration/embedders/test_voyageai.py,sha256=Gm3sVjhsym1ASIDfr-sZoCbpsNMaAk_l4E3-dtjRCQ4,1832
68
+ test/integration/embedders/test_voyageai.py,sha256=hf8JP8eSL1MMFsmQ9rErM8oxCcwO6kC1WfzzBn7bnME,2414
69
69
  test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
70
70
  test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  test/integration/partitioners/test_partitioner.py,sha256=6sdZhhtqEICBPqEgpKrCQIfJ-7hKcwuTFqjWs1mbQf8,2787
@@ -107,7 +107,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
107
107
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
108
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
109
109
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
110
- unstructured_ingest/__version__.py,sha256=8heXQJ79JSGfqiDjjQtqcfkCTWOYFwgErKEt_wwF3c4,42
110
+ unstructured_ingest/__version__.py,sha256=SJI27PQ23gz4_g984Mn5VF7Lgitn3vm0GQDyvqnYbdc,42
111
111
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
112
112
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
113
113
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -278,13 +278,13 @@ unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
278
278
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
279
279
  unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
280
280
  unstructured_ingest/embed/huggingface.py,sha256=EWU1kd5Cm6ajgCw6hP5w_4pniGSgxnR0wM9vjuPQ6Yk,2334
281
- unstructured_ingest/embed/interfaces.py,sha256=C_R_RZJafm3gq8Q0T-HJs-OlbnXpUrCKl4D7Ryv9-Ss,4894
281
+ unstructured_ingest/embed/interfaces.py,sha256=_-CqasY6R5nnNUY-X6PS5lz8dsmGaUw5zIGRdPfx16o,4918
282
282
  unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
283
283
  unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
284
284
  unstructured_ingest/embed/openai.py,sha256=Fe_17y-YpkiGcfrOxZFmgjV-Y-u8svhDVYyAjV-GeBM,3279
285
285
  unstructured_ingest/embed/togetherai.py,sha256=i1qeX2fwWtUf1vdGOGnpA_bJB__VzU1NQsR8k-KhxIw,2983
286
286
  unstructured_ingest/embed/vertexai.py,sha256=EcXhhm1IbCZVq4KA0sbJjyABu8jpF2ZL3JCqmuxPsjo,3688
287
- unstructured_ingest/embed/voyageai.py,sha256=tuUZ28B_t2sVDVC6EiHdQEQycMEY2nybE3DFcpxSqh0,3992
287
+ unstructured_ingest/embed/voyageai.py,sha256=lsdiTHVE3CMUX4gXdn2AaRJcKPcKptzgYdF2McvQcvA,4496
288
288
  unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
289
289
  unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
290
290
  unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
@@ -567,9 +567,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
567
567
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
568
568
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
569
569
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yfABJKJGCvPuZ2XCNtDOuCtiscdEAmBCSPPNZnbTKDk,12821
570
- unstructured_ingest-0.5.6.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
- unstructured_ingest-0.5.6.dist-info/METADATA,sha256=ts8jHfqXkNXKcF9TL5UqQNHkynZuzjiobUomXaqiYgM,8316
572
- unstructured_ingest-0.5.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
573
- unstructured_ingest-0.5.6.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
- unstructured_ingest-0.5.6.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
- unstructured_ingest-0.5.6.dist-info/RECORD,,
570
+ unstructured_ingest-0.5.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
571
+ unstructured_ingest-0.5.7.dist-info/METADATA,sha256=Zr_UTJd0V_0vUjwukPd2BgrEh47hqfLSiwivBPAxJos,8316
572
+ unstructured_ingest-0.5.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
573
+ unstructured_ingest-0.5.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
574
+ unstructured_ingest-0.5.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
575
+ unstructured_ingest-0.5.7.dist-info/RECORD,,