unstructured-ingest 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (38) hide show
  1. test/integration/connectors/weaviate/test_local.py +27 -6
  2. test/integration/embedders/test_azure_openai.py +1 -3
  3. test/integration/embedders/test_bedrock.py +2 -2
  4. test/integration/embedders/test_huggingface.py +1 -3
  5. test/integration/embedders/test_mixedbread.py +2 -2
  6. test/integration/embedders/test_octoai.py +2 -4
  7. test/integration/embedders/test_openai.py +2 -4
  8. test/integration/embedders/test_togetherai.py +2 -2
  9. test/integration/embedders/test_vertexai.py +2 -4
  10. test/integration/embedders/test_voyageai.py +2 -4
  11. test/integration/embedders/utils.py +12 -14
  12. test/unit/embed/test_openai.py +12 -4
  13. test/unit/test_html.py +112 -0
  14. test/unit/v2/embedders/test_voyageai.py +1 -1
  15. unstructured_ingest/__version__.py +1 -1
  16. unstructured_ingest/embed/huggingface.py +6 -1
  17. unstructured_ingest/embed/interfaces.py +9 -6
  18. unstructured_ingest/embed/mixedbreadai.py +3 -10
  19. unstructured_ingest/embed/octoai.py +14 -7
  20. unstructured_ingest/embed/openai.py +18 -5
  21. unstructured_ingest/embed/togetherai.py +19 -8
  22. unstructured_ingest/embed/vertexai.py +13 -6
  23. unstructured_ingest/embed/voyageai.py +19 -6
  24. unstructured_ingest/utils/html.py +143 -93
  25. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  26. unstructured_ingest/v2/interfaces/process.py +3 -0
  27. unstructured_ingest/v2/interfaces/uploader.py +14 -1
  28. unstructured_ingest/v2/pipeline/pipeline.py +20 -6
  29. unstructured_ingest/v2/processes/connectors/assets/__init__.py +0 -0
  30. unstructured_ingest/v2/processes/connectors/confluence.py +15 -22
  31. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +41 -3
  32. unstructured_ingest/v2/processes/embedder.py +3 -0
  33. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/METADATA +22 -22
  34. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/RECORD +38 -36
  35. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/LICENSE.md +0 -0
  36. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/WHEEL +0 -0
  37. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/entry_points.txt +0 -0
  38. {unstructured_ingest-0.4.2.dist-info → unstructured_ingest-0.4.3.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import date, datetime
6
+ from pathlib import Path
6
7
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
8
 
8
9
  from dateutil import parser
@@ -15,10 +16,10 @@ from unstructured_ingest.v2.interfaces import (
15
16
  AccessConfig,
16
17
  ConnectionConfig,
17
18
  FileData,
18
- Uploader,
19
19
  UploaderConfig,
20
20
  UploadStager,
21
21
  UploadStagerConfig,
22
+ VectorDBUploader,
22
23
  )
23
24
  from unstructured_ingest.v2.logger import logger
24
25
 
@@ -160,7 +161,9 @@ class WeaviateUploadStager(UploadStager):
160
161
 
161
162
 
162
163
  class WeaviateUploaderConfig(UploaderConfig):
163
- collection: str = Field(description="The name of the collection this object belongs to")
164
+ collection: Optional[str] = Field(
165
+ description="The name of the collection this object belongs to", default=None
166
+ )
164
167
  batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
165
168
  requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
166
169
  dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
@@ -205,17 +208,50 @@ class WeaviateUploaderConfig(UploaderConfig):
205
208
 
206
209
 
207
210
  @dataclass
208
- class WeaviateUploader(Uploader, ABC):
211
+ class WeaviateUploader(VectorDBUploader, ABC):
209
212
  upload_config: WeaviateUploaderConfig
210
213
  connection_config: WeaviateConnectionConfig
211
214
 
215
+ def _collection_exists(self, collection_name: Optional[str] = None):
216
+ collection_name = collection_name or self.upload_config.collection
217
+ with self.connection_config.get_client() as weaviate_client:
218
+ return weaviate_client.collections.exists(name=collection_name)
219
+
212
220
  def precheck(self) -> None:
213
221
  try:
214
222
  self.connection_config.get_client()
223
+ # only if collection name populated should we check that it exists
224
+ if self.upload_config.collection and not self._collection_exists():
225
+ raise DestinationConnectionError(
226
+ f"collection '{self.upload_config.collection}' does not exist"
227
+ )
215
228
  except Exception as e:
216
229
  logger.error(f"Failed to validate connection {e}", exc_info=True)
217
230
  raise DestinationConnectionError(f"failed to validate connection: {e}")
218
231
 
232
+ def init(self, *kwargs: Any) -> None:
233
+ self.create_destination()
234
+
235
+ def create_destination(
236
+ self, destination_name: str = "elements", vector_length: Optional[int] = None, **kwargs: Any
237
+ ) -> bool:
238
+ collection_name = self.upload_config.collection or destination_name
239
+ self.upload_config.collection = collection_name
240
+ connectors_dir = Path(__file__).parents[1]
241
+ collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
242
+ with collection_config_file.open() as f:
243
+ collection_config = json.load(f)
244
+ collection_config["class"] = collection_name
245
+ if not self._collection_exists():
246
+ logger.info(
247
+ f"creating default weaviate collection '{collection_name}' with default configs"
248
+ )
249
+ with self.connection_config.get_client() as weaviate_client:
250
+ weaviate_client.collections.create_from_dict(config=collection_config)
251
+ return True
252
+ logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
253
+ return False
254
+
219
255
  def check_for_errors(self, client: "WeaviateClient") -> None:
220
256
  failed_uploads = client.batch.failed_objects
221
257
  if failed_uploads:
@@ -253,6 +289,8 @@ class WeaviateUploader(Uploader, ABC):
253
289
  f"writing {len(data)} objects to destination "
254
290
  f"class {self.connection_config.access_config} "
255
291
  )
292
+ if not self.upload_config.collection:
293
+ raise ValueError("No collection specified")
256
294
 
257
295
  with self.connection_config.get_client() as weaviate_client:
258
296
  self.delete_by_record_id(client=weaviate_client, file_data=file_data)
@@ -184,6 +184,9 @@ class EmbedderConfig(BaseModel):
184
184
  class Embedder(BaseProcess, ABC):
185
185
  config: EmbedderConfig
186
186
 
187
+ def init(self, *kwargs: Any) -> None:
188
+ self.config.get_embedder().initialize()
189
+
187
190
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
188
191
  # TODO update base embedder classes to support async
189
192
  embedder = self.config.get_embedder()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: pydantic>=2.7
26
- Requires-Dist: click
27
- Requires-Dist: tqdm
28
25
  Requires-Dist: dataclasses-json
29
26
  Requires-Dist: pandas
30
- Requires-Dist: opentelemetry-sdk
31
27
  Requires-Dist: python-dateutil
28
+ Requires-Dist: opentelemetry-sdk
29
+ Requires-Dist: click
30
+ Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: aioboto3; extra == "bedrock"
43
42
  Requires-Dist: boto3; extra == "bedrock"
43
+ Requires-Dist: aioboto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: bs4; extra == "biomed"
46
46
  Requires-Dist: requests; extra == "biomed"
47
47
  Provides-Extra: box
48
- Requires-Dist: boxfs; extra == "box"
49
48
  Requires-Dist: fsspec; extra == "box"
49
+ Requires-Dist: boxfs; extra == "box"
50
50
  Provides-Extra: chroma
51
51
  Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: atlassian-python-api; extra == "confluence"
56
55
  Requires-Dist: requests; extra == "confluence"
56
+ Requires-Dist: atlassian-python-api; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
63
  Provides-Extra: databricks-volumes
64
64
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
65
  Provides-Extra: delta-table
66
- Requires-Dist: deltalake; extra == "delta-table"
67
66
  Requires-Dist: boto3; extra == "delta-table"
67
+ Requires-Dist: deltalake; extra == "delta-table"
68
68
  Provides-Extra: discord
69
69
  Requires-Dist: discord.py; extra == "discord"
70
70
  Provides-Extra: doc
@@ -72,8 +72,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
72
72
  Provides-Extra: docx
73
73
  Requires-Dist: unstructured[docx]; extra == "docx"
74
74
  Provides-Extra: dropbox
75
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
76
75
  Requires-Dist: fsspec; extra == "dropbox"
76
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
77
77
  Provides-Extra: duckdb
78
78
  Requires-Dist: duckdb; extra == "duckdb"
79
79
  Provides-Extra: elasticsearch
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
- Requires-Dist: bs4; extra == "gcs"
96
- Requires-Dist: fsspec; extra == "gcs"
97
95
  Requires-Dist: gcsfs; extra == "gcs"
96
+ Requires-Dist: fsspec; extra == "gcs"
97
+ Requires-Dist: bs4; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: pygithub>1.58.0; extra == "github"
100
99
  Requires-Dist: requests; extra == "github"
100
+ Requires-Dist: pygithub>1.58.0; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
@@ -122,9 +122,9 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
+ Requires-Dist: neo4j; extra == "neo4j"
125
126
  Requires-Dist: networkx; extra == "neo4j"
126
127
  Requires-Dist: cymple; extra == "neo4j"
127
- Requires-Dist: neo4j; extra == "neo4j"
128
128
  Provides-Extra: notion
129
129
  Requires-Dist: htmlBuilder; extra == "notion"
130
130
  Requires-Dist: backoff; extra == "notion"
@@ -133,9 +133,9 @@ Requires-Dist: httpx; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
+ Requires-Dist: msal; extra == "onedrive"
136
137
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
138
  Requires-Dist: bs4; extra == "onedrive"
138
- Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: openai
140
140
  Requires-Dist: openai; extra == "openai"
141
141
  Requires-Dist: tiktoken; extra == "openai"
@@ -144,8 +144,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
145
145
  Requires-Dist: unstructured[org]; extra == "org"
146
146
  Provides-Extra: outlook
147
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
147
  Requires-Dist: msal; extra == "outlook"
148
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
149
149
  Provides-Extra: pdf
150
150
  Requires-Dist: unstructured[pdf]; extra == "pdf"
151
151
  Provides-Extra: pinecone
@@ -174,11 +174,11 @@ Requires-Dist: s3fs; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
177
- Requires-Dist: paramiko; extra == "sftp"
178
177
  Requires-Dist: fsspec; extra == "sftp"
178
+ Requires-Dist: paramiko; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
180
  Requires-Dist: msal; extra == "sharepoint"
181
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
182
182
  Provides-Extra: singlestore
183
183
  Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
+ Requires-Dist: vastdb; extra == "vastdb"
194
195
  Requires-Dist: ibis; extra == "vastdb"
195
196
  Requires-Dist: pyarrow; extra == "vastdb"
196
- Requires-Dist: vastdb; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
- Requires-Dist: aiofiles; extra == "vectara"
199
- Requires-Dist: requests; extra == "vectara"
200
198
  Requires-Dist: httpx; extra == "vectara"
199
+ Requires-Dist: requests; extra == "vectara"
200
+ Requires-Dist: aiofiles; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"
203
203
  Provides-Extra: wikipedia
@@ -51,29 +51,30 @@ test/integration/connectors/utils/validation/utils.py,sha256=xYYvAbqP6_lZyH09_Jj
51
51
  test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
53
53
  test/integration/connectors/weaviate/test_cloud.py,sha256=U1ZS6a7wTPX7h3XGvaJHaT-Uwg4IeGgzxx1YBywgVhM,1284
54
- test/integration/connectors/weaviate/test_local.py,sha256=bSJwS6rWxPf3BoOXKzZi2AOuT51py9V3tao6IBy1Rgk,4538
54
+ test/integration/connectors/weaviate/test_local.py,sha256=gXMpnzVcrNQdptDjx0haPWBU-dm1MQTkalgxocI3-L8,5287
55
55
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
57
- test/integration/embedders/test_azure_openai.py,sha256=6tFpKFBFRXD49imhhRzsvy3MPtuZ4L1PtnKyMVBRAqc,1808
58
- test/integration/embedders/test_bedrock.py,sha256=WuI2limd0dcT9JTkc44pjjHsdz755bQlZ1by-Xyuy_Y,3565
59
- test/integration/embedders/test_huggingface.py,sha256=0mMTOO-Nh7KB70AGs_7LLQIxMYrnSPqyihriUeqACbM,1007
60
- test/integration/embedders/test_mixedbread.py,sha256=udjfl5fb71R62hC9pYBj7yBrbCzRYNJuIn2ApXY0i18,2006
61
- test/integration/embedders/test_octoai.py,sha256=LT-JpfNDb2d16HWrX0nWHcPOf0_CmY6tsfUMoC8IWpY,2219
62
- test/integration/embedders/test_openai.py,sha256=9GZI0ZzSDHtdTDMY3yB_JOV3eDbrkPhoN1WD1JILBNE,2149
63
- test/integration/embedders/test_togetherai.py,sha256=bsD5Ea8spLzQlTjQKXXTLaFcEnRisQQve8_tDThg6Qo,2213
64
- test/integration/embedders/test_vertexai.py,sha256=AH3spYF9dtIZUf6qN1_r9-rMff8FdNQaQC5bHcx-G3w,1852
65
- test/integration/embedders/test_voyageai.py,sha256=U6RILPJJyRc2X7E9DhkrNghYba-e_lOhcMkL2LejN7Y,1854
66
- test/integration/embedders/utils.py,sha256=2B_JnitpCchZZO-UCi-5jX4JhRAwBgLal8F03SBc0bQ,2932
57
+ test/integration/embedders/test_azure_openai.py,sha256=YQ3uq2-NuxtTyGsSgMNa10pcITLKMJ4E1scTGFgwujw,1790
58
+ test/integration/embedders/test_bedrock.py,sha256=ZehreheLgY9Bqdjk-3MQOaou9IP-H3Pcz7WWiOWAxTU,3557
59
+ test/integration/embedders/test_huggingface.py,sha256=qFblyXounVNRaNkk3gbKoBqU5E2dNecgKU2Bz2LyOa8,989
60
+ test/integration/embedders/test_mixedbread.py,sha256=lLz_cooyC38VSo-FMHbhKpHvYs3QzA20NOIvM5oooaw,1998
61
+ test/integration/embedders/test_octoai.py,sha256=qs-bqZ7iGWO_BzUZvKJmOHBT3cmFSkEYbleWhj3snJc,2197
62
+ test/integration/embedders/test_openai.py,sha256=9XioXuvdnbh_3vRmRwpMsi1D5heCcY7KA4nHb5vOU_M,2127
63
+ test/integration/embedders/test_togetherai.py,sha256=hsg3c3SGJGd93unz4-VLYmFXxLA1vmrD5xK5Gj-g0R4,2205
64
+ test/integration/embedders/test_vertexai.py,sha256=4-E4plJXFf1b02RhOqOCBHR2GA4gTnc8K4AnHm6EgPU,1830
65
+ test/integration/embedders/test_voyageai.py,sha256=Gm3sVjhsym1ASIDfr-sZoCbpsNMaAk_l4E3-dtjRCQ4,1832
66
+ test/integration/embedders/utils.py,sha256=Sqqg-X31ZV1hojqPQBaZgM2lb2u8cG6s6OnH9JRsFjs,2717
67
67
  test/integration/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
68
  test/integration/partitioners/test_partitioner.py,sha256=MEQJbRoc01uPLT6O8CkXeQF_DXK21nz3KVJkzkBtsgM,2835
69
69
  test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
70
  test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
71
+ test/unit/test_html.py,sha256=LKGi_QaH4U4gktrbd2NcURL-d-0Rm1UnG5Y6r9EvTG0,4489
71
72
  test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
72
73
  test/unit/test_utils.py,sha256=Q6mp9YZPah8z3-2lreyRbmAc7m2Y_w26_N9vocSInoA,5421
73
74
  test/unit/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
75
  test/unit/embed/test_mixedbreadai.py,sha256=Z9A9jg5eJRF4OgYTgbIzQUI27J16uv2qj2kp_Rv0r9k,1428
75
76
  test/unit/embed/test_octoai.py,sha256=CWVrieqJh-N40J9n3nzqQPLOH9T1_mldkpZYRiHKxrg,1055
76
- test/unit/embed/test_openai.py,sha256=QGpMQ6mSNOuEcCn8PcEhKEjq1tygTm6K68UDfHHiIu4,833
77
+ test/unit/embed/test_openai.py,sha256=RQ-4QIcRvq0JSBFNit_NRcy61EsOv7xh_TcKJKHwHGM,1186
77
78
  test/unit/embed/test_vertexai.py,sha256=k_dK-yR_yx1RAOpmAgfcPo-osRDJP9aRCMCsJmQPxYI,1050
78
79
  test/unit/embed/test_voyageai.py,sha256=QWoDZEX8cAIkTgn4NtIyGKzOAu-GmudD4VMujnfi1Gg,983
79
80
  test/unit/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -95,13 +96,13 @@ test/unit/v2/embedders/test_octoai.py,sha256=JMfrFz25QfEh0ieB4bJneZd4XtNcdPOnNsN
95
96
  test/unit/v2/embedders/test_openai.py,sha256=HoEW95289Ijgo3PJ-pEaDOknfdkSjPXTgkXmE6jJomY,1012
96
97
  test/unit/v2/embedders/test_togetherai.py,sha256=s24V_geDNZzblU74sSdC_m4Lqlzjp00RMpy56ptfdx0,1009
97
98
  test/unit/v2/embedders/test_vertexai.py,sha256=_4a0tw_GbyvgYJSrP1yw1KjEQJYGzqR5yNXBCSdK8yQ,1145
98
- test/unit/v2/embedders/test_voyageai.py,sha256=De_25F0EhxTNLmAE_c-EK2pFO5p54ad1TVVF055y6p0,1186
99
+ test/unit/v2/embedders/test_voyageai.py,sha256=VaWthF64pmxc-fOBbAQsEzMw7tV4t4Nz_H_Cc5tuAYQ,1193
99
100
  test/unit/v2/partitioners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
101
  test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U-dS0ga6h04h7WSfg,2281
101
102
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
103
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
103
104
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
104
- unstructured_ingest/__version__.py,sha256=Y85nIpRVpjjjl2MW3ZwhLs55JjhABkZJeXfKDAbsRxM,42
105
+ unstructured_ingest/__version__.py,sha256=C0tWanpqRzvQsOclLMfAsEjPaa-5I3hXoMIvdtnb1w4,42
105
106
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
106
107
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
107
108
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -271,14 +272,14 @@ unstructured_ingest/connector/notion/types/database_properties/verification.py,s
271
272
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
273
  unstructured_ingest/embed/azure_openai.py,sha256=4YBOIxv66wVZ5EqNNC4uCDPNJ3VrsLPe5wwagT6zqe0,1001
273
274
  unstructured_ingest/embed/bedrock.py,sha256=50G8PBEdW3ILwyWXAWl4w-gUA9I0AR7LuFq6NLz-sWI,7284
274
- unstructured_ingest/embed/huggingface.py,sha256=2cBiQhOhfWHX3hS-eKjocysOkUaRlyRfUj9Kxjrp6cE,1934
275
- unstructured_ingest/embed/interfaces.py,sha256=eHS7vyWPhIsUV_BvQ2NtfknoQvd2UzusdRHbVFhJEqw,3654
276
- unstructured_ingest/embed/mixedbreadai.py,sha256=qTPMa9FUyk1u7qCGoz1OUcdxaeA8Ck4Kc7JAusqVw5s,6922
277
- unstructured_ingest/embed/octoai.py,sha256=yjy9nl2F_3Qosa69vCmaFym7l1oSo-zuJ06HqmSbJIE,4676
278
- unstructured_ingest/embed/openai.py,sha256=ExPR9Gv5YM_9fF4YCN5NyrsaiVj68-037rKLbXco3NE,4076
279
- unstructured_ingest/embed/togetherai.py,sha256=nLeDdvzMvsL1EjdsQR37xW-3xorj5uPF7DPKzO-eHR0,3782
280
- unstructured_ingest/embed/vertexai.py,sha256=sV4DUr4YrqAXvuPw5lbQuAY8YWYM5FjKjYPMd_D_x0g,4496
281
- unstructured_ingest/embed/voyageai.py,sha256=70p3rBSd5gq9QukfF7dZHg5Fy0fJkh2Opw-NNjnvmI8,4594
275
+ unstructured_ingest/embed/huggingface.py,sha256=Avcc16st9Cp2xGScG6TeNEEd3T8YjjnESNN4OdIlnh0,2119
276
+ unstructured_ingest/embed/interfaces.py,sha256=7jsQ3rLOXy1hq__muf-EPcLnv17XzNQaD05AyGbZeNo,3739
277
+ unstructured_ingest/embed/mixedbreadai.py,sha256=OhF5cMxWMq8-0mt8_-Xe3ZkjGjf2u6QYzfzgHnOEYtU,6838
278
+ unstructured_ingest/embed/octoai.py,sha256=oLNlM02W1CNUYRG_j6qWyI7yE24vYGKYradNzeeP6mE,5062
279
+ unstructured_ingest/embed/openai.py,sha256=H1sURGuRvXBUSXJcAVzrLObV5wSCVM29tkaXJ-9ZR30,4727
280
+ unstructured_ingest/embed/togetherai.py,sha256=SUd16JEUPlR8aCrd4q_T3CHwMTRUi-1yenq_r1AWlak,4266
281
+ unstructured_ingest/embed/vertexai.py,sha256=CPptS7U5W1CgvxIN8CgVz5J1Ia4FctV6BsmpN9c92A0,4890
282
+ unstructured_ingest/embed/voyageai.py,sha256=lydMASUDcTuyfWBPS3uIqDJPQbjf95bEI5Kr4tytONs,5111
282
283
  unstructured_ingest/enhanced_dataclass/__init__.py,sha256=gDZOUsv5eo-8jm4Yu7DdDwi101aGbfG7JctTdOYnTOM,151
283
284
  unstructured_ingest/enhanced_dataclass/core.py,sha256=d6aUkDynuKX87cHx9_N5UDUWrvISR4jYRFRTvd_avlI,3038
284
285
  unstructured_ingest/enhanced_dataclass/dataclasses.py,sha256=aZMsoCzAGRb8Rmh3BTSBFtNr6FmFTY93KYGLk3gYJKQ,1949
@@ -363,7 +364,7 @@ unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSz
363
364
  unstructured_ingest/utils/data_prep.py,sha256=X3d8Kos1zqX-HQAicF_8TB0BrstRtHrbMzu_1s7mj7M,7191
364
365
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
365
366
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
366
- unstructured_ingest/utils/html.py,sha256=gORKKCkva71JBbOilYtAn_MLLCqV8VKmSjSbpwEOlno,4257
367
+ unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
367
368
  unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
368
369
  unstructured_ingest/utils/string_and_date_utils.py,sha256=kijtPlGAbH376vVjFSo5H_ZhW-FEcMC2sCNsSNwDOjo,1729
369
370
  unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
@@ -386,19 +387,19 @@ unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdj
386
387
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
387
388
  unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
388
389
  unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
389
- unstructured_ingest/v2/interfaces/__init__.py,sha256=9VO09XuTvyOcFF8ZDKN169fNb_uA5TAYzPsiPHOyxhQ,963
390
+ unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADAUBiPAY88PKaMRyqY,1005
390
391
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
391
392
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
392
393
  unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
393
394
  unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
394
- unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
395
+ unstructured_ingest/v2/interfaces/process.py,sha256=6Ll0O9ATcdm36dx2_TOg9PfCEJrADgyd8OQK3TTNzZM,448
395
396
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
396
397
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
397
- unstructured_ingest/v2/interfaces/uploader.py,sha256=T2oHbN-d4Px1w1oATKKYZA10aUssqytEpiaqBM92r0Q,1600
398
+ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-iZPCVsUaL0rljcME,2090
398
399
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
399
400
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
400
401
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
401
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=7Yg8_xwlSX6lA-oPGlTcn6KXZ9kc51zsoJxME5TiUlw,15956
402
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=y6AkUBUL2r3t4OO0jWKomtN3v8U7EDtMPrJ8VYRo7VM,16344
402
403
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
403
404
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
404
405
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -412,7 +413,7 @@ unstructured_ingest/v2/pipeline/steps/upload.py,sha256=We4OAtStuZwWKKBCOPhfeAz_v
412
413
  unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
413
414
  unstructured_ingest/v2/processes/chunker.py,sha256=31-7ojsM2coIt2rMR0KOb82IxLVJfNHbqYUOsDkhxN8,5491
414
415
  unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
415
- unstructured_ingest/v2/processes/embedder.py,sha256=xCBpaL07WnVUOUW8SHktaf1vwBGZxl3Nf8-99509ClQ,7721
416
+ unstructured_ingest/v2/processes/embedder.py,sha256=uiuCOSwwasHp4eqtewMvgnM86WVch7HDFiWqpGLahvo,7812
416
417
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
417
418
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
418
419
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
@@ -421,7 +422,7 @@ unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XE
421
422
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=xhUMoUdnrfAY1isZGqsV4lZUsnZNpbvgLyQWQbR4hVo,14814
422
423
  unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6m5sxIlB6u5ebQpqCS_SJ-_amCC1KQ03EQ,11529
423
424
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
424
- unstructured_ingest/v2/processes/connectors/confluence.py,sha256=OdoMK5ZD2HOncquj9c_Xct7bFa6kSGW3qZwfiN1LqtQ,11399
425
+ unstructured_ingest/v2/processes/connectors/confluence.py,sha256=_zkiST0FTggEKNORalCcZZIRGZKnCM0LLcavgQZfDVE,11112
425
426
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
426
427
  unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=SotSXZQ85_6TO906YvFi3yTml8jE9A_zV6nBJ4oTx8A,7075
427
428
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
@@ -441,6 +442,7 @@ unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtl
441
442
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
442
443
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
443
444
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
445
+ unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
444
446
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
445
447
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=Yj4fIxgGo9Qh1x_6-INdbrPGHCuZElu-QKNfSVtW7F4,7694
446
448
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=TA2e_1SIr4VaEI62873eyReCNfgmQ51_2Pko2I04pPM,2747
@@ -558,10 +560,10 @@ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWa
558
560
  unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
559
561
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
560
562
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
561
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=X1yv1H_orDQ-J965EMXhR2XaURqe8vovSi9n1fk85B4,10499
562
- unstructured_ingest-0.4.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
563
- unstructured_ingest-0.4.2.dist-info/METADATA,sha256=-3ILUK1wZ1fDgJcT22FO9ZhM_NKKHNBCLvgWBgzvVOY,8051
564
- unstructured_ingest-0.4.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
565
- unstructured_ingest-0.4.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
566
- unstructured_ingest-0.4.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
567
- unstructured_ingest-0.4.2.dist-info/RECORD,,
563
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
564
+ unstructured_ingest-0.4.3.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
565
+ unstructured_ingest-0.4.3.dist-info/METADATA,sha256=UXXbx1Vr9zdcvAfOdgabURlB8nR2I8Lo_aDTN1PNjwU,8051
566
+ unstructured_ingest-0.4.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
567
+ unstructured_ingest-0.4.3.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
568
+ unstructured_ingest-0.4.3.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
569
+ unstructured_ingest-0.4.3.dist-info/RECORD,,