unstructured-ingest 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.4.3" # pragma: no cover
1
+ __version__ = "0.4.5" # pragma: no cover
@@ -1,4 +1,4 @@
1
- from abc import ABC, abstractmethod
1
+ from abc import ABC
2
2
  from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
3
 
4
4
  from pydantic import BaseModel
@@ -22,9 +22,8 @@ class Indexer(BaseProcess, BaseConnector, ABC):
22
22
  def is_async(self) -> bool:
23
23
  return False
24
24
 
25
- @abstractmethod
26
25
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
27
- pass
26
+ raise NotImplementedError()
28
27
 
29
28
  async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
30
29
  raise NotImplementedError()
@@ -203,7 +203,14 @@ class Pipeline:
203
203
 
204
204
  def get_indices(self) -> list[dict]:
205
205
  if self.indexer_step.process.is_async():
206
- indices = asyncio.run(self.indexer_step.run_async())
206
+
207
+ async def run_async():
208
+ output = []
209
+ async for i in self.indexer_step.run_async():
210
+ output.append(i)
211
+ return output
212
+
213
+ indices = asyncio.run(run_async())
207
214
  else:
208
215
  indices = self.indexer_step.run()
209
216
  indices_inputs = [{"file_data_path": i} for i in indices]
@@ -5,7 +5,7 @@ import json
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
7
  from time import time
8
- from typing import TYPE_CHECKING, Any, AsyncIterator, Generator, Iterator, Optional, TypeVar
8
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
9
9
 
10
10
  from dateutil import parser
11
11
  from pydantic import Field, Secret
@@ -101,27 +101,6 @@ class OnedriveIndexerConfig(IndexerConfig):
101
101
  recursive: bool = False
102
102
 
103
103
 
104
- T = TypeVar("T")
105
-
106
-
107
- def async_iterable_to_sync_iterable(iterator: AsyncIterator[T]) -> Iterator[T]:
108
- # This version works on Python 3.9 by manually handling the async iteration.
109
- loop = asyncio.new_event_loop()
110
- asyncio.set_event_loop(loop)
111
- try:
112
- while True:
113
- try:
114
- # Instead of anext(iterator), we directly call __anext__().
115
- # __anext__ returns a coroutine that we must run until complete.
116
- future = iterator.__anext__()
117
- result = loop.run_until_complete(future)
118
- yield result
119
- except StopAsyncIteration:
120
- break
121
- finally:
122
- loop.close()
123
-
124
-
125
104
  @dataclass
126
105
  class OnedriveIndexer(Indexer):
127
106
  connection_config: OnedriveConnectionConfig
@@ -215,7 +194,10 @@ class OnedriveIndexer(Indexer):
215
194
  # Offload the file data creation if it's not guaranteed async
216
195
  return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
217
196
 
218
- async def _run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
197
+ def is_async(self) -> bool:
198
+ return True
199
+
200
+ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
219
201
  token_resp = await asyncio.to_thread(self.connection_config.get_token)
220
202
  if "error" in token_resp:
221
203
  raise SourceConnectionError(
@@ -230,12 +212,6 @@ class OnedriveIndexer(Indexer):
230
212
  file_data = await self.drive_item_to_file_data(drive_item=drive_item)
231
213
  yield file_data
232
214
 
233
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
234
- # Convert the async generator to a sync generator without loading all data into memory
235
- async_gen = self._run_async(**kwargs)
236
- for item in async_iterable_to_sync_iterable(async_gen):
237
- yield item
238
-
239
215
 
240
216
  class OnedriveDownloaderConfig(DownloaderConfig):
241
217
  pass
@@ -247,7 +223,7 @@ class OnedriveDownloader(Downloader):
247
223
  download_config: OnedriveDownloaderConfig
248
224
 
249
225
  @SourceConnectionNetworkError.wrap
250
- def _fetch_file(self, file_data: FileData):
226
+ def _fetch_file(self, file_data: FileData) -> DriveItem:
251
227
  if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
252
228
  raise ValueError(
253
229
  f"file data doesn't have enough information to get "
@@ -281,7 +257,7 @@ class OnedriveDownloader(Downloader):
281
257
  file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
282
258
  else:
283
259
  with download_path.open(mode="wb") as f:
284
- file.download(f).execute_query()
260
+ file.download_session(f).execute_query()
285
261
  return self.generate_download_response(file_data=file_data, download_path=download_path)
286
262
  except Exception as e:
287
263
  logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,25 +22,25 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: dataclasses-json
25
+ Requires-Dist: pydantic>=2.7
26
26
  Requires-Dist: pandas
27
+ Requires-Dist: tqdm
27
28
  Requires-Dist: python-dateutil
28
- Requires-Dist: opentelemetry-sdk
29
29
  Requires-Dist: click
30
- Requires-Dist: pydantic>=2.7
31
- Requires-Dist: tqdm
30
+ Requires-Dist: dataclasses-json
31
+ Requires-Dist: opentelemetry-sdk
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: fsspec; extra == "azure"
38
37
  Requires-Dist: adlfs; extra == "azure"
38
+ Requires-Dist: fsspec; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
- Requires-Dist: boto3; extra == "bedrock"
43
42
  Requires-Dist: aioboto3; extra == "bedrock"
43
+ Requires-Dist: boto3; extra == "bedrock"
44
44
  Provides-Extra: biomed
45
45
  Requires-Dist: bs4; extra == "biomed"
46
46
  Requires-Dist: requests; extra == "biomed"
@@ -52,8 +52,8 @@ Requires-Dist: chromadb; extra == "chroma"
52
52
  Provides-Extra: clarifai
53
53
  Requires-Dist: clarifai; extra == "clarifai"
54
54
  Provides-Extra: confluence
55
- Requires-Dist: requests; extra == "confluence"
56
55
  Requires-Dist: atlassian-python-api; extra == "confluence"
56
+ Requires-Dist: requests; extra == "confluence"
57
57
  Provides-Extra: couchbase
58
58
  Requires-Dist: couchbase; extra == "couchbase"
59
59
  Provides-Extra: csv
@@ -83,8 +83,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
83
83
  Provides-Extra: embed-mixedbreadai
84
84
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
85
85
  Provides-Extra: embed-octoai
86
- Requires-Dist: openai; extra == "embed-octoai"
87
86
  Requires-Dist: tiktoken; extra == "embed-octoai"
87
+ Requires-Dist: openai; extra == "embed-octoai"
88
88
  Provides-Extra: embed-vertexai
89
89
  Requires-Dist: vertexai; extra == "embed-vertexai"
90
90
  Provides-Extra: embed-voyageai
@@ -92,19 +92,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
92
92
  Provides-Extra: epub
93
93
  Requires-Dist: unstructured[epub]; extra == "epub"
94
94
  Provides-Extra: gcs
95
- Requires-Dist: gcsfs; extra == "gcs"
96
- Requires-Dist: fsspec; extra == "gcs"
97
95
  Requires-Dist: bs4; extra == "gcs"
96
+ Requires-Dist: fsspec; extra == "gcs"
97
+ Requires-Dist: gcsfs; extra == "gcs"
98
98
  Provides-Extra: github
99
- Requires-Dist: requests; extra == "github"
100
99
  Requires-Dist: pygithub>1.58.0; extra == "github"
100
+ Requires-Dist: requests; extra == "github"
101
101
  Provides-Extra: gitlab
102
102
  Requires-Dist: python-gitlab; extra == "gitlab"
103
103
  Provides-Extra: google-drive
104
104
  Requires-Dist: google-api-python-client; extra == "google-drive"
105
105
  Provides-Extra: hubspot
106
- Requires-Dist: urllib3; extra == "hubspot"
107
106
  Requires-Dist: hubspot-api-client; extra == "hubspot"
107
+ Requires-Dist: urllib3; extra == "hubspot"
108
108
  Provides-Extra: jira
109
109
  Requires-Dist: atlassian-python-api; extra == "jira"
110
110
  Provides-Extra: kafka
@@ -122,30 +122,30 @@ Requires-Dist: pymongo; extra == "mongodb"
122
122
  Provides-Extra: msg
123
123
  Requires-Dist: unstructured[msg]; extra == "msg"
124
124
  Provides-Extra: neo4j
125
+ Requires-Dist: cymple; extra == "neo4j"
125
126
  Requires-Dist: neo4j; extra == "neo4j"
126
127
  Requires-Dist: networkx; extra == "neo4j"
127
- Requires-Dist: cymple; extra == "neo4j"
128
128
  Provides-Extra: notion
129
- Requires-Dist: htmlBuilder; extra == "notion"
130
129
  Requires-Dist: backoff; extra == "notion"
131
130
  Requires-Dist: notion-client; extra == "notion"
132
131
  Requires-Dist: httpx; extra == "notion"
132
+ Requires-Dist: htmlBuilder; extra == "notion"
133
133
  Provides-Extra: odt
134
134
  Requires-Dist: unstructured[odt]; extra == "odt"
135
135
  Provides-Extra: onedrive
136
- Requires-Dist: msal; extra == "onedrive"
137
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
136
  Requires-Dist: bs4; extra == "onedrive"
137
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: openai
140
- Requires-Dist: openai; extra == "openai"
141
140
  Requires-Dist: tiktoken; extra == "openai"
141
+ Requires-Dist: openai; extra == "openai"
142
142
  Provides-Extra: opensearch
143
143
  Requires-Dist: opensearch-py; extra == "opensearch"
144
144
  Provides-Extra: org
145
145
  Requires-Dist: unstructured[org]; extra == "org"
146
146
  Provides-Extra: outlook
147
- Requires-Dist: msal; extra == "outlook"
148
147
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
+ Requires-Dist: msal; extra == "outlook"
149
149
  Provides-Extra: pdf
150
150
  Requires-Dist: unstructured[pdf]; extra == "pdf"
151
151
  Provides-Extra: pinecone
@@ -169,16 +169,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
169
169
  Provides-Extra: rtf
170
170
  Requires-Dist: unstructured[rtf]; extra == "rtf"
171
171
  Provides-Extra: s3
172
- Requires-Dist: fsspec; extra == "s3"
173
172
  Requires-Dist: s3fs; extra == "s3"
173
+ Requires-Dist: fsspec; extra == "s3"
174
174
  Provides-Extra: salesforce
175
175
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
176
  Provides-Extra: sftp
177
177
  Requires-Dist: fsspec; extra == "sftp"
178
178
  Requires-Dist: paramiko; extra == "sftp"
179
179
  Provides-Extra: sharepoint
180
- Requires-Dist: msal; extra == "sharepoint"
181
180
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
+ Requires-Dist: msal; extra == "sharepoint"
182
182
  Provides-Extra: singlestore
183
183
  Requires-Dist: singlestoredb; extra == "singlestore"
184
184
  Provides-Extra: slack
@@ -191,13 +191,13 @@ Requires-Dist: together; extra == "togetherai"
191
191
  Provides-Extra: tsv
192
192
  Requires-Dist: unstructured[tsv]; extra == "tsv"
193
193
  Provides-Extra: vastdb
194
- Requires-Dist: vastdb; extra == "vastdb"
195
194
  Requires-Dist: ibis; extra == "vastdb"
195
+ Requires-Dist: vastdb; extra == "vastdb"
196
196
  Requires-Dist: pyarrow; extra == "vastdb"
197
197
  Provides-Extra: vectara
198
+ Requires-Dist: aiofiles; extra == "vectara"
198
199
  Requires-Dist: httpx; extra == "vectara"
199
200
  Requires-Dist: requests; extra == "vectara"
200
- Requires-Dist: aiofiles; extra == "vectara"
201
201
  Provides-Extra: weaviate
202
202
  Requires-Dist: weaviate-client; extra == "weaviate"
203
203
  Provides-Extra: wikipedia
@@ -102,7 +102,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
102
102
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
104
104
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
105
- unstructured_ingest/__version__.py,sha256=C0tWanpqRzvQsOclLMfAsEjPaa-5I3hXoMIvdtnb1w4,42
105
+ unstructured_ingest/__version__.py,sha256=LZI8wLYHcTzImgX-mBT2GEDfuLfBbZawJ40Z_jZShYc,42
106
106
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
107
107
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
108
108
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -391,7 +391,7 @@ unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADA
391
391
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
392
392
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
393
393
  unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
394
- unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
394
+ unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
395
395
  unstructured_ingest/v2/interfaces/process.py,sha256=6Ll0O9ATcdm36dx2_TOg9PfCEJrADgyd8OQK3TTNzZM,448
396
396
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
397
397
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
@@ -399,7 +399,7 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=rrZLTjmTcrDL-amQIKzIP6j2fW-
399
399
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
400
400
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
401
401
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
402
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=y6AkUBUL2r3t4OO0jWKomtN3v8U7EDtMPrJ8VYRo7VM,16344
402
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=-1TlqG33x_GGjGMk4Y8Psx1z6Prbuj11MMAR2WAuhBc,16520
403
403
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
404
404
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=LK2ldM24TE4ukX_Z6Z81LpF53orMaRkddM3uhLtT5EQ,3221
405
405
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=nZ4B0d9p-6TgWqrBoKUQPlr8m6dz1RGNr_3OjUhRpWg,8259
@@ -433,7 +433,7 @@ unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWo
433
433
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
434
434
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
435
435
  unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
436
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=d6gC40YmfqBNXxizAt4MO4OOu5BoCZ7SAe1AbNwTP0E,18322
436
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=9UK5nILtrAXSwpp_aeANgqvQf_UbH3J3czN7y-DL9d0,17386
437
437
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
438
438
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
439
439
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
@@ -561,9 +561,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
561
561
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
562
562
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
563
563
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
564
- unstructured_ingest-0.4.3.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
565
- unstructured_ingest-0.4.3.dist-info/METADATA,sha256=UXXbx1Vr9zdcvAfOdgabURlB8nR2I8Lo_aDTN1PNjwU,8051
566
- unstructured_ingest-0.4.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
567
- unstructured_ingest-0.4.3.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
568
- unstructured_ingest-0.4.3.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
569
- unstructured_ingest-0.4.3.dist-info/RECORD,,
564
+ unstructured_ingest-0.4.5.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
565
+ unstructured_ingest-0.4.5.dist-info/METADATA,sha256=cetNdLOmsQvHFt7j2m2utZKATnaMx9BIAV2i386aoTc,8051
566
+ unstructured_ingest-0.4.5.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
567
+ unstructured_ingest-0.4.5.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
568
+ unstructured_ingest-0.4.5.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
569
+ unstructured_ingest-0.4.5.dist-info/RECORD,,