unstructured-ingest 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/indexer.py +2 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -29
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/METADATA +23 -23
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/RECORD +9 -9
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.4" # pragma: no cover
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from abc import ABC
|
|
1
|
+
from abc import ABC
|
|
2
2
|
from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
@@ -22,9 +22,8 @@ class Indexer(BaseProcess, BaseConnector, ABC):
|
|
|
22
22
|
def is_async(self) -> bool:
|
|
23
23
|
return False
|
|
24
24
|
|
|
25
|
-
@abstractmethod
|
|
26
25
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
27
|
-
|
|
26
|
+
raise NotImplementedError()
|
|
28
27
|
|
|
29
28
|
async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
|
|
30
29
|
raise NotImplementedError()
|
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from time import time
|
|
8
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator,
|
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
|
|
9
9
|
|
|
10
10
|
from dateutil import parser
|
|
11
11
|
from pydantic import Field, Secret
|
|
@@ -101,27 +101,6 @@ class OnedriveIndexerConfig(IndexerConfig):
|
|
|
101
101
|
recursive: bool = False
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
T = TypeVar("T")
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def async_iterable_to_sync_iterable(iterator: AsyncIterator[T]) -> Iterator[T]:
|
|
108
|
-
# This version works on Python 3.9 by manually handling the async iteration.
|
|
109
|
-
loop = asyncio.new_event_loop()
|
|
110
|
-
asyncio.set_event_loop(loop)
|
|
111
|
-
try:
|
|
112
|
-
while True:
|
|
113
|
-
try:
|
|
114
|
-
# Instead of anext(iterator), we directly call __anext__().
|
|
115
|
-
# __anext__ returns a coroutine that we must run until complete.
|
|
116
|
-
future = iterator.__anext__()
|
|
117
|
-
result = loop.run_until_complete(future)
|
|
118
|
-
yield result
|
|
119
|
-
except StopAsyncIteration:
|
|
120
|
-
break
|
|
121
|
-
finally:
|
|
122
|
-
loop.close()
|
|
123
|
-
|
|
124
|
-
|
|
125
104
|
@dataclass
|
|
126
105
|
class OnedriveIndexer(Indexer):
|
|
127
106
|
connection_config: OnedriveConnectionConfig
|
|
@@ -215,7 +194,10 @@ class OnedriveIndexer(Indexer):
|
|
|
215
194
|
# Offload the file data creation if it's not guaranteed async
|
|
216
195
|
return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
|
|
217
196
|
|
|
218
|
-
|
|
197
|
+
def is_async(self) -> bool:
|
|
198
|
+
return True
|
|
199
|
+
|
|
200
|
+
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
|
|
219
201
|
token_resp = await asyncio.to_thread(self.connection_config.get_token)
|
|
220
202
|
if "error" in token_resp:
|
|
221
203
|
raise SourceConnectionError(
|
|
@@ -230,12 +212,6 @@ class OnedriveIndexer(Indexer):
|
|
|
230
212
|
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
231
213
|
yield file_data
|
|
232
214
|
|
|
233
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
234
|
-
# Convert the async generator to a sync generator without loading all data into memory
|
|
235
|
-
async_gen = self._run_async(**kwargs)
|
|
236
|
-
for item in async_iterable_to_sync_iterable(async_gen):
|
|
237
|
-
yield item
|
|
238
|
-
|
|
239
215
|
|
|
240
216
|
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
241
217
|
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,38 +22,38 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: dataclasses-json
|
|
26
|
-
Requires-Dist: pandas
|
|
27
|
-
Requires-Dist: python-dateutil
|
|
28
|
-
Requires-Dist: opentelemetry-sdk
|
|
29
|
-
Requires-Dist: click
|
|
30
25
|
Requires-Dist: pydantic>=2.7
|
|
26
|
+
Requires-Dist: click
|
|
27
|
+
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: dataclasses-json
|
|
31
29
|
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
|
+
Requires-Dist: python-dateutil
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
43
42
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
43
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
44
44
|
Provides-Extra: biomed
|
|
45
45
|
Requires-Dist: bs4; extra == "biomed"
|
|
46
46
|
Requires-Dist: requests; extra == "biomed"
|
|
47
47
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: fsspec; extra == "box"
|
|
49
48
|
Requires-Dist: boxfs; extra == "box"
|
|
49
|
+
Requires-Dist: fsspec; extra == "box"
|
|
50
50
|
Provides-Extra: chroma
|
|
51
51
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
52
|
Provides-Extra: clarifai
|
|
53
53
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
54
54
|
Provides-Extra: confluence
|
|
55
|
-
Requires-Dist: requests; extra == "confluence"
|
|
56
55
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
|
+
Requires-Dist: requests; extra == "confluence"
|
|
57
57
|
Provides-Extra: couchbase
|
|
58
58
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
59
59
|
Provides-Extra: csv
|
|
@@ -63,8 +63,8 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
63
63
|
Provides-Extra: databricks-volumes
|
|
64
64
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
65
65
|
Provides-Extra: delta-table
|
|
66
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
67
66
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
67
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
68
68
|
Provides-Extra: discord
|
|
69
69
|
Requires-Dist: discord.py; extra == "discord"
|
|
70
70
|
Provides-Extra: doc
|
|
@@ -72,8 +72,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
72
72
|
Provides-Extra: docx
|
|
73
73
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
74
74
|
Provides-Extra: dropbox
|
|
75
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
76
75
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
76
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
77
77
|
Provides-Extra: duckdb
|
|
78
78
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
79
79
|
Provides-Extra: elasticsearch
|
|
@@ -92,12 +92,12 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
92
92
|
Provides-Extra: epub
|
|
93
93
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
94
|
Provides-Extra: gcs
|
|
95
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
96
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
97
95
|
Requires-Dist: bs4; extra == "gcs"
|
|
96
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
97
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
98
98
|
Provides-Extra: github
|
|
99
|
-
Requires-Dist: requests; extra == "github"
|
|
100
99
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
|
+
Requires-Dist: requests; extra == "github"
|
|
101
101
|
Provides-Extra: gitlab
|
|
102
102
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
103
103
|
Provides-Extra: google-drive
|
|
@@ -122,20 +122,20 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
122
122
|
Provides-Extra: msg
|
|
123
123
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
124
124
|
Provides-Extra: neo4j
|
|
125
|
-
Requires-Dist: neo4j; extra == "neo4j"
|
|
126
125
|
Requires-Dist: networkx; extra == "neo4j"
|
|
127
126
|
Requires-Dist: cymple; extra == "neo4j"
|
|
127
|
+
Requires-Dist: neo4j; extra == "neo4j"
|
|
128
128
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
130
129
|
Requires-Dist: backoff; extra == "notion"
|
|
131
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
132
130
|
Requires-Dist: httpx; extra == "notion"
|
|
131
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
133
133
|
Provides-Extra: odt
|
|
134
134
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
137
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
138
137
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
139
139
|
Provides-Extra: openai
|
|
140
140
|
Requires-Dist: openai; extra == "openai"
|
|
141
141
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -144,8 +144,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
144
144
|
Provides-Extra: org
|
|
145
145
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
146
146
|
Provides-Extra: outlook
|
|
147
|
-
Requires-Dist: msal; extra == "outlook"
|
|
148
147
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
|
+
Requires-Dist: msal; extra == "outlook"
|
|
149
149
|
Provides-Extra: pdf
|
|
150
150
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
151
151
|
Provides-Extra: pinecone
|
|
@@ -177,23 +177,23 @@ Provides-Extra: sftp
|
|
|
177
177
|
Requires-Dist: fsspec; extra == "sftp"
|
|
178
178
|
Requires-Dist: paramiko; extra == "sftp"
|
|
179
179
|
Provides-Extra: sharepoint
|
|
180
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
181
180
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
182
182
|
Provides-Extra: singlestore
|
|
183
183
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
184
184
|
Provides-Extra: slack
|
|
185
185
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
186
186
|
Provides-Extra: snowflake
|
|
187
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
188
187
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
189
|
Provides-Extra: togetherai
|
|
190
190
|
Requires-Dist: together; extra == "togetherai"
|
|
191
191
|
Provides-Extra: tsv
|
|
192
192
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
193
193
|
Provides-Extra: vastdb
|
|
194
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
195
194
|
Requires-Dist: ibis; extra == "vastdb"
|
|
196
195
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
196
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
197
197
|
Provides-Extra: vectara
|
|
198
198
|
Requires-Dist: httpx; extra == "vectara"
|
|
199
199
|
Requires-Dist: requests; extra == "vectara"
|
|
@@ -102,7 +102,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
102
102
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
103
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
104
104
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
105
|
-
unstructured_ingest/__version__.py,sha256=
|
|
105
|
+
unstructured_ingest/__version__.py,sha256=k5K6WAWnRkNeRW39AQyaFiSCUwHRsxlNOpkoF4MqU3c,42
|
|
106
106
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
107
107
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
108
108
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -391,7 +391,7 @@ unstructured_ingest/v2/interfaces/__init__.py,sha256=Xp7-345QpM6MG7V7G4ZrVERjADA
|
|
|
391
391
|
unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
|
|
392
392
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
|
|
393
393
|
unstructured_ingest/v2/interfaces/file_data.py,sha256=7MyRlj5dijQsCR6W18wQ8fEgJigGKwoOYc10g9A6PSo,3834
|
|
394
|
-
unstructured_ingest/v2/interfaces/indexer.py,sha256=
|
|
394
|
+
unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
|
|
395
395
|
unstructured_ingest/v2/interfaces/process.py,sha256=6Ll0O9ATcdm36dx2_TOg9PfCEJrADgyd8OQK3TTNzZM,448
|
|
396
396
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
397
397
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=9EV9863ODDv0Y5liDT3xh2yiVuFiaVVyCcnwCy6nfkM,3172
|
|
@@ -433,7 +433,7 @@ unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWo
|
|
|
433
433
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
434
434
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
435
435
|
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=HU1IwchTM7Q1kkeIFVe-Lg6gInMItBpgkDkVwuTvkGY,14259
|
|
436
|
-
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=
|
|
436
|
+
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=sVRk1LodwVS9do3kmetO8kvSdEzfR-oATXa6covC64Y,17365
|
|
437
437
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
438
438
|
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=bQDCch7OGiQgpWO3n3ncLuQ4XCWqDc7ZWEB-Qrqkss8,10730
|
|
439
439
|
unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
|
|
@@ -561,9 +561,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
|
|
|
561
561
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
562
562
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
563
563
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=yJza_jBSEFnzZRq5L6vJ0Mm3uS1uxkOiKIimPpUyQds,12418
|
|
564
|
-
unstructured_ingest-0.4.
|
|
565
|
-
unstructured_ingest-0.4.
|
|
566
|
-
unstructured_ingest-0.4.
|
|
567
|
-
unstructured_ingest-0.4.
|
|
568
|
-
unstructured_ingest-0.4.
|
|
569
|
-
unstructured_ingest-0.4.
|
|
564
|
+
unstructured_ingest-0.4.4.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
565
|
+
unstructured_ingest-0.4.4.dist-info/METADATA,sha256=h_Yeg9jJuyJmsipS3juMfEozK8U6sNyA-PotmiuuBsE,8051
|
|
566
|
+
unstructured_ingest-0.4.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
567
|
+
unstructured_ingest-0.4.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
568
|
+
unstructured_ingest-0.4.4.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
569
|
+
unstructured_ingest-0.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.4.3.dist-info → unstructured_ingest-0.4.4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|