unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +10 -4
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/METADATA +13 -13
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/RECORD +7 -7
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.2-dev0" # pragma: no cover
|
|
@@ -7,6 +7,7 @@ from datetime import datetime
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from time import time
|
|
9
9
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
10
11
|
|
|
11
12
|
from unstructured.documents.elements import DataSourceMetadata
|
|
12
13
|
|
|
@@ -210,8 +211,11 @@ class FsspecIndexer(Indexer):
|
|
|
210
211
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
211
212
|
# to get a valid relative path
|
|
212
213
|
rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
214
|
+
|
|
215
|
+
additional_metadata = self.sterilize_info(path=file)
|
|
216
|
+
additional_metadata["original_file_path"] = file
|
|
213
217
|
yield FileData(
|
|
214
|
-
identifier=file,
|
|
218
|
+
identifier=str(uuid5(NAMESPACE_DNS, file)),
|
|
215
219
|
connector_type=self.connector_type,
|
|
216
220
|
source_identifiers=SourceIdentifiers(
|
|
217
221
|
filename=Path(file).name,
|
|
@@ -219,7 +223,7 @@ class FsspecIndexer(Indexer):
|
|
|
219
223
|
fullpath=file,
|
|
220
224
|
),
|
|
221
225
|
metadata=self.get_metadata(path=file),
|
|
222
|
-
additional_metadata=
|
|
226
|
+
additional_metadata=additional_metadata,
|
|
223
227
|
)
|
|
224
228
|
|
|
225
229
|
|
|
@@ -262,7 +266,8 @@ class FsspecDownloader(Downloader):
|
|
|
262
266
|
download_path = self.get_download_path(file_data=file_data)
|
|
263
267
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
268
|
try:
|
|
265
|
-
|
|
269
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
270
|
+
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
266
271
|
except Exception as e:
|
|
267
272
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
268
273
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -272,7 +277,8 @@ class FsspecDownloader(Downloader):
|
|
|
272
277
|
download_path = self.get_download_path(file_data=file_data)
|
|
273
278
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
279
|
try:
|
|
275
|
-
|
|
280
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
281
|
+
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
276
282
|
except Exception as e:
|
|
277
283
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
278
284
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2.dev0
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -21,9 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
|
-
Requires-Dist: pandas
|
|
25
24
|
Requires-Dist: unstructured
|
|
26
25
|
Requires-Dist: python-dateutil
|
|
26
|
+
Requires-Dist: pandas
|
|
27
27
|
Provides-Extra: airtable
|
|
28
28
|
Requires-Dist: pyairtable ; extra == 'airtable'
|
|
29
29
|
Provides-Extra: astra
|
|
@@ -39,8 +39,8 @@ Requires-Dist: boto3 ; extra == 'bedrock'
|
|
|
39
39
|
Provides-Extra: biomed
|
|
40
40
|
Requires-Dist: bs4 ; extra == 'biomed'
|
|
41
41
|
Provides-Extra: box
|
|
42
|
-
Requires-Dist: fsspec ; extra == 'box'
|
|
43
42
|
Requires-Dist: boxfs ; extra == 'box'
|
|
43
|
+
Requires-Dist: fsspec ; extra == 'box'
|
|
44
44
|
Provides-Extra: chroma
|
|
45
45
|
Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
|
|
46
46
|
Requires-Dist: chromadb ; extra == 'chroma'
|
|
@@ -63,30 +63,30 @@ Requires-Dist: unstructured[docx] ; extra == 'doc'
|
|
|
63
63
|
Provides-Extra: docx
|
|
64
64
|
Requires-Dist: unstructured[docx] ; extra == 'docx'
|
|
65
65
|
Provides-Extra: dropbox
|
|
66
|
-
Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
|
|
67
66
|
Requires-Dist: fsspec ; extra == 'dropbox'
|
|
67
|
+
Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
|
|
68
68
|
Provides-Extra: elasticsearch
|
|
69
69
|
Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
|
|
70
70
|
Provides-Extra: embed-huggingface
|
|
71
|
-
Requires-Dist: huggingface ; extra == 'embed-huggingface'
|
|
72
71
|
Requires-Dist: langchain-community ; extra == 'embed-huggingface'
|
|
72
|
+
Requires-Dist: huggingface ; extra == 'embed-huggingface'
|
|
73
73
|
Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
|
|
74
74
|
Provides-Extra: embed-octoai
|
|
75
75
|
Requires-Dist: tiktoken ; extra == 'embed-octoai'
|
|
76
76
|
Requires-Dist: openai ; extra == 'embed-octoai'
|
|
77
77
|
Provides-Extra: embed-vertexai
|
|
78
|
-
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
79
78
|
Requires-Dist: langchain ; extra == 'embed-vertexai'
|
|
80
79
|
Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
|
|
80
|
+
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
81
81
|
Provides-Extra: embed-voyageai
|
|
82
|
-
Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
|
|
83
82
|
Requires-Dist: langchain ; extra == 'embed-voyageai'
|
|
83
|
+
Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
|
|
84
84
|
Provides-Extra: epub
|
|
85
85
|
Requires-Dist: unstructured[epub] ; extra == 'epub'
|
|
86
86
|
Provides-Extra: gcs
|
|
87
87
|
Requires-Dist: bs4 ; extra == 'gcs'
|
|
88
|
-
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
89
88
|
Requires-Dist: fsspec ; extra == 'gcs'
|
|
89
|
+
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
90
90
|
Provides-Extra: github
|
|
91
91
|
Requires-Dist: pygithub >1.58.0 ; extra == 'github'
|
|
92
92
|
Provides-Extra: gitlab
|
|
@@ -109,18 +109,18 @@ Requires-Dist: pymongo ; extra == 'mongodb'
|
|
|
109
109
|
Provides-Extra: msg
|
|
110
110
|
Requires-Dist: unstructured[msg] ; extra == 'msg'
|
|
111
111
|
Provides-Extra: notion
|
|
112
|
-
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
113
112
|
Requires-Dist: notion-client ; extra == 'notion'
|
|
113
|
+
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
114
114
|
Provides-Extra: odt
|
|
115
115
|
Requires-Dist: unstructured[odt] ; extra == 'odt'
|
|
116
116
|
Provides-Extra: onedrive
|
|
117
|
-
Requires-Dist: bs4 ; extra == 'onedrive'
|
|
118
117
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
|
|
119
118
|
Requires-Dist: msal ; extra == 'onedrive'
|
|
119
|
+
Requires-Dist: bs4 ; extra == 'onedrive'
|
|
120
120
|
Provides-Extra: openai
|
|
121
|
-
Requires-Dist: tiktoken ; extra == 'openai'
|
|
122
121
|
Requires-Dist: langchain-community ; extra == 'openai'
|
|
123
122
|
Requires-Dist: openai ; extra == 'openai'
|
|
123
|
+
Requires-Dist: tiktoken ; extra == 'openai'
|
|
124
124
|
Provides-Extra: opensearch
|
|
125
125
|
Requires-Dist: opensearch-py ; extra == 'opensearch'
|
|
126
126
|
Provides-Extra: org
|
|
@@ -147,13 +147,13 @@ Requires-Dist: unstructured[rst] ; extra == 'rst'
|
|
|
147
147
|
Provides-Extra: rtf
|
|
148
148
|
Requires-Dist: unstructured[rtf] ; extra == 'rtf'
|
|
149
149
|
Provides-Extra: s3
|
|
150
|
-
Requires-Dist: fsspec ; extra == 's3'
|
|
151
150
|
Requires-Dist: s3fs ; extra == 's3'
|
|
151
|
+
Requires-Dist: fsspec ; extra == 's3'
|
|
152
152
|
Provides-Extra: salesforce
|
|
153
153
|
Requires-Dist: simple-salesforce ; extra == 'salesforce'
|
|
154
154
|
Provides-Extra: sftp
|
|
155
|
-
Requires-Dist: paramiko ; extra == 'sftp'
|
|
156
155
|
Requires-Dist: fsspec ; extra == 'sftp'
|
|
156
|
+
Requires-Dist: paramiko ; extra == 'sftp'
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
158
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
|
|
159
159
|
Requires-Dist: msal ; extra == 'sharepoint'
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=neOBPct_gjgXqs6YN8HnzdaRiPiugEbJpwI6SDZ7qac,47
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
|
|
5
5
|
unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
|
|
@@ -346,13 +346,13 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
|
|
|
346
346
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
|
|
347
347
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
|
|
348
348
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
|
|
349
|
-
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=
|
|
349
|
+
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=MgOUhDGtTAUuzmAsLvBwV_3ggyL5DDpMm-sb4KNck88,12689
|
|
350
350
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
|
|
351
351
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
|
|
352
352
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
|
|
353
353
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
354
|
-
unstructured_ingest-0.0.
|
|
355
|
-
unstructured_ingest-0.0.
|
|
356
|
-
unstructured_ingest-0.0.
|
|
357
|
-
unstructured_ingest-0.0.
|
|
358
|
-
unstructured_ingest-0.0.
|
|
354
|
+
unstructured_ingest-0.0.2.dev0.dist-info/METADATA,sha256=dnWewzRLiYQlU6Fglws3oQnUkgzAVCnKM7BPMtls9YU,21573
|
|
355
|
+
unstructured_ingest-0.0.2.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
356
|
+
unstructured_ingest-0.0.2.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
357
|
+
unstructured_ingest-0.0.2.dev0.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
|
|
358
|
+
unstructured_ingest-0.0.2.dev0.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt
RENAMED
|
File without changes
|