unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.1" # pragma: no cover
1
+ __version__ = "0.0.2-dev0" # pragma: no cover
@@ -7,6 +7,7 @@ from datetime import datetime
7
7
  from pathlib import Path
8
8
  from time import time
9
9
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
+ from uuid import NAMESPACE_DNS, uuid5
10
11
 
11
12
  from unstructured.documents.elements import DataSourceMetadata
12
13
 
@@ -210,8 +211,11 @@ class FsspecIndexer(Indexer):
210
211
  # Note: we remove any remaining leading slashes (Box introduces these)
211
212
  # to get a valid relative path
212
213
  rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
214
+
215
+ additional_metadata = self.sterilize_info(path=file)
216
+ additional_metadata["original_file_path"] = file
213
217
  yield FileData(
214
- identifier=file,
218
+ identifier=str(uuid5(NAMESPACE_DNS, file)),
215
219
  connector_type=self.connector_type,
216
220
  source_identifiers=SourceIdentifiers(
217
221
  filename=Path(file).name,
@@ -219,7 +223,7 @@ class FsspecIndexer(Indexer):
219
223
  fullpath=file,
220
224
  ),
221
225
  metadata=self.get_metadata(path=file),
222
- additional_metadata=self.sterilize_info(path=file),
226
+ additional_metadata=additional_metadata,
223
227
  )
224
228
 
225
229
 
@@ -262,7 +266,8 @@ class FsspecDownloader(Downloader):
262
266
  download_path = self.get_download_path(file_data=file_data)
263
267
  download_path.parent.mkdir(parents=True, exist_ok=True)
264
268
  try:
265
- self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
269
+ rpath = file_data.additional_metadata["original_file_path"]
270
+ self.fs.get(rpath=rpath, lpath=download_path.as_posix())
266
271
  except Exception as e:
267
272
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
268
273
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +277,8 @@ class FsspecDownloader(Downloader):
272
277
  download_path = self.get_download_path(file_data=file_data)
273
278
  download_path.parent.mkdir(parents=True, exist_ok=True)
274
279
  try:
275
- await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
280
+ rpath = file_data.additional_metadata["original_file_path"]
281
+ await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
276
282
  except Exception as e:
277
283
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
278
284
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.1
3
+ Version: 0.0.2.dev0
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -21,9 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
- Requires-Dist: pandas
25
24
  Requires-Dist: unstructured
26
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
27
27
  Provides-Extra: airtable
28
28
  Requires-Dist: pyairtable ; extra == 'airtable'
29
29
  Provides-Extra: astra
@@ -39,8 +39,8 @@ Requires-Dist: boto3 ; extra == 'bedrock'
39
39
  Provides-Extra: biomed
40
40
  Requires-Dist: bs4 ; extra == 'biomed'
41
41
  Provides-Extra: box
42
- Requires-Dist: fsspec ; extra == 'box'
43
42
  Requires-Dist: boxfs ; extra == 'box'
43
+ Requires-Dist: fsspec ; extra == 'box'
44
44
  Provides-Extra: chroma
45
45
  Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
46
46
  Requires-Dist: chromadb ; extra == 'chroma'
@@ -63,30 +63,30 @@ Requires-Dist: unstructured[docx] ; extra == 'doc'
63
63
  Provides-Extra: docx
64
64
  Requires-Dist: unstructured[docx] ; extra == 'docx'
65
65
  Provides-Extra: dropbox
66
- Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
67
66
  Requires-Dist: fsspec ; extra == 'dropbox'
67
+ Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
68
68
  Provides-Extra: elasticsearch
69
69
  Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
70
  Provides-Extra: embed-huggingface
71
- Requires-Dist: huggingface ; extra == 'embed-huggingface'
72
71
  Requires-Dist: langchain-community ; extra == 'embed-huggingface'
72
+ Requires-Dist: huggingface ; extra == 'embed-huggingface'
73
73
  Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
74
74
  Provides-Extra: embed-octoai
75
75
  Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
76
  Requires-Dist: openai ; extra == 'embed-octoai'
77
77
  Provides-Extra: embed-vertexai
78
- Requires-Dist: langchain-community ; extra == 'embed-vertexai'
79
78
  Requires-Dist: langchain ; extra == 'embed-vertexai'
80
79
  Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
80
+ Requires-Dist: langchain-community ; extra == 'embed-vertexai'
81
81
  Provides-Extra: embed-voyageai
82
- Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
83
82
  Requires-Dist: langchain ; extra == 'embed-voyageai'
83
+ Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
84
84
  Provides-Extra: epub
85
85
  Requires-Dist: unstructured[epub] ; extra == 'epub'
86
86
  Provides-Extra: gcs
87
87
  Requires-Dist: bs4 ; extra == 'gcs'
88
- Requires-Dist: gcsfs ; extra == 'gcs'
89
88
  Requires-Dist: fsspec ; extra == 'gcs'
89
+ Requires-Dist: gcsfs ; extra == 'gcs'
90
90
  Provides-Extra: github
91
91
  Requires-Dist: pygithub >1.58.0 ; extra == 'github'
92
92
  Provides-Extra: gitlab
@@ -109,18 +109,18 @@ Requires-Dist: pymongo ; extra == 'mongodb'
109
109
  Provides-Extra: msg
110
110
  Requires-Dist: unstructured[msg] ; extra == 'msg'
111
111
  Provides-Extra: notion
112
- Requires-Dist: htmlBuilder ; extra == 'notion'
113
112
  Requires-Dist: notion-client ; extra == 'notion'
113
+ Requires-Dist: htmlBuilder ; extra == 'notion'
114
114
  Provides-Extra: odt
115
115
  Requires-Dist: unstructured[odt] ; extra == 'odt'
116
116
  Provides-Extra: onedrive
117
- Requires-Dist: bs4 ; extra == 'onedrive'
118
117
  Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
119
118
  Requires-Dist: msal ; extra == 'onedrive'
119
+ Requires-Dist: bs4 ; extra == 'onedrive'
120
120
  Provides-Extra: openai
121
- Requires-Dist: tiktoken ; extra == 'openai'
122
121
  Requires-Dist: langchain-community ; extra == 'openai'
123
122
  Requires-Dist: openai ; extra == 'openai'
123
+ Requires-Dist: tiktoken ; extra == 'openai'
124
124
  Provides-Extra: opensearch
125
125
  Requires-Dist: opensearch-py ; extra == 'opensearch'
126
126
  Provides-Extra: org
@@ -147,13 +147,13 @@ Requires-Dist: unstructured[rst] ; extra == 'rst'
147
147
  Provides-Extra: rtf
148
148
  Requires-Dist: unstructured[rtf] ; extra == 'rtf'
149
149
  Provides-Extra: s3
150
- Requires-Dist: fsspec ; extra == 's3'
151
150
  Requires-Dist: s3fs ; extra == 's3'
151
+ Requires-Dist: fsspec ; extra == 's3'
152
152
  Provides-Extra: salesforce
153
153
  Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
154
  Provides-Extra: sftp
155
- Requires-Dist: paramiko ; extra == 'sftp'
156
155
  Requires-Dist: fsspec ; extra == 'sftp'
156
+ Requires-Dist: paramiko ; extra == 'sftp'
157
157
  Provides-Extra: sharepoint
158
158
  Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
159
159
  Requires-Dist: msal ; extra == 'sharepoint'
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=SI019rW6paHw93e6fOWFzF9TruLom8o9HrgZsjGZvaE,42
2
+ unstructured_ingest/__version__.py,sha256=neOBPct_gjgXqs6YN8HnzdaRiPiugEbJpwI6SDZ7qac,47
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
5
5
  unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
@@ -346,13 +346,13 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
346
346
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
347
347
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
348
348
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
349
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=gNgrRqKqk9YpBRGqGPvBUuEcBv1jN59fmBBj6NrB4sA,12394
349
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=MgOUhDGtTAUuzmAsLvBwV_3ggyL5DDpMm-sb4KNck88,12689
350
350
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
351
351
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
352
352
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
353
353
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
354
- unstructured_ingest-0.0.1.dist-info/METADATA,sha256=Qru27Cxrf0C-vFe7MqfaKOfavazrWYTTRif6loKf71o,21568
355
- unstructured_ingest-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
356
- unstructured_ingest-0.0.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
357
- unstructured_ingest-0.0.1.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
358
- unstructured_ingest-0.0.1.dist-info/RECORD,,
354
+ unstructured_ingest-0.0.2.dev0.dist-info/METADATA,sha256=dnWewzRLiYQlU6Fglws3oQnUkgzAVCnKM7BPMtls9YU,21573
355
+ unstructured_ingest-0.0.2.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
356
+ unstructured_ingest-0.0.2.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
357
+ unstructured_ingest-0.0.2.dev0.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
358
+ unstructured_ingest-0.0.2.dev0.dist-info/RECORD,,