unstructured-ingest 1.0.51__py3-none-any.whl → 1.0.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +3 -2
- unstructured_ingest/processes/connectors/sharepoint.py +5 -2
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/compression.py +2 -1
- unstructured_ingest/utils/filesystem.py +27 -0
- {unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/METADATA +1 -1
- {unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/RECORD +11 -10
- {unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.53" # pragma: no cover
|
|
@@ -30,6 +30,7 @@ from unstructured_ingest.interfaces import (
|
|
|
30
30
|
)
|
|
31
31
|
from unstructured_ingest.logger import logger
|
|
32
32
|
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
|
|
33
|
+
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
|
|
33
34
|
|
|
34
35
|
if TYPE_CHECKING:
|
|
35
36
|
from fsspec import AbstractFileSystem
|
|
@@ -270,7 +271,7 @@ class FsspecDownloader(Downloader):
|
|
|
270
271
|
|
|
271
272
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
272
273
|
download_path = self.get_download_path(file_data=file_data)
|
|
273
|
-
download_path.parent
|
|
274
|
+
mkdir_concurrent_safe(download_path.parent)
|
|
274
275
|
try:
|
|
275
276
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
276
277
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
@@ -282,7 +283,7 @@ class FsspecDownloader(Downloader):
|
|
|
282
283
|
|
|
283
284
|
async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
284
285
|
download_path = self.get_download_path(file_data=file_data)
|
|
285
|
-
download_path.parent
|
|
286
|
+
mkdir_concurrent_safe(download_path.parent)
|
|
286
287
|
try:
|
|
287
288
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
288
289
|
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
@@ -106,8 +106,11 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
106
106
|
try:
|
|
107
107
|
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
108
108
|
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
109
|
-
except ClientRequestException:
|
|
110
|
-
logger.
|
|
109
|
+
except ClientRequestException as e:
|
|
110
|
+
logger.error(f"Failed to access SharePoint site: {self.connection_config.site}")
|
|
111
|
+
raise SourceConnectionError(
|
|
112
|
+
f"Unable to access SharePoint site at {self.connection_config.site}: {str(e)}"
|
|
113
|
+
)
|
|
111
114
|
|
|
112
115
|
path = self.index_config.path
|
|
113
116
|
# Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.logger import logger
|
|
9
|
+
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
|
|
9
10
|
|
|
10
11
|
ZIP_FILE_EXT = [".zip"]
|
|
11
12
|
TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
|
@@ -17,7 +18,7 @@ def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
|
|
17
18
|
"""
|
|
18
19
|
# Create path if it doesn't already exist
|
|
19
20
|
if path:
|
|
20
|
-
Path(path)
|
|
21
|
+
mkdir_concurrent_safe(Path(path))
|
|
21
22
|
|
|
22
23
|
if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
|
|
23
24
|
return uncompress_zip_file(zip_filename=filename, path=path)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filesystem utilities for concurrent operations.
|
|
3
|
+
|
|
4
|
+
This module provides race-condition-safe filesystem operations that are needed
|
|
5
|
+
when multiple processes operate on the same directory structures simultaneously.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def mkdir_concurrent_safe(path: Path) -> None:
|
|
12
|
+
"""
|
|
13
|
+
Create directory safely in concurrent environments, handling race conditions.
|
|
14
|
+
|
|
15
|
+
This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
|
|
16
|
+
raise FileExistsError when multiple processes attempt to create overlapping
|
|
17
|
+
directory structures simultaneously. In this codebase, this occurs when multiple
|
|
18
|
+
files are being downloaded in parallel and archive extraction is happening in parallel.
|
|
19
|
+
|
|
20
|
+
Related: https://github.com/python/cpython/pull/112966/files
|
|
21
|
+
Python core team used the same approach to fix zipfile race conditions.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
except FileExistsError:
|
|
26
|
+
if not (path.exists() and path.is_dir()):
|
|
27
|
+
raise
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=R6_J5XcJv3o4se9ZaxG3ld-O1m1WwU2CyxpqODyEhmg,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
@@ -84,7 +84,7 @@ unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y
|
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
|
|
85
85
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
86
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
|
|
87
|
-
unstructured_ingest/processes/connectors/sharepoint.py,sha256=
|
|
87
|
+
unstructured_ingest/processes/connectors/sharepoint.py,sha256=Wgv9Pih9S9FmQJud1bg7kj_qqi55d7QZ48LqlTU_mk0,6509
|
|
88
88
|
unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
|
|
89
89
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
90
90
|
unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
|
|
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
|
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
110
110
|
unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
|
|
111
111
|
unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
|
|
112
|
-
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=
|
|
112
|
+
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=S1siX888TfHAByEXuvOqkTbcNAzx-m5UNqhKjiEKR5s,14524
|
|
113
113
|
unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
|
|
114
114
|
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
|
|
115
115
|
unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
|
|
@@ -220,19 +220,20 @@ unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-
|
|
|
220
220
|
unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
|
|
221
221
|
unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
222
|
unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
|
|
223
|
-
unstructured_ingest/utils/__init__.py,sha256=
|
|
223
|
+
unstructured_ingest/utils/__init__.py,sha256=URnsQu-y3Vmc7vn6GVL2sYuXxlSJ3naR3c9o6oKSm3w,157
|
|
224
224
|
unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
|
|
225
|
-
unstructured_ingest/utils/compression.py,sha256=
|
|
225
|
+
unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJEvA6o7PXaLI,2712
|
|
226
226
|
unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
|
|
227
227
|
unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
|
|
228
228
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
229
|
+
unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
|
|
229
230
|
unstructured_ingest/utils/html.py,sha256=78ou1vVZ0SJ3c6-Nmxg2iR5MoqubJTvwiuTNMtSFDh4,6816
|
|
230
231
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
231
232
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
232
233
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
233
234
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
234
|
-
unstructured_ingest-1.0.
|
|
235
|
-
unstructured_ingest-1.0.
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
235
|
+
unstructured_ingest-1.0.53.dist-info/METADATA,sha256=FTZfBOPcfl-pbxoBAIK-d0MwQKfN5nbJXQh92H1QwMs,8842
|
|
236
|
+
unstructured_ingest-1.0.53.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
237
|
+
unstructured_ingest-1.0.53.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
238
|
+
unstructured_ingest-1.0.53.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
239
|
+
unstructured_ingest-1.0.53.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.51.dist-info → unstructured_ingest-1.0.53.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|