unstructured-ingest 1.0.51__py3-none-any.whl → 1.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.51" # pragma: no cover
1
+ __version__ = "1.0.53" # pragma: no cover
@@ -30,6 +30,7 @@ from unstructured_ingest.interfaces import (
30
30
  )
31
31
  from unstructured_ingest.logger import logger
32
32
  from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
33
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
33
34
 
34
35
  if TYPE_CHECKING:
35
36
  from fsspec import AbstractFileSystem
@@ -270,7 +271,7 @@ class FsspecDownloader(Downloader):
270
271
 
271
272
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
272
273
  download_path = self.get_download_path(file_data=file_data)
273
- download_path.parent.mkdir(parents=True, exist_ok=True)
274
+ mkdir_concurrent_safe(download_path.parent)
274
275
  try:
275
276
  rpath = file_data.additional_metadata["original_file_path"]
276
277
  with self.connection_config.get_client(protocol=self.protocol) as client:
@@ -282,7 +283,7 @@ class FsspecDownloader(Downloader):
282
283
 
283
284
  async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
284
285
  download_path = self.get_download_path(file_data=file_data)
285
- download_path.parent.mkdir(parents=True, exist_ok=True)
286
+ mkdir_concurrent_safe(download_path.parent)
286
287
  try:
287
288
  rpath = file_data.additional_metadata["original_file_path"]
288
289
  with self.connection_config.get_client(protocol=self.protocol) as client:
@@ -106,8 +106,11 @@ class SharepointIndexer(OnedriveIndexer):
106
106
  try:
107
107
  client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
108
108
  site_drive_item = self.connection_config._get_drive_item(client_site)
109
- except ClientRequestException:
110
- logger.info("Site not found")
109
+ except ClientRequestException as e:
110
+ logger.error(f"Failed to access SharePoint site: {self.connection_config.site}")
111
+ raise SourceConnectionError(
112
+ f"Unable to access SharePoint site at {self.connection_config.site}: {str(e)}"
113
+ )
111
114
 
112
115
  path = self.index_config.path
113
116
  # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
@@ -0,0 +1,5 @@
1
+ """Utility functions for unstructured-ingest."""
2
+
3
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
4
+
5
+ __all__ = ["mkdir_concurrent_safe"]
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
8
  from unstructured_ingest.logger import logger
9
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
9
10
 
10
11
  ZIP_FILE_EXT = [".zip"]
11
12
  TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
@@ -17,7 +18,7 @@ def uncompress_file(filename: str, path: Optional[str] = None) -> str:
17
18
  """
18
19
  # Create path if it doesn't already exist
19
20
  if path:
20
- Path(path).mkdir(parents=True, exist_ok=True)
21
+ mkdir_concurrent_safe(Path(path))
21
22
 
22
23
  if any(filename.endswith(ext) for ext in ZIP_FILE_EXT):
23
24
  return uncompress_zip_file(zip_filename=filename, path=path)
@@ -0,0 +1,27 @@
1
+ """
2
+ Filesystem utilities for concurrent operations.
3
+
4
+ This module provides race-condition-safe filesystem operations that are needed
5
+ when multiple processes operate on the same directory structures simultaneously.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+
11
+ def mkdir_concurrent_safe(path: Path) -> None:
12
+ """
13
+ Create directory safely in concurrent environments, handling race conditions.
14
+
15
+ This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
16
+ raise FileExistsError when multiple processes attempt to create overlapping
17
+ directory structures simultaneously. In this codebase, this occurs when multiple
18
+ files are being downloaded in parallel and archive extraction is happening in parallel.
19
+
20
+ Related: https://github.com/python/cpython/pull/112966/files
21
+ Python core team used the same approach to fix zipfile race conditions.
22
+ """
23
+ try:
24
+ path.mkdir(parents=True, exist_ok=True)
25
+ except FileExistsError:
26
+ if not (path.exists() and path.is_dir()):
27
+ raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.51
3
+ Version: 1.0.53
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=jkcvVBVHKL5jaZGZh7CF4yFZfuGwbHAHHUGV-bTIVBs,43
2
+ unstructured_ingest/__version__.py,sha256=R6_J5XcJv3o4se9ZaxG3ld-O1m1WwU2CyxpqODyEhmg,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -84,7 +84,7 @@ unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=oGBZ8czM3XwcfhYwqnkb9A9YVAGWTmCvD5vE3Q_vfUs,6303
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=Wgv9Pih9S9FmQJud1bg7kj_qqi55d7QZ48LqlTU_mk0,6509
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=viPp5NABSycN1RjAOyAYcHlYsd__Xc9owtvshLXFN4U,14477
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=S1siX888TfHAByEXuvOqkTbcNAzx-m5UNqhKjiEKR5s,14524
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -220,19 +220,20 @@ unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-
220
220
  unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
221
221
  unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
222
  unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
223
- unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
+ unstructured_ingest/utils/__init__.py,sha256=URnsQu-y3Vmc7vn6GVL2sYuXxlSJ3naR3c9o6oKSm3w,157
224
224
  unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
225
- unstructured_ingest/utils/compression.py,sha256=_BkFREoa0fkJ6z-1lY76HCmy8mLymbPCg55iMUQTd5c,2653
225
+ unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJEvA6o7PXaLI,2712
226
226
  unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
227
227
  unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
228
228
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
229
+ unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
229
230
  unstructured_ingest/utils/html.py,sha256=78ou1vVZ0SJ3c6-Nmxg2iR5MoqubJTvwiuTNMtSFDh4,6816
230
231
  unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
231
232
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
233
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
234
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.51.dist-info/METADATA,sha256=-YIqizM0jt0c3B2u2Kh9kzTlQ0_FK3JDr0dkuLpbELU,8842
235
- unstructured_ingest-1.0.51.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.51.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.51.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.51.dist-info/RECORD,,
235
+ unstructured_ingest-1.0.53.dist-info/METADATA,sha256=FTZfBOPcfl-pbxoBAIK-d0MwQKfN5nbJXQh92H1QwMs,8842
236
+ unstructured_ingest-1.0.53.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
237
+ unstructured_ingest-1.0.53.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
238
+ unstructured_ingest-1.0.53.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
239
+ unstructured_ingest-1.0.53.dist-info/RECORD,,