unstructured-ingest 1.0.44__py3-none-any.whl → 1.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.44" # pragma: no cover
1
+ __version__ = "1.0.46" # pragma: no cover
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import traceback
3
3
  from dataclasses import dataclass, field
4
- from multiprocessing import Process, Queue, current_process
4
+ from multiprocessing import Queue, current_process
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Optional
7
7
  from urllib.parse import urlparse
@@ -186,10 +186,25 @@ class DeltaTableUploader(Uploader):
186
186
  )
187
187
 
188
188
  def _is_commit_conflict(exc: BaseException) -> bool: # noqa: ANN401
189
- """Return True if exception looks like a Delta Lake commit conflict."""
190
-
191
- return isinstance(exc, RuntimeError) and (
192
- "CommitFailed" in str(exc) or "Metadata changed" in str(exc)
189
+ """Return True if exception looks like a Delta Lake commit conflict.
190
+
191
+ Besides the canonical *CommitFailed* / *Metadata changed* errors that
192
+ deltalake surfaces when two writers clash, we occasionally hit
193
+ messages such as *Delta transaction failed, version 0 already
194
+ exists* while multiple processes race to create the very first log
195
+ entry. These situations are equally safe to retry, so detect them
196
+ too.
197
+ """
198
+
199
+ return isinstance(exc, RuntimeError) and any(
200
+ marker in str(exc)
201
+ for marker in (
202
+ "CommitFailed",
203
+ "Metadata changed",
204
+ "version 0 already exists",
205
+ "version already exists",
206
+ "Delta transaction failed",
207
+ )
193
208
  )
194
209
 
195
210
  @retry(
@@ -206,25 +221,39 @@ class DeltaTableUploader(Uploader):
206
221
  # cause ingest to fail, even though all tasks are completed normally. Putting the writer
207
222
  # into a process mitigates this issue by ensuring python interpreter waits properly for
208
223
  # deltalake's rust backend to finish
209
- queue: Queue[str] = Queue()
224
+ # Use a multiprocessing context that relies on 'spawn' to avoid inheriting the
225
+ # parent process' Tokio runtime, which leads to `pyo3_runtime.PanicException`.
226
+ from multiprocessing import get_context
227
+
228
+ ctx = get_context("spawn")
229
+ queue: "Queue[str]" = ctx.Queue()
210
230
 
211
231
  if current_process().daemon:
212
232
  # write_deltalake_with_error_handling will push any traceback to our queue
213
233
  write_deltalake_with_error_handling(queue=queue, **writer_kwargs)
214
234
  else:
215
- # On non-daemon processes we still guard against SIGABRT by running in a subprocess.
216
- writer = Process(
235
+ # On non-daemon processes we still guard against SIGABRT by running in a
236
+ # dedicated subprocess created via the 'spawn' method.
237
+ writer = ctx.Process(
217
238
  target=write_deltalake_with_error_handling,
218
239
  kwargs={"queue": queue, **writer_kwargs},
219
240
  )
220
241
  writer.start()
221
242
  writer.join()
222
243
 
223
- # Check if the queue has any exception message
224
- if not queue.empty():
225
- error_message = queue.get()
226
- logger.error("Exception occurred in write_deltalake: %s", error_message)
227
- raise RuntimeError(f"Error in write_deltalake: {error_message}")
244
+ # First surface any traceback captured inside the subprocess so users see the real
245
+ # root-cause instead of a generic non-zero exit code.
246
+ if not queue.empty():
247
+ error_message = queue.get()
248
+ logger.error("Exception occurred in write_deltalake: %s", error_message)
249
+ raise RuntimeError(f"Error in write_deltalake: {error_message}")
250
+
251
+ # If the subprocess terminated abnormally but produced no traceback (e.g., SIGABRT),
252
+ # still raise a helpful error for callers.
253
+ if not current_process().daemon and writer.exitcode != 0:
254
+ raise RuntimeError(
255
+ f"write_deltalake subprocess exited with code {writer.exitcode}"
256
+ )
228
257
 
229
258
  _single_attempt()
230
259
 
@@ -274,7 +274,7 @@ class FsspecDownloader(Downloader):
274
274
  try:
275
275
  rpath = file_data.additional_metadata["original_file_path"]
276
276
  with self.connection_config.get_client(protocol=self.protocol) as client:
277
- client.get(rpath=rpath, lpath=download_path.as_posix())
277
+ client.get_file(rpath=rpath, lpath=download_path.as_posix())
278
278
  self.handle_directory_download(lpath=download_path)
279
279
  except Exception as e:
280
280
  raise self.wrap_error(e=e)
@@ -286,7 +286,7 @@ class FsspecDownloader(Downloader):
286
286
  try:
287
287
  rpath = file_data.additional_metadata["original_file_path"]
288
288
  with self.connection_config.get_client(protocol=self.protocol) as client:
289
- await client.get(rpath=rpath, lpath=download_path.as_posix())
289
+ await client.get_file(rpath=rpath, lpath=download_path.as_posix())
290
290
  self.handle_directory_download(lpath=download_path)
291
291
  except Exception as e:
292
292
  raise self.wrap_error(e=e)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.44
3
+ Version: 1.0.46
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -60,6 +60,7 @@ Provides-Extra: delta-table
60
60
  Requires-Dist: boto3; extra == 'delta-table'
61
61
  Requires-Dist: deltalake; extra == 'delta-table'
62
62
  Requires-Dist: pandas; extra == 'delta-table'
63
+ Requires-Dist: pyarrow; extra == 'delta-table'
63
64
  Requires-Dist: tenacity; extra == 'delta-table'
64
65
  Provides-Extra: discord
65
66
  Requires-Dist: discord-py; extra == 'discord'
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=12SSwrWI8zU57pbaRSeJH9dGmuvWZXi056-PfBAhJTw,43
2
+ unstructured_ingest/__version__.py,sha256=gy8wDmwG1gXjyS0ueUlncvuS37NRLiR1Derrpt71Y7g,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -68,7 +68,7 @@ unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE
68
68
  unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
69
69
  unstructured_ingest/processes/connectors/confluence.py,sha256=aA2B_FPdAjlVAJtmMldYu6lld2sR-6JL5tWh7yItiwg,22828
70
70
  unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
71
- unstructured_ingest/processes/connectors/delta_table.py,sha256=Y3yJPfwTyDdv7dqn54ZLZ4DBjg9OF2rXuUaNfbPCkvc,9993
71
+ unstructured_ingest/processes/connectors/delta_table.py,sha256=JrpiX9V-YD1VhExKi6KFwlYatCheSs3t-xB3Td1BVFk,11487
72
72
  unstructured_ingest/processes/connectors/discord.py,sha256=CD-SBECMdr3pnmqbPvBMyPU2cBroXUhyW6F7L3laP6A,5348
73
73
  unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
74
74
  unstructured_ingest/processes/connectors/gitlab.py,sha256=Fdq6_lk-By1JDmLGVjoKJkaHESiKTZsbvoHhMsljlE0,10114
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=NbId5WMq6M5kF3fYAwSUuaL2e_gutgmTATrE_X8okGY,14467
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=viPp5NABSycN1RjAOyAYcHlYsd__Xc9owtvshLXFN4U,14477
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.44.dist-info/METADATA,sha256=PR_LHUUQP-2oayEmsoTGblqWKPmJt46QtijI7y-zni0,8795
235
- unstructured_ingest-1.0.44.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.44.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.44.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.44.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.46.dist-info/METADATA,sha256=EHmSqyvPZRzxeO6_UvkHyVqPilKlOmbqHeAVJejCiLE,8842
235
+ unstructured_ingest-1.0.46.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.46.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.46.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.46.dist-info/RECORD,,