unstructured-ingest 1.1.2__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.1.2" # pragma: no cover
1
+ __version__ = "1.2.0" # pragma: no cover
@@ -32,21 +32,23 @@ class LogSpanExporter(ConsoleSpanExporter):
32
32
  self.log_out(self.formatter(span))
33
33
  return SpanExportResult.SUCCESS
34
34
 
35
+
35
36
  def get_log_out() -> Callable:
36
37
  level_names_mapping = {
37
- 'CRITICAL': logging.CRITICAL,
38
- 'FATAL': logging.FATAL,
39
- 'ERROR': logging.ERROR,
40
- 'WARN': logging.WARNING,
41
- 'WARNING': logging.WARNING,
42
- 'INFO': logging.INFO,
43
- 'DEBUG': logging.DEBUG,
44
- 'NOTSET': logging.NOTSET,
38
+ "CRITICAL": logging.CRITICAL,
39
+ "FATAL": logging.FATAL,
40
+ "ERROR": logging.ERROR,
41
+ "WARN": logging.WARNING,
42
+ "WARNING": logging.WARNING,
43
+ "INFO": logging.INFO,
44
+ "DEBUG": logging.DEBUG,
45
+ "NOTSET": logging.NOTSET,
45
46
  }
46
47
  log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
47
48
  log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
48
49
  return lambda message: logger.log(log_level_int, message)
49
50
 
51
+
50
52
  @dataclass
51
53
  class OtelHandler:
52
54
  otel_endpoint: Optional[str] = None
@@ -19,7 +19,6 @@ from unstructured_ingest.error import (
19
19
  DestinationConnectionError,
20
20
  SourceConnectionError,
21
21
  SourceConnectionNetworkError,
22
- WriteError,
23
22
  )
24
23
  from unstructured_ingest.interfaces import (
25
24
  AccessConfig,
@@ -336,6 +335,8 @@ class ElasticsearchUploadStager(UploadStager):
336
335
 
337
336
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
338
337
  data = element_dict.copy()
338
+ # when _op_type is not specified, it defaults to "index":
339
+ # Overwrites if exists, creates if not.
339
340
  resp = {
340
341
  "_index": self.upload_stager_config.index_name,
341
342
  "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
@@ -397,23 +398,6 @@ class ElasticsearchUploader(Uploader):
397
398
 
398
399
  return parallel_bulk
399
400
 
400
- def delete_by_record_id(self, client, file_data: FileData) -> None:
401
- logger.debug(
402
- f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
403
- f"from {self.upload_config.index_name} index"
404
- )
405
- delete_resp = client.delete_by_query(
406
- index=self.upload_config.index_name,
407
- body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
408
- )
409
- logger.info(
410
- "deleted {} records from index {}".format(
411
- delete_resp["deleted"], self.upload_config.index_name
412
- )
413
- )
414
- if failures := delete_resp.get("failures"):
415
- raise WriteError(f"failed to delete records: {failures}")
416
-
417
401
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
418
402
  def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
419
403
  from elasticsearch.helpers.errors import BulkIndexError
@@ -429,7 +413,6 @@ class ElasticsearchUploader(Uploader):
429
413
  )
430
414
 
431
415
  with self.connection_config.get_client() as client:
432
- self.delete_by_record_id(client=client, file_data=file_data)
433
416
  if not client.indices.exists(index=self.upload_config.index_name):
434
417
  logger.warning(
435
418
  f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
@@ -446,6 +429,10 @@ class ElasticsearchUploader(Uploader):
446
429
  thread_count=self.upload_config.num_threads,
447
430
  )
448
431
  collections.deque(iterator, maxlen=0)
432
+ logger.info(
433
+ f"uploaded batch of {len(batch)} elements to index "
434
+ f"{self.upload_config.index_name}"
435
+ )
449
436
  except BulkIndexError as e:
450
437
  sanitized_errors = [
451
438
  self._sanitize_bulk_index_error(error) for error in e.errors
@@ -385,7 +385,7 @@ class FsspecUploader(Uploader):
385
385
 
386
386
  def __post_init__(self):
387
387
  super().__post_init__()
388
- # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
388
+ # TODO: Consider using `kw_only` instead
389
389
  if not self.upload_config:
390
390
  raise TypeError(
391
391
  f"{self.__class__.__name__}.__init__() "
@@ -11,12 +11,12 @@ from pathlib import Path
11
11
  def mkdir_concurrent_safe(path: Path) -> None:
12
12
  """
13
13
  Create directory safely in concurrent environments, handling race conditions.
14
-
14
+
15
15
  This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
16
- raise FileExistsError when multiple processes attempt to create overlapping
16
+ raise FileExistsError when multiple processes attempt to create overlapping
17
17
  directory structures simultaneously. In this codebase, this occurs when multiple
18
18
  files are being downloaded in parallel and archive extraction is happening in parallel.
19
-
19
+
20
20
  Related: https://github.com/python/cpython/pull/112966/files
21
21
  Python core team used the same approach to fix zipfile race conditions.
22
22
  """
@@ -24,4 +24,4 @@ def mkdir_concurrent_safe(path: Path) -> None:
24
24
  path.mkdir(parents=True, exist_ok=True)
25
25
  except FileExistsError:
26
26
  if not (path.exists() and path.is_dir()):
27
- raise
27
+ raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.1.2
3
+ Version: 1.2.0
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,11 @@ Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
- Requires-Python: <3.13,>=3.9
19
+ Requires-Python: <3.13,>=3.10
21
20
  Requires-Dist: certifi>=2025.7.14
22
21
  Requires-Dist: click
23
22
  Requires-Dist: opentelemetry-sdk
@@ -1,10 +1,10 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=vjCfCC4E5xxRkzt-CNT-2N30VSGilRtJ4TLDNsaN0S8,42
2
+ unstructured_ingest/__version__.py,sha256=hYDudzvEd17dAmuuGVXnxBsg1JfgQBVtuuoFz06SeYo,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
6
6
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
7
- unstructured_ingest/otel.py,sha256=wxnkdZqFtlypmOn4QX6uLxjGa7jSoFabP3PEG5FjH1g,4669
7
+ unstructured_ingest/otel.py,sha256=YalGjdPPuZKRoIfGU-O62-EXkTvn5BWVyu-ztRflax4,4671
8
8
  unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
9
9
  unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
10
10
  unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -103,13 +103,13 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
103
103
  unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
104
104
  unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
105
105
  unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
106
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iCC4AP5s8YYa8sMldTFcHp9sfUK1LdQTD0oqXnvklwM,19305
106
+ unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=nh1VptiryyudZcRDiml38wikDcGIHVoXQfPKR9_7ugI,18772
107
107
  unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
108
108
  unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=p0u6JL6ouEPe4R_i_rAhzlvSDyMO3-NDHiw_CtPaCTc,17875
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=fA9jtXnr1P4wr8VBpZ1Lx9TsZzH-FDqHoBvPUH0DnWk,17827
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -228,15 +228,15 @@ unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJ
228
228
  unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
229
229
  unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
230
230
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
231
- unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
231
+ unstructured_ingest/utils/filesystem.py,sha256=QY-On6RNVj9IK524eYQLRkP6oTZ0PuQ8WKqnWOV_tXU,1028
232
232
  unstructured_ingest/utils/html.py,sha256=lm5lVYhVl7ztntquxzMLVQ8EmK7wkvYgNvlIuHnenoM,6865
233
233
  unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
234
234
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
235
235
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
236
236
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
237
237
  unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
238
- unstructured_ingest-1.1.2.dist-info/METADATA,sha256=BHBW_LxSYdFLrT8hyh6hI8tkQXjhjdr2R1LmxCn0rxU,8875
239
- unstructured_ingest-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
- unstructured_ingest-1.1.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
- unstructured_ingest-1.1.2.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
- unstructured_ingest-1.1.2.dist-info/RECORD,,
238
+ unstructured_ingest-1.2.0.dist-info/METADATA,sha256=3D67Gk9trwGIVvMh0oSUDx_aJDsYCHD0qwL1VB9ZoYw,8826
239
+ unstructured_ingest-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
+ unstructured_ingest-1.2.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
+ unstructured_ingest-1.2.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
+ unstructured_ingest-1.2.0.dist-info/RECORD,,