unstructured-ingest 1.1.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.1.3" # pragma: no cover
1
+ __version__ = "1.2.1" # pragma: no cover
@@ -67,14 +67,14 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
67
67
  elements = elements.copy()
68
68
  elements_with_text = [e for e in elements if e.get("text")]
69
69
  texts = [e["text"] for e in elements_with_text]
70
- embeddings = []
70
+ all_embeddings = []
71
71
  try:
72
72
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
73
- embeddings = self.embed_batch(client=client, batch=batch)
74
- embeddings.extend(embeddings)
73
+ embeddings_batch = self.embed_batch(client=client, batch=batch)
74
+ all_embeddings.extend(embeddings_batch)
75
75
  except Exception as e:
76
76
  raise self.wrap_error(e=e)
77
- for element, embedding in zip(elements_with_text, embeddings):
77
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
78
78
  element[EMBEDDINGS_KEY] = embedding
79
79
  return elements
80
80
 
@@ -123,14 +123,14 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
123
123
  elements = elements.copy()
124
124
  elements_with_text = [e for e in elements if e.get("text")]
125
125
  texts = [e["text"] for e in elements_with_text]
126
- embeddings = []
126
+ all_embeddings = []
127
127
  try:
128
128
  for batch in batch_generator(texts, batch_size=self.config.batch_size or len(texts)):
129
- embeddings = await self.embed_batch(client=client, batch=batch)
130
- embeddings.extend(embeddings)
129
+ embeddings_batch = await self.embed_batch(client=client, batch=batch)
130
+ all_embeddings.extend(embeddings_batch)
131
131
  except Exception as e:
132
132
  raise self.wrap_error(e=e)
133
- for element, embedding in zip(elements_with_text, embeddings):
133
+ for element, embedding in zip(elements_with_text, all_embeddings, strict=True):
134
134
  element[EMBEDDINGS_KEY] = embedding
135
135
  return elements
136
136
 
@@ -32,21 +32,23 @@ class LogSpanExporter(ConsoleSpanExporter):
32
32
  self.log_out(self.formatter(span))
33
33
  return SpanExportResult.SUCCESS
34
34
 
35
+
35
36
  def get_log_out() -> Callable:
36
37
  level_names_mapping = {
37
- 'CRITICAL': logging.CRITICAL,
38
- 'FATAL': logging.FATAL,
39
- 'ERROR': logging.ERROR,
40
- 'WARN': logging.WARNING,
41
- 'WARNING': logging.WARNING,
42
- 'INFO': logging.INFO,
43
- 'DEBUG': logging.DEBUG,
44
- 'NOTSET': logging.NOTSET,
38
+ "CRITICAL": logging.CRITICAL,
39
+ "FATAL": logging.FATAL,
40
+ "ERROR": logging.ERROR,
41
+ "WARN": logging.WARNING,
42
+ "WARNING": logging.WARNING,
43
+ "INFO": logging.INFO,
44
+ "DEBUG": logging.DEBUG,
45
+ "NOTSET": logging.NOTSET,
45
46
  }
46
47
  log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
47
48
  log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
48
49
  return lambda message: logger.log(log_level_int, message)
49
50
 
51
+
50
52
  @dataclass
51
53
  class OtelHandler:
52
54
  otel_endpoint: Optional[str] = None
@@ -335,7 +335,7 @@ class ElasticsearchUploadStager(UploadStager):
335
335
 
336
336
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
337
337
  data = element_dict.copy()
338
- # when _op_type is not specified, it defaults to "index":
338
+ # when _op_type is not specified, it defaults to "index":
339
339
  # Overwrites if exists, creates if not.
340
340
  resp = {
341
341
  "_index": self.upload_stager_config.index_name,
@@ -385,7 +385,7 @@ class FsspecUploader(Uploader):
385
385
 
386
386
  def __post_init__(self):
387
387
  super().__post_init__()
388
- # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
388
+ # TODO: Consider using `kw_only` instead
389
389
  if not self.upload_config:
390
390
  raise TypeError(
391
391
  f"{self.__class__.__name__}.__init__() "
@@ -11,12 +11,12 @@ from pathlib import Path
11
11
  def mkdir_concurrent_safe(path: Path) -> None:
12
12
  """
13
13
  Create directory safely in concurrent environments, handling race conditions.
14
-
14
+
15
15
  This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
16
- raise FileExistsError when multiple processes attempt to create overlapping
16
+ raise FileExistsError when multiple processes attempt to create overlapping
17
17
  directory structures simultaneously. In this codebase, this occurs when multiple
18
18
  files are being downloaded in parallel and archive extraction is happening in parallel.
19
-
19
+
20
20
  Related: https://github.com/python/cpython/pull/112966/files
21
21
  Python core team used the same approach to fix zipfile race conditions.
22
22
  """
@@ -24,4 +24,4 @@ def mkdir_concurrent_safe(path: Path) -> None:
24
24
  path.mkdir(parents=True, exist_ok=True)
25
25
  except FileExistsError:
26
26
  if not (path.exists() and path.is_dir()):
27
- raise
27
+ raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.1.3
3
+ Version: 1.2.1
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -12,12 +12,11 @@ Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
- Requires-Python: <3.13,>=3.9
19
+ Requires-Python: <3.13,>=3.10
21
20
  Requires-Dist: certifi>=2025.7.14
22
21
  Requires-Dist: click
23
22
  Requires-Dist: opentelemetry-sdk
@@ -1,10 +1,10 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=t-wmxuTrwUNkiwss9snVysVtVPGHyGwXuWW2QDRpdec,42
2
+ unstructured_ingest/__version__.py,sha256=r2tk3QlR-3IlKjbYsMIts7ynZhCAyO4v-dzzULrlUCM,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
6
6
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
7
- unstructured_ingest/otel.py,sha256=wxnkdZqFtlypmOn4QX6uLxjGa7jSoFabP3PEG5FjH1g,4669
7
+ unstructured_ingest/otel.py,sha256=YalGjdPPuZKRoIfGU-O62-EXkTvn5BWVyu-ztRflax4,4671
8
8
  unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
9
9
  unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
10
10
  unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,7 +25,7 @@ unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
25
25
  unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
26
26
  unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
27
27
  unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
28
- unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
28
+ unstructured_ingest/embed/interfaces.py,sha256=VCrCSJiEfIxKB4NL4AHgKb-0vB_SEekb47zMUW6gWf0,5211
29
29
  unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
30
30
  unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
31
31
  unstructured_ingest/embed/openai.py,sha256=09I5BIrb-iGsv92LOV46-F7oZ7j1JnJIOQFARNKVq3k,5029
@@ -103,13 +103,13 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
103
103
  unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
104
104
  unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
105
105
  unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
106
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=2DZCluXR5IamwmrYmlaXTFI6g-q3y6uatuK2BDIlDj0,18773
106
+ unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=nh1VptiryyudZcRDiml38wikDcGIHVoXQfPKR9_7ugI,18772
107
107
  unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
108
108
  unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=p0u6JL6ouEPe4R_i_rAhzlvSDyMO3-NDHiw_CtPaCTc,17875
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=fA9jtXnr1P4wr8VBpZ1Lx9TsZzH-FDqHoBvPUH0DnWk,17827
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -228,15 +228,15 @@ unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJ
228
228
  unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
229
229
  unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
230
230
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
231
- unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
231
+ unstructured_ingest/utils/filesystem.py,sha256=QY-On6RNVj9IK524eYQLRkP6oTZ0PuQ8WKqnWOV_tXU,1028
232
232
  unstructured_ingest/utils/html.py,sha256=lm5lVYhVl7ztntquxzMLVQ8EmK7wkvYgNvlIuHnenoM,6865
233
233
  unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
234
234
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
235
235
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
236
236
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
237
237
  unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
238
- unstructured_ingest-1.1.3.dist-info/METADATA,sha256=Ztyvq_GpPKsmIMN5cCvXL0ppQdO4HIXtugh8rXuNqGY,8875
239
- unstructured_ingest-1.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
- unstructured_ingest-1.1.3.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
- unstructured_ingest-1.1.3.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
- unstructured_ingest-1.1.3.dist-info/RECORD,,
238
+ unstructured_ingest-1.2.1.dist-info/METADATA,sha256=dABn7DHmV7FLVs7oG3G_ltcDE5OSERhhZdGfJAoDtN4,8826
239
+ unstructured_ingest-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
240
+ unstructured_ingest-1.2.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
241
+ unstructured_ingest-1.2.1.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
242
+ unstructured_ingest-1.2.1.dist-info/RECORD,,