unstructured-ingest 1.1.2__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/otel.py +10 -8
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +6 -19
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +1 -1
- unstructured_ingest/utils/filesystem.py +4 -4
- {unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/METADATA +2 -3
- {unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/RECORD +10 -10
- {unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.
|
|
1
|
+
__version__ = "1.2.0" # pragma: no cover
|
unstructured_ingest/otel.py
CHANGED
|
@@ -32,21 +32,23 @@ class LogSpanExporter(ConsoleSpanExporter):
|
|
|
32
32
|
self.log_out(self.formatter(span))
|
|
33
33
|
return SpanExportResult.SUCCESS
|
|
34
34
|
|
|
35
|
+
|
|
35
36
|
def get_log_out() -> Callable:
|
|
36
37
|
level_names_mapping = {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
"CRITICAL": logging.CRITICAL,
|
|
39
|
+
"FATAL": logging.FATAL,
|
|
40
|
+
"ERROR": logging.ERROR,
|
|
41
|
+
"WARN": logging.WARNING,
|
|
42
|
+
"WARNING": logging.WARNING,
|
|
43
|
+
"INFO": logging.INFO,
|
|
44
|
+
"DEBUG": logging.DEBUG,
|
|
45
|
+
"NOTSET": logging.NOTSET,
|
|
45
46
|
}
|
|
46
47
|
log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
|
|
47
48
|
log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
|
|
48
49
|
return lambda message: logger.log(log_level_int, message)
|
|
49
50
|
|
|
51
|
+
|
|
50
52
|
@dataclass
|
|
51
53
|
class OtelHandler:
|
|
52
54
|
otel_endpoint: Optional[str] = None
|
|
@@ -19,7 +19,6 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
DestinationConnectionError,
|
|
20
20
|
SourceConnectionError,
|
|
21
21
|
SourceConnectionNetworkError,
|
|
22
|
-
WriteError,
|
|
23
22
|
)
|
|
24
23
|
from unstructured_ingest.interfaces import (
|
|
25
24
|
AccessConfig,
|
|
@@ -336,6 +335,8 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
336
335
|
|
|
337
336
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
338
337
|
data = element_dict.copy()
|
|
338
|
+
# when _op_type is not specified, it defaults to "index":
|
|
339
|
+
# Overwrites if exists, creates if not.
|
|
339
340
|
resp = {
|
|
340
341
|
"_index": self.upload_stager_config.index_name,
|
|
341
342
|
"_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
@@ -397,23 +398,6 @@ class ElasticsearchUploader(Uploader):
|
|
|
397
398
|
|
|
398
399
|
return parallel_bulk
|
|
399
400
|
|
|
400
|
-
def delete_by_record_id(self, client, file_data: FileData) -> None:
|
|
401
|
-
logger.debug(
|
|
402
|
-
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
403
|
-
f"from {self.upload_config.index_name} index"
|
|
404
|
-
)
|
|
405
|
-
delete_resp = client.delete_by_query(
|
|
406
|
-
index=self.upload_config.index_name,
|
|
407
|
-
body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
|
|
408
|
-
)
|
|
409
|
-
logger.info(
|
|
410
|
-
"deleted {} records from index {}".format(
|
|
411
|
-
delete_resp["deleted"], self.upload_config.index_name
|
|
412
|
-
)
|
|
413
|
-
)
|
|
414
|
-
if failures := delete_resp.get("failures"):
|
|
415
|
-
raise WriteError(f"failed to delete records: {failures}")
|
|
416
|
-
|
|
417
401
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
418
402
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
|
|
419
403
|
from elasticsearch.helpers.errors import BulkIndexError
|
|
@@ -429,7 +413,6 @@ class ElasticsearchUploader(Uploader):
|
|
|
429
413
|
)
|
|
430
414
|
|
|
431
415
|
with self.connection_config.get_client() as client:
|
|
432
|
-
self.delete_by_record_id(client=client, file_data=file_data)
|
|
433
416
|
if not client.indices.exists(index=self.upload_config.index_name):
|
|
434
417
|
logger.warning(
|
|
435
418
|
f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
|
|
@@ -446,6 +429,10 @@ class ElasticsearchUploader(Uploader):
|
|
|
446
429
|
thread_count=self.upload_config.num_threads,
|
|
447
430
|
)
|
|
448
431
|
collections.deque(iterator, maxlen=0)
|
|
432
|
+
logger.info(
|
|
433
|
+
f"uploaded batch of {len(batch)} elements to index "
|
|
434
|
+
f"{self.upload_config.index_name}"
|
|
435
|
+
)
|
|
449
436
|
except BulkIndexError as e:
|
|
450
437
|
sanitized_errors = [
|
|
451
438
|
self._sanitize_bulk_index_error(error) for error in e.errors
|
|
@@ -385,7 +385,7 @@ class FsspecUploader(Uploader):
|
|
|
385
385
|
|
|
386
386
|
def __post_init__(self):
|
|
387
387
|
super().__post_init__()
|
|
388
|
-
# TODO
|
|
388
|
+
# TODO: Consider using `kw_only` instead
|
|
389
389
|
if not self.upload_config:
|
|
390
390
|
raise TypeError(
|
|
391
391
|
f"{self.__class__.__name__}.__init__() "
|
|
@@ -11,12 +11,12 @@ from pathlib import Path
|
|
|
11
11
|
def mkdir_concurrent_safe(path: Path) -> None:
|
|
12
12
|
"""
|
|
13
13
|
Create directory safely in concurrent environments, handling race conditions.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
This addresses the issue where Path.mkdir(parents=True, exist_ok=True) can still
|
|
16
|
-
raise FileExistsError when multiple processes attempt to create overlapping
|
|
16
|
+
raise FileExistsError when multiple processes attempt to create overlapping
|
|
17
17
|
directory structures simultaneously. In this codebase, this occurs when multiple
|
|
18
18
|
files are being downloaded in parallel and archive extraction is happening in parallel.
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
Related: https://github.com/python/cpython/pull/112966/files
|
|
21
21
|
Python core team used the same approach to fix zipfile race conditions.
|
|
22
22
|
"""
|
|
@@ -24,4 +24,4 @@ def mkdir_concurrent_safe(path: Path) -> None:
|
|
|
24
24
|
path.mkdir(parents=True, exist_ok=True)
|
|
25
25
|
except FileExistsError:
|
|
26
26
|
if not (path.exists() and path.is_dir()):
|
|
27
|
-
raise
|
|
27
|
+
raise
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,12 +12,11 @@ Classifier: Intended Audience :: Science/Research
|
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
-
Requires-Python: <3.13,>=3.
|
|
19
|
+
Requires-Python: <3.13,>=3.10
|
|
21
20
|
Requires-Dist: certifi>=2025.7.14
|
|
22
21
|
Requires-Dist: click
|
|
23
22
|
Requires-Dist: opentelemetry-sdk
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=hYDudzvEd17dAmuuGVXnxBsg1JfgQBVtuuoFz06SeYo,42
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
6
6
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
7
|
-
unstructured_ingest/otel.py,sha256=
|
|
7
|
+
unstructured_ingest/otel.py,sha256=YalGjdPPuZKRoIfGU-O62-EXkTvn5BWVyu-ztRflax4,4671
|
|
8
8
|
unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
|
|
9
9
|
unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
|
|
10
10
|
unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -103,13 +103,13 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
|
|
|
103
103
|
unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
|
|
104
104
|
unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
|
|
105
105
|
unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
|
|
106
|
-
unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=
|
|
106
|
+
unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=nh1VptiryyudZcRDiml38wikDcGIHVoXQfPKR9_7ugI,18772
|
|
107
107
|
unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
|
|
108
108
|
unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
110
110
|
unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
|
|
111
111
|
unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
|
|
112
|
-
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=
|
|
112
|
+
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=fA9jtXnr1P4wr8VBpZ1Lx9TsZzH-FDqHoBvPUH0DnWk,17827
|
|
113
113
|
unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
|
|
114
114
|
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
|
|
115
115
|
unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
|
|
@@ -228,15 +228,15 @@ unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJ
|
|
|
228
228
|
unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
|
|
229
229
|
unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
|
|
230
230
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
231
|
-
unstructured_ingest/utils/filesystem.py,sha256=
|
|
231
|
+
unstructured_ingest/utils/filesystem.py,sha256=QY-On6RNVj9IK524eYQLRkP6oTZ0PuQ8WKqnWOV_tXU,1028
|
|
232
232
|
unstructured_ingest/utils/html.py,sha256=lm5lVYhVl7ztntquxzMLVQ8EmK7wkvYgNvlIuHnenoM,6865
|
|
233
233
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
234
234
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
235
235
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
236
236
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
237
237
|
unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
|
|
238
|
-
unstructured_ingest-1.
|
|
239
|
-
unstructured_ingest-1.
|
|
240
|
-
unstructured_ingest-1.
|
|
241
|
-
unstructured_ingest-1.
|
|
242
|
-
unstructured_ingest-1.
|
|
238
|
+
unstructured_ingest-1.2.0.dist-info/METADATA,sha256=3D67Gk9trwGIVvMh0oSUDx_aJDsYCHD0qwL1VB9ZoYw,8826
|
|
239
|
+
unstructured_ingest-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
240
|
+
unstructured_ingest-1.2.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
241
|
+
unstructured_ingest-1.2.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
242
|
+
unstructured_ingest-1.2.0.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.1.2.dist-info → unstructured_ingest-1.2.0.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|