unstructured-ingest 0.0.18__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/astradb.py +2 -2
- unstructured_ingest/connector/astradb.py +54 -24
- unstructured_ingest/v2/processes/chunker.py +8 -29
- unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
- unstructured_ingest/v2/processes/connectors/chroma.py +2 -5
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +12 -14
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +2 -5
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -6
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +1 -6
- unstructured_ingest/v2/processes/connectors/kdbai.py +2 -5
- unstructured_ingest/v2/processes/connectors/local.py +2 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -5
- unstructured_ingest/v2/processes/connectors/mongodb.py +2 -5
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -5
- unstructured_ingest/v2/processes/connectors/sql.py +1 -6
- unstructured_ingest/v2/processes/connectors/weaviate.py +2 -5
- unstructured_ingest/v2/processes/partitioner.py +9 -55
- unstructured_ingest/v2/unstructured_api.py +87 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/METADATA +369 -369
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/RECORD +27 -26
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import fields
|
|
3
|
+
from functools import partial
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Optional
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.logger import logger
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
|
|
14
|
+
"""Given a filename and a dict of API parameters, return a PartitionRequest for use
|
|
15
|
+
by unstructured-client. Remove any params that aren't recognized by the SDK.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
filename: Path to the file being partitioned
|
|
19
|
+
parameters_dict: A mapping of all API params we want to send
|
|
20
|
+
|
|
21
|
+
Returns: A PartitionRequest containing the file and all valid params
|
|
22
|
+
"""
|
|
23
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
24
|
+
from unstructured_client.models.shared import Files, PartitionParameters
|
|
25
|
+
|
|
26
|
+
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
27
|
+
# Prior to this it was a dataclass which doesn't have .__fields
|
|
28
|
+
try:
|
|
29
|
+
possible_fields = PartitionParameters.__fields__
|
|
30
|
+
except AttributeError:
|
|
31
|
+
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
32
|
+
|
|
33
|
+
filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
|
|
34
|
+
if len(filtered_partition_request) != len(parameters_dict):
|
|
35
|
+
logger.debug(
|
|
36
|
+
"Following fields were omitted due to not being "
|
|
37
|
+
"supported by the currently used unstructured client: {}".format(
|
|
38
|
+
", ".join([v for v in parameters_dict if v not in filtered_partition_request])
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
|
|
43
|
+
|
|
44
|
+
with open(filename, "rb") as f:
|
|
45
|
+
files = Files(
|
|
46
|
+
content=f.read(),
|
|
47
|
+
file_name=str(filename.resolve()),
|
|
48
|
+
)
|
|
49
|
+
filtered_partition_request["files"] = files
|
|
50
|
+
|
|
51
|
+
partition_params = PartitionParameters(**filtered_partition_request)
|
|
52
|
+
|
|
53
|
+
return PartitionRequest(partition_parameters=partition_params)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def call_api(
|
|
57
|
+
server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
|
|
58
|
+
) -> list[dict]:
|
|
59
|
+
"""Call the Unstructured API using unstructured-client.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
server_url: The base URL where the API is hosted
|
|
63
|
+
api_key: The user's API key (can be empty if this is a self hosted API)
|
|
64
|
+
filename: Path to the file being partitioned
|
|
65
|
+
api_parameters: A dict containing the requested API parameters
|
|
66
|
+
|
|
67
|
+
Returns: A list of the file's elements, or an empty list if there was an error
|
|
68
|
+
"""
|
|
69
|
+
from unstructured_client import UnstructuredClient
|
|
70
|
+
|
|
71
|
+
client = UnstructuredClient(
|
|
72
|
+
server_url=server_url,
|
|
73
|
+
api_key_auth=api_key,
|
|
74
|
+
)
|
|
75
|
+
partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
|
|
76
|
+
|
|
77
|
+
# TODO when client supports async, run without using run_in_executor
|
|
78
|
+
# isolate the IO heavy call
|
|
79
|
+
loop = asyncio.get_event_loop()
|
|
80
|
+
|
|
81
|
+
# Note(austin) - The partition calls needs request to be a keyword arg
|
|
82
|
+
# We have to use partial to do this, we can't pass request=request into run_in_executor
|
|
83
|
+
partition_call = partial(client.general.partition, request=partition_request)
|
|
84
|
+
|
|
85
|
+
res = await loop.run_in_executor(None, partition_call)
|
|
86
|
+
|
|
87
|
+
return res.elements or []
|