unstructured-ingest 0.0.18__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (27) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cmds/astradb.py +2 -2
  3. unstructured_ingest/connector/astradb.py +54 -24
  4. unstructured_ingest/v2/processes/chunker.py +8 -29
  5. unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
  6. unstructured_ingest/v2/processes/connectors/chroma.py +2 -5
  7. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +12 -14
  8. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -6
  9. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -6
  10. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +2 -5
  11. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -6
  12. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +1 -6
  13. unstructured_ingest/v2/processes/connectors/kdbai.py +2 -5
  14. unstructured_ingest/v2/processes/connectors/local.py +2 -5
  15. unstructured_ingest/v2/processes/connectors/milvus.py +2 -5
  16. unstructured_ingest/v2/processes/connectors/mongodb.py +2 -5
  17. unstructured_ingest/v2/processes/connectors/pinecone.py +2 -5
  18. unstructured_ingest/v2/processes/connectors/sql.py +1 -6
  19. unstructured_ingest/v2/processes/connectors/weaviate.py +2 -5
  20. unstructured_ingest/v2/processes/partitioner.py +9 -55
  21. unstructured_ingest/v2/unstructured_api.py +87 -0
  22. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/METADATA +369 -369
  23. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/RECORD +27 -26
  24. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/LICENSE.md +0 -0
  25. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/WHEEL +0 -0
  26. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/entry_points.txt +0 -0
  27. {unstructured_ingest-0.0.18.dist-info → unstructured_ingest-0.0.21.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ import asyncio
2
+ from dataclasses import fields
3
+ from functools import partial
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Optional
6
+
7
+ from unstructured_ingest.v2.logger import logger
8
+
9
+ if TYPE_CHECKING:
10
+ from unstructured_client.models.operations import PartitionRequest
11
+
12
+
13
+ def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
14
+ """Given a filename and a dict of API parameters, return a PartitionRequest for use
15
+ by unstructured-client. Remove any params that aren't recognized by the SDK.
16
+
17
+ Args:
18
+ filename: Path to the file being partitioned
19
+ parameters_dict: A mapping of all API params we want to send
20
+
21
+ Returns: A PartitionRequest containing the file and all valid params
22
+ """
23
+ from unstructured_client.models.operations import PartitionRequest
24
+ from unstructured_client.models.shared import Files, PartitionParameters
25
+
26
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
27
+ # Prior to this it was a dataclass which doesn't have .__fields
28
+ try:
29
+ possible_fields = PartitionParameters.__fields__
30
+ except AttributeError:
31
+ possible_fields = [f.name for f in fields(PartitionParameters)]
32
+
33
+ filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
34
+ if len(filtered_partition_request) != len(parameters_dict):
35
+ logger.debug(
36
+ "Following fields were omitted due to not being "
37
+ "supported by the currently used unstructured client: {}".format(
38
+ ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
39
+ )
40
+ )
41
+
42
+ logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
43
+
44
+ with open(filename, "rb") as f:
45
+ files = Files(
46
+ content=f.read(),
47
+ file_name=str(filename.resolve()),
48
+ )
49
+ filtered_partition_request["files"] = files
50
+
51
+ partition_params = PartitionParameters(**filtered_partition_request)
52
+
53
+ return PartitionRequest(partition_parameters=partition_params)
54
+
55
+
56
+ async def call_api(
57
+ server_url: Optional[str], api_key: Optional[str], filename: Path, api_parameters: dict
58
+ ) -> list[dict]:
59
+ """Call the Unstructured API using unstructured-client.
60
+
61
+ Args:
62
+ server_url: The base URL where the API is hosted
63
+ api_key: The user's API key (can be empty if this is a self hosted API)
64
+ filename: Path to the file being partitioned
65
+ api_parameters: A dict containing the requested API parameters
66
+
67
+ Returns: A list of the file's elements, or an empty list if there was an error
68
+ """
69
+ from unstructured_client import UnstructuredClient
70
+
71
+ client = UnstructuredClient(
72
+ server_url=server_url,
73
+ api_key_auth=api_key,
74
+ )
75
+ partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
76
+
77
+ # TODO when client supports async, run without using run_in_executor
78
+ # isolate the IO heavy call
79
+ loop = asyncio.get_event_loop()
80
+
81
+ # Note(austin) - The partition calls needs request to be a keyword arg
82
+ # We have to use partial to do this, we can't pass request=request into run_in_executor
83
+ partition_call = partial(client.general.partition, request=partition_request)
84
+
85
+ res = await loop.run_in_executor(None, partition_call)
86
+
87
+ return res.elements or []