unstructured-ingest 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/compression.py +2 -1
- unstructured_ingest/v2/cli/utils/model_conversion.py +17 -1
- unstructured_ingest/v2/interfaces/file_data.py +1 -0
- unstructured_ingest/v2/pipeline/steps/download.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +3 -16
- unstructured_ingest/v2/processes/connectors/chroma.py +7 -6
- unstructured_ingest/v2/processes/connectors/google_drive.py +6 -5
- unstructured_ingest/v2/processes/connectors/opensearch.py +29 -28
- unstructured_ingest/v2/processes/connectors/utils.py +11 -1
- unstructured_ingest/v2/processes/uncompress.py +25 -7
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/METADATA +311 -311
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/RECORD +17 -17
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.8.dist-info → unstructured_ingest-0.0.10.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.10" # pragma: no cover
|
|
@@ -63,7 +63,8 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
|
|
63
63
|
|
|
64
64
|
path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
|
|
65
65
|
logger.info(f"extracting tar {tar_filename} -> {path}")
|
|
66
|
-
|
|
66
|
+
# NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
|
|
67
|
+
with tarfile.open(tar_filename, "r:*") as tfile:
|
|
67
68
|
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
|
|
68
69
|
# This was added in Python 3.12
|
|
69
70
|
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
|
@@ -3,7 +3,18 @@ import datetime
|
|
|
3
3
|
from collections import Counter
|
|
4
4
|
from enum import EnumMeta
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import (
|
|
7
|
+
Annotated,
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Literal,
|
|
11
|
+
Optional,
|
|
12
|
+
Type,
|
|
13
|
+
TypedDict,
|
|
14
|
+
Union,
|
|
15
|
+
get_args,
|
|
16
|
+
get_origin,
|
|
17
|
+
)
|
|
7
18
|
from uuid import UUID
|
|
8
19
|
|
|
9
20
|
import click
|
|
@@ -102,6 +113,11 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
|
|
|
102
113
|
if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
|
|
103
114
|
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
104
115
|
return get_type_from_annotation(field_type=field_type)
|
|
116
|
+
if field_origin is Annotated:
|
|
117
|
+
field_origin = field_args[0]
|
|
118
|
+
field_metadata = field_args[1]
|
|
119
|
+
if isinstance(field_metadata, click.ParamType):
|
|
120
|
+
return field_metadata
|
|
105
121
|
if field_origin is Secret and len(field_args) == 1:
|
|
106
122
|
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
107
123
|
return get_type_from_annotation(field_type=field_type)
|
|
@@ -42,6 +42,7 @@ class FileData(DataClassJsonMixin):
|
|
|
42
42
|
metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
|
|
43
43
|
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
44
44
|
reprocess: bool = False
|
|
45
|
+
local_download_path: Optional[str] = None
|
|
45
46
|
|
|
46
47
|
@classmethod
|
|
47
48
|
def from_file(cls, path: str) -> "FileData":
|
|
@@ -68,10 +68,9 @@ class DownloadStep(PipelineStep):
|
|
|
68
68
|
def update_file_data(
|
|
69
69
|
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
70
70
|
) -> None:
|
|
71
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
71
72
|
file_size_bytes = download_path.stat().st_size
|
|
72
|
-
changed = False
|
|
73
73
|
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
74
|
-
changed = True
|
|
75
74
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
76
75
|
if (
|
|
77
76
|
file_data.metadata.filesize_bytes
|
|
@@ -82,12 +81,10 @@ class DownloadStep(PipelineStep):
|
|
|
82
81
|
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
83
82
|
f"match size of local file: {file_size_bytes}, updating"
|
|
84
83
|
)
|
|
85
|
-
changed = True
|
|
86
84
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
json.dump(file_data.to_dict(), file, indent=2)
|
|
85
|
+
logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
|
|
86
|
+
with file_data_path.open("w") as file:
|
|
87
|
+
json.dump(file_data.to_dict(), file, indent=2)
|
|
91
88
|
|
|
92
89
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
93
90
|
file_data = FileData.from_file(path=file_data_path)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Callable, TypedDict
|
|
4
5
|
|
|
@@ -15,6 +16,7 @@ class UncompressStepResponse(TypedDict):
|
|
|
15
16
|
path: str
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
@dataclass
|
|
18
20
|
class UncompressStep(PipelineStep):
|
|
19
21
|
process: Uncompressor
|
|
20
22
|
identifier: str = STEP_ID
|
|
@@ -23,21 +25,6 @@ class UncompressStep(PipelineStep):
|
|
|
23
25
|
config = self.process.config.json() if self.process.config else None
|
|
24
26
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
25
27
|
|
|
26
|
-
def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
|
|
27
|
-
file_data = FileData.from_file(path=file_data_path)
|
|
28
|
-
new_file_data = self.process.run(file_data=file_data)
|
|
29
|
-
responses = []
|
|
30
|
-
for new_file in new_file_data:
|
|
31
|
-
new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
|
|
32
|
-
new_file.to_file(path=str(new_file_data_path.resolve()))
|
|
33
|
-
responses.append(
|
|
34
|
-
UncompressStepResponse(
|
|
35
|
-
path=new_file.source_identifiers.fullpath,
|
|
36
|
-
file_data_path=str(new_file_data_path),
|
|
37
|
-
)
|
|
38
|
-
)
|
|
39
|
-
return responses
|
|
40
|
-
|
|
41
28
|
async def _run_async(
|
|
42
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
43
30
|
) -> list[UncompressStepResponse]:
|
|
@@ -56,7 +43,7 @@ class UncompressStep(PipelineStep):
|
|
|
56
43
|
new_file.to_file(path=str(new_file_data_path.resolve()))
|
|
57
44
|
responses.append(
|
|
58
45
|
UncompressStepResponse(
|
|
59
|
-
path=new_file.
|
|
46
|
+
path=new_file.local_download_path,
|
|
60
47
|
file_data_path=str(new_file_data_path),
|
|
61
48
|
)
|
|
62
49
|
)
|
|
@@ -3,10 +3,11 @@ import uuid
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from datetime import date, datetime
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
|
+
from pydantic.functional_validators import BeforeValidator
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.error import DestinationConnectionError
|
|
12
13
|
from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
|
|
@@ -21,9 +22,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
22
|
UploadStagerConfig,
|
|
22
23
|
)
|
|
23
24
|
from unstructured_ingest.v2.logger import logger
|
|
24
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
26
|
+
|
|
27
|
+
from .utils import conform_string_to_dict
|
|
27
28
|
|
|
28
29
|
if TYPE_CHECKING:
|
|
29
30
|
from chromadb import Client
|
|
@@ -32,10 +33,10 @@ CONNECTOR_TYPE = "chroma"
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class ChromaAccessConfig(AccessConfig):
|
|
35
|
-
settings: Optional[dict
|
|
36
|
+
settings: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
36
37
|
default=None, description="A dictionary of settings to communicate with the chroma server."
|
|
37
38
|
)
|
|
38
|
-
headers: Optional[dict
|
|
39
|
+
headers: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
39
40
|
default=None, description="A dictionary of headers to send to the Chroma server."
|
|
40
41
|
)
|
|
41
42
|
|
|
@@ -2,10 +2,11 @@ import io
|
|
|
2
2
|
import json
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
|
+
from pydantic.functional_validators import BeforeValidator
|
|
9
10
|
|
|
10
11
|
from unstructured_ingest.error import (
|
|
11
12
|
SourceConnectionError,
|
|
@@ -26,9 +27,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
26
27
|
download_responses,
|
|
27
28
|
)
|
|
28
29
|
from unstructured_ingest.v2.logger import logger
|
|
29
|
-
from unstructured_ingest.v2.processes.connector_registry import
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
31
|
+
|
|
32
|
+
from .utils import conform_string_to_dict
|
|
32
33
|
|
|
33
34
|
CONNECTOR_TYPE = "google_drive"
|
|
34
35
|
|
|
@@ -38,7 +39,7 @@ if TYPE_CHECKING:
|
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
41
|
-
service_account_key: Optional[dict] = Field(
|
|
42
|
+
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
|
|
42
43
|
default=None, description="Credentials values to use for authentication"
|
|
43
44
|
)
|
|
44
45
|
service_account_key_path: Optional[Path] = Field(
|
|
@@ -39,22 +39,6 @@ heavily on the Elasticsearch connector code, inheriting the functionality as muc
|
|
|
39
39
|
|
|
40
40
|
class OpenSearchAccessConfig(AccessConfig):
|
|
41
41
|
password: Optional[str] = Field(default=None, description="password when using basic auth")
|
|
42
|
-
use_ssl: bool = Field(default=False, description="use ssl for the connection")
|
|
43
|
-
verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
|
|
44
|
-
ssl_show_warn: bool = Field(
|
|
45
|
-
default=False, description="show warning when verify certs is disabled"
|
|
46
|
-
)
|
|
47
|
-
ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
|
|
48
|
-
client_cert: Optional[Path] = Field(
|
|
49
|
-
default=None,
|
|
50
|
-
description="path to the file containing the private key and the certificate,"
|
|
51
|
-
" or cert only if using client_key",
|
|
52
|
-
)
|
|
53
|
-
client_key: Optional[Path] = Field(
|
|
54
|
-
default=None,
|
|
55
|
-
description="path to the file containing the private key"
|
|
56
|
-
" if using separate cert and key files",
|
|
57
|
-
)
|
|
58
42
|
|
|
59
43
|
|
|
60
44
|
class OpenSearchClientInput(BaseModel):
|
|
@@ -75,6 +59,23 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
75
59
|
examples=["http://localhost:9200"],
|
|
76
60
|
)
|
|
77
61
|
username: Optional[str] = Field(default=None, description="username when using basic auth")
|
|
62
|
+
use_ssl: bool = Field(default=False, description="use ssl for the connection")
|
|
63
|
+
verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
|
|
64
|
+
ssl_show_warn: bool = Field(
|
|
65
|
+
default=False, description="show warning when verify certs is disabled"
|
|
66
|
+
)
|
|
67
|
+
ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
|
|
68
|
+
client_cert: Optional[Path] = Field(
|
|
69
|
+
default=None,
|
|
70
|
+
description="path to the file containing the private key and the certificate,"
|
|
71
|
+
" or cert only if using client_key",
|
|
72
|
+
)
|
|
73
|
+
client_key: Optional[Path] = Field(
|
|
74
|
+
default=None,
|
|
75
|
+
description="path to the file containing the private key"
|
|
76
|
+
" if using separate cert and key files",
|
|
77
|
+
)
|
|
78
|
+
|
|
78
79
|
access_config: Secret[OpenSearchAccessConfig]
|
|
79
80
|
|
|
80
81
|
def get_client_kwargs(self) -> dict:
|
|
@@ -85,18 +86,18 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
85
86
|
client_input_kwargs = {}
|
|
86
87
|
if self.hosts:
|
|
87
88
|
client_input_kwargs["hosts"] = self.hosts
|
|
88
|
-
if
|
|
89
|
-
client_input_kwargs["use_ssl"] =
|
|
90
|
-
if
|
|
91
|
-
client_input_kwargs["verify_certs"] =
|
|
92
|
-
if
|
|
93
|
-
client_input_kwargs["ssl_show_warn"] =
|
|
94
|
-
if
|
|
95
|
-
client_input_kwargs["ca_certs"] = str(
|
|
96
|
-
if
|
|
97
|
-
client_input_kwargs["client_cert"] = str(
|
|
98
|
-
if
|
|
99
|
-
client_input_kwargs["client_key"] = str(
|
|
89
|
+
if self.use_ssl:
|
|
90
|
+
client_input_kwargs["use_ssl"] = self.use_ssl
|
|
91
|
+
if self.verify_certs:
|
|
92
|
+
client_input_kwargs["verify_certs"] = self.verify_certs
|
|
93
|
+
if self.ssl_show_warn:
|
|
94
|
+
client_input_kwargs["ssl_show_warn"] = self.ssl_show_warn
|
|
95
|
+
if self.ca_certs:
|
|
96
|
+
client_input_kwargs["ca_certs"] = str(self.ca_certs)
|
|
97
|
+
if self.client_cert:
|
|
98
|
+
client_input_kwargs["client_cert"] = str(self.client_cert)
|
|
99
|
+
if self.client_key:
|
|
100
|
+
client_input_kwargs["client_key"] = str(self.client_key)
|
|
100
101
|
if self.username and access_config.password:
|
|
101
102
|
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
102
103
|
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from datetime import datetime
|
|
2
|
-
from typing import Union
|
|
3
|
+
from typing import Any, Union
|
|
3
4
|
|
|
4
5
|
from dateutil import parser
|
|
6
|
+
from pydantic import ValidationError
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
|
|
@@ -17,3 +19,11 @@ def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
|
|
|
17
19
|
return datetime.fromtimestamp(timestamp)
|
|
18
20
|
except ValueError:
|
|
19
21
|
return parser.parse(date_value)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def conform_string_to_dict(value: Any) -> dict:
|
|
25
|
+
if isinstance(value, dict):
|
|
26
|
+
return value
|
|
27
|
+
if isinstance(value, str):
|
|
28
|
+
return json.loads(value)
|
|
29
|
+
raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
|
|
@@ -3,12 +3,14 @@ from copy import copy
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
6
7
|
|
|
7
8
|
from pydantic import BaseModel
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
|
|
10
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
11
12
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
13
|
+
from unstructured_ingest.v2.logger import logger
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class UncompressConfig(BaseModel):
|
|
@@ -23,19 +25,35 @@ class Uncompressor(BaseProcess, ABC):
|
|
|
23
25
|
return True
|
|
24
26
|
|
|
25
27
|
def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
|
|
26
|
-
local_filepath = Path(file_data.
|
|
28
|
+
local_filepath = Path(file_data.local_download_path)
|
|
27
29
|
if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT:
|
|
28
30
|
return [file_data]
|
|
29
31
|
new_path = uncompress_file(filename=str(local_filepath))
|
|
30
32
|
new_files = [i for i in Path(new_path).rglob("*") if i.is_file()]
|
|
31
33
|
responses = []
|
|
34
|
+
logger.debug(
|
|
35
|
+
"uncompressed {} files from original file {}: {}".format(
|
|
36
|
+
len(new_files), local_filepath, ", ".join([str(f) for f in new_files])
|
|
37
|
+
)
|
|
38
|
+
)
|
|
32
39
|
for f in new_files:
|
|
33
40
|
new_file_data = copy(file_data)
|
|
34
|
-
new_file_data.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
41
|
+
new_file_data.identifier = str(uuid5(NAMESPACE_DNS, str(f)))
|
|
42
|
+
new_file_data.local_download_path = str(f.resolve())
|
|
43
|
+
new_rel_download_path = str(f).replace(str(Path(local_filepath.parent)), "")[1:]
|
|
44
|
+
new_file_data.source_identifiers = SourceIdentifiers(
|
|
45
|
+
filename=f.name,
|
|
46
|
+
fullpath=file_data.source_identifiers.fullpath.replace(
|
|
47
|
+
file_data.source_identifiers.filename, new_rel_download_path
|
|
48
|
+
),
|
|
49
|
+
rel_path=(
|
|
50
|
+
file_data.source_identifiers.rel_path.replace(
|
|
51
|
+
file_data.source_identifiers.filename, new_rel_download_path
|
|
52
|
+
)
|
|
53
|
+
if file_data.source_identifiers.rel_path
|
|
54
|
+
else None
|
|
55
|
+
),
|
|
56
|
+
)
|
|
39
57
|
responses.append(new_file_data)
|
|
40
58
|
return responses
|
|
41
59
|
|