unstructured-ingest 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __version__ = "0.0.8" # pragma: no cover
1
+ __version__ = "0.0.10" # pragma: no cover
@@ -63,7 +63,8 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
63
63
 
64
64
  path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
65
65
  logger.info(f"extracting tar {tar_filename} -> {path}")
66
- with tarfile.open(tar_filename, "r:gz") as tfile:
66
+ # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
67
+ with tarfile.open(tar_filename, "r:*") as tfile:
67
68
  # NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
68
69
  # This was added in Python 3.12
69
70
  # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
@@ -3,7 +3,18 @@ import datetime
3
3
  from collections import Counter
4
4
  from enum import EnumMeta
5
5
  from pathlib import Path
6
- from typing import Any, Callable, Literal, Optional, Type, TypedDict, Union, get_args, get_origin
6
+ from typing import (
7
+ Annotated,
8
+ Any,
9
+ Callable,
10
+ Literal,
11
+ Optional,
12
+ Type,
13
+ TypedDict,
14
+ Union,
15
+ get_args,
16
+ get_origin,
17
+ )
7
18
  from uuid import UUID
8
19
 
9
20
  import click
@@ -102,6 +113,11 @@ def get_type_from_annotation(field_type: Any) -> click.ParamType:
102
113
  if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
103
114
  field_type = next(field_arg for field_arg in field_args if field_arg is not None)
104
115
  return get_type_from_annotation(field_type=field_type)
116
+ if field_origin is Annotated:
117
+ field_origin = field_args[0]
118
+ field_metadata = field_args[1]
119
+ if isinstance(field_metadata, click.ParamType):
120
+ return field_metadata
105
121
  if field_origin is Secret and len(field_args) == 1:
106
122
  field_type = next(field_arg for field_arg in field_args if field_arg is not None)
107
123
  return get_type_from_annotation(field_type=field_type)
@@ -42,6 +42,7 @@ class FileData(DataClassJsonMixin):
42
42
  metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
43
43
  additional_metadata: dict[str, Any] = field(default_factory=dict)
44
44
  reprocess: bool = False
45
+ local_download_path: Optional[str] = None
45
46
 
46
47
  @classmethod
47
48
  def from_file(cls, path: str) -> "FileData":
@@ -68,10 +68,9 @@ class DownloadStep(PipelineStep):
68
68
  def update_file_data(
69
69
  self, file_data: FileData, file_data_path: Path, download_path: Path
70
70
  ) -> None:
71
+ file_data.local_download_path = str(download_path.resolve())
71
72
  file_size_bytes = download_path.stat().st_size
72
- changed = False
73
73
  if not file_data.metadata.filesize_bytes and file_size_bytes:
74
- changed = True
75
74
  file_data.metadata.filesize_bytes = file_size_bytes
76
75
  if (
77
76
  file_data.metadata.filesize_bytes
@@ -82,12 +81,10 @@ class DownloadStep(PipelineStep):
82
81
  f"({file_data.metadata.filesize_bytes}) doesn't "
83
82
  f"match size of local file: {file_size_bytes}, updating"
84
83
  )
85
- changed = True
86
84
  file_data.metadata.filesize_bytes = file_size_bytes
87
- if changed:
88
- logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
89
- with file_data_path.open("w") as file:
90
- json.dump(file_data.to_dict(), file, indent=2)
85
+ logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
86
+ with file_data_path.open("w") as file:
87
+ json.dump(file_data.to_dict(), file, indent=2)
91
88
 
92
89
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
93
90
  file_data = FileData.from_file(path=file_data_path)
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ from dataclasses import dataclass
2
3
  from pathlib import Path
3
4
  from typing import Callable, TypedDict
4
5
 
@@ -15,6 +16,7 @@ class UncompressStepResponse(TypedDict):
15
16
  path: str
16
17
 
17
18
 
19
+ @dataclass
18
20
  class UncompressStep(PipelineStep):
19
21
  process: Uncompressor
20
22
  identifier: str = STEP_ID
@@ -23,21 +25,6 @@ class UncompressStep(PipelineStep):
23
25
  config = self.process.config.json() if self.process.config else None
24
26
  logger.info(f"Created {self.identifier} with configs: {config}")
25
27
 
26
- def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
27
- file_data = FileData.from_file(path=file_data_path)
28
- new_file_data = self.process.run(file_data=file_data)
29
- responses = []
30
- for new_file in new_file_data:
31
- new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
32
- new_file.to_file(path=str(new_file_data_path.resolve()))
33
- responses.append(
34
- UncompressStepResponse(
35
- path=new_file.source_identifiers.fullpath,
36
- file_data_path=str(new_file_data_path),
37
- )
38
- )
39
- return responses
40
-
41
28
  async def _run_async(
42
29
  self, fn: Callable, path: str, file_data_path: str
43
30
  ) -> list[UncompressStepResponse]:
@@ -56,7 +43,7 @@ class UncompressStep(PipelineStep):
56
43
  new_file.to_file(path=str(new_file_data_path.resolve()))
57
44
  responses.append(
58
45
  UncompressStepResponse(
59
- path=new_file.source_identifiers.fullpath,
46
+ path=new_file.local_download_path,
60
47
  file_data_path=str(new_file_data_path),
61
48
  )
62
49
  )
@@ -3,10 +3,11 @@ import uuid
3
3
  from dataclasses import dataclass, field
4
4
  from datetime import date, datetime
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Optional
6
+ from typing import TYPE_CHECKING, Annotated, Any, Optional
7
7
 
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
+ from pydantic.functional_validators import BeforeValidator
10
11
 
11
12
  from unstructured_ingest.error import DestinationConnectionError
12
13
  from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
@@ -21,9 +22,9 @@ from unstructured_ingest.v2.interfaces import (
21
22
  UploadStagerConfig,
22
23
  )
23
24
  from unstructured_ingest.v2.logger import logger
24
- from unstructured_ingest.v2.processes.connector_registry import (
25
- DestinationRegistryEntry,
26
- )
25
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
26
+
27
+ from .utils import conform_string_to_dict
27
28
 
28
29
  if TYPE_CHECKING:
29
30
  from chromadb import Client
@@ -32,10 +33,10 @@ CONNECTOR_TYPE = "chroma"
32
33
 
33
34
 
34
35
  class ChromaAccessConfig(AccessConfig):
35
- settings: Optional[dict[str, str]] = Field(
36
+ settings: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
36
37
  default=None, description="A dictionary of settings to communicate with the chroma server."
37
38
  )
38
- headers: Optional[dict[str, str]] = Field(
39
+ headers: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
39
40
  default=None, description="A dictionary of headers to send to the Chroma server."
40
41
  )
41
42
 
@@ -2,10 +2,11 @@ import io
2
2
  import json
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Generator, Optional
5
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
8
  from pydantic import Field, Secret
9
+ from pydantic.functional_validators import BeforeValidator
9
10
 
10
11
  from unstructured_ingest.error import (
11
12
  SourceConnectionError,
@@ -26,9 +27,9 @@ from unstructured_ingest.v2.interfaces import (
26
27
  download_responses,
27
28
  )
28
29
  from unstructured_ingest.v2.logger import logger
29
- from unstructured_ingest.v2.processes.connector_registry import (
30
- SourceRegistryEntry,
31
- )
30
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
31
+
32
+ from .utils import conform_string_to_dict
32
33
 
33
34
  CONNECTOR_TYPE = "google_drive"
34
35
 
@@ -38,7 +39,7 @@ if TYPE_CHECKING:
38
39
 
39
40
 
40
41
  class GoogleDriveAccessConfig(AccessConfig):
41
- service_account_key: Optional[dict] = Field(
42
+ service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
42
43
  default=None, description="Credentials values to use for authentication"
43
44
  )
44
45
  service_account_key_path: Optional[Path] = Field(
@@ -39,22 +39,6 @@ heavily on the Elasticsearch connector code, inheriting the functionality as muc
39
39
 
40
40
  class OpenSearchAccessConfig(AccessConfig):
41
41
  password: Optional[str] = Field(default=None, description="password when using basic auth")
42
- use_ssl: bool = Field(default=False, description="use ssl for the connection")
43
- verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
44
- ssl_show_warn: bool = Field(
45
- default=False, description="show warning when verify certs is disabled"
46
- )
47
- ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
48
- client_cert: Optional[Path] = Field(
49
- default=None,
50
- description="path to the file containing the private key and the certificate,"
51
- " or cert only if using client_key",
52
- )
53
- client_key: Optional[Path] = Field(
54
- default=None,
55
- description="path to the file containing the private key"
56
- " if using separate cert and key files",
57
- )
58
42
 
59
43
 
60
44
  class OpenSearchClientInput(BaseModel):
@@ -75,6 +59,23 @@ class OpenSearchConnectionConfig(ConnectionConfig):
75
59
  examples=["http://localhost:9200"],
76
60
  )
77
61
  username: Optional[str] = Field(default=None, description="username when using basic auth")
62
+ use_ssl: bool = Field(default=False, description="use ssl for the connection")
63
+ verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
64
+ ssl_show_warn: bool = Field(
65
+ default=False, description="show warning when verify certs is disabled"
66
+ )
67
+ ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
68
+ client_cert: Optional[Path] = Field(
69
+ default=None,
70
+ description="path to the file containing the private key and the certificate,"
71
+ " or cert only if using client_key",
72
+ )
73
+ client_key: Optional[Path] = Field(
74
+ default=None,
75
+ description="path to the file containing the private key"
76
+ " if using separate cert and key files",
77
+ )
78
+
78
79
  access_config: Secret[OpenSearchAccessConfig]
79
80
 
80
81
  def get_client_kwargs(self) -> dict:
@@ -85,18 +86,18 @@ class OpenSearchConnectionConfig(ConnectionConfig):
85
86
  client_input_kwargs = {}
86
87
  if self.hosts:
87
88
  client_input_kwargs["hosts"] = self.hosts
88
- if access_config.use_ssl:
89
- client_input_kwargs["use_ssl"] = access_config.use_ssl
90
- if access_config.verify_certs:
91
- client_input_kwargs["verify_certs"] = access_config.verify_certs
92
- if access_config.ssl_show_warn:
93
- client_input_kwargs["ssl_show_warn"] = access_config.ssl_show_warn
94
- if access_config.ca_certs:
95
- client_input_kwargs["ca_certs"] = str(access_config.ca_certs)
96
- if access_config.client_cert:
97
- client_input_kwargs["client_cert"] = str(access_config.client_cert)
98
- if access_config.client_key:
99
- client_input_kwargs["client_key"] = str(access_config.client_key)
89
+ if self.use_ssl:
90
+ client_input_kwargs["use_ssl"] = self.use_ssl
91
+ if self.verify_certs:
92
+ client_input_kwargs["verify_certs"] = self.verify_certs
93
+ if self.ssl_show_warn:
94
+ client_input_kwargs["ssl_show_warn"] = self.ssl_show_warn
95
+ if self.ca_certs:
96
+ client_input_kwargs["ca_certs"] = str(self.ca_certs)
97
+ if self.client_cert:
98
+ client_input_kwargs["client_cert"] = str(self.client_cert)
99
+ if self.client_key:
100
+ client_input_kwargs["client_key"] = str(self.client_key)
100
101
  if self.username and access_config.password:
101
102
  client_input_kwargs["http_auth"] = (self.username, access_config.password)
102
103
  client_input = OpenSearchClientInput(**client_input_kwargs)
@@ -1,7 +1,9 @@
1
+ import json
1
2
  from datetime import datetime
2
- from typing import Union
3
+ from typing import Any, Union
3
4
 
4
5
  from dateutil import parser
6
+ from pydantic import ValidationError
5
7
 
6
8
 
7
9
  def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
@@ -17,3 +19,11 @@ def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
17
19
  return datetime.fromtimestamp(timestamp)
18
20
  except ValueError:
19
21
  return parser.parse(date_value)
22
+
23
+
24
+ def conform_string_to_dict(value: Any) -> dict:
25
+ if isinstance(value, dict):
26
+ return value
27
+ if isinstance(value, str):
28
+ return json.loads(value)
29
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
@@ -3,12 +3,14 @@ from copy import copy
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any
6
+ from uuid import NAMESPACE_DNS, uuid5
6
7
 
7
8
  from pydantic import BaseModel
8
9
 
9
10
  from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
10
- from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
11
12
  from unstructured_ingest.v2.interfaces.process import BaseProcess
13
+ from unstructured_ingest.v2.logger import logger
12
14
 
13
15
 
14
16
  class UncompressConfig(BaseModel):
@@ -23,19 +25,35 @@ class Uncompressor(BaseProcess, ABC):
23
25
  return True
24
26
 
25
27
  def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
26
- local_filepath = Path(file_data.source_identifiers.fullpath)
28
+ local_filepath = Path(file_data.local_download_path)
27
29
  if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT:
28
30
  return [file_data]
29
31
  new_path = uncompress_file(filename=str(local_filepath))
30
32
  new_files = [i for i in Path(new_path).rglob("*") if i.is_file()]
31
33
  responses = []
34
+ logger.debug(
35
+ "uncompressed {} files from original file {}: {}".format(
36
+ len(new_files), local_filepath, ", ".join([str(f) for f in new_files])
37
+ )
38
+ )
32
39
  for f in new_files:
33
40
  new_file_data = copy(file_data)
34
- new_file_data.source_identifiers.fullpath = str(f)
35
- if new_file_data.source_identifiers.rel_path:
36
- new_file_data.source_identifiers.rel_path = str(f).replace(
37
- str(local_filepath.parent), ""
38
- )[1:]
41
+ new_file_data.identifier = str(uuid5(NAMESPACE_DNS, str(f)))
42
+ new_file_data.local_download_path = str(f.resolve())
43
+ new_rel_download_path = str(f).replace(str(Path(local_filepath.parent)), "")[1:]
44
+ new_file_data.source_identifiers = SourceIdentifiers(
45
+ filename=f.name,
46
+ fullpath=file_data.source_identifiers.fullpath.replace(
47
+ file_data.source_identifiers.filename, new_rel_download_path
48
+ ),
49
+ rel_path=(
50
+ file_data.source_identifiers.rel_path.replace(
51
+ file_data.source_identifiers.filename, new_rel_download_path
52
+ )
53
+ if file_data.source_identifiers.rel_path
54
+ else None
55
+ ),
56
+ )
39
57
  responses.append(new_file_data)
40
58
  return responses
41
59