unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/local.py +22 -14
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -5,9 +5,9 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
DownloadResponse,
|
|
13
13
|
FileData,
|
|
@@ -32,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
32
32
|
CONNECTOR_TYPE = "s3"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class S3IndexerConfig(FsspecIndexerConfig):
|
|
37
36
|
pass
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
@dataclass
|
|
41
39
|
class S3AccessConfig(FsspecAccessConfig):
|
|
42
|
-
key: Optional[str] =
|
|
43
|
-
|
|
44
|
-
|
|
40
|
+
key: Optional[str] = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
description="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
43
|
+
"over `aws_access_key_id` in client_kwargs.",
|
|
44
|
+
)
|
|
45
|
+
secret: Optional[str] = Field(
|
|
46
|
+
default=None, description="If not anonymous, use this secret access key, if specified."
|
|
47
|
+
)
|
|
48
|
+
token: Optional[str] = Field(
|
|
49
|
+
default=None, description="If not anonymous, use this security token, if specified."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
SecretS3AccessConfig = Secret[S3AccessConfig]
|
|
45
54
|
|
|
46
55
|
|
|
47
|
-
@dataclass
|
|
48
56
|
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
49
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
|
|
50
|
-
access_config:
|
|
51
|
-
|
|
57
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
58
|
+
access_config: SecretS3AccessConfig = Field(
|
|
59
|
+
default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
|
|
52
60
|
)
|
|
53
|
-
endpoint_url: Optional[str] =
|
|
54
|
-
|
|
55
|
-
|
|
61
|
+
endpoint_url: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Use this endpoint_url, if specified. Needed for "
|
|
64
|
+
"connecting to non-AWS S3 buckets.",
|
|
65
|
+
)
|
|
66
|
+
anonymous: bool = Field(
|
|
67
|
+
default=False, description="Connect to s3 without local AWS credentials."
|
|
68
|
+
)
|
|
69
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
56
70
|
|
|
57
71
|
def get_access_config(self) -> dict[str, Any]:
|
|
58
72
|
access_configs: dict[str, Any] = {"anon": self.anonymous}
|
|
@@ -60,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
60
74
|
access_configs["endpoint_url"] = self.endpoint_url
|
|
61
75
|
|
|
62
76
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
63
|
-
access_configs.update(
|
|
77
|
+
access_configs.update(
|
|
78
|
+
{k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
|
|
79
|
+
)
|
|
64
80
|
return access_configs
|
|
65
81
|
|
|
66
82
|
|
|
@@ -116,7 +132,6 @@ class S3Indexer(FsspecIndexer):
|
|
|
116
132
|
super().precheck()
|
|
117
133
|
|
|
118
134
|
|
|
119
|
-
@dataclass
|
|
120
135
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
121
136
|
pass
|
|
122
137
|
|
|
@@ -137,7 +152,6 @@ class S3Downloader(FsspecDownloader):
|
|
|
137
152
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
138
153
|
|
|
139
154
|
|
|
140
|
-
@dataclass
|
|
141
155
|
class S3UploaderConfig(FsspecUploaderConfig):
|
|
142
156
|
pass
|
|
143
157
|
|
|
@@ -6,7 +6,8 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
12
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
27
28
|
CONNECTOR_TYPE = "sftp"
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
@dataclass
|
|
31
31
|
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
|
|
33
|
+
def model_post_init(self, __context: Any) -> None:
|
|
34
|
+
super().model_post_init(__context)
|
|
34
35
|
_, ext = os.path.splitext(self.remote_url)
|
|
35
36
|
parsed_url = urlparse(self.remote_url)
|
|
36
37
|
if ext:
|
|
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
|
|
|
39
40
|
self.path_without_protocol = parsed_url.path.lstrip("/")
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
@dataclass
|
|
43
43
|
class SftpAccessConfig(FsspecAccessConfig):
|
|
44
|
-
password: str
|
|
44
|
+
password: str = Field(description="Password for sftp connection")
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
@dataclass
|
|
48
47
|
class SftpConnectionConfig(FsspecConnectionConfig):
|
|
49
|
-
supported_protocols: list[str] =
|
|
50
|
-
access_config: SftpAccessConfig
|
|
51
|
-
connector_type: str = CONNECTOR_TYPE
|
|
52
|
-
username:
|
|
53
|
-
host: Optional[str] = None
|
|
54
|
-
port: int = 22
|
|
55
|
-
look_for_keys: bool =
|
|
56
|
-
|
|
48
|
+
supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
|
|
49
|
+
access_config: Secret[SftpAccessConfig]
|
|
50
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
51
|
+
username: str = Field(description="Username for sftp connection")
|
|
52
|
+
host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
|
|
53
|
+
port: int = Field(default=22, description="Port for sftp connection")
|
|
54
|
+
look_for_keys: bool = Field(
|
|
55
|
+
default=False, description="Whether to search for private key files in ~/.ssh/"
|
|
56
|
+
)
|
|
57
|
+
allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
|
|
57
58
|
|
|
58
59
|
def get_access_config(self) -> dict[str, Any]:
|
|
59
60
|
access_config = {
|
|
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
|
|
|
62
63
|
"port": self.port,
|
|
63
64
|
"look_for_keys": self.look_for_keys,
|
|
64
65
|
"allow_agent": self.allow_agent,
|
|
65
|
-
"password": self.access_config.password,
|
|
66
|
+
"password": self.access_config.get_secret_value().password,
|
|
66
67
|
}
|
|
67
68
|
return access_config
|
|
68
69
|
|
|
@@ -96,24 +97,15 @@ class SftpIndexer(FsspecIndexer):
|
|
|
96
97
|
super().precheck()
|
|
97
98
|
|
|
98
99
|
|
|
99
|
-
@dataclass
|
|
100
100
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
101
|
-
remote_url:
|
|
102
|
-
|
|
103
|
-
def __post_init__(self):
|
|
104
|
-
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
105
|
-
if not self.remote_url:
|
|
106
|
-
raise TypeError(
|
|
107
|
-
f"{self.__class__.__name__}.__init__() "
|
|
108
|
-
f"missing 1 required positional argument: 'remote_url'"
|
|
109
|
-
)
|
|
101
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
110
102
|
|
|
111
103
|
|
|
112
104
|
@dataclass
|
|
113
105
|
class SftpDownloader(FsspecDownloader):
|
|
114
106
|
protocol: str = "sftp"
|
|
115
107
|
connection_config: SftpConnectionConfig
|
|
116
|
-
connector_type: str = CONNECTOR_TYPE
|
|
108
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
117
109
|
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
118
110
|
|
|
119
111
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
@@ -131,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
|
|
|
131
123
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
132
124
|
|
|
133
125
|
|
|
134
|
-
@dataclass
|
|
135
126
|
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
136
127
|
pass
|
|
137
128
|
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import io
|
|
2
|
-
import
|
|
2
|
+
import json
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
6
|
|
|
6
7
|
from dateutil import parser
|
|
7
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
10
|
from unstructured_ingest.error import (
|
|
11
11
|
SourceConnectionError,
|
|
12
12
|
SourceConnectionNetworkError,
|
|
13
13
|
)
|
|
14
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
from unstructured_ingest.utils.
|
|
15
|
+
from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
16
16
|
from unstructured_ingest.v2.interfaces import (
|
|
17
17
|
AccessConfig,
|
|
18
18
|
ConnectionConfig,
|
|
@@ -37,46 +37,54 @@ if TYPE_CHECKING:
|
|
|
37
37
|
from googleapiclient.http import MediaIoBaseDownload
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
@dataclass
|
|
41
40
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
42
|
-
service_account_key:
|
|
41
|
+
service_account_key: Optional[dict] = Field(
|
|
42
|
+
default=None, description="Credentials values to use for authentication"
|
|
43
|
+
)
|
|
44
|
+
service_account_key_path: Optional[Path] = Field(
|
|
45
|
+
default=None, description="File path to credentials values to use for authentication"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def model_post_init(self, __context: Any) -> None:
|
|
49
|
+
if self.service_account_key is None and self.service_account_key_path is None:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"either service_account_key or service_account_key_path must be provided"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def get_service_account_key(self) -> dict:
|
|
55
|
+
key_data = None
|
|
56
|
+
if self.service_account_key_path:
|
|
57
|
+
with self.service_account_key_path.open() as f:
|
|
58
|
+
key_data = json.load(f)
|
|
59
|
+
if key_data and self.service_account_key:
|
|
60
|
+
if key_data == self.service_account_key:
|
|
61
|
+
return key_data
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"service_account_key and service_account_key_path "
|
|
65
|
+
"both provided and have different values"
|
|
66
|
+
)
|
|
67
|
+
if key_data:
|
|
68
|
+
return key_data
|
|
69
|
+
return self.service_account_key
|
|
43
70
|
|
|
44
71
|
|
|
45
|
-
@dataclass
|
|
46
72
|
class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
47
|
-
drive_id: str
|
|
48
|
-
access_config: GoogleDriveAccessConfig
|
|
73
|
+
drive_id: str = Field(description="Google Drive File or Folder ID.")
|
|
74
|
+
access_config: Secret[GoogleDriveAccessConfig]
|
|
49
75
|
|
|
50
76
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
51
77
|
def get_files_service(self) -> "GoogleAPIResource":
|
|
52
|
-
from google.auth import
|
|
78
|
+
from google.auth import exceptions
|
|
53
79
|
from google.oauth2 import service_account
|
|
54
80
|
from googleapiclient.discovery import build
|
|
55
81
|
from googleapiclient.errors import HttpError
|
|
56
82
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if isinstance(self.access_config.service_account_key, str):
|
|
60
|
-
key_path = json_to_dict(self.access_config.service_account_key)
|
|
61
|
-
elif isinstance(self.access_config.service_account_key, dict):
|
|
62
|
-
key_path = self.access_config.service_account_key
|
|
63
|
-
else:
|
|
64
|
-
raise TypeError(
|
|
65
|
-
f"access_config.service_account_key must be "
|
|
66
|
-
f"str or dict, got: {type(self.access_config.service_account_key)}"
|
|
67
|
-
)
|
|
83
|
+
access_config = self.access_config.get_secret_value()
|
|
84
|
+
key_data = access_config.get_service_account_key()
|
|
68
85
|
|
|
69
86
|
try:
|
|
70
|
-
|
|
71
|
-
creds = service_account.Credentials.from_service_account_info(key_path)
|
|
72
|
-
elif isinstance(key_path, str):
|
|
73
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
|
74
|
-
creds, _ = default()
|
|
75
|
-
else:
|
|
76
|
-
raise ValueError(
|
|
77
|
-
f"key path not recognized as a dictionary or a file path: "
|
|
78
|
-
f"[{type(key_path)}] {key_path}",
|
|
79
|
-
)
|
|
87
|
+
creds = service_account.Credentials.from_service_account_info(key_data)
|
|
80
88
|
service = build("drive", "v3", credentials=creds)
|
|
81
89
|
return service.files()
|
|
82
90
|
|
|
@@ -86,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
|
86
94
|
raise ValueError("The provided API key is invalid.")
|
|
87
95
|
|
|
88
96
|
|
|
89
|
-
@dataclass
|
|
90
97
|
class GoogleDriveIndexerConfig(IndexerConfig):
|
|
91
98
|
extensions: Optional[list[str]] = None
|
|
92
99
|
recursive: bool = False
|
|
@@ -268,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
268
275
|
yield f
|
|
269
276
|
|
|
270
277
|
|
|
271
|
-
@dataclass
|
|
272
278
|
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
273
279
|
pass
|
|
274
280
|
|
|
@@ -5,6 +5,8 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator
|
|
7
7
|
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
8
10
|
from unstructured_ingest.v2.interfaces import (
|
|
9
11
|
AccessConfig,
|
|
10
12
|
ConnectionConfig,
|
|
@@ -29,20 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
29
31
|
CONNECTOR_TYPE = "local"
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
@dataclass
|
|
33
34
|
class LocalAccessConfig(AccessConfig):
|
|
34
35
|
pass
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
|
|
38
|
+
SecretLocalAccessConfig = Secret[LocalAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
38
41
|
class LocalConnectionConfig(ConnectionConfig):
|
|
39
|
-
access_config:
|
|
42
|
+
access_config: SecretLocalAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
|
|
44
|
+
)
|
|
40
45
|
|
|
41
46
|
|
|
42
|
-
@dataclass
|
|
43
47
|
class LocalIndexerConfig(IndexerConfig):
|
|
44
|
-
input_path:
|
|
45
|
-
|
|
48
|
+
input_path: Path = Field(
|
|
49
|
+
description="Path to the location in the local file system that will be processed."
|
|
50
|
+
)
|
|
51
|
+
recursive: bool = Field(
|
|
52
|
+
default=False,
|
|
53
|
+
description="Recursively download files in their respective folders "
|
|
54
|
+
"otherwise stop at the files in provided folder level.",
|
|
55
|
+
)
|
|
46
56
|
|
|
47
57
|
@property
|
|
48
58
|
def path(self) -> Path:
|
|
@@ -122,7 +132,6 @@ class LocalIndexer(Indexer):
|
|
|
122
132
|
yield file_data
|
|
123
133
|
|
|
124
134
|
|
|
125
|
-
@dataclass
|
|
126
135
|
class LocalDownloaderConfig(DownloaderConfig):
|
|
127
136
|
pass
|
|
128
137
|
|
|
@@ -130,10 +139,8 @@ class LocalDownloaderConfig(DownloaderConfig):
|
|
|
130
139
|
@dataclass
|
|
131
140
|
class LocalDownloader(Downloader):
|
|
132
141
|
connector_type: str = CONNECTOR_TYPE
|
|
133
|
-
connection_config: LocalConnectionConfig = field(
|
|
134
|
-
|
|
135
|
-
)
|
|
136
|
-
download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
|
|
142
|
+
connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
|
|
143
|
+
download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
|
|
137
144
|
|
|
138
145
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
139
146
|
return Path(file_data.source_identifiers.fullpath)
|
|
@@ -144,9 +151,10 @@ class LocalDownloader(Downloader):
|
|
|
144
151
|
)
|
|
145
152
|
|
|
146
153
|
|
|
147
|
-
@dataclass
|
|
148
154
|
class LocalUploaderConfig(UploaderConfig):
|
|
149
|
-
output_dir: str =
|
|
155
|
+
output_dir: str = Field(
|
|
156
|
+
default="structured-output", description="Local path to write partitioned output to"
|
|
157
|
+
)
|
|
150
158
|
|
|
151
159
|
@property
|
|
152
160
|
def output_path(self) -> Path:
|
|
@@ -160,7 +168,7 @@ class LocalUploaderConfig(UploaderConfig):
|
|
|
160
168
|
@dataclass
|
|
161
169
|
class LocalUploader(Uploader):
|
|
162
170
|
connector_type: str = CONNECTOR_TYPE
|
|
163
|
-
upload_config: LocalUploaderConfig = field(default_factory=
|
|
171
|
+
upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
|
|
164
172
|
connection_config: LocalConnectionConfig = field(
|
|
165
173
|
default_factory=lambda: LocalConnectionConfig()
|
|
166
174
|
)
|
|
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import WriteError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "milvus"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class MilvusAccessConfig(AccessConfig):
|
|
37
|
-
password: Optional[str] = None
|
|
38
|
-
token: Optional[str] = None
|
|
36
|
+
password: Optional[str] = Field(default=None, description="Milvus password")
|
|
37
|
+
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
@dataclass
|
|
42
43
|
class MilvusConnectionConfig(ConnectionConfig):
|
|
43
|
-
access_config:
|
|
44
|
-
|
|
44
|
+
access_config: SecretMilvusAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
|
|
45
46
|
)
|
|
46
|
-
uri: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
uri: Optional[str] = Field(
|
|
48
|
+
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
49
|
+
)
|
|
50
|
+
user: Optional[str] = Field(default=None, description="Milvus user")
|
|
51
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
49
52
|
|
|
50
53
|
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
access_config = self.access_config.get_secret_value()
|
|
55
|
+
access_config_dict = access_config.dict()
|
|
56
|
+
connection_config_dict = self.dict()
|
|
53
57
|
connection_config_dict.pop("access_config", None)
|
|
54
58
|
connection_config_dict.update(access_config_dict)
|
|
55
59
|
# Drop any that were not set explicitly
|
|
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
63
67
|
return MilvusClient(**self.get_connection_kwargs())
|
|
64
68
|
|
|
65
69
|
|
|
66
|
-
@dataclass
|
|
67
70
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
68
71
|
pass
|
|
69
72
|
|
|
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
|
|
|
130
133
|
return output_path
|
|
131
134
|
|
|
132
135
|
|
|
133
|
-
@dataclass
|
|
134
136
|
class MilvusUploaderConfig(UploaderConfig):
|
|
135
|
-
collection_name: str
|
|
136
|
-
|
|
137
|
+
collection_name: str = Field(description="Milvus collections to write to")
|
|
138
|
+
num_processes: int = Field(
|
|
139
|
+
default=4, description="number of processes to use when writing to support parallel writes"
|
|
140
|
+
)
|
|
137
141
|
|
|
138
142
|
|
|
139
143
|
@dataclass
|
|
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
|
|
|
180
184
|
self.insert_results(data=data)
|
|
181
185
|
|
|
182
186
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
183
|
-
if self.upload_config.
|
|
187
|
+
if self.upload_config.num_processes == 1:
|
|
184
188
|
for content in contents:
|
|
185
189
|
self.upload(content=content)
|
|
186
190
|
|
|
187
191
|
else:
|
|
188
192
|
with mp.Pool(
|
|
189
|
-
processes=self.upload_config.
|
|
193
|
+
processes=self.upload_config.num_processes,
|
|
190
194
|
) as pool:
|
|
191
195
|
pool.map(self.upload, contents)
|
|
192
196
|
|
|
@@ -3,9 +3,9 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -31,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
|
|
|
31
31
|
SERVER_API_VERSION = "1"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class MongoDBAccessConfig(AccessConfig):
|
|
36
|
-
uri: Optional[str] = None
|
|
35
|
+
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
|
|
37
39
|
|
|
38
40
|
|
|
39
|
-
@dataclass
|
|
40
41
|
class MongoDBConnectionConfig(ConnectionConfig):
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
42
|
+
access_config: SecretMongoDBAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
|
|
43
44
|
)
|
|
44
|
-
host: Optional[str] =
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
host: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
48
|
+
"mongos instance to connect to, or a list of hostnames",
|
|
49
|
+
)
|
|
50
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
51
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
52
|
+
port: int = Field(default=27017)
|
|
53
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
50
54
|
|
|
51
55
|
|
|
52
|
-
@dataclass
|
|
53
56
|
class MongoDBUploadStagerConfig(UploadStagerConfig):
|
|
54
57
|
pass
|
|
55
58
|
|
|
@@ -77,9 +80,8 @@ class MongoDBUploadStager(UploadStager):
|
|
|
77
80
|
return output_path
|
|
78
81
|
|
|
79
82
|
|
|
80
|
-
@dataclass
|
|
81
83
|
class MongoDBUploaderConfig(UploaderConfig):
|
|
82
|
-
batch_size: int = 100
|
|
84
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
83
85
|
|
|
84
86
|
|
|
85
87
|
@dataclass
|
|
@@ -102,9 +104,11 @@ class MongoDBUploader(Uploader):
|
|
|
102
104
|
from pymongo.driver_info import DriverInfo
|
|
103
105
|
from pymongo.server_api import ServerApi
|
|
104
106
|
|
|
105
|
-
|
|
107
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
108
|
+
|
|
109
|
+
if access_config.uri:
|
|
106
110
|
return MongoClient(
|
|
107
|
-
|
|
111
|
+
access_config.uri,
|
|
108
112
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
109
113
|
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
110
114
|
)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from time import time
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
10
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
|
|
|
35
35
|
MAX_MB_SIZE = 512_000_000
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
@dataclass
|
|
39
38
|
class OnedriveAccessConfig(AccessConfig):
|
|
40
|
-
client_cred: str
|
|
39
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
41
40
|
|
|
42
41
|
|
|
43
|
-
@dataclass
|
|
44
42
|
class OnedriveConnectionConfig(ConnectionConfig):
|
|
45
|
-
client_id: str
|
|
46
|
-
user_pname: str
|
|
47
|
-
tenant: str =
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
client_id: str = Field(description="Microsoft app client ID")
|
|
44
|
+
user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
|
|
45
|
+
tenant: str = Field(
|
|
46
|
+
repr=False, description="ID or domain name associated with your Azure AD instance"
|
|
47
|
+
)
|
|
48
|
+
authority_url: Optional[str] = Field(
|
|
49
|
+
repr=False,
|
|
50
|
+
default="https://login.microsoftonline.com",
|
|
51
|
+
examples=["https://login.microsoftonline.com"],
|
|
52
|
+
description="Authentication token provider for Microsoft apps",
|
|
53
|
+
)
|
|
54
|
+
access_config: Secret[OnedriveAccessConfig]
|
|
50
55
|
|
|
51
56
|
@requires_dependencies(["msal"], extras="onedrive")
|
|
52
57
|
def get_token(self):
|
|
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
56
61
|
app = ConfidentialClientApplication(
|
|
57
62
|
authority=f"{self.authority_url}/{self.tenant}",
|
|
58
63
|
client_id=self.client_id,
|
|
59
|
-
client_credential=self.access_config.client_cred,
|
|
64
|
+
client_credential=self.access_config.get_secret_value().client_cred,
|
|
60
65
|
)
|
|
61
66
|
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
62
67
|
except ValueError as exc:
|
|
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
76
81
|
return client
|
|
77
82
|
|
|
78
83
|
|
|
79
|
-
@dataclass
|
|
80
84
|
class OnedriveIndexerConfig(IndexerConfig):
|
|
81
|
-
path: Optional[str] =
|
|
85
|
+
path: Optional[str] = Field(default="")
|
|
82
86
|
recursive: bool = False
|
|
83
87
|
|
|
84
88
|
|
|
@@ -171,7 +175,6 @@ class OnedriveIndexer(Indexer):
|
|
|
171
175
|
yield file_data
|
|
172
176
|
|
|
173
177
|
|
|
174
|
-
@dataclass
|
|
175
178
|
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
176
179
|
pass
|
|
177
180
|
|