unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -5,11 +5,15 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
DownloadResponse,
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
UploadContent,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
18
|
DestinationRegistryEntry,
|
|
15
19
|
SourceRegistryEntry,
|
|
@@ -28,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
32
|
CONNECTOR_TYPE = "s3"
|
|
29
33
|
|
|
30
34
|
|
|
31
|
-
@dataclass
|
|
32
35
|
class S3IndexerConfig(FsspecIndexerConfig):
|
|
33
36
|
pass
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
@dataclass
|
|
37
39
|
class S3AccessConfig(FsspecAccessConfig):
|
|
38
|
-
key: Optional[str] =
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
key: Optional[str] = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
description="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
43
|
+
"over `aws_access_key_id` in client_kwargs.",
|
|
44
|
+
)
|
|
45
|
+
secret: Optional[str] = Field(
|
|
46
|
+
default=None, description="If not anonymous, use this secret access key, if specified."
|
|
47
|
+
)
|
|
48
|
+
token: Optional[str] = Field(
|
|
49
|
+
default=None, description="If not anonymous, use this security token, if specified."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
SecretS3AccessConfig = Secret[S3AccessConfig]
|
|
41
54
|
|
|
42
55
|
|
|
43
|
-
@dataclass
|
|
44
56
|
class S3ConnectionConfig(FsspecConnectionConfig):
|
|
45
|
-
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
|
|
46
|
-
access_config:
|
|
47
|
-
|
|
57
|
+
supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
|
|
58
|
+
access_config: SecretS3AccessConfig = Field(
|
|
59
|
+
default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
|
|
48
60
|
)
|
|
49
|
-
endpoint_url: Optional[str] =
|
|
50
|
-
|
|
51
|
-
|
|
61
|
+
endpoint_url: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Use this endpoint_url, if specified. Needed for "
|
|
64
|
+
"connecting to non-AWS S3 buckets.",
|
|
65
|
+
)
|
|
66
|
+
anonymous: bool = Field(
|
|
67
|
+
default=False, description="Connect to s3 without local AWS credentials."
|
|
68
|
+
)
|
|
69
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
70
|
|
|
53
71
|
def get_access_config(self) -> dict[str, Any]:
|
|
54
72
|
access_configs: dict[str, Any] = {"anon": self.anonymous}
|
|
@@ -56,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
56
74
|
access_configs["endpoint_url"] = self.endpoint_url
|
|
57
75
|
|
|
58
76
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
59
|
-
access_configs.update(
|
|
77
|
+
access_configs.update(
|
|
78
|
+
{k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
|
|
79
|
+
)
|
|
60
80
|
return access_configs
|
|
61
81
|
|
|
62
82
|
|
|
@@ -66,9 +86,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
66
86
|
index_config: S3IndexerConfig
|
|
67
87
|
connector_type: str = CONNECTOR_TYPE
|
|
68
88
|
|
|
69
|
-
def get_metadata(self, path: str) ->
|
|
89
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
70
90
|
date_created = None
|
|
71
91
|
date_modified = None
|
|
92
|
+
file_size = None
|
|
72
93
|
try:
|
|
73
94
|
modified: Optional[datetime] = self.fs.modified(path)
|
|
74
95
|
if modified:
|
|
@@ -76,6 +97,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
76
97
|
date_modified = str(modified.timestamp())
|
|
77
98
|
except NotImplementedError:
|
|
78
99
|
pass
|
|
100
|
+
with contextlib.suppress(AttributeError):
|
|
101
|
+
file_size = self.fs.size(path)
|
|
79
102
|
|
|
80
103
|
version = None
|
|
81
104
|
info: dict[str, Any] = self.fs.info(path)
|
|
@@ -90,21 +113,25 @@ class S3Indexer(FsspecIndexer):
|
|
|
90
113
|
}
|
|
91
114
|
if metadata:
|
|
92
115
|
record_locator["metadata"] = metadata
|
|
93
|
-
return
|
|
116
|
+
return FileDataSourceMetadata(
|
|
94
117
|
date_created=date_created,
|
|
95
118
|
date_modified=date_modified,
|
|
96
119
|
date_processed=str(time()),
|
|
97
120
|
version=version,
|
|
98
121
|
url=f"{self.index_config.protocol}://{path}",
|
|
99
122
|
record_locator=record_locator,
|
|
123
|
+
filesize_bytes=file_size,
|
|
100
124
|
)
|
|
101
125
|
|
|
102
126
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
103
127
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
104
128
|
return super().run(**kwargs)
|
|
105
129
|
|
|
130
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
131
|
+
def precheck(self) -> None:
|
|
132
|
+
super().precheck()
|
|
133
|
+
|
|
106
134
|
|
|
107
|
-
@dataclass
|
|
108
135
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
109
136
|
pass
|
|
110
137
|
|
|
@@ -125,7 +152,6 @@ class S3Downloader(FsspecDownloader):
|
|
|
125
152
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
126
153
|
|
|
127
154
|
|
|
128
|
-
@dataclass
|
|
129
155
|
class S3UploaderConfig(FsspecUploaderConfig):
|
|
130
156
|
pass
|
|
131
157
|
|
|
@@ -136,6 +162,10 @@ class S3Uploader(FsspecUploader):
|
|
|
136
162
|
connection_config: S3ConnectionConfig
|
|
137
163
|
upload_config: S3UploaderConfig = field(default=None)
|
|
138
164
|
|
|
165
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
166
|
+
def precheck(self) -> None:
|
|
167
|
+
super().precheck()
|
|
168
|
+
|
|
139
169
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
140
170
|
def __post_init__(self):
|
|
141
171
|
super().__post_init__()
|
|
@@ -6,7 +6,8 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
12
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
27
28
|
CONNECTOR_TYPE = "sftp"
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
@dataclass
|
|
31
31
|
class SftpIndexerConfig(FsspecIndexerConfig):
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
|
|
33
|
+
def model_post_init(self, __context: Any) -> None:
|
|
34
|
+
super().model_post_init(__context)
|
|
34
35
|
_, ext = os.path.splitext(self.remote_url)
|
|
35
36
|
parsed_url = urlparse(self.remote_url)
|
|
36
37
|
if ext:
|
|
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
|
|
|
39
40
|
self.path_without_protocol = parsed_url.path.lstrip("/")
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
@dataclass
|
|
43
43
|
class SftpAccessConfig(FsspecAccessConfig):
|
|
44
|
-
password: str
|
|
44
|
+
password: str = Field(description="Password for sftp connection")
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
@dataclass
|
|
48
47
|
class SftpConnectionConfig(FsspecConnectionConfig):
|
|
49
|
-
supported_protocols: list[str] =
|
|
50
|
-
access_config: SftpAccessConfig
|
|
51
|
-
connector_type: str = CONNECTOR_TYPE
|
|
52
|
-
username:
|
|
53
|
-
host: Optional[str] = None
|
|
54
|
-
port: int = 22
|
|
55
|
-
look_for_keys: bool =
|
|
56
|
-
|
|
48
|
+
supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
|
|
49
|
+
access_config: Secret[SftpAccessConfig]
|
|
50
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
51
|
+
username: str = Field(description="Username for sftp connection")
|
|
52
|
+
host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
|
|
53
|
+
port: int = Field(default=22, description="Port for sftp connection")
|
|
54
|
+
look_for_keys: bool = Field(
|
|
55
|
+
default=False, description="Whether to search for private key files in ~/.ssh/"
|
|
56
|
+
)
|
|
57
|
+
allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
|
|
57
58
|
|
|
58
59
|
def get_access_config(self) -> dict[str, Any]:
|
|
59
60
|
access_config = {
|
|
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
|
|
|
62
63
|
"port": self.port,
|
|
63
64
|
"look_for_keys": self.look_for_keys,
|
|
64
65
|
"allow_agent": self.allow_agent,
|
|
65
|
-
"password": self.access_config.password,
|
|
66
|
+
"password": self.access_config.get_secret_value().password,
|
|
66
67
|
}
|
|
67
68
|
return access_config
|
|
68
69
|
|
|
@@ -91,25 +92,20 @@ class SftpIndexer(FsspecIndexer):
|
|
|
91
92
|
file.identifier = new_identifier
|
|
92
93
|
yield file
|
|
93
94
|
|
|
95
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
96
|
+
def precheck(self) -> None:
|
|
97
|
+
super().precheck()
|
|
98
|
+
|
|
94
99
|
|
|
95
|
-
@dataclass
|
|
96
100
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
97
|
-
remote_url:
|
|
98
|
-
|
|
99
|
-
def __post_init__(self):
|
|
100
|
-
# TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
|
|
101
|
-
if not self.remote_url:
|
|
102
|
-
raise TypeError(
|
|
103
|
-
f"{self.__class__.__name__}.__init__() "
|
|
104
|
-
f"missing 1 required positional argument: 'remote_url'"
|
|
105
|
-
)
|
|
101
|
+
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|
|
106
102
|
|
|
107
103
|
|
|
108
104
|
@dataclass
|
|
109
105
|
class SftpDownloader(FsspecDownloader):
|
|
110
106
|
protocol: str = "sftp"
|
|
111
107
|
connection_config: SftpConnectionConfig
|
|
112
|
-
connector_type: str = CONNECTOR_TYPE
|
|
108
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
113
109
|
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
114
110
|
|
|
115
111
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
@@ -127,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
|
|
|
127
123
|
return await super().run_async(file_data=file_data, **kwargs)
|
|
128
124
|
|
|
129
125
|
|
|
130
|
-
@dataclass
|
|
131
126
|
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
132
127
|
pass
|
|
133
128
|
|
|
@@ -142,6 +137,10 @@ class SftpUploader(FsspecUploader):
|
|
|
142
137
|
def __post_init__(self):
|
|
143
138
|
super().__post_init__()
|
|
144
139
|
|
|
140
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
141
|
+
def precheck(self) -> None:
|
|
142
|
+
super().precheck()
|
|
143
|
+
|
|
145
144
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
146
145
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
147
146
|
return super().run(contents=contents, **kwargs)
|
|
@@ -1,23 +1,25 @@
|
|
|
1
1
|
import io
|
|
2
|
-
import
|
|
2
|
+
import json
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
-
from
|
|
9
|
-
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
8
|
+
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
SourceConnectionError,
|
|
12
|
+
SourceConnectionNetworkError,
|
|
13
|
+
)
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
from unstructured_ingest.utils.
|
|
15
|
+
from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
15
16
|
from unstructured_ingest.v2.interfaces import (
|
|
16
17
|
AccessConfig,
|
|
17
18
|
ConnectionConfig,
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
20
21
|
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
21
23
|
Indexer,
|
|
22
24
|
IndexerConfig,
|
|
23
25
|
SourceIdentifiers,
|
|
@@ -35,46 +37,54 @@ if TYPE_CHECKING:
|
|
|
35
37
|
from googleapiclient.http import MediaIoBaseDownload
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
@dataclass
|
|
39
40
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
40
|
-
service_account_key:
|
|
41
|
+
service_account_key: Optional[dict] = Field(
|
|
42
|
+
default=None, description="Credentials values to use for authentication"
|
|
43
|
+
)
|
|
44
|
+
service_account_key_path: Optional[Path] = Field(
|
|
45
|
+
default=None, description="File path to credentials values to use for authentication"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def model_post_init(self, __context: Any) -> None:
|
|
49
|
+
if self.service_account_key is None and self.service_account_key_path is None:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"either service_account_key or service_account_key_path must be provided"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def get_service_account_key(self) -> dict:
|
|
55
|
+
key_data = None
|
|
56
|
+
if self.service_account_key_path:
|
|
57
|
+
with self.service_account_key_path.open() as f:
|
|
58
|
+
key_data = json.load(f)
|
|
59
|
+
if key_data and self.service_account_key:
|
|
60
|
+
if key_data == self.service_account_key:
|
|
61
|
+
return key_data
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"service_account_key and service_account_key_path "
|
|
65
|
+
"both provided and have different values"
|
|
66
|
+
)
|
|
67
|
+
if key_data:
|
|
68
|
+
return key_data
|
|
69
|
+
return self.service_account_key
|
|
41
70
|
|
|
42
71
|
|
|
43
|
-
@dataclass
|
|
44
72
|
class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
45
|
-
drive_id: str
|
|
46
|
-
access_config: GoogleDriveAccessConfig
|
|
73
|
+
drive_id: str = Field(description="Google Drive File or Folder ID.")
|
|
74
|
+
access_config: Secret[GoogleDriveAccessConfig]
|
|
47
75
|
|
|
48
76
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
49
77
|
def get_files_service(self) -> "GoogleAPIResource":
|
|
50
|
-
from google.auth import
|
|
78
|
+
from google.auth import exceptions
|
|
51
79
|
from google.oauth2 import service_account
|
|
52
80
|
from googleapiclient.discovery import build
|
|
53
81
|
from googleapiclient.errors import HttpError
|
|
54
82
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if isinstance(self.access_config.service_account_key, str):
|
|
58
|
-
key_path = json_to_dict(self.access_config.service_account_key)
|
|
59
|
-
elif isinstance(self.access_config.service_account_key, dict):
|
|
60
|
-
key_path = self.access_config.service_account_key
|
|
61
|
-
else:
|
|
62
|
-
raise TypeError(
|
|
63
|
-
f"access_config.service_account_key must be "
|
|
64
|
-
f"str or dict, got: {type(self.access_config.service_account_key)}"
|
|
65
|
-
)
|
|
83
|
+
access_config = self.access_config.get_secret_value()
|
|
84
|
+
key_data = access_config.get_service_account_key()
|
|
66
85
|
|
|
67
86
|
try:
|
|
68
|
-
|
|
69
|
-
creds = service_account.Credentials.from_service_account_info(key_path)
|
|
70
|
-
elif isinstance(key_path, str):
|
|
71
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
|
72
|
-
creds, _ = default()
|
|
73
|
-
else:
|
|
74
|
-
raise ValueError(
|
|
75
|
-
f"key path not recognized as a dictionary or a file path: "
|
|
76
|
-
f"[{type(key_path)}] {key_path}",
|
|
77
|
-
)
|
|
87
|
+
creds = service_account.Credentials.from_service_account_info(key_data)
|
|
78
88
|
service = build("drive", "v3", credentials=creds)
|
|
79
89
|
return service.files()
|
|
80
90
|
|
|
@@ -84,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
|
84
94
|
raise ValueError("The provided API key is invalid.")
|
|
85
95
|
|
|
86
96
|
|
|
87
|
-
@dataclass
|
|
88
97
|
class GoogleDriveIndexerConfig(IndexerConfig):
|
|
89
98
|
extensions: Optional[list[str]] = None
|
|
90
99
|
recursive: bool = False
|
|
@@ -121,6 +130,13 @@ class GoogleDriveIndexer(Indexer):
|
|
|
121
130
|
]
|
|
122
131
|
)
|
|
123
132
|
|
|
133
|
+
def precheck(self) -> None:
|
|
134
|
+
try:
|
|
135
|
+
self.connection_config.get_files_service()
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
138
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
139
|
+
|
|
124
140
|
@staticmethod
|
|
125
141
|
def is_dir(record: dict) -> bool:
|
|
126
142
|
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
@@ -155,7 +171,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
155
171
|
connector_type=CONNECTOR_TYPE,
|
|
156
172
|
identifier=file_id,
|
|
157
173
|
source_identifiers=source_identifiers,
|
|
158
|
-
metadata=
|
|
174
|
+
metadata=FileDataSourceMetadata(
|
|
159
175
|
url=url,
|
|
160
176
|
version=version,
|
|
161
177
|
date_created=str(date_created_dt.timestamp()),
|
|
@@ -259,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
259
275
|
yield f
|
|
260
276
|
|
|
261
277
|
|
|
262
|
-
@dataclass
|
|
263
278
|
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
264
279
|
pass
|
|
265
280
|
|
|
@@ -272,11 +287,6 @@ class GoogleDriveDownloader(Downloader):
|
|
|
272
287
|
)
|
|
273
288
|
connector_type: str = CONNECTOR_TYPE
|
|
274
289
|
|
|
275
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
-
return self.download_dir / Path(rel_path)
|
|
279
|
-
|
|
280
290
|
@SourceConnectionNetworkError.wrap
|
|
281
291
|
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
282
292
|
downloaded = False
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import itertools
|
|
3
2
|
import shutil
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from time import time
|
|
7
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import Any, Generator
|
|
8
7
|
|
|
9
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
10
9
|
|
|
11
10
|
from unstructured_ingest.v2.interfaces import (
|
|
12
11
|
AccessConfig,
|
|
@@ -15,6 +14,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
14
|
DownloaderConfig,
|
|
16
15
|
DownloadResponse,
|
|
17
16
|
FileData,
|
|
17
|
+
FileDataSourceMetadata,
|
|
18
18
|
Indexer,
|
|
19
19
|
IndexerConfig,
|
|
20
20
|
SourceIdentifiers,
|
|
@@ -31,21 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
31
31
|
CONNECTOR_TYPE = "local"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class LocalAccessConfig(AccessConfig):
|
|
36
35
|
pass
|
|
37
36
|
|
|
38
37
|
|
|
39
|
-
|
|
38
|
+
SecretLocalAccessConfig = Secret[LocalAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
40
41
|
class LocalConnectionConfig(ConnectionConfig):
|
|
41
|
-
access_config:
|
|
42
|
+
access_config: SecretLocalAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
|
|
44
|
+
)
|
|
42
45
|
|
|
43
46
|
|
|
44
|
-
@dataclass
|
|
45
47
|
class LocalIndexerConfig(IndexerConfig):
|
|
46
|
-
input_path:
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
input_path: Path = Field(
|
|
49
|
+
description="Path to the location in the local file system that will be processed."
|
|
50
|
+
)
|
|
51
|
+
recursive: bool = Field(
|
|
52
|
+
default=False,
|
|
53
|
+
description="Recursively download files in their respective folders "
|
|
54
|
+
"otherwise stop at the files in provided folder level.",
|
|
55
|
+
)
|
|
49
56
|
|
|
50
57
|
@property
|
|
51
58
|
def path(self) -> Path:
|
|
@@ -64,16 +71,11 @@ class LocalIndexer(Indexer):
|
|
|
64
71
|
input_path = self.index_config.path
|
|
65
72
|
if input_path.is_file():
|
|
66
73
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return list(
|
|
71
|
-
itertools.chain.from_iterable(
|
|
72
|
-
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
-
)
|
|
74
|
-
)
|
|
74
|
+
if self.index_config.recursive:
|
|
75
|
+
return list(input_path.rglob("*"))
|
|
76
|
+
return list(input_path.glob("*"))
|
|
75
77
|
|
|
76
|
-
def get_file_metadata(self, path: Path) ->
|
|
78
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
77
79
|
stats = path.stat()
|
|
78
80
|
try:
|
|
79
81
|
date_modified = str(stats.st_mtime)
|
|
@@ -93,12 +95,20 @@ class LocalIndexer(Indexer):
|
|
|
93
95
|
except Exception as e:
|
|
94
96
|
logger.warning(f"Couldn't detect file mode: {e}")
|
|
95
97
|
permissions_data = None
|
|
96
|
-
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
filesize_bytes = stats.st_size
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
103
|
+
filesize_bytes = None
|
|
104
|
+
|
|
105
|
+
return FileDataSourceMetadata(
|
|
97
106
|
date_modified=date_modified,
|
|
98
107
|
date_created=date_created,
|
|
99
108
|
date_processed=str(time()),
|
|
100
109
|
permissions_data=permissions_data,
|
|
101
110
|
record_locator={"path": str(path.resolve())},
|
|
111
|
+
filesize_bytes=filesize_bytes,
|
|
102
112
|
)
|
|
103
113
|
|
|
104
114
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
@@ -122,7 +132,6 @@ class LocalIndexer(Indexer):
|
|
|
122
132
|
yield file_data
|
|
123
133
|
|
|
124
134
|
|
|
125
|
-
@dataclass
|
|
126
135
|
class LocalDownloaderConfig(DownloaderConfig):
|
|
127
136
|
pass
|
|
128
137
|
|
|
@@ -130,10 +139,8 @@ class LocalDownloaderConfig(DownloaderConfig):
|
|
|
130
139
|
@dataclass
|
|
131
140
|
class LocalDownloader(Downloader):
|
|
132
141
|
connector_type: str = CONNECTOR_TYPE
|
|
133
|
-
connection_config: LocalConnectionConfig = field(
|
|
134
|
-
|
|
135
|
-
)
|
|
136
|
-
download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
|
|
142
|
+
connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
|
|
143
|
+
download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
|
|
137
144
|
|
|
138
145
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
139
146
|
return Path(file_data.source_identifiers.fullpath)
|
|
@@ -144,9 +151,10 @@ class LocalDownloader(Downloader):
|
|
|
144
151
|
)
|
|
145
152
|
|
|
146
153
|
|
|
147
|
-
@dataclass
|
|
148
154
|
class LocalUploaderConfig(UploaderConfig):
|
|
149
|
-
output_dir: str =
|
|
155
|
+
output_dir: str = Field(
|
|
156
|
+
default="structured-output", description="Local path to write partitioned output to"
|
|
157
|
+
)
|
|
150
158
|
|
|
151
159
|
@property
|
|
152
160
|
def output_path(self) -> Path:
|
|
@@ -160,7 +168,7 @@ class LocalUploaderConfig(UploaderConfig):
|
|
|
160
168
|
@dataclass
|
|
161
169
|
class LocalUploader(Uploader):
|
|
162
170
|
connector_type: str = CONNECTOR_TYPE
|
|
163
|
-
upload_config: LocalUploaderConfig = field(default_factory=
|
|
171
|
+
upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
|
|
164
172
|
connection_config: LocalConnectionConfig = field(
|
|
165
173
|
default_factory=lambda: LocalConnectionConfig()
|
|
166
174
|
)
|
|
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import WriteError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "milvus"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class MilvusAccessConfig(AccessConfig):
|
|
37
|
-
password: Optional[str] = None
|
|
38
|
-
token: Optional[str] = None
|
|
36
|
+
password: Optional[str] = Field(default=None, description="Milvus password")
|
|
37
|
+
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
@dataclass
|
|
42
43
|
class MilvusConnectionConfig(ConnectionConfig):
|
|
43
|
-
access_config:
|
|
44
|
-
|
|
44
|
+
access_config: SecretMilvusAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
|
|
45
46
|
)
|
|
46
|
-
uri: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
uri: Optional[str] = Field(
|
|
48
|
+
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
49
|
+
)
|
|
50
|
+
user: Optional[str] = Field(default=None, description="Milvus user")
|
|
51
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
49
52
|
|
|
50
53
|
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
access_config = self.access_config.get_secret_value()
|
|
55
|
+
access_config_dict = access_config.dict()
|
|
56
|
+
connection_config_dict = self.dict()
|
|
53
57
|
connection_config_dict.pop("access_config", None)
|
|
54
58
|
connection_config_dict.update(access_config_dict)
|
|
55
59
|
# Drop any that were not set explicitly
|
|
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
63
67
|
return MilvusClient(**self.get_connection_kwargs())
|
|
64
68
|
|
|
65
69
|
|
|
66
|
-
@dataclass
|
|
67
70
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
68
71
|
pass
|
|
69
72
|
|
|
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
|
|
|
130
133
|
return output_path
|
|
131
134
|
|
|
132
135
|
|
|
133
|
-
@dataclass
|
|
134
136
|
class MilvusUploaderConfig(UploaderConfig):
|
|
135
|
-
collection_name: str
|
|
136
|
-
|
|
137
|
+
collection_name: str = Field(description="Milvus collections to write to")
|
|
138
|
+
num_processes: int = Field(
|
|
139
|
+
default=4, description="number of processes to use when writing to support parallel writes"
|
|
140
|
+
)
|
|
137
141
|
|
|
138
142
|
|
|
139
143
|
@dataclass
|
|
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
|
|
|
180
184
|
self.insert_results(data=data)
|
|
181
185
|
|
|
182
186
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
183
|
-
if self.upload_config.
|
|
187
|
+
if self.upload_config.num_processes == 1:
|
|
184
188
|
for content in contents:
|
|
185
189
|
self.upload(content=content)
|
|
186
190
|
|
|
187
191
|
else:
|
|
188
192
|
with mp.Pool(
|
|
189
|
-
processes=self.upload_config.
|
|
193
|
+
processes=self.upload_config.num_processes,
|
|
190
194
|
) as pool:
|
|
191
195
|
pool.map(self.upload, contents)
|
|
192
196
|
|