unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +1 -1
- unstructured_ingest/logger.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/cmds/milvus.py +72 -0
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +3 -1
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -2
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/milvus.py +200 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +10 -7
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +44 -39
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
|
+
from unstructured_ingest.v2.logger import logger
|
|
7
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
9
|
+
from unstructured_ingest.v2.processes.filter import Filterer
|
|
10
|
+
|
|
11
|
+
STEP_ID = "filter"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class FilterStep(PipelineStep):
|
|
16
|
+
process: Filterer
|
|
17
|
+
identifier: str = STEP_ID
|
|
18
|
+
|
|
19
|
+
def __post_init__(self):
|
|
20
|
+
config = (
|
|
21
|
+
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
22
|
+
if self.process.config
|
|
23
|
+
else None
|
|
24
|
+
)
|
|
25
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
26
|
+
|
|
27
|
+
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
28
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
29
|
+
fn_kwargs = {"file_data": file_data}
|
|
30
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
31
|
+
resp = fn(**fn_kwargs)
|
|
32
|
+
elif semaphore := self.context.semaphore:
|
|
33
|
+
async with semaphore:
|
|
34
|
+
resp = await fn(**fn_kwargs)
|
|
35
|
+
else:
|
|
36
|
+
resp = await fn(**fn_kwargs)
|
|
37
|
+
|
|
38
|
+
if resp:
|
|
39
|
+
return {"file_data_path": file_data_path}
|
|
40
|
+
return None
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
4
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
6
5
|
add_destination_entry,
|
|
7
6
|
add_source_entry,
|
|
@@ -19,6 +18,8 @@ from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
|
19
18
|
from .google_drive import google_drive_source_entry
|
|
20
19
|
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
21
20
|
from .local import local_destination_entry, local_source_entry
|
|
21
|
+
from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
22
|
+
from .milvus import milvus_destination_entry
|
|
22
23
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
23
24
|
from .mongodb import mongodb_destination_entry
|
|
24
25
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
@@ -75,3 +76,4 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
|
|
|
75
76
|
add_destination_entry(
|
|
76
77
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
77
78
|
)
|
|
79
|
+
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
@@ -7,6 +7,7 @@ from unstructured import __name__ as integration_name
|
|
|
7
7
|
from unstructured.__version__ import __version__ as integration_version
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
11
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -94,6 +95,13 @@ class AstraUploader(Uploader):
|
|
|
94
95
|
upload_config: AstraUploaderConfig
|
|
95
96
|
connector_type: str = CONNECTOR_TYPE
|
|
96
97
|
|
|
98
|
+
def precheck(self) -> None:
|
|
99
|
+
try:
|
|
100
|
+
self.get_collection()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
103
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
104
|
+
|
|
97
105
|
@requires_dependencies(["astrapy"], extras="astra")
|
|
98
106
|
def get_collection(self) -> "AstraDBCollection":
|
|
99
107
|
from astrapy.db import AstraDB
|
|
@@ -175,6 +175,14 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
175
175
|
),
|
|
176
176
|
)
|
|
177
177
|
|
|
178
|
+
def precheck(self) -> None:
|
|
179
|
+
try:
|
|
180
|
+
client = self.connection_config.generate_client()
|
|
181
|
+
client.get_document_count()
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
184
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
185
|
+
|
|
178
186
|
def write_dict_wrapper(self, elements_dict):
|
|
179
187
|
return self.write_dict(elements_dict=elements_dict)
|
|
180
188
|
|
|
@@ -111,10 +111,13 @@ class ChromaUploader(Uploader):
|
|
|
111
111
|
connector_type: str = CONNECTOR_TYPE
|
|
112
112
|
upload_config: ChromaUploaderConfig
|
|
113
113
|
connection_config: ChromaConnectionConfig
|
|
114
|
-
client: Optional["Client"] = field(init=False)
|
|
115
114
|
|
|
116
|
-
def
|
|
117
|
-
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
try:
|
|
117
|
+
self.create_client()
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
120
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
118
121
|
|
|
119
122
|
@requires_dependencies(["chromadb"], extras="chroma")
|
|
120
123
|
def create_client(self) -> "Client":
|
|
@@ -187,10 +190,9 @@ class ChromaUploader(Uploader):
|
|
|
187
190
|
f"collection {self.connection_config.collection_name} "
|
|
188
191
|
f"at {self.connection_config.host}",
|
|
189
192
|
)
|
|
193
|
+
client = self.create_client()
|
|
190
194
|
|
|
191
|
-
collection =
|
|
192
|
-
name=self.connection_config.collection_name
|
|
193
|
-
)
|
|
195
|
+
collection = client.get_or_create_collection(name=self.connection_config.collection_name)
|
|
194
196
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
195
197
|
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
196
198
|
|
|
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
6
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
6
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
7
8
|
from unstructured_ingest.v2.interfaces import (
|
|
8
9
|
AccessConfig,
|
|
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
11
12
|
Uploader,
|
|
12
13
|
UploaderConfig,
|
|
13
14
|
)
|
|
15
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
16
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
15
17
|
|
|
16
18
|
if TYPE_CHECKING:
|
|
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
78
80
|
host=self.connection_config.host, **self.connection_config.access_config.to_dict()
|
|
79
81
|
)
|
|
80
82
|
|
|
83
|
+
def precheck(self) -> None:
|
|
84
|
+
try:
|
|
85
|
+
assert self.client.current_user.me().active
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
88
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
89
|
+
|
|
81
90
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
82
91
|
for content in contents:
|
|
83
92
|
with open(content.path, "rb") as elements_file:
|
|
@@ -7,10 +7,12 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
9
|
|
|
10
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
11
|
-
|
|
12
10
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
13
|
-
from unstructured_ingest.error import
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
SourceConnectionError,
|
|
14
|
+
SourceConnectionNetworkError,
|
|
15
|
+
)
|
|
14
16
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
15
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
18
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
22
|
DownloaderConfig,
|
|
21
23
|
DownloadResponse,
|
|
22
24
|
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
23
26
|
Indexer,
|
|
24
27
|
IndexerConfig,
|
|
25
28
|
UploadContent,
|
|
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
|
|
|
121
124
|
class ElasticsearchIndexer(Indexer):
|
|
122
125
|
connection_config: ElasticsearchConnectionConfig
|
|
123
126
|
index_config: ElasticsearchIndexerConfig
|
|
124
|
-
client: "ElasticsearchClient" = field(init=False)
|
|
125
127
|
connector_type: str = CONNECTOR_TYPE
|
|
126
128
|
|
|
127
|
-
def
|
|
128
|
-
|
|
129
|
+
def precheck(self) -> None:
|
|
130
|
+
try:
|
|
131
|
+
self.connection_config.get_client()
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
134
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
129
135
|
|
|
130
136
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
131
137
|
def load_scan(self):
|
|
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
|
|
|
138
144
|
scan = self.load_scan()
|
|
139
145
|
|
|
140
146
|
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
147
|
+
client = self.connection_config.get_client()
|
|
141
148
|
hits = scan(
|
|
142
|
-
|
|
149
|
+
client,
|
|
143
150
|
query=scan_query,
|
|
144
151
|
scroll="1m",
|
|
145
152
|
index=self.index_config.index_name,
|
|
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
168
175
|
yield FileData(
|
|
169
176
|
identifier=identified,
|
|
170
177
|
connector_type=CONNECTOR_TYPE,
|
|
171
|
-
metadata=
|
|
178
|
+
metadata=FileDataSourceMetadata(
|
|
172
179
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
173
180
|
date_processed=str(time()),
|
|
174
181
|
),
|
|
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
234
241
|
file_data=FileData(
|
|
235
242
|
identifier=filename_id,
|
|
236
243
|
connector_type=CONNECTOR_TYPE,
|
|
237
|
-
metadata=
|
|
244
|
+
metadata=FileDataSourceMetadata(
|
|
238
245
|
version=str(result["_version"]) if "_version" in result else None,
|
|
239
246
|
date_processed=str(time()),
|
|
240
247
|
record_locator={
|
|
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
|
|
|
339
346
|
upload_config: ElasticsearchUploaderConfig
|
|
340
347
|
connection_config: ElasticsearchConnectionConfig
|
|
341
348
|
|
|
349
|
+
def precheck(self) -> None:
|
|
350
|
+
try:
|
|
351
|
+
self.connection_config.get_client()
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
354
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
355
|
+
|
|
342
356
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
343
357
|
def load_parallel_bulk(self):
|
|
344
358
|
from elasticsearch.helpers import parallel_bulk
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
-
import fnmatch
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from time import time
|
|
9
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
|
-
|
|
11
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
12
10
|
|
|
13
11
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
12
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
17
|
DownloaderConfig,
|
|
20
18
|
DownloadResponse,
|
|
21
19
|
FileData,
|
|
20
|
+
FileDataSourceMetadata,
|
|
22
21
|
Indexer,
|
|
23
22
|
IndexerConfig,
|
|
24
23
|
SourceIdentifiers,
|
|
@@ -73,7 +72,6 @@ class FileConfig(Base):
|
|
|
73
72
|
@dataclass
|
|
74
73
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
75
74
|
recursive: bool = False
|
|
76
|
-
file_glob: Optional[list[str]] = None
|
|
77
75
|
|
|
78
76
|
|
|
79
77
|
@dataclass
|
|
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
|
|
|
108
106
|
**self.connection_config.get_access_config(),
|
|
109
107
|
)
|
|
110
108
|
|
|
111
|
-
def
|
|
112
|
-
if self.index_config.file_glob is None:
|
|
113
|
-
return True
|
|
114
|
-
patterns = self.index_config.file_glob
|
|
115
|
-
for pattern in patterns:
|
|
116
|
-
if fnmatch.filter([path], pattern):
|
|
117
|
-
return True
|
|
118
|
-
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
def check_connection(self):
|
|
109
|
+
def precheck(self) -> None:
|
|
122
110
|
from fsspec import get_filesystem_class
|
|
123
111
|
|
|
124
112
|
try:
|
|
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
|
|
|
156
144
|
else:
|
|
157
145
|
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
158
146
|
|
|
159
|
-
def get_metadata(self, path: str) ->
|
|
147
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
160
148
|
date_created = None
|
|
161
149
|
date_modified = None
|
|
162
|
-
|
|
150
|
+
file_size = None
|
|
163
151
|
try:
|
|
164
152
|
created: Optional[Any] = self.fs.created(path)
|
|
165
153
|
if created:
|
|
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
|
|
|
179
167
|
date_modified = str(modified)
|
|
180
168
|
except NotImplementedError:
|
|
181
169
|
pass
|
|
170
|
+
with contextlib.suppress(AttributeError):
|
|
171
|
+
file_size = self.fs.size(path)
|
|
182
172
|
|
|
183
173
|
version = self.fs.checksum(path)
|
|
184
174
|
metadata: dict[str, str] = {}
|
|
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
|
|
|
188
178
|
"protocol": self.index_config.protocol,
|
|
189
179
|
"remote_file_path": self.index_config.remote_url,
|
|
190
180
|
}
|
|
181
|
+
file_stat = self.fs.stat(path=path)
|
|
182
|
+
if file_id := file_stat.get("id"):
|
|
183
|
+
record_locator["file_id"] = file_id
|
|
191
184
|
if metadata:
|
|
192
185
|
record_locator["metadata"] = metadata
|
|
193
|
-
return
|
|
186
|
+
return FileDataSourceMetadata(
|
|
194
187
|
date_created=date_created,
|
|
195
188
|
date_modified=date_modified,
|
|
196
189
|
date_processed=str(time()),
|
|
197
190
|
version=str(version),
|
|
198
191
|
url=f"{self.index_config.protocol}://{path}",
|
|
199
192
|
record_locator=record_locator,
|
|
193
|
+
filesize_bytes=file_size,
|
|
200
194
|
)
|
|
201
195
|
|
|
202
196
|
def sterilize_info(self, path) -> dict:
|
|
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
|
|
|
204
198
|
return sterilize_dict(data=info)
|
|
205
199
|
|
|
206
200
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
207
|
-
|
|
208
|
-
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
201
|
+
files = self.list_files()
|
|
209
202
|
for file in files:
|
|
210
203
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
211
204
|
# to get a valid relative path
|
|
212
205
|
rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
206
|
+
|
|
207
|
+
additional_metadata = self.sterilize_info(path=file)
|
|
208
|
+
additional_metadata["original_file_path"] = file
|
|
213
209
|
yield FileData(
|
|
214
|
-
identifier=file,
|
|
210
|
+
identifier=str(uuid5(NAMESPACE_DNS, file)),
|
|
215
211
|
connector_type=self.connector_type,
|
|
216
212
|
source_identifiers=SourceIdentifiers(
|
|
217
213
|
filename=Path(file).name,
|
|
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
|
|
|
219
215
|
fullpath=file,
|
|
220
216
|
),
|
|
221
217
|
metadata=self.get_metadata(path=file),
|
|
222
|
-
additional_metadata=
|
|
218
|
+
additional_metadata=additional_metadata,
|
|
223
219
|
)
|
|
224
220
|
|
|
225
221
|
|
|
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
|
|
|
251
247
|
**self.connection_config.get_access_config(),
|
|
252
248
|
)
|
|
253
249
|
|
|
254
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
255
|
-
return (
|
|
256
|
-
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
257
|
-
if self.download_config
|
|
258
|
-
else Path(file_data.source_identifiers.rel_path)
|
|
259
|
-
)
|
|
260
|
-
|
|
261
250
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
262
251
|
download_path = self.get_download_path(file_data=file_data)
|
|
263
252
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
253
|
try:
|
|
265
|
-
|
|
254
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
255
|
+
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
266
256
|
except Exception as e:
|
|
267
257
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
268
258
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
|
|
|
272
262
|
download_path = self.get_download_path(file_data=file_data)
|
|
273
263
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
264
|
try:
|
|
275
|
-
|
|
265
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
266
|
+
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
276
267
|
except Exception as e:
|
|
277
268
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
278
269
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -5,11 +5,15 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.
|
|
8
|
+
from unstructured.utils import requires_dependencies
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
DownloadResponse,
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
UploadContent,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
18
|
DestinationRegistryEntry,
|
|
15
19
|
SourceRegistryEntry,
|
|
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
66
70
|
index_config: S3IndexerConfig
|
|
67
71
|
connector_type: str = CONNECTOR_TYPE
|
|
68
72
|
|
|
69
|
-
def get_metadata(self, path: str) ->
|
|
73
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
70
74
|
date_created = None
|
|
71
75
|
date_modified = None
|
|
76
|
+
file_size = None
|
|
72
77
|
try:
|
|
73
78
|
modified: Optional[datetime] = self.fs.modified(path)
|
|
74
79
|
if modified:
|
|
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
76
81
|
date_modified = str(modified.timestamp())
|
|
77
82
|
except NotImplementedError:
|
|
78
83
|
pass
|
|
84
|
+
with contextlib.suppress(AttributeError):
|
|
85
|
+
file_size = self.fs.size(path)
|
|
79
86
|
|
|
80
87
|
version = None
|
|
81
88
|
info: dict[str, Any] = self.fs.info(path)
|
|
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
|
|
|
90
97
|
}
|
|
91
98
|
if metadata:
|
|
92
99
|
record_locator["metadata"] = metadata
|
|
93
|
-
return
|
|
100
|
+
return FileDataSourceMetadata(
|
|
94
101
|
date_created=date_created,
|
|
95
102
|
date_modified=date_modified,
|
|
96
103
|
date_processed=str(time()),
|
|
97
104
|
version=version,
|
|
98
105
|
url=f"{self.index_config.protocol}://{path}",
|
|
99
106
|
record_locator=record_locator,
|
|
107
|
+
filesize_bytes=file_size,
|
|
100
108
|
)
|
|
101
109
|
|
|
102
110
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
6
5
|
|
|
7
6
|
from dateutil import parser
|
|
8
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
7
|
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
10
8
|
|
|
11
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
|
-
from unstructured_ingest.error import
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
SourceConnectionError,
|
|
12
|
+
SourceConnectionNetworkError,
|
|
13
|
+
)
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
15
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
15
16
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
20
21
|
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
21
23
|
Indexer,
|
|
22
24
|
IndexerConfig,
|
|
23
25
|
SourceIdentifiers,
|
|
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
|
|
|
121
123
|
]
|
|
122
124
|
)
|
|
123
125
|
|
|
126
|
+
def precheck(self) -> None:
|
|
127
|
+
try:
|
|
128
|
+
self.connection_config.get_files_service()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
132
|
+
|
|
124
133
|
@staticmethod
|
|
125
134
|
def is_dir(record: dict) -> bool:
|
|
126
135
|
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
155
164
|
connector_type=CONNECTOR_TYPE,
|
|
156
165
|
identifier=file_id,
|
|
157
166
|
source_identifiers=source_identifiers,
|
|
158
|
-
metadata=
|
|
167
|
+
metadata=FileDataSourceMetadata(
|
|
159
168
|
url=url,
|
|
160
169
|
version=version,
|
|
161
170
|
date_created=str(date_created_dt.timestamp()),
|
|
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
|
|
|
272
281
|
)
|
|
273
282
|
connector_type: str = CONNECTOR_TYPE
|
|
274
283
|
|
|
275
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
-
return self.download_dir / Path(rel_path)
|
|
279
|
-
|
|
280
284
|
@SourceConnectionNetworkError.wrap
|
|
281
285
|
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
282
286
|
downloaded = False
|
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import itertools
|
|
3
2
|
import shutil
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from time import time
|
|
7
|
-
from typing import Any, Generator
|
|
8
|
-
|
|
9
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
6
|
+
from typing import Any, Generator
|
|
10
7
|
|
|
11
8
|
from unstructured_ingest.v2.interfaces import (
|
|
12
9
|
AccessConfig,
|
|
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
12
|
DownloaderConfig,
|
|
16
13
|
DownloadResponse,
|
|
17
14
|
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
18
16
|
Indexer,
|
|
19
17
|
IndexerConfig,
|
|
20
18
|
SourceIdentifiers,
|
|
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
|
|
|
45
43
|
class LocalIndexerConfig(IndexerConfig):
|
|
46
44
|
input_path: str
|
|
47
45
|
recursive: bool = False
|
|
48
|
-
file_glob: Optional[list[str]] = None
|
|
49
46
|
|
|
50
47
|
@property
|
|
51
48
|
def path(self) -> Path:
|
|
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
|
|
|
64
61
|
input_path = self.index_config.path
|
|
65
62
|
if input_path.is_file():
|
|
66
63
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return list(
|
|
71
|
-
itertools.chain.from_iterable(
|
|
72
|
-
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
-
)
|
|
74
|
-
)
|
|
64
|
+
if self.index_config.recursive:
|
|
65
|
+
return list(input_path.rglob("*"))
|
|
66
|
+
return list(input_path.glob("*"))
|
|
75
67
|
|
|
76
|
-
def get_file_metadata(self, path: Path) ->
|
|
68
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
77
69
|
stats = path.stat()
|
|
78
70
|
try:
|
|
79
71
|
date_modified = str(stats.st_mtime)
|
|
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
|
|
|
93
85
|
except Exception as e:
|
|
94
86
|
logger.warning(f"Couldn't detect file mode: {e}")
|
|
95
87
|
permissions_data = None
|
|
96
|
-
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
filesize_bytes = stats.st_size
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
93
|
+
filesize_bytes = None
|
|
94
|
+
|
|
95
|
+
return FileDataSourceMetadata(
|
|
97
96
|
date_modified=date_modified,
|
|
98
97
|
date_created=date_created,
|
|
99
98
|
date_processed=str(time()),
|
|
100
99
|
permissions_data=permissions_data,
|
|
101
100
|
record_locator={"path": str(path.resolve())},
|
|
101
|
+
filesize_bytes=filesize_bytes,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|