unstructured-ingest 0.0.21__py3-none-any.whl → 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/bedrock.py +56 -19
- unstructured_ingest/embed/huggingface.py +22 -22
- unstructured_ingest/embed/interfaces.py +11 -4
- unstructured_ingest/embed/mixedbreadai.py +17 -17
- unstructured_ingest/embed/octoai.py +7 -7
- unstructured_ingest/embed/openai.py +15 -20
- unstructured_ingest/embed/vertexai.py +25 -17
- unstructured_ingest/embed/voyageai.py +22 -17
- unstructured_ingest/v2/cli/base/cmd.py +1 -1
- unstructured_ingest/v2/interfaces/connector.py +1 -1
- unstructured_ingest/v2/pipeline/pipeline.py +3 -1
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
- unstructured_ingest/v2/pipeline/steps/download.py +6 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +4 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +1 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
- unstructured_ingest/v2/utils.py +1 -1
- unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
- {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +40 -40
- {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
- unstructured_ingest-0.0.21.dist-info/METADATA +0 -639
- {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
|
@@ -187,7 +187,9 @@ class Pipeline:
|
|
|
187
187
|
return filtered_records
|
|
188
188
|
|
|
189
189
|
def _run(self):
|
|
190
|
-
logger.info(
|
|
190
|
+
logger.info(
|
|
191
|
+
f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
|
|
192
|
+
)
|
|
191
193
|
if self.context.mp_supported:
|
|
192
194
|
manager = mp.Manager()
|
|
193
195
|
self.context.status = manager.dict()
|
|
@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
|
|
|
31
31
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
32
32
|
|
|
33
33
|
def __post_init__(self):
|
|
34
|
-
config =
|
|
34
|
+
config = (
|
|
35
|
+
self.process.download_config.model_dump_json() if self.process.download_config else None
|
|
36
|
+
)
|
|
35
37
|
connection_config = (
|
|
36
|
-
self.process.connection_config.
|
|
38
|
+
self.process.connection_config.model_dump_json()
|
|
39
|
+
if self.process.connection_config
|
|
40
|
+
else None
|
|
37
41
|
)
|
|
38
42
|
logger.info(
|
|
39
43
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.embedding_provider})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
|
|
|
16
16
|
identifier: str = STEP_ID
|
|
17
17
|
|
|
18
18
|
def __post_init__(self):
|
|
19
|
-
config = self.process.config.
|
|
19
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
20
20
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
|
|
|
23
23
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
24
24
|
|
|
25
25
|
def __post_init__(self):
|
|
26
|
-
config = self.process.index_config.
|
|
26
|
+
config = self.process.index_config.model_dump_json() if self.process.index_config else None
|
|
27
27
|
connection_config = (
|
|
28
|
-
self.process.connection_config.
|
|
28
|
+
self.process.connection_config.model_dump_json()
|
|
29
|
+
if self.process.connection_config
|
|
30
|
+
else None
|
|
29
31
|
)
|
|
30
32
|
logger.info(
|
|
31
33
|
f"created {self.identifier} with configs: {config}, "
|
|
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.strategy})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config = self.process.config.
|
|
31
|
+
config = self.process.config.model_dump_json()
|
|
32
32
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
|
|
|
28
28
|
|
|
29
29
|
def __post_init__(self):
|
|
30
30
|
config = (
|
|
31
|
-
self.process.upload_stager_config.
|
|
31
|
+
self.process.upload_stager_config.model_dump_json()
|
|
32
|
+
if self.process.upload_stager_config
|
|
33
|
+
else None
|
|
32
34
|
)
|
|
33
35
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
34
36
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
|
|
|
22
22
|
identifier: str = STEP_ID
|
|
23
23
|
|
|
24
24
|
def __post_init__(self):
|
|
25
|
-
config = self.process.config.
|
|
25
|
+
config = self.process.config.model_dump_json() if self.process.config else None
|
|
26
26
|
logger.info(f"created {self.identifier} with configs: {config}")
|
|
27
27
|
|
|
28
28
|
async def _run_async(
|
|
@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
|
|
|
25
25
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
26
26
|
|
|
27
27
|
def __post_init__(self):
|
|
28
|
-
config =
|
|
28
|
+
config = (
|
|
29
|
+
self.process.upload_config.model_dump_json() if self.process.upload_config else None
|
|
30
|
+
)
|
|
29
31
|
connection_config = (
|
|
30
|
-
self.process.connection_config.
|
|
32
|
+
self.process.connection_config.model_dump_json()
|
|
33
|
+
if self.process.connection_config
|
|
34
|
+
else None
|
|
31
35
|
)
|
|
32
36
|
logger.info(
|
|
33
37
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
|
|
|
181
181
|
yield FileData(
|
|
182
182
|
identifier=table_meta.get_id(),
|
|
183
183
|
connector_type=CONNECTOR_TYPE,
|
|
184
|
-
additional_metadata=table_meta.
|
|
184
|
+
additional_metadata=table_meta.model_dump(),
|
|
185
185
|
source_identifiers=SourceIdentifiers(
|
|
186
186
|
filename=str(Path(fullpath).name),
|
|
187
187
|
fullpath=fullpath,
|
|
@@ -130,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
130
130
|
|
|
131
131
|
return WorkspaceClient(
|
|
132
132
|
host=self.connection_config.host,
|
|
133
|
-
**self.connection_config.access_config.get_secret_value().
|
|
133
|
+
**self.connection_config.access_config.get_secret_value().model_dump(),
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
def precheck(self) -> None:
|
|
@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
104
104
|
elif access_config.es_api_key:
|
|
105
105
|
client_input_kwargs["api_key"] = access_config.es_api_key
|
|
106
106
|
client_input = ElasticsearchClientInput(**client_input_kwargs)
|
|
107
|
-
logger.debug(f"elasticsearch client inputs mapped to: {client_input.
|
|
108
|
-
client_kwargs = client_input.
|
|
107
|
+
logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
|
|
108
|
+
client_kwargs = client_input.model_dump()
|
|
109
109
|
client_kwargs["basic_auth"] = (
|
|
110
110
|
client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
|
|
111
111
|
)
|
|
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
13
|
DestinationRegistryEntry,
|
|
13
14
|
SourceRegistryEntry,
|
|
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
|
|
|
84
85
|
def get_access_config(self) -> dict[str, Any]:
|
|
85
86
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
86
87
|
access_configs: dict[str, Any] = {
|
|
87
|
-
k: v for k, v in self.access_config.get_secret_value().
|
|
88
|
+
k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
|
|
88
89
|
}
|
|
89
90
|
return access_configs
|
|
90
91
|
|
|
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
|
|
|
99
100
|
def precheck(self) -> None:
|
|
100
101
|
super().precheck()
|
|
101
102
|
|
|
102
|
-
def sterilize_info(self,
|
|
103
|
-
|
|
104
|
-
return sterilize_dict(data=info, default=azure_json_serial)
|
|
103
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
104
|
+
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
105
105
|
|
|
106
106
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
107
107
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
108
108
|
return super().run(**kwargs)
|
|
109
109
|
|
|
110
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
|
+
path = file_data["name"]
|
|
112
|
+
date_created = (
|
|
113
|
+
file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
|
|
114
|
+
)
|
|
115
|
+
date_modified = (
|
|
116
|
+
file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
120
|
+
|
|
121
|
+
version = file_data.get("etag")
|
|
122
|
+
record_locator = {
|
|
123
|
+
"protocol": self.index_config.protocol,
|
|
124
|
+
"remote_file_path": self.index_config.remote_url,
|
|
125
|
+
}
|
|
126
|
+
return FileDataSourceMetadata(
|
|
127
|
+
date_created=date_created,
|
|
128
|
+
date_modified=date_modified,
|
|
129
|
+
date_processed=str(time()),
|
|
130
|
+
version=version,
|
|
131
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
132
|
+
record_locator=record_locator,
|
|
133
|
+
filesize_bytes=file_size,
|
|
134
|
+
)
|
|
135
|
+
|
|
110
136
|
|
|
111
137
|
class AzureDownloaderConfig(FsspecDownloaderConfig):
|
|
112
138
|
pass
|
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
8
|
+
from dateutil import parser
|
|
7
9
|
from pydantic import Field, Secret
|
|
8
10
|
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
12
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
14
|
DestinationRegistryEntry,
|
|
13
15
|
SourceRegistryEntry,
|
|
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
52
54
|
ac.box_app_config,
|
|
53
55
|
),
|
|
54
56
|
}
|
|
55
|
-
access_config: dict[str, Any] = ac.
|
|
57
|
+
access_config: dict[str, Any] = ac.model_dump()
|
|
56
58
|
access_config.pop("box_app_config", None)
|
|
57
59
|
access_kwargs_with_oauth.update(access_config)
|
|
58
60
|
|
|
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
|
|
|
73
75
|
def precheck(self) -> None:
|
|
74
76
|
super().precheck()
|
|
75
77
|
|
|
78
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
79
|
+
path = file_data["name"]
|
|
80
|
+
date_created = None
|
|
81
|
+
date_modified = None
|
|
82
|
+
if modified_at_str := file_data.get("modified_at"):
|
|
83
|
+
date_modified = parser.parse(modified_at_str).timestamp()
|
|
84
|
+
if created_at_str := file_data.get("created_at"):
|
|
85
|
+
date_created = parser.parse(created_at_str).timestamp()
|
|
86
|
+
|
|
87
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
88
|
+
|
|
89
|
+
version = file_data.get("id")
|
|
90
|
+
record_locator = {
|
|
91
|
+
"protocol": self.index_config.protocol,
|
|
92
|
+
"remote_file_path": self.index_config.remote_url,
|
|
93
|
+
"file_id": file_data.get("id"),
|
|
94
|
+
}
|
|
95
|
+
return FileDataSourceMetadata(
|
|
96
|
+
date_created=date_created,
|
|
97
|
+
date_modified=date_modified,
|
|
98
|
+
date_processed=str(time()),
|
|
99
|
+
version=version,
|
|
100
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
101
|
+
record_locator=record_locator,
|
|
102
|
+
filesize_bytes=file_size,
|
|
103
|
+
)
|
|
104
|
+
|
|
76
105
|
|
|
77
106
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
78
107
|
pass
|
|
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
11
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
13
|
DestinationRegistryEntry,
|
|
13
14
|
SourceRegistryEntry,
|
|
@@ -22,7 +23,6 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
22
23
|
FsspecUploader,
|
|
23
24
|
FsspecUploaderConfig,
|
|
24
25
|
)
|
|
25
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "dropbox"
|
|
28
28
|
|
|
@@ -49,6 +49,40 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
49
49
|
index_config: DropboxIndexerConfig
|
|
50
50
|
connector_type: str = CONNECTOR_TYPE
|
|
51
51
|
|
|
52
|
+
def get_path(self, file_data: dict) -> str:
|
|
53
|
+
return file_data["name"]
|
|
54
|
+
|
|
55
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
56
|
+
path = file_data["name"].lstrip("/")
|
|
57
|
+
date_created = None
|
|
58
|
+
date_modified = None
|
|
59
|
+
server_modified = file_data.get("server_modified")
|
|
60
|
+
client_modified = file_data.get("client_modified")
|
|
61
|
+
if server_modified and client_modified and server_modified > client_modified:
|
|
62
|
+
date_created = str(client_modified.timestamp())
|
|
63
|
+
date_modified = str(server_modified.timestamp())
|
|
64
|
+
elif server_modified and client_modified and server_modified < client_modified:
|
|
65
|
+
date_created = str(server_modified.timestamp())
|
|
66
|
+
date_modified = str(client_modified.timestamp())
|
|
67
|
+
|
|
68
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
69
|
+
|
|
70
|
+
version = file_data.get("content_hash")
|
|
71
|
+
record_locator = {
|
|
72
|
+
"protocol": self.index_config.protocol,
|
|
73
|
+
"remote_file_path": self.index_config.remote_url,
|
|
74
|
+
"file_id": file_data.get("id"),
|
|
75
|
+
}
|
|
76
|
+
return FileDataSourceMetadata(
|
|
77
|
+
date_created=date_created,
|
|
78
|
+
date_modified=date_modified,
|
|
79
|
+
date_processed=str(time()),
|
|
80
|
+
version=version,
|
|
81
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
82
|
+
record_locator=record_locator,
|
|
83
|
+
filesize_bytes=file_size,
|
|
84
|
+
)
|
|
85
|
+
|
|
52
86
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
53
87
|
def __post_init__(self):
|
|
54
88
|
# dropbox expects the path to start with a /
|
|
@@ -63,12 +97,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
63
97
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
64
98
|
return super().run(**kwargs)
|
|
65
99
|
|
|
66
|
-
def sterilize_info(self, path) -> dict:
|
|
67
|
-
# the fs.info method defined in the dropboxdrivefs library expects a "url"
|
|
68
|
-
# kwarg rather than "path"; though both refer to the same thing
|
|
69
|
-
info = self.fs.info(url=path)
|
|
70
|
-
return sterilize_dict(data=info)
|
|
71
|
-
|
|
72
100
|
|
|
73
101
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
74
102
|
pass
|
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import contextlib
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
|
-
from datetime import datetime
|
|
6
4
|
from pathlib import Path
|
|
7
|
-
from time import time
|
|
8
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
9
6
|
from uuid import NAMESPACE_DNS, uuid5
|
|
10
7
|
|
|
@@ -113,18 +110,13 @@ class FsspecIndexer(Indexer):
|
|
|
113
110
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
114
111
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
115
112
|
|
|
116
|
-
def
|
|
113
|
+
def get_file_data(self) -> list[dict[str, Any]]:
|
|
117
114
|
if not self.index_config.recursive:
|
|
118
115
|
# fs.ls does not walk directories
|
|
119
116
|
# directories that are listed in cloud storage can cause problems
|
|
120
117
|
# because they are seen as 0 byte files
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
return [
|
|
124
|
-
x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
|
|
125
|
-
]
|
|
126
|
-
else:
|
|
127
|
-
raise TypeError(f"unhandled response type from ls: {type(found)}")
|
|
118
|
+
files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
|
|
119
|
+
|
|
128
120
|
else:
|
|
129
121
|
# fs.find will recursively walk directories
|
|
130
122
|
# "size" is a common key for all the cloud protocols with fs
|
|
@@ -132,84 +124,40 @@ class FsspecIndexer(Indexer):
|
|
|
132
124
|
self.index_config.path_without_protocol,
|
|
133
125
|
detail=True,
|
|
134
126
|
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
141
|
-
|
|
142
|
-
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
143
|
-
date_created = None
|
|
144
|
-
date_modified = None
|
|
145
|
-
file_size = None
|
|
146
|
-
try:
|
|
147
|
-
created: Optional[Any] = self.fs.created(path)
|
|
148
|
-
if created:
|
|
149
|
-
if isinstance(created, datetime):
|
|
150
|
-
date_created = str(created.timestamp())
|
|
151
|
-
else:
|
|
152
|
-
date_created = str(created)
|
|
153
|
-
except NotImplementedError:
|
|
154
|
-
pass
|
|
127
|
+
files = found.values()
|
|
128
|
+
filtered_files = [
|
|
129
|
+
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
130
|
+
]
|
|
131
|
+
return filtered_files
|
|
155
132
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
else:
|
|
162
|
-
date_modified = str(modified)
|
|
163
|
-
except NotImplementedError:
|
|
164
|
-
pass
|
|
165
|
-
with contextlib.suppress(AttributeError):
|
|
166
|
-
file_size = self.fs.size(path)
|
|
167
|
-
|
|
168
|
-
version = self.fs.checksum(path)
|
|
169
|
-
metadata: dict[str, str] = {}
|
|
170
|
-
with contextlib.suppress(AttributeError):
|
|
171
|
-
metadata = self.fs.metadata(path)
|
|
172
|
-
record_locator = {
|
|
173
|
-
"protocol": self.index_config.protocol,
|
|
174
|
-
"remote_file_path": self.index_config.remote_url,
|
|
175
|
-
}
|
|
176
|
-
file_stat = self.fs.stat(path=path)
|
|
177
|
-
if file_id := file_stat.get("id"):
|
|
178
|
-
record_locator["file_id"] = file_id
|
|
179
|
-
if metadata:
|
|
180
|
-
record_locator["metadata"] = metadata
|
|
181
|
-
return FileDataSourceMetadata(
|
|
182
|
-
date_created=date_created,
|
|
183
|
-
date_modified=date_modified,
|
|
184
|
-
date_processed=str(time()),
|
|
185
|
-
version=str(version),
|
|
186
|
-
url=f"{self.index_config.protocol}://{path}",
|
|
187
|
-
record_locator=record_locator,
|
|
188
|
-
filesize_bytes=file_size,
|
|
189
|
-
)
|
|
133
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
134
|
+
raise NotImplementedError()
|
|
135
|
+
|
|
136
|
+
def get_path(self, file_data: dict) -> str:
|
|
137
|
+
return file_data["name"]
|
|
190
138
|
|
|
191
|
-
def sterilize_info(self,
|
|
192
|
-
|
|
193
|
-
return sterilize_dict(data=info)
|
|
139
|
+
def sterilize_info(self, file_data: dict) -> dict:
|
|
140
|
+
return sterilize_dict(data=file_data)
|
|
194
141
|
|
|
195
142
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
196
|
-
files = self.
|
|
197
|
-
for
|
|
143
|
+
files = self.get_file_data()
|
|
144
|
+
for file_data in files:
|
|
145
|
+
file_path = self.get_path(file_data=file_data)
|
|
198
146
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
199
147
|
# to get a valid relative path
|
|
200
|
-
rel_path =
|
|
148
|
+
rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
201
149
|
|
|
202
|
-
additional_metadata = self.sterilize_info(
|
|
203
|
-
additional_metadata["original_file_path"] =
|
|
150
|
+
additional_metadata = self.sterilize_info(file_data=file_data)
|
|
151
|
+
additional_metadata["original_file_path"] = file_path
|
|
204
152
|
yield FileData(
|
|
205
|
-
identifier=str(uuid5(NAMESPACE_DNS,
|
|
153
|
+
identifier=str(uuid5(NAMESPACE_DNS, file_path)),
|
|
206
154
|
connector_type=self.connector_type,
|
|
207
155
|
source_identifiers=SourceIdentifiers(
|
|
208
|
-
filename=Path(
|
|
156
|
+
filename=Path(file_path).name,
|
|
209
157
|
rel_path=rel_path or None,
|
|
210
|
-
fullpath=
|
|
158
|
+
fullpath=file_path,
|
|
211
159
|
),
|
|
212
|
-
metadata=self.get_metadata(
|
|
160
|
+
metadata=self.get_metadata(file_data=file_data),
|
|
213
161
|
additional_metadata=additional_metadata,
|
|
214
162
|
)
|
|
215
163
|
|
|
@@ -2,13 +2,15 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from time import time
|
|
5
6
|
from typing import Any, Generator, Optional, Union
|
|
6
7
|
|
|
8
|
+
from dateutil import parser
|
|
7
9
|
from pydantic import Field, Secret
|
|
8
10
|
|
|
9
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
12
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
11
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
13
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
12
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
15
|
DestinationRegistryEntry,
|
|
14
16
|
SourceRegistryEntry,
|
|
@@ -106,6 +108,33 @@ class GcsIndexer(FsspecIndexer):
|
|
|
106
108
|
def precheck(self) -> None:
|
|
107
109
|
super().precheck()
|
|
108
110
|
|
|
111
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
112
|
+
path = file_data["name"]
|
|
113
|
+
date_created = None
|
|
114
|
+
date_modified = None
|
|
115
|
+
if modified_at_str := file_data.get("updated"):
|
|
116
|
+
date_modified = parser.parse(modified_at_str).timestamp()
|
|
117
|
+
if created_at_str := file_data.get("timeCreated"):
|
|
118
|
+
date_created = parser.parse(created_at_str).timestamp()
|
|
119
|
+
|
|
120
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
121
|
+
|
|
122
|
+
version = file_data.get("etag")
|
|
123
|
+
record_locator = {
|
|
124
|
+
"protocol": self.index_config.protocol,
|
|
125
|
+
"remote_file_path": self.index_config.remote_url,
|
|
126
|
+
"file_id": file_data.get("id"),
|
|
127
|
+
}
|
|
128
|
+
return FileDataSourceMetadata(
|
|
129
|
+
date_created=date_created,
|
|
130
|
+
date_modified=date_modified,
|
|
131
|
+
date_processed=str(time()),
|
|
132
|
+
version=version,
|
|
133
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
134
|
+
record_locator=record_locator,
|
|
135
|
+
filesize_bytes=file_size,
|
|
136
|
+
)
|
|
137
|
+
|
|
109
138
|
|
|
110
139
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
111
140
|
pass
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from time import time
|
|
6
5
|
from typing import Any, Generator, Optional
|
|
@@ -69,7 +68,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
69
68
|
|
|
70
69
|
# Avoid injecting None by filtering out k,v pairs where the value is None
|
|
71
70
|
access_configs.update(
|
|
72
|
-
{k: v for k, v in self.access_config.get_secret_value().
|
|
71
|
+
{k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
|
|
73
72
|
)
|
|
74
73
|
return access_configs
|
|
75
74
|
|
|
@@ -80,27 +79,25 @@ class S3Indexer(FsspecIndexer):
|
|
|
80
79
|
index_config: S3IndexerConfig
|
|
81
80
|
connector_type: str = CONNECTOR_TYPE
|
|
82
81
|
|
|
83
|
-
def
|
|
82
|
+
def get_path(self, file_data: dict) -> str:
|
|
83
|
+
return file_data["Key"]
|
|
84
|
+
|
|
85
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
86
|
+
path = file_data["Key"]
|
|
84
87
|
date_created = None
|
|
85
88
|
date_modified = None
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
pass
|
|
94
|
-
with contextlib.suppress(AttributeError):
|
|
95
|
-
file_size = self.fs.size(path)
|
|
89
|
+
modified = file_data.get("LastModified")
|
|
90
|
+
if modified:
|
|
91
|
+
date_created = str(modified.timestamp())
|
|
92
|
+
date_modified = str(modified.timestamp())
|
|
93
|
+
|
|
94
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
95
|
+
file_size = file_size or file_data.get("Size")
|
|
96
96
|
|
|
97
|
-
version = None
|
|
98
|
-
info: dict[str, Any] = self.fs.info(path)
|
|
99
|
-
if etag := info.get("ETag"):
|
|
100
|
-
version = str(etag).rstrip('"').lstrip('"')
|
|
97
|
+
version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
|
|
101
98
|
metadata: dict[str, str] = {}
|
|
102
99
|
with contextlib.suppress(AttributeError):
|
|
103
|
-
metadata = self.fs.metadata(path)
|
|
100
|
+
metadata = self.fs.metadata(path=path)
|
|
104
101
|
record_locator = {
|
|
105
102
|
"protocol": self.index_config.protocol,
|
|
106
103
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from time import time
|
|
6
7
|
from typing import Any, Generator, Optional
|
|
7
8
|
from urllib.parse import urlparse
|
|
8
9
|
|
|
9
10
|
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
13
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
|
|
13
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
15
|
DestinationRegistryEntry,
|
|
15
16
|
SourceRegistryEntry,
|
|
@@ -96,6 +97,26 @@ class SftpIndexer(FsspecIndexer):
|
|
|
96
97
|
def precheck(self) -> None:
|
|
97
98
|
super().precheck()
|
|
98
99
|
|
|
100
|
+
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
101
|
+
path = file_data["name"]
|
|
102
|
+
date_created = file_data.get("time").timestamp() if "time" in file_data else None
|
|
103
|
+
date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
|
|
104
|
+
|
|
105
|
+
file_size = file_data.get("size") if "size" in file_data else None
|
|
106
|
+
|
|
107
|
+
record_locator = {
|
|
108
|
+
"protocol": self.index_config.protocol,
|
|
109
|
+
"remote_file_path": self.index_config.remote_url,
|
|
110
|
+
}
|
|
111
|
+
return FileDataSourceMetadata(
|
|
112
|
+
date_created=date_created,
|
|
113
|
+
date_modified=date_modified,
|
|
114
|
+
date_processed=str(time()),
|
|
115
|
+
url=f"{self.index_config.protocol}://{path}",
|
|
116
|
+
record_locator=record_locator,
|
|
117
|
+
filesize_bytes=file_size,
|
|
118
|
+
)
|
|
119
|
+
|
|
99
120
|
|
|
100
121
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
101
122
|
remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
|