unstructured-ingest 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +6 -1
- unstructured_ingest/v2/interfaces/uploader.py +9 -4
- unstructured_ingest/v2/otel.py +111 -0
- unstructured_ingest/v2/pipeline/interfaces.py +61 -28
- unstructured_ingest/v2/pipeline/otel.py +32 -0
- unstructured_ingest/v2/pipeline/pipeline.py +11 -7
- unstructured_ingest/v2/pipeline/steps/index.py +2 -0
- unstructured_ingest/v2/pipeline/steps/upload.py +7 -19
- unstructured_ingest/v2/processes/chunker.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +3 -8
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +4 -9
- unstructured_ingest/v2/processes/connectors/chroma.py +3 -8
- unstructured_ingest/v2/processes/connectors/couchbase.py +5 -9
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -10
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +4 -7
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -6
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +2 -3
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +3 -3
- unstructured_ingest/v2/processes/connectors/kdbai.py +7 -8
- unstructured_ingest/v2/processes/connectors/local.py +15 -22
- unstructured_ingest/v2/processes/connectors/milvus.py +2 -14
- unstructured_ingest/v2/processes/connectors/mongodb.py +3 -8
- unstructured_ingest/v2/processes/connectors/pinecone.py +6 -24
- unstructured_ingest/v2/processes/connectors/singlestore.py +6 -6
- unstructured_ingest/v2/processes/connectors/sql.py +5 -7
- unstructured_ingest/v2/processes/connectors/weaviate.py +4 -11
- unstructured_ingest/v2/processes/partitioner.py +13 -3
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/METADATA +275 -211
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/RECORD +38 -37
- unstructured_ingest/v2/example.py +0 -37
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
10
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
12
|
DestinationRegistryEntry,
|
|
13
13
|
SourceRegistryEntry,
|
|
@@ -152,8 +152,8 @@ class AzureUploader(FsspecUploader):
|
|
|
152
152
|
super().precheck()
|
|
153
153
|
|
|
154
154
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
155
|
-
def run(self,
|
|
156
|
-
return super().run(
|
|
155
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
156
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
157
157
|
|
|
158
158
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
159
159
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
10
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
12
|
DestinationRegistryEntry,
|
|
13
13
|
SourceRegistryEntry,
|
|
@@ -118,8 +118,8 @@ class BoxUploader(FsspecUploader):
|
|
|
118
118
|
super().precheck()
|
|
119
119
|
|
|
120
120
|
@requires_dependencies(["boxfs"], extras="box")
|
|
121
|
-
def run(self,
|
|
122
|
-
return super().run(
|
|
121
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
122
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
123
123
|
|
|
124
124
|
@requires_dependencies(["boxfs"], extras="box")
|
|
125
125
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
10
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
12
|
DestinationRegistryEntry,
|
|
13
13
|
SourceRegistryEntry,
|
|
@@ -114,8 +114,8 @@ class DropboxUploader(FsspecUploader):
|
|
|
114
114
|
super().precheck()
|
|
115
115
|
|
|
116
116
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
117
|
-
def run(self,
|
|
118
|
-
return super().run(
|
|
117
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
118
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
119
119
|
|
|
120
120
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
121
121
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
26
26
|
Indexer,
|
|
27
27
|
IndexerConfig,
|
|
28
28
|
SourceIdentifiers,
|
|
29
|
-
UploadContent,
|
|
30
29
|
Uploader,
|
|
31
30
|
UploaderConfig,
|
|
32
31
|
)
|
|
@@ -273,6 +272,9 @@ class FsspecUploader(Uploader):
|
|
|
273
272
|
connector_type: str = CONNECTOR_TYPE
|
|
274
273
|
upload_config: FsspecUploaderConfigT = field(default=None)
|
|
275
274
|
|
|
275
|
+
def is_async(self) -> bool:
|
|
276
|
+
return self.fs.async_impl
|
|
277
|
+
|
|
276
278
|
@property
|
|
277
279
|
def fs(self) -> "AbstractFileSystem":
|
|
278
280
|
from fsspec import get_filesystem_class
|
|
@@ -311,11 +313,7 @@ class FsspecUploader(Uploader):
|
|
|
311
313
|
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
312
314
|
return updated_upload_path
|
|
313
315
|
|
|
314
|
-
def run(self,
|
|
315
|
-
for content in contents:
|
|
316
|
-
self._run(path=content.path, file_data=content.file_data)
|
|
317
|
-
|
|
318
|
-
def _run(self, path: Path, file_data: FileData) -> None:
|
|
316
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
319
317
|
path_str = str(path.resolve())
|
|
320
318
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
321
319
|
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
@@ -8,7 +8,7 @@ from pydantic import Field, Secret
|
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
11
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -151,8 +151,8 @@ class GcsUploader(FsspecUploader):
|
|
|
151
151
|
super().precheck()
|
|
152
152
|
|
|
153
153
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
154
|
-
def run(self,
|
|
155
|
-
return super().run(
|
|
154
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
155
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
156
156
|
|
|
157
157
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
158
158
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -12,7 +12,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
DownloadResponse,
|
|
13
13
|
FileData,
|
|
14
14
|
FileDataSourceMetadata,
|
|
15
|
-
UploadContent,
|
|
16
15
|
)
|
|
17
16
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
18
17
|
DestinationRegistryEntry,
|
|
@@ -171,8 +170,8 @@ class S3Uploader(FsspecUploader):
|
|
|
171
170
|
super().__post_init__()
|
|
172
171
|
|
|
173
172
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
174
|
-
def run(self,
|
|
175
|
-
return super().run(
|
|
173
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
174
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
176
175
|
|
|
177
176
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
178
177
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -9,7 +9,7 @@ from urllib.parse import urlparse
|
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
12
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
|
|
13
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
14
|
DestinationRegistryEntry,
|
|
15
15
|
SourceRegistryEntry,
|
|
@@ -142,8 +142,8 @@ class SftpUploader(FsspecUploader):
|
|
|
142
142
|
super().precheck()
|
|
143
143
|
|
|
144
144
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
145
|
-
def run(self,
|
|
146
|
-
return super().run(
|
|
145
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
146
|
+
return super().run(path=path, file_data=file_data, **kwargs)
|
|
147
147
|
|
|
148
148
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
149
149
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
15
|
AccessConfig,
|
|
16
16
|
ConnectionConfig,
|
|
17
17
|
FileData,
|
|
18
|
-
UploadContent,
|
|
19
18
|
Uploader,
|
|
20
19
|
UploaderConfig,
|
|
21
20
|
UploadStager,
|
|
@@ -152,13 +151,13 @@ class KdbaiUploader(Uploader):
|
|
|
152
151
|
df = pd.DataFrame(data=all_records)
|
|
153
152
|
self.process_dataframe(df=df)
|
|
154
153
|
|
|
155
|
-
def run(self,
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
154
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
155
|
+
if path.suffix == ".csv":
|
|
156
|
+
self.process_csv(csv_paths=[path])
|
|
157
|
+
elif path.suffix == ".json":
|
|
158
|
+
self.process_json(json_paths=[path])
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
|
|
162
161
|
|
|
163
162
|
|
|
164
163
|
kdbai_destination_entry = DestinationRegistryEntry(
|
|
@@ -18,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
18
|
Indexer,
|
|
19
19
|
IndexerConfig,
|
|
20
20
|
SourceIdentifiers,
|
|
21
|
-
UploadContent,
|
|
22
21
|
Uploader,
|
|
23
22
|
UploaderConfig,
|
|
24
23
|
)
|
|
@@ -179,27 +178,21 @@ class LocalUploader(Uploader):
|
|
|
179
178
|
def is_async(self) -> bool:
|
|
180
179
|
return False
|
|
181
180
|
|
|
182
|
-
def run(self,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
identifiers
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
final_path = self.upload_config.output_path / Path(
|
|
198
|
-
f"{content.file_data.identifier}.json"
|
|
199
|
-
)
|
|
200
|
-
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
-
logger.debug(f"copying file from {content.path} to {final_path}")
|
|
202
|
-
shutil.copy(src=str(content.path), dst=str(final_path))
|
|
181
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
182
|
+
if source_identifiers := file_data.source_identifiers:
|
|
183
|
+
identifiers = source_identifiers
|
|
184
|
+
rel_path = (
|
|
185
|
+
identifiers.relative_path[1:]
|
|
186
|
+
if identifiers.relative_path.startswith("/")
|
|
187
|
+
else identifiers.relative_path
|
|
188
|
+
)
|
|
189
|
+
new_path = self.upload_config.output_path / Path(rel_path)
|
|
190
|
+
final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
|
|
191
|
+
else:
|
|
192
|
+
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
193
|
+
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
|
194
|
+
logger.debug(f"copying file from {path} to {final_path}")
|
|
195
|
+
shutil.copy(src=str(path), dst=str(final_path))
|
|
203
196
|
|
|
204
197
|
|
|
205
198
|
local_source_entry = SourceRegistryEntry(
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import multiprocessing as mp
|
|
3
2
|
from dataclasses import dataclass, field
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
@@ -135,9 +134,6 @@ class MilvusUploadStager(UploadStager):
|
|
|
135
134
|
|
|
136
135
|
class MilvusUploaderConfig(UploaderConfig):
|
|
137
136
|
collection_name: str = Field(description="Milvus collections to write to")
|
|
138
|
-
num_processes: int = Field(
|
|
139
|
-
default=4, description="number of processes to use when writing to support parallel writes"
|
|
140
|
-
)
|
|
141
137
|
|
|
142
138
|
|
|
143
139
|
@dataclass
|
|
@@ -183,16 +179,8 @@ class MilvusUploader(Uploader):
|
|
|
183
179
|
data: list[dict] = json.load(file)
|
|
184
180
|
self.insert_results(data=data)
|
|
185
181
|
|
|
186
|
-
def run(self,
|
|
187
|
-
|
|
188
|
-
for content in contents:
|
|
189
|
-
self.upload(content=content)
|
|
190
|
-
|
|
191
|
-
else:
|
|
192
|
-
with mp.Pool(
|
|
193
|
-
processes=self.upload_config.num_processes,
|
|
194
|
-
) as pool:
|
|
195
|
-
pool.map(self.upload, contents)
|
|
182
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
183
|
+
self.upload(content=UploadContent(path=path, file_data=file_data))
|
|
196
184
|
|
|
197
185
|
|
|
198
186
|
milvus_destination_entry = DestinationRegistryEntry(
|
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
13
13
|
AccessConfig,
|
|
14
14
|
ConnectionConfig,
|
|
15
15
|
FileData,
|
|
16
|
-
UploadContent,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
@@ -119,13 +118,9 @@ class MongoDBUploader(Uploader):
|
|
|
119
118
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
120
119
|
)
|
|
121
120
|
|
|
122
|
-
def run(self,
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
with open(content.path) as elements_file:
|
|
126
|
-
elements = json.load(elements_file)
|
|
127
|
-
elements_dict.extend(elements)
|
|
128
|
-
|
|
121
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
122
|
+
with path.open("r") as file:
|
|
123
|
+
elements_dict = json.load(file)
|
|
129
124
|
logger.info(
|
|
130
125
|
f"writing {len(elements_dict)} objects to destination "
|
|
131
126
|
f"db, {self.connection_config.database}, "
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import multiprocessing as mp
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
@@ -13,7 +12,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
13
12
|
from unstructured_ingest.v2.interfaces import (
|
|
14
13
|
AccessConfig,
|
|
15
14
|
ConnectionConfig,
|
|
16
|
-
|
|
15
|
+
FileData,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
@@ -68,7 +67,6 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
|
68
67
|
|
|
69
68
|
class PineconeUploaderConfig(UploaderConfig):
|
|
70
69
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
71
|
-
num_processes: int = Field(default=4, description="Number of processes to use for uploading")
|
|
72
70
|
|
|
73
71
|
|
|
74
72
|
@dataclass
|
|
@@ -143,34 +141,18 @@ class PineconeUploader(Uploader):
|
|
|
143
141
|
raise DestinationConnectionError(f"http error: {api_error}") from api_error
|
|
144
142
|
logger.debug(f"results: {response}")
|
|
145
143
|
|
|
146
|
-
def run(self,
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
for content in contents:
|
|
150
|
-
with open(content.path) as elements_file:
|
|
151
|
-
elements = json.load(elements_file)
|
|
152
|
-
elements_dict.extend(elements)
|
|
153
|
-
|
|
144
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
|
+
with path.open("r") as file:
|
|
146
|
+
elements_dict = json.load(file)
|
|
154
147
|
logger.info(
|
|
155
148
|
f"writing document batches to destination"
|
|
156
149
|
f" index named {self.connection_config.index_name}"
|
|
157
150
|
f" with batch size {self.upload_config.batch_size}"
|
|
158
|
-
f" with {self.upload_config.num_processes} (number of) processes"
|
|
159
151
|
)
|
|
160
152
|
|
|
161
153
|
pinecone_batch_size = self.upload_config.batch_size
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
for batch in batch_generator(elements_dict, pinecone_batch_size):
|
|
165
|
-
self.upsert_batch(batch) # noqa: E203
|
|
166
|
-
|
|
167
|
-
else:
|
|
168
|
-
with mp.Pool(
|
|
169
|
-
processes=self.upload_config.num_processes,
|
|
170
|
-
) as pool:
|
|
171
|
-
pool.map(
|
|
172
|
-
self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
|
|
173
|
-
)
|
|
154
|
+
for pinecone_batch in batch_generator(elements_dict, pinecone_batch_size):
|
|
155
|
+
self.upsert_batch(batch=pinecone_batch)
|
|
174
156
|
|
|
175
157
|
|
|
176
158
|
pinecone_destination_entry = DestinationRegistryEntry(
|
|
@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
AccessConfig,
|
|
17
17
|
ConnectionConfig,
|
|
18
18
|
FileData,
|
|
19
|
-
UploadContent,
|
|
20
19
|
Uploader,
|
|
21
20
|
UploaderConfig,
|
|
22
21
|
UploadStager,
|
|
@@ -120,8 +119,8 @@ class SingleStoreUploader(Uploader):
|
|
|
120
119
|
upload_config: SingleStoreUploaderConfig
|
|
121
120
|
connector_type: str = CONNECTOR_TYPE
|
|
122
121
|
|
|
123
|
-
def upload_csv(self,
|
|
124
|
-
df = pd.read_csv(
|
|
122
|
+
def upload_csv(self, csv_path: Path) -> None:
|
|
123
|
+
df = pd.read_csv(csv_path)
|
|
125
124
|
logger.debug(
|
|
126
125
|
f"uploading {len(df)} entries to {self.connection_config.database} "
|
|
127
126
|
f"db in table {self.upload_config.table_name}"
|
|
@@ -142,9 +141,10 @@ class SingleStoreUploader(Uploader):
|
|
|
142
141
|
cur.executemany(stmt, chunk)
|
|
143
142
|
conn.commit()
|
|
144
143
|
|
|
145
|
-
def run(self,
|
|
146
|
-
|
|
147
|
-
|
|
144
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
145
|
+
if path.suffix != ".csv":
|
|
146
|
+
raise ValueError(f"Only .csv files are supported: {path}")
|
|
147
|
+
self.upload_csv(csv_path=path)
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
singlestore_destination_entry = DestinationRegistryEntry(
|
|
@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
AccessConfig,
|
|
17
17
|
ConnectionConfig,
|
|
18
18
|
FileData,
|
|
19
|
-
UploadContent,
|
|
20
19
|
Uploader,
|
|
21
20
|
UploaderConfig,
|
|
22
21
|
UploadStager,
|
|
@@ -246,8 +245,8 @@ class SQLUploader(Uploader):
|
|
|
246
245
|
output.append(tuple(parsed))
|
|
247
246
|
return output
|
|
248
247
|
|
|
249
|
-
def upload_contents(self,
|
|
250
|
-
df = pd.read_json(
|
|
248
|
+
def upload_contents(self, path: Path) -> None:
|
|
249
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
251
250
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
252
251
|
df.replace({np.nan: None}, inplace=True)
|
|
253
252
|
|
|
@@ -256,7 +255,7 @@ class SQLUploader(Uploader):
|
|
|
256
255
|
VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
|
|
257
256
|
|
|
258
257
|
for rows in pd.read_json(
|
|
259
|
-
|
|
258
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
260
259
|
):
|
|
261
260
|
with self.connection() as conn:
|
|
262
261
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
@@ -268,9 +267,8 @@ class SQLUploader(Uploader):
|
|
|
268
267
|
|
|
269
268
|
conn.commit()
|
|
270
269
|
|
|
271
|
-
def run(self,
|
|
272
|
-
|
|
273
|
-
self.upload_contents(content=content)
|
|
270
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
271
|
+
self.upload_contents(path=path)
|
|
274
272
|
|
|
275
273
|
|
|
276
274
|
sql_destination_entry = DestinationRegistryEntry(
|
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
13
13
|
AccessConfig,
|
|
14
14
|
ConnectionConfig,
|
|
15
15
|
FileData,
|
|
16
|
-
UploadContent,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
@@ -184,7 +183,7 @@ class WeaviateUploader(Uploader):
|
|
|
184
183
|
|
|
185
184
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
186
185
|
def _resolve_auth_method(self):
|
|
187
|
-
access_configs = self.connection_config.access_config
|
|
186
|
+
access_configs = self.connection_config.access_config.get_secret_value()
|
|
188
187
|
connection_config = self.connection_config
|
|
189
188
|
if connection_config.anonymous:
|
|
190
189
|
return None
|
|
@@ -216,15 +215,9 @@ class WeaviateUploader(Uploader):
|
|
|
216
215
|
)
|
|
217
216
|
return None
|
|
218
217
|
|
|
219
|
-
def run(self,
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
elements_dict = []
|
|
223
|
-
for content in contents:
|
|
224
|
-
with open(content.path) as elements_file:
|
|
225
|
-
elements = json.load(elements_file)
|
|
226
|
-
elements_dict.extend(elements)
|
|
227
|
-
|
|
218
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
219
|
+
with path.open("r") as file:
|
|
220
|
+
elements_dict = json.load(file)
|
|
228
221
|
logger.info(
|
|
229
222
|
f"writing {len(elements_dict)} objects to destination "
|
|
230
223
|
f"class {self.connection_config.class_name} "
|
|
@@ -13,6 +13,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from unstructured_client import UnstructuredClient
|
|
16
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
16
17
|
from unstructured_client.models.shared import PartitionParameters
|
|
17
18
|
|
|
18
19
|
|
|
@@ -153,7 +154,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
153
154
|
)
|
|
154
155
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
155
156
|
|
|
156
|
-
async def call_api(self, client: "UnstructuredClient", request: "
|
|
157
|
+
async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
|
|
157
158
|
# TODO when client supports async, run without using run_in_executor
|
|
158
159
|
# isolate the IO heavy call
|
|
159
160
|
loop = asyncio.get_event_loop()
|
|
@@ -163,7 +164,14 @@ class Partitioner(BaseProcess, ABC):
|
|
|
163
164
|
from unstructured_client.models.shared import Files, PartitionParameters
|
|
164
165
|
|
|
165
166
|
partition_request = self.config.to_partition_kwargs()
|
|
166
|
-
|
|
167
|
+
|
|
168
|
+
# Note(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
169
|
+
# Prior to this it was a dataclass which doesn't have .__fields
|
|
170
|
+
try:
|
|
171
|
+
possible_fields = PartitionParameters.__fields__
|
|
172
|
+
except AttributeError:
|
|
173
|
+
possible_fields = [f.name for f in fields(PartitionParameters)]
|
|
174
|
+
|
|
167
175
|
filtered_partition_request = {
|
|
168
176
|
k: v for k, v in partition_request.items() if k in possible_fields
|
|
169
177
|
}
|
|
@@ -189,6 +197,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
189
197
|
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
190
198
|
) -> list[dict]:
|
|
191
199
|
from unstructured_client import UnstructuredClient
|
|
200
|
+
from unstructured_client.models.operations import PartitionRequest
|
|
192
201
|
|
|
193
202
|
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
194
203
|
client = UnstructuredClient(
|
|
@@ -196,7 +205,8 @@ class Partitioner(BaseProcess, ABC):
|
|
|
196
205
|
api_key_auth=self.config.api_key.get_secret_value(),
|
|
197
206
|
)
|
|
198
207
|
partition_params = self.create_partition_parameters(filename=filename)
|
|
199
|
-
|
|
208
|
+
partition_request = PartitionRequest(partition_params)
|
|
209
|
+
resp = await self.call_api(client=client, request=partition_request)
|
|
200
210
|
elements = resp.elements or []
|
|
201
211
|
# Append the data source metadata the auto partition does for you
|
|
202
212
|
for element in elements:
|