unstructured-ingest 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/interfaces/processor.py +6 -1
  3. unstructured_ingest/v2/interfaces/uploader.py +9 -4
  4. unstructured_ingest/v2/otel.py +111 -0
  5. unstructured_ingest/v2/pipeline/interfaces.py +61 -28
  6. unstructured_ingest/v2/pipeline/otel.py +32 -0
  7. unstructured_ingest/v2/pipeline/pipeline.py +11 -7
  8. unstructured_ingest/v2/pipeline/steps/index.py +2 -0
  9. unstructured_ingest/v2/pipeline/steps/upload.py +7 -19
  10. unstructured_ingest/v2/processes/chunker.py +3 -1
  11. unstructured_ingest/v2/processes/connectors/astradb.py +3 -8
  12. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +4 -9
  13. unstructured_ingest/v2/processes/connectors/chroma.py +3 -8
  14. unstructured_ingest/v2/processes/connectors/couchbase.py +5 -9
  15. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -10
  16. unstructured_ingest/v2/processes/connectors/elasticsearch.py +4 -7
  17. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +3 -3
  18. unstructured_ingest/v2/processes/connectors/fsspec/box.py +3 -3
  19. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -6
  21. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +3 -3
  22. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +2 -3
  23. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +3 -3
  24. unstructured_ingest/v2/processes/connectors/kdbai.py +7 -8
  25. unstructured_ingest/v2/processes/connectors/local.py +15 -22
  26. unstructured_ingest/v2/processes/connectors/milvus.py +2 -14
  27. unstructured_ingest/v2/processes/connectors/mongodb.py +3 -8
  28. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -24
  29. unstructured_ingest/v2/processes/connectors/singlestore.py +6 -6
  30. unstructured_ingest/v2/processes/connectors/sql.py +5 -7
  31. unstructured_ingest/v2/processes/connectors/weaviate.py +4 -11
  32. unstructured_ingest/v2/processes/partitioner.py +13 -3
  33. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/METADATA +275 -211
  34. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/RECORD +38 -37
  35. unstructured_ingest/v2/example.py +0 -37
  36. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.0.5.dist-info → unstructured_ingest-0.0.7.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
11
  from unstructured_ingest.v2.processes.connector_registry import (
12
12
  DestinationRegistryEntry,
13
13
  SourceRegistryEntry,
@@ -152,8 +152,8 @@ class AzureUploader(FsspecUploader):
152
152
  super().precheck()
153
153
 
154
154
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
155
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
156
- return super().run(contents=contents, **kwargs)
155
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
156
+ return super().run(path=path, file_data=file_data, **kwargs)
157
157
 
158
158
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
159
159
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
11
  from unstructured_ingest.v2.processes.connector_registry import (
12
12
  DestinationRegistryEntry,
13
13
  SourceRegistryEntry,
@@ -118,8 +118,8 @@ class BoxUploader(FsspecUploader):
118
118
  super().precheck()
119
119
 
120
120
  @requires_dependencies(["boxfs"], extras="box")
121
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122
- return super().run(contents=contents, **kwargs)
121
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
122
+ return super().run(path=path, file_data=file_data, **kwargs)
123
123
 
124
124
  @requires_dependencies(["boxfs"], extras="box")
125
125
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -7,7 +7,7 @@ from typing import Any, Generator, Optional
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
11
  from unstructured_ingest.v2.processes.connector_registry import (
12
12
  DestinationRegistryEntry,
13
13
  SourceRegistryEntry,
@@ -114,8 +114,8 @@ class DropboxUploader(FsspecUploader):
114
114
  super().precheck()
115
115
 
116
116
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
117
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
118
- return super().run(contents=contents, **kwargs)
117
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
118
+ return super().run(path=path, file_data=file_data, **kwargs)
119
119
 
120
120
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
121
121
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -26,7 +26,6 @@ from unstructured_ingest.v2.interfaces import (
26
26
  Indexer,
27
27
  IndexerConfig,
28
28
  SourceIdentifiers,
29
- UploadContent,
30
29
  Uploader,
31
30
  UploaderConfig,
32
31
  )
@@ -273,6 +272,9 @@ class FsspecUploader(Uploader):
273
272
  connector_type: str = CONNECTOR_TYPE
274
273
  upload_config: FsspecUploaderConfigT = field(default=None)
275
274
 
275
+ def is_async(self) -> bool:
276
+ return self.fs.async_impl
277
+
276
278
  @property
277
279
  def fs(self) -> "AbstractFileSystem":
278
280
  from fsspec import get_filesystem_class
@@ -311,11 +313,7 @@ class FsspecUploader(Uploader):
311
313
  updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
312
314
  return updated_upload_path
313
315
 
314
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
315
- for content in contents:
316
- self._run(path=content.path, file_data=content.file_data)
317
-
318
- def _run(self, path: Path, file_data: FileData) -> None:
316
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
319
317
  path_str = str(path.resolve())
320
318
  upload_path = self.get_upload_path(file_data=file_data)
321
319
  if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
@@ -8,7 +8,7 @@ from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -151,8 +151,8 @@ class GcsUploader(FsspecUploader):
151
151
  super().precheck()
152
152
 
153
153
  @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
154
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
155
- return super().run(contents=contents, **kwargs)
154
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
155
+ return super().run(path=path, file_data=file_data, **kwargs)
156
156
 
157
157
  @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
158
158
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -12,7 +12,6 @@ from unstructured_ingest.v2.interfaces import (
12
12
  DownloadResponse,
13
13
  FileData,
14
14
  FileDataSourceMetadata,
15
- UploadContent,
16
15
  )
17
16
  from unstructured_ingest.v2.processes.connector_registry import (
18
17
  DestinationRegistryEntry,
@@ -171,8 +170,8 @@ class S3Uploader(FsspecUploader):
171
170
  super().__post_init__()
172
171
 
173
172
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
174
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
175
- return super().run(contents=contents, **kwargs)
173
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
174
+ return super().run(path=path, file_data=file_data, **kwargs)
176
175
 
177
176
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
178
177
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -9,7 +9,7 @@ from urllib.parse import urlparse
9
9
  from pydantic import Field, Secret
10
10
 
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
13
13
  from unstructured_ingest.v2.processes.connector_registry import (
14
14
  DestinationRegistryEntry,
15
15
  SourceRegistryEntry,
@@ -142,8 +142,8 @@ class SftpUploader(FsspecUploader):
142
142
  super().precheck()
143
143
 
144
144
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
145
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
146
- return super().run(contents=contents, **kwargs)
145
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
146
+ return super().run(path=path, file_data=file_data, **kwargs)
147
147
 
148
148
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
149
149
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
15
15
  AccessConfig,
16
16
  ConnectionConfig,
17
17
  FileData,
18
- UploadContent,
19
18
  Uploader,
20
19
  UploaderConfig,
21
20
  UploadStager,
@@ -152,13 +151,13 @@ class KdbaiUploader(Uploader):
152
151
  df = pd.DataFrame(data=all_records)
153
152
  self.process_dataframe(df=df)
154
153
 
155
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
156
- csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
157
- if csv_paths:
158
- self.process_csv(csv_paths=csv_paths)
159
- json_paths = [c.path for c in contents if c.path.suffix == ".json"]
160
- if json_paths:
161
- self.process_json(json_paths=json_paths)
154
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
155
+ if path.suffix == ".csv":
156
+ self.process_csv(csv_paths=[path])
157
+ elif path.suffix == ".json":
158
+ self.process_json(json_paths=[path])
159
+ else:
160
+ raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
162
161
 
163
162
 
164
163
  kdbai_destination_entry = DestinationRegistryEntry(
@@ -18,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
18
18
  Indexer,
19
19
  IndexerConfig,
20
20
  SourceIdentifiers,
21
- UploadContent,
22
21
  Uploader,
23
22
  UploaderConfig,
24
23
  )
@@ -179,27 +178,21 @@ class LocalUploader(Uploader):
179
178
  def is_async(self) -> bool:
180
179
  return False
181
180
 
182
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
183
- self.upload_config.output_path.mkdir(parents=True, exist_ok=True)
184
- for content in contents:
185
- if source_identifiers := content.file_data.source_identifiers:
186
- identifiers = source_identifiers
187
- rel_path = (
188
- identifiers.relative_path[1:]
189
- if identifiers.relative_path.startswith("/")
190
- else identifiers.relative_path
191
- )
192
- new_path = self.upload_config.output_path / Path(rel_path)
193
- final_path = str(new_path).replace(
194
- identifiers.filename, f"{identifiers.filename}.json"
195
- )
196
- else:
197
- final_path = self.upload_config.output_path / Path(
198
- f"{content.file_data.identifier}.json"
199
- )
200
- Path(final_path).parent.mkdir(parents=True, exist_ok=True)
201
- logger.debug(f"copying file from {content.path} to {final_path}")
202
- shutil.copy(src=str(content.path), dst=str(final_path))
181
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
182
+ if source_identifiers := file_data.source_identifiers:
183
+ identifiers = source_identifiers
184
+ rel_path = (
185
+ identifiers.relative_path[1:]
186
+ if identifiers.relative_path.startswith("/")
187
+ else identifiers.relative_path
188
+ )
189
+ new_path = self.upload_config.output_path / Path(rel_path)
190
+ final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
191
+ else:
192
+ final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
193
+ Path(final_path).parent.mkdir(parents=True, exist_ok=True)
194
+ logger.debug(f"copying file from {path} to {final_path}")
195
+ shutil.copy(src=str(path), dst=str(final_path))
203
196
 
204
197
 
205
198
  local_source_entry = SourceRegistryEntry(
@@ -1,5 +1,4 @@
1
1
  import json
2
- import multiprocessing as mp
3
2
  from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Optional, Union
@@ -135,9 +134,6 @@ class MilvusUploadStager(UploadStager):
135
134
 
136
135
  class MilvusUploaderConfig(UploaderConfig):
137
136
  collection_name: str = Field(description="Milvus collections to write to")
138
- num_processes: int = Field(
139
- default=4, description="number of processes to use when writing to support parallel writes"
140
- )
141
137
 
142
138
 
143
139
  @dataclass
@@ -183,16 +179,8 @@ class MilvusUploader(Uploader):
183
179
  data: list[dict] = json.load(file)
184
180
  self.insert_results(data=data)
185
181
 
186
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
187
- if self.upload_config.num_processes == 1:
188
- for content in contents:
189
- self.upload(content=content)
190
-
191
- else:
192
- with mp.Pool(
193
- processes=self.upload_config.num_processes,
194
- ) as pool:
195
- pool.map(self.upload, contents)
182
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
183
+ self.upload(content=UploadContent(path=path, file_data=file_data))
196
184
 
197
185
 
198
186
  milvus_destination_entry = DestinationRegistryEntry(
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
15
  FileData,
16
- UploadContent,
17
16
  Uploader,
18
17
  UploaderConfig,
19
18
  UploadStager,
@@ -119,13 +118,9 @@ class MongoDBUploader(Uploader):
119
118
  server_api=ServerApi(version=SERVER_API_VERSION),
120
119
  )
121
120
 
122
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
123
- elements_dict = []
124
- for content in contents:
125
- with open(content.path) as elements_file:
126
- elements = json.load(elements_file)
127
- elements_dict.extend(elements)
128
-
121
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
122
+ with path.open("r") as file:
123
+ elements_dict = json.load(file)
129
124
  logger.info(
130
125
  f"writing {len(elements_dict)} objects to destination "
131
126
  f"db, {self.connection_config.database}, "
@@ -1,5 +1,4 @@
1
1
  import json
2
- import multiprocessing as mp
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
@@ -13,7 +12,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
13
12
  from unstructured_ingest.v2.interfaces import (
14
13
  AccessConfig,
15
14
  ConnectionConfig,
16
- UploadContent,
15
+ FileData,
17
16
  Uploader,
18
17
  UploaderConfig,
19
18
  UploadStager,
@@ -68,7 +67,6 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
68
67
 
69
68
  class PineconeUploaderConfig(UploaderConfig):
70
69
  batch_size: int = Field(default=100, description="Number of records per batch")
71
- num_processes: int = Field(default=4, description="Number of processes to use for uploading")
72
70
 
73
71
 
74
72
  @dataclass
@@ -143,34 +141,18 @@ class PineconeUploader(Uploader):
143
141
  raise DestinationConnectionError(f"http error: {api_error}") from api_error
144
142
  logger.debug(f"results: {response}")
145
143
 
146
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147
-
148
- elements_dict = []
149
- for content in contents:
150
- with open(content.path) as elements_file:
151
- elements = json.load(elements_file)
152
- elements_dict.extend(elements)
153
-
144
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
145
+ with path.open("r") as file:
146
+ elements_dict = json.load(file)
154
147
  logger.info(
155
148
  f"writing document batches to destination"
156
149
  f" index named {self.connection_config.index_name}"
157
150
  f" with batch size {self.upload_config.batch_size}"
158
- f" with {self.upload_config.num_processes} (number of) processes"
159
151
  )
160
152
 
161
153
  pinecone_batch_size = self.upload_config.batch_size
162
-
163
- if self.upload_config.num_processes == 1:
164
- for batch in batch_generator(elements_dict, pinecone_batch_size):
165
- self.upsert_batch(batch) # noqa: E203
166
-
167
- else:
168
- with mp.Pool(
169
- processes=self.upload_config.num_processes,
170
- ) as pool:
171
- pool.map(
172
- self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
173
- )
154
+ for pinecone_batch in batch_generator(elements_dict, pinecone_batch_size):
155
+ self.upsert_batch(batch=pinecone_batch)
174
156
 
175
157
 
176
158
  pinecone_destination_entry = DestinationRegistryEntry(
@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
16
16
  AccessConfig,
17
17
  ConnectionConfig,
18
18
  FileData,
19
- UploadContent,
20
19
  Uploader,
21
20
  UploaderConfig,
22
21
  UploadStager,
@@ -120,8 +119,8 @@ class SingleStoreUploader(Uploader):
120
119
  upload_config: SingleStoreUploaderConfig
121
120
  connector_type: str = CONNECTOR_TYPE
122
121
 
123
- def upload_csv(self, content: UploadContent) -> None:
124
- df = pd.read_csv(content.path)
122
+ def upload_csv(self, csv_path: Path) -> None:
123
+ df = pd.read_csv(csv_path)
125
124
  logger.debug(
126
125
  f"uploading {len(df)} entries to {self.connection_config.database} "
127
126
  f"db in table {self.upload_config.table_name}"
@@ -142,9 +141,10 @@ class SingleStoreUploader(Uploader):
142
141
  cur.executemany(stmt, chunk)
143
142
  conn.commit()
144
143
 
145
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
146
- for content in contents:
147
- self.upload_csv(content=content)
144
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
145
+ if path.suffix != ".csv":
146
+ raise ValueError(f"Only .csv files are supported: {path}")
147
+ self.upload_csv(csv_path=path)
148
148
 
149
149
 
150
150
  singlestore_destination_entry = DestinationRegistryEntry(
@@ -16,7 +16,6 @@ from unstructured_ingest.v2.interfaces import (
16
16
  AccessConfig,
17
17
  ConnectionConfig,
18
18
  FileData,
19
- UploadContent,
20
19
  Uploader,
21
20
  UploaderConfig,
22
21
  UploadStager,
@@ -246,8 +245,8 @@ class SQLUploader(Uploader):
246
245
  output.append(tuple(parsed))
247
246
  return output
248
247
 
249
- def upload_contents(self, content: UploadContent) -> None:
250
- df = pd.read_json(content.path, orient="records", lines=True)
248
+ def upload_contents(self, path: Path) -> None:
249
+ df = pd.read_json(path, orient="records", lines=True)
251
250
  logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
252
251
  df.replace({np.nan: None}, inplace=True)
253
252
 
@@ -256,7 +255,7 @@ class SQLUploader(Uploader):
256
255
  VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
257
256
 
258
257
  for rows in pd.read_json(
259
- content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
258
+ path, orient="records", lines=True, chunksize=self.upload_config.batch_size
260
259
  ):
261
260
  with self.connection() as conn:
262
261
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
@@ -268,9 +267,8 @@ class SQLUploader(Uploader):
268
267
 
269
268
  conn.commit()
270
269
 
271
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
272
- for content in contents:
273
- self.upload_contents(content=content)
270
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
271
+ self.upload_contents(path=path)
274
272
 
275
273
 
276
274
  sql_destination_entry = DestinationRegistryEntry(
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.interfaces import (
13
13
  AccessConfig,
14
14
  ConnectionConfig,
15
15
  FileData,
16
- UploadContent,
17
16
  Uploader,
18
17
  UploaderConfig,
19
18
  UploadStager,
@@ -184,7 +183,7 @@ class WeaviateUploader(Uploader):
184
183
 
185
184
  @requires_dependencies(["weaviate"], extras="weaviate")
186
185
  def _resolve_auth_method(self):
187
- access_configs = self.connection_config.access_config
186
+ access_configs = self.connection_config.access_config.get_secret_value()
188
187
  connection_config = self.connection_config
189
188
  if connection_config.anonymous:
190
189
  return None
@@ -216,15 +215,9 @@ class WeaviateUploader(Uploader):
216
215
  )
217
216
  return None
218
217
 
219
- def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
220
- # TODO update to use async support in weaviate client
221
- # once the version can be bumped to include it
222
- elements_dict = []
223
- for content in contents:
224
- with open(content.path) as elements_file:
225
- elements = json.load(elements_file)
226
- elements_dict.extend(elements)
227
-
218
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
219
+ with path.open("r") as file:
220
+ elements_dict = json.load(file)
228
221
  logger.info(
229
222
  f"writing {len(elements_dict)} objects to destination "
230
223
  f"class {self.connection_config.class_name} "
@@ -13,6 +13,7 @@ from unstructured_ingest.v2.logger import logger
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from unstructured_client import UnstructuredClient
16
+ from unstructured_client.models.operations import PartitionRequest
16
17
  from unstructured_client.models.shared import PartitionParameters
17
18
 
18
19
 
@@ -153,7 +154,7 @@ class Partitioner(BaseProcess, ABC):
153
154
  )
154
155
  return self.postprocess(elements=elements_to_dicts(elements))
155
156
 
156
- async def call_api(self, client: "UnstructuredClient", request: "PartitionParameters"):
157
+ async def call_api(self, client: "UnstructuredClient", request: "PartitionRequest"):
157
158
  # TODO when client supports async, run without using run_in_executor
158
159
  # isolate the IO heavy call
159
160
  loop = asyncio.get_event_loop()
@@ -163,7 +164,14 @@ class Partitioner(BaseProcess, ABC):
163
164
  from unstructured_client.models.shared import Files, PartitionParameters
164
165
 
165
166
  partition_request = self.config.to_partition_kwargs()
166
- possible_fields = [f.name for f in fields(PartitionParameters)]
167
+
168
+ # Note(austin): PartitionParameters is a Pydantic model in v0.26.0
169
+ # Prior to this it was a dataclass which doesn't have .__fields
170
+ try:
171
+ possible_fields = PartitionParameters.__fields__
172
+ except AttributeError:
173
+ possible_fields = [f.name for f in fields(PartitionParameters)]
174
+
167
175
  filtered_partition_request = {
168
176
  k: v for k, v in partition_request.items() if k in possible_fields
169
177
  }
@@ -189,6 +197,7 @@ class Partitioner(BaseProcess, ABC):
189
197
  self, filename: Path, metadata: Optional[dict] = None, **kwargs
190
198
  ) -> list[dict]:
191
199
  from unstructured_client import UnstructuredClient
200
+ from unstructured_client.models.operations import PartitionRequest
192
201
 
193
202
  logger.debug(f"partitioning file {filename} with metadata: {metadata}")
194
203
  client = UnstructuredClient(
@@ -196,7 +205,8 @@ class Partitioner(BaseProcess, ABC):
196
205
  api_key_auth=self.config.api_key.get_secret_value(),
197
206
  )
198
207
  partition_params = self.create_partition_parameters(filename=filename)
199
- resp = await self.call_api(client=client, request=partition_params)
208
+ partition_request = PartitionRequest(partition_params)
209
+ resp = await self.call_api(client=client, request=partition_request)
200
210
  elements = resp.elements or []
201
211
  # Append the data source metadata the auto partition does for you
202
212
  for element in elements: