unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +103 -92
  10. test/integration/connectors/sql/test_singlestore.py +112 -100
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +87 -76
  13. test/integration/connectors/test_astradb.py +62 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +6 -6
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +7 -4
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  35. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  36. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  37. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  38. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  39. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  40. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  41. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  42. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  43. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  44. unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
  45. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  47. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  48. unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
  49. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  50. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  51. unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
  52. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  53. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  54. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  55. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  56. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
  57. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  58. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  59. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  60. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  61. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  62. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  63. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  64. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  66. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  67. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  69. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  70. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
  72. unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
  73. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
  75. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  77. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  78. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  79. unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
  80. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  81. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  82. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
  83. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
  84. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  85. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  86. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  87. {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass
4
5
  from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
@@ -82,16 +83,18 @@ class GitLabConnectionConfig(ConnectionConfig):
82
83
 
83
84
  @SourceConnectionError.wrap
84
85
  @requires_dependencies(["gitlab"], extras="gitlab")
85
- def get_client(self) -> "Gitlab":
86
+ @contextmanager
87
+ def get_client(self) -> Generator["Gitlab", None, None]:
86
88
  from gitlab import Gitlab
87
89
 
88
90
  logger.info(f"Connection to GitLab: {self.base_url!r}")
89
- gitlab = Gitlab(
91
+ with Gitlab(
90
92
  self.base_url, private_token=self.access_config.get_secret_value().access_token
91
- )
92
- return gitlab
93
+ ) as client:
94
+ yield client
93
95
 
94
- def get_project(self) -> "Project":
96
+ @contextmanager
97
+ def get_project(self) -> Generator["Project", None, None]:
95
98
  """Retrieves the specified GitLab project using the configured base URL and access token.
96
99
 
97
100
  Returns:
@@ -101,13 +104,12 @@ class GitLabConnectionConfig(ConnectionConfig):
101
104
  SourceConnectionError: If the GitLab API connection fails.
102
105
  gitlab.exceptions.GitlabGetError: If the project is not found.
103
106
  """
104
- gitlab = self.get_client()
107
+ with self.get_client() as client:
108
+ logger.info(f"Accessing Project: '{self.repo_path}'")
109
+ project = client.projects.get(self.repo_path)
105
110
 
106
- logger.info(f"Accessing Project: '{self.repo_path}'")
107
- project = gitlab.projects.get(self.repo_path)
108
-
109
- logger.info(f"Successfully accessed project '{self.repo_path}'")
110
- return project
111
+ logger.info(f"Successfully accessed project '{self.repo_path}'")
112
+ yield project
111
113
 
112
114
 
113
115
  class GitLabIndexerConfig(IndexerConfig):
@@ -144,11 +146,11 @@ class GitLabIndexer(Indexer):
144
146
  """
145
147
 
146
148
  try:
147
- gitlab = self.connection_config.get_client()
148
- if self.connection_config.access_config.get_secret_value().access_token is not None:
149
- gitlab.auth()
150
- else:
151
- gitlab.projects.get(self.connection_config.repo_path)
149
+ with self.connection_config.get_client() as client:
150
+ if self.connection_config.access_config.get_secret_value().access_token is not None:
151
+ client.auth()
152
+ else:
153
+ client.projects.get(self.connection_config.repo_path)
152
154
 
153
155
  except Exception as e:
154
156
  logger.error(f"Failed to validate connection: {e}", exc_info=True)
@@ -168,17 +170,16 @@ class GitLabIndexer(Indexer):
168
170
  FileData: A generator that yields `FileData` objects representing each file (blob)
169
171
  in the repository.
170
172
  """
171
- project = self.connection_config.get_project()
172
-
173
- ref = self.index_config.git_branch or project.default_branch
174
-
175
- files = project.repository_tree(
176
- path=str(self.index_config.path),
177
- ref=ref,
178
- recursive=self.index_config.recursive,
179
- iterator=True,
180
- all=True,
181
- )
173
+ with self.connection_config.get_project() as project:
174
+ ref = self.index_config.git_branch or project.default_branch
175
+
176
+ files = project.repository_tree(
177
+ path=str(self.index_config.path),
178
+ ref=ref,
179
+ recursive=self.index_config.recursive,
180
+ iterator=True,
181
+ all=True,
182
+ )
182
183
 
183
184
  for file in files:
184
185
  relative_path = str(Path(file["path"]).relative_to(self.index_config.path))
@@ -250,12 +251,12 @@ class GitLabDownloader(Downloader):
250
251
 
251
252
  ref = file_data.metadata.record_locator["ref"]
252
253
  path = file_data.metadata.record_locator["file_path"]
253
-
254
- project_file = self.connection_config.get_project().files.get(file_path=path, ref=ref)
255
254
  download_path.parent.mkdir(exist_ok=True, parents=True)
256
255
 
257
- with open(download_path, "wb") as file:
258
- file.write(project_file.decode())
256
+ with self.connection_config.get_project() as project:
257
+ project_file = project.files.get(file_path=path, ref=ref)
258
+ with open(download_path, "wb") as file:
259
+ file.write(project_file.decode())
259
260
 
260
261
 
261
262
  gitlab_source_entry = SourceRegistryEntry(
@@ -1,5 +1,6 @@
1
1
  import io
2
2
  import json
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
@@ -74,7 +75,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
74
75
  access_config: Secret[GoogleDriveAccessConfig]
75
76
 
76
77
  @requires_dependencies(["googleapiclient"], extras="google-drive")
77
- def get_files_service(self) -> "GoogleAPIResource":
78
+ @contextmanager
79
+ def get_client(self) -> Generator["GoogleAPIResource", None, None]:
78
80
  from google.auth import exceptions
79
81
  from google.oauth2 import service_account
80
82
  from googleapiclient.discovery import build
@@ -86,8 +88,8 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
86
88
  try:
87
89
  creds = service_account.Credentials.from_service_account_info(key_data)
88
90
  service = build("drive", "v3", credentials=creds)
89
- return service.files()
90
-
91
+ with service.files() as client:
92
+ yield client
91
93
  except HttpError as exc:
92
94
  raise ValueError(f"{exc.reason}")
93
95
  except exceptions.DefaultCredentialsError:
@@ -132,7 +134,7 @@ class GoogleDriveIndexer(Indexer):
132
134
 
133
135
  def precheck(self) -> None:
134
136
  try:
135
- self.connection_config.get_files_service()
137
+ self.connection_config.get_client()
136
138
  except Exception as e:
137
139
  logger.error(f"failed to validate connection: {e}", exc_info=True)
138
140
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -266,13 +268,14 @@ class GoogleDriveIndexer(Indexer):
266
268
  return data
267
269
 
268
270
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
269
- for f in self.get_files(
270
- files_client=self.connection_config.get_files_service(),
271
- object_id=self.connection_config.drive_id,
272
- recursive=self.index_config.recursive,
273
- extensions=self.index_config.extensions,
274
- ):
275
- yield f
271
+ with self.connection_config.get_client() as client:
272
+ for f in self.get_files(
273
+ files_client=client,
274
+ object_id=self.connection_config.drive_id,
275
+ recursive=self.index_config.recursive,
276
+ extensions=self.index_config.extensions,
277
+ ):
278
+ yield f
276
279
 
277
280
 
278
281
  class GoogleDriveDownloaderConfig(DownloaderConfig):
@@ -309,30 +312,30 @@ class GoogleDriveDownloader(Downloader):
309
312
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
310
313
  mime_type = file_data.additional_metadata["mimeType"]
311
314
  record_id = file_data.identifier
312
- files_client = self.connection_config.get_files_service()
313
- if mime_type.startswith("application/vnd.google-apps"):
314
- export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
315
- self.meta.get("mimeType"), # type: ignore
316
- )
317
- if not export_mime:
318
- raise TypeError(
319
- f"File not supported. Name: {file_data.source_identifiers.filename} "
320
- f"ID: {record_id} "
321
- f"MimeType: {mime_type}"
315
+ with self.connection_config.get_client() as client:
316
+ if mime_type.startswith("application/vnd.google-apps"):
317
+ export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
318
+ self.meta.get("mimeType"), # type: ignore
322
319
  )
323
-
324
- request = files_client.export_media(
325
- fileId=record_id,
326
- mimeType=export_mime,
327
- )
328
- else:
329
- request = files_client.get_media(fileId=record_id)
320
+ if not export_mime:
321
+ raise TypeError(
322
+ f"File not supported. Name: {file_data.source_identifiers.filename} "
323
+ f"ID: {record_id} "
324
+ f"MimeType: {mime_type}"
325
+ )
326
+
327
+ request = client.export_media(
328
+ fileId=record_id,
329
+ mimeType=export_mime,
330
+ )
331
+ else:
332
+ request = client.get_media(fileId=record_id)
330
333
 
331
334
  file_contents = io.BytesIO()
332
335
  downloader = MediaIoBaseDownload(file_contents, request)
333
336
  downloaded = self._get_content(downloader=downloader)
334
337
  if not downloaded or not file_contents:
335
- return []
338
+ raise SourceConnectionError("nothing found to download")
336
339
  return self._write_file(file_data=file_data, file_contents=file_contents)
337
340
 
338
341
 
@@ -257,8 +257,6 @@ class KafkaUploader(Uploader, ABC):
257
257
  if failed_producer:
258
258
  raise KafkaException("failed to produce all messages in batch")
259
259
 
260
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
261
- with path.open("r") as elements_file:
262
- elements = json.load(elements_file)
263
- for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
260
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
261
+ for element_batch in batch_generator(data, batch_size=self.upload_config.batch_size):
264
262
  self.produce_batch(elements=element_batch)
@@ -1,14 +1,13 @@
1
- import json
1
+ from contextlib import contextmanager
2
2
  from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
- import numpy as np
7
6
  import pandas as pd
8
7
  from pydantic import Field, Secret
9
8
 
10
9
  from unstructured_ingest.error import DestinationConnectionError
11
- from unstructured_ingest.utils.data_prep import flatten_dict
10
+ from unstructured_ingest.utils.data_prep import flatten_dict, get_data_df, split_dataframe
12
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
12
  from unstructured_ingest.v2.interfaces import (
14
13
  AccessConfig,
@@ -48,12 +47,19 @@ class KdbaiConnectionConfig(ConnectionConfig):
48
47
  )
49
48
 
50
49
  @requires_dependencies(["kdbai_client"], extras="kdbai")
51
- def get_session(self) -> "Session":
50
+ @contextmanager
51
+ def get_client(self) -> Generator["Session", None, None]:
52
52
  from kdbai_client import Session
53
53
 
54
- return Session(
55
- api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
56
- )
54
+ session = None
55
+ try:
56
+ session = Session(
57
+ api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
58
+ )
59
+ yield session
60
+ finally:
61
+ if session:
62
+ session.close()
57
63
 
58
64
 
59
65
  class KdbaiUploadStagerConfig(UploadStagerConfig):
@@ -64,38 +70,19 @@ class KdbaiUploadStagerConfig(UploadStagerConfig):
64
70
  class KdbaiUploadStager(UploadStager):
65
71
  upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
66
72
 
67
- def run(
68
- self,
69
- elements_filepath: Path,
70
- file_data: FileData,
71
- output_dir: Path,
72
- output_filename: str,
73
- **kwargs: Any,
74
- ) -> Path:
75
- with open(elements_filepath) as elements_file:
76
- elements_contents = json.load(elements_file)
77
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
78
- output_path.parent.mkdir(parents=True, exist_ok=True)
79
-
80
- data = []
81
- for element in elements_contents:
82
- data.append(
83
- {
84
- "id": get_enhanced_element_id(element_dict=element, file_data=file_data),
85
- "element_id": element.get("element_id"),
86
- "document": element.pop("text", None),
87
- "embeddings": element.get("embeddings"),
88
- "metadata": flatten_dict(
89
- dictionary=element.get("metadata"),
90
- flatten_lists=True,
91
- remove_none=True,
92
- ),
93
- }
94
- )
95
- logger.debug(f"writing {len(data)} elements to {output_path}")
96
- with output_path.open("w") as output_file:
97
- json.dump(data, output_file, indent=2)
98
- return output_path
73
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
74
+ data = element_dict.copy()
75
+ return {
76
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
77
+ "element_id": data.get("element_id"),
78
+ "document": data.pop("text", None),
79
+ "embeddings": data.get("embeddings"),
80
+ "metadata": flatten_dict(
81
+ dictionary=data.get("metadata"),
82
+ flatten_lists=True,
83
+ remove_none=True,
84
+ ),
85
+ }
99
86
 
100
87
 
101
88
  class KdbaiUploaderConfig(UploaderConfig):
@@ -119,50 +106,37 @@ class KdbaiUploader(Uploader):
119
106
  logger.error(f"Failed to validate connection {e}", exc_info=True)
120
107
  raise DestinationConnectionError(f"failed to validate connection: {e}")
121
108
 
122
- def get_database(self) -> "Database":
123
- session: Session = self.connection_config.get_session()
124
- db = session.database(self.upload_config.database_name)
125
- return db
109
+ @contextmanager
110
+ def get_database(self) -> Generator["Database", None, None]:
111
+ with self.connection_config.get_client() as client:
112
+ db = client.database(self.upload_config.database_name)
113
+ yield db
126
114
 
127
- def get_table(self) -> "Table":
128
- db = self.get_database()
129
- table = db.table(self.upload_config.table_name)
130
- return table
115
+ @contextmanager
116
+ def get_table(self) -> Generator["Table", None, None]:
117
+ with self.get_database() as db:
118
+ table = db.table(self.upload_config.table_name)
119
+ yield table
131
120
 
132
121
  def upsert_batch(self, batch: pd.DataFrame):
133
- table = self.get_table()
134
- table.insert(batch)
122
+ with self.get_table() as table:
123
+ table.insert(batch)
135
124
 
136
125
  def process_dataframe(self, df: pd.DataFrame):
137
126
  logger.debug(
138
127
  f"uploading {len(df)} entries to {self.connection_config.endpoint} "
139
128
  f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
140
129
  )
141
- for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
130
+ for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
142
131
  self.upsert_batch(batch=batch_df)
143
132
 
144
- def process_csv(self, csv_paths: list[Path]):
145
- logger.debug(f"uploading content from {len(csv_paths)} csv files")
146
- df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
147
- self.process_dataframe(df=df)
148
-
149
- def process_json(self, json_paths: list[Path]):
150
- logger.debug(f"uploading content from {len(json_paths)} json files")
151
- all_records = []
152
- for p in json_paths:
153
- with open(p) as json_file:
154
- all_records.extend(json.load(json_file))
155
-
156
- df = pd.DataFrame(data=all_records)
133
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
134
+ df = pd.DataFrame(data=data)
157
135
  self.process_dataframe(df=df)
158
136
 
159
137
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
160
- if path.suffix == ".csv":
161
- self.process_csv(csv_paths=[path])
162
- elif path.suffix == ".json":
163
- self.process_json(json_paths=[path])
164
- else:
165
- raise ValueError(f"Unsupported file type, must be json or csv file: {path}")
138
+ data = get_data_df(path=path)
139
+ self.process_dataframe(df=data)
166
140
 
167
141
 
168
142
  kdbai_destination_entry = DestinationRegistryEntry(
@@ -41,14 +41,11 @@ class LanceDBConnectionConfig(ConnectionConfig, ABC):
41
41
  async def get_async_connection(self) -> AsyncGenerator["AsyncConnection", None]:
42
42
  import lancedb
43
43
 
44
- connection = await lancedb.connect_async(
44
+ with await lancedb.connect_async(
45
45
  self.uri,
46
46
  storage_options=self.get_storage_options(),
47
- )
48
- try:
47
+ ) as connection:
49
48
  yield connection
50
- finally:
51
- connection.close()
52
49
 
53
50
 
54
51
  class LanceDBRemoteConnectionConfig(LanceDBConnectionConfig):
@@ -85,8 +82,8 @@ class LanceDBUploadStager(UploadStager):
85
82
 
86
83
  df = pd.DataFrame(
87
84
  [
88
- self._conform_element_contents(element_contents, file_data)
89
- for element_contents in elements_contents
85
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
86
+ for element_dict in elements_contents
90
87
  ]
91
88
  )
92
89
 
@@ -95,11 +92,12 @@ class LanceDBUploadStager(UploadStager):
95
92
 
96
93
  return output_path
97
94
 
98
- def _conform_element_contents(self, element: dict, file_data: FileData) -> dict:
95
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
96
+ data = element_dict.copy()
99
97
  return {
100
- "vector": element.pop("embeddings", None),
98
+ "vector": data.pop("embeddings", None),
101
99
  RECORD_ID_LABEL: file_data.identifier,
102
- **flatten_dict(element, separator="-"),
100
+ **flatten_dict(data, separator="-"),
103
101
  }
104
102
 
105
103
 
@@ -1,4 +1,5 @@
1
1
  import glob
2
+ import json
2
3
  import shutil
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
@@ -175,7 +176,7 @@ class LocalUploader(Uploader):
175
176
  def is_async(self) -> bool:
176
177
  return False
177
178
 
178
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
+ def get_destination_path(self, file_data: FileData) -> Path:
179
180
  if source_identifiers := file_data.source_identifiers:
180
181
  rel_path = (
181
182
  source_identifiers.relative_path[1:]
@@ -188,7 +189,17 @@ class LocalUploader(Uploader):
188
189
  )
189
190
  else:
190
191
  final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
191
- Path(final_path).parent.mkdir(parents=True, exist_ok=True)
192
+ final_path = Path(final_path)
193
+ final_path.parent.mkdir(parents=True, exist_ok=True)
194
+ return final_path
195
+
196
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
197
+ final_path = self.get_destination_path(file_data=file_data)
198
+ with final_path.open("w") as f:
199
+ json.dump(data, f)
200
+
201
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
202
+ final_path = self.get_destination_path(file_data=file_data)
192
203
  logger.debug(f"copying file from {path} to {final_path}")
193
204
  shutil.copy(src=str(path), dst=str(final_path))
194
205
 
@@ -1,10 +1,8 @@
1
1
  import json
2
2
  from contextlib import contextmanager
3
3
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
6
5
 
7
- import pandas as pd
8
6
  from dateutil import parser
9
7
  from pydantic import Field, Secret
10
8
 
@@ -16,7 +14,6 @@ from unstructured_ingest.v2.interfaces import (
16
14
  AccessConfig,
17
15
  ConnectionConfig,
18
16
  FileData,
19
- UploadContent,
20
17
  Uploader,
21
18
  UploaderConfig,
22
19
  UploadStager,
@@ -59,10 +56,17 @@ class MilvusConnectionConfig(ConnectionConfig):
59
56
  return connection_config_dict
60
57
 
61
58
  @requires_dependencies(["pymilvus"], extras="milvus")
62
- def get_client(self) -> "MilvusClient":
59
+ @contextmanager
60
+ def get_client(self) -> Generator["MilvusClient", None, None]:
63
61
  from pymilvus import MilvusClient
64
62
 
65
- return MilvusClient(**self.get_connection_kwargs())
63
+ client = None
64
+ try:
65
+ client = MilvusClient(**self.get_connection_kwargs())
66
+ yield client
67
+ finally:
68
+ if client:
69
+ client.close()
66
70
 
67
71
 
68
72
  class MilvusUploadStagerConfig(UploadStagerConfig):
@@ -91,8 +95,8 @@ class MilvusUploadStager(UploadStager):
91
95
  pass
92
96
  return parser.parse(date_string).timestamp()
93
97
 
94
- def conform_dict(self, data: dict, file_data: FileData) -> dict:
95
- working_data = data.copy()
98
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
99
+ working_data = element_dict.copy()
96
100
  if self.upload_stager_config.flatten_metadata and (
97
101
  metadata := working_data.pop("metadata", None)
98
102
  ):
@@ -134,29 +138,6 @@ class MilvusUploadStager(UploadStager):
134
138
  working_data[RECORD_ID_LABEL] = file_data.identifier
135
139
  return working_data
136
140
 
137
- def run(
138
- self,
139
- elements_filepath: Path,
140
- file_data: FileData,
141
- output_dir: Path,
142
- output_filename: str,
143
- **kwargs: Any,
144
- ) -> Path:
145
- with open(elements_filepath) as elements_file:
146
- elements_contents: list[dict[str, Any]] = json.load(elements_file)
147
- new_content = [
148
- self.conform_dict(data=element, file_data=file_data) for element in elements_contents
149
- ]
150
- output_filename_path = Path(output_filename)
151
- if output_filename_path.suffix == ".json":
152
- output_path = Path(output_dir) / output_filename_path
153
- else:
154
- output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
155
- output_path.parent.mkdir(parents=True, exist_ok=True)
156
- with output_path.open("w") as output_file:
157
- json.dump(new_content, output_file, indent=2)
158
- return output_path
159
-
160
141
 
161
142
  class MilvusUploaderConfig(UploaderConfig):
162
143
  db_name: Optional[str] = Field(default=None, description="Milvus database name")
@@ -183,22 +164,10 @@ class MilvusUploader(Uploader):
183
164
 
184
165
  @contextmanager
185
166
  def get_client(self) -> Generator["MilvusClient", None, None]:
186
- client = self.connection_config.get_client()
187
- if db_name := self.upload_config.db_name:
188
- client.using_database(db_name=db_name)
189
- try:
167
+ with self.connection_config.get_client() as client:
168
+ if db_name := self.upload_config.db_name:
169
+ client.using_database(db_name=db_name)
190
170
  yield client
191
- finally:
192
- client.close()
193
-
194
- def upload(self, content: UploadContent) -> None:
195
- file_extension = content.path.suffix
196
- if file_extension == ".json":
197
- self.upload_json(content=content)
198
- elif file_extension == ".csv":
199
- self.upload_csv(content=content)
200
- else:
201
- raise ValueError(f"Unsupported file extension: {file_extension}")
202
171
 
203
172
  def delete_by_record_id(self, file_data: FileData) -> None:
204
173
  logger.info(
@@ -233,19 +202,9 @@ class MilvusUploader(Uploader):
233
202
  err_count = res["err_count"]
234
203
  raise WriteError(f"failed to upload {err_count} docs")
235
204
 
236
- def upload_csv(self, content: UploadContent) -> None:
237
- df = pd.read_csv(content.path)
238
- data = df.to_dict(orient="records")
239
- self.insert_results(data=data)
240
-
241
- def upload_json(self, content: UploadContent) -> None:
242
- with content.path.open("r") as file:
243
- data: list[dict] = json.load(file)
244
- self.insert_results(data=data)
245
-
246
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
205
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
247
206
  self.delete_by_record_id(file_data=file_data)
248
- self.upload(content=UploadContent(path=path, file_data=file_data))
207
+ self.insert_results(data=data)
249
208
 
250
209
 
251
210
  milvus_destination_entry = DestinationRegistryEntry(