unstructured-ingest 0.0.24__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.24" # pragma: no cover
1
+ __version__ = "0.0.25" # pragma: no cover
@@ -20,6 +20,18 @@ def requires_dependencies(
20
20
  dependencies: str | list[str],
21
21
  extras: Optional[str] = None,
22
22
  ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
23
+ """Decorator ensuring required modules are installed.
24
+
25
+ Use on functions with local imports to ensure required modules are available and log
26
+ an installation instruction if they're not.
27
+
28
+ Args:
29
+ dependencies: Name(s) of module(s) required by the decorated function.
30
+ extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
31
+
32
+ Raises:
33
+ ImportError: When at least one of the `dependencies` is not available.
34
+ """
23
35
  if isinstance(dependencies, str):
24
36
  dependencies = [dependencies]
25
37
 
@@ -17,7 +17,10 @@ from .chroma import chroma_destination_entry
17
17
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
18
18
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
19
19
  from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
20
- from .databricks_volumes import databricks_volumes_destination_entry
20
+ from .databricks_volumes import (
21
+ databricks_volumes_destination_entry,
22
+ databricks_volumes_source_entry,
23
+ )
21
24
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
22
25
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
23
26
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -34,6 +37,8 @@ from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
34
37
  from .onedrive import onedrive_source_entry
35
38
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
36
39
  from .opensearch import opensearch_destination_entry, opensearch_source_entry
40
+ from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
41
+ from .outlook import outlook_source_entry
37
42
  from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
38
43
  from .pinecone import pinecone_destination_entry
39
44
  from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
@@ -78,6 +83,10 @@ add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_d
78
83
  add_destination_entry(
79
84
  destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
80
85
  )
86
+ add_source_entry(
87
+ source_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_source_entry
88
+ )
89
+
81
90
 
82
91
  add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
83
92
 
@@ -95,3 +104,5 @@ add_destination_entry(
95
104
 
96
105
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
97
106
  add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
107
+
108
+ add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
@@ -1,21 +1,35 @@
1
1
  import os
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
5
 
6
6
  from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.error import DestinationConnectionError
8
+ from unstructured_ingest.error import (
9
+ DestinationConnectionError,
10
+ SourceConnectionError,
11
+ SourceConnectionNetworkError,
12
+ )
9
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
14
  from unstructured_ingest.v2.interfaces import (
11
15
  AccessConfig,
12
16
  ConnectionConfig,
17
+ Downloader,
18
+ DownloaderConfig,
19
+ DownloadResponse,
13
20
  FileData,
21
+ FileDataSourceMetadata,
22
+ Indexer,
23
+ IndexerConfig,
24
+ SourceIdentifiers,
14
25
  Uploader,
15
26
  UploaderConfig,
16
27
  )
17
28
  from unstructured_ingest.v2.logger import logger
18
- from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
29
+ from unstructured_ingest.v2.processes.connector_registry import (
30
+ DestinationRegistryEntry,
31
+ SourceRegistryEntry,
32
+ )
19
33
 
20
34
  if TYPE_CHECKING:
21
35
  from databricks.sdk import WorkspaceClient
@@ -32,16 +46,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
32
46
  "https://accounts.azuredatabricks.net/ (Azure), "
33
47
  "or https://accounts.gcp.databricks.com/ (GCP).",
34
48
  )
35
- username: Optional[str] = Field(
36
- default=None,
37
- description="The Databricks username part of basic authentication. "
38
- "Only possible when Host is *.cloud.databricks.com (AWS).",
39
- )
40
- password: Optional[str] = Field(
41
- default=None,
42
- description="The Databricks password part of basic authentication. "
43
- "Only possible when Host is *.cloud.databricks.com (AWS).",
44
- )
45
49
  client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
46
50
  client_secret: Optional[str] = Field(
47
51
  default=None, description="Client Secret of the OAuth app."
@@ -78,7 +82,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
78
82
  "argument. This argument also holds the currently "
79
83
  "selected auth.",
80
84
  )
81
- cluster_id: Optional[str] = None
82
85
  google_credentials: Optional[str] = None
83
86
  google_service_account: Optional[str] = None
84
87
 
@@ -93,17 +96,11 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig):
93
96
  "Databricks workspace endpoint or the "
94
97
  "Databricks accounts endpoint.",
95
98
  )
96
-
97
-
98
- class DatabricksVolumesUploaderConfig(UploaderConfig):
99
99
  volume: str = Field(description="Name of volume in the Unity Catalog")
100
100
  catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
101
101
  volume_path: Optional[str] = Field(
102
102
  default=None, description="Optional path within the volume to write to"
103
103
  )
104
- overwrite: bool = Field(
105
- default=False, description="If true, an existing file will be overwritten."
106
- )
107
104
  databricks_schema: str = Field(
108
105
  default="default",
109
106
  alias="schema",
@@ -117,33 +114,121 @@ class DatabricksVolumesUploaderConfig(UploaderConfig):
117
114
  path = f"{path}/{self.volume_path}"
118
115
  return path
119
116
 
120
-
121
- @dataclass
122
- class DatabricksVolumesUploader(Uploader):
123
- connector_type: str = CONNECTOR_TYPE
124
- upload_config: DatabricksVolumesUploaderConfig
125
- connection_config: DatabricksVolumesConnectionConfig
126
-
127
117
  @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
128
118
  def get_client(self) -> "WorkspaceClient":
129
119
  from databricks.sdk import WorkspaceClient
130
120
 
131
121
  return WorkspaceClient(
132
- host=self.connection_config.host,
133
- **self.connection_config.access_config.get_secret_value().model_dump(),
122
+ host=self.host,
123
+ **self.access_config.get_secret_value().model_dump(),
134
124
  )
135
125
 
126
+
127
+ @dataclass
128
+ class DatabricksVolumesIndexerConfig(IndexerConfig):
129
+ recursive: bool = False
130
+
131
+
132
+ @dataclass
133
+ class DatabricksVolumesIndexer(Indexer):
134
+ index_config: DatabricksVolumesIndexerConfig
135
+ connection_config: DatabricksVolumesConnectionConfig
136
+ connector_type: str = CONNECTOR_TYPE
137
+
136
138
  def precheck(self) -> None:
137
139
  try:
138
- assert self.get_client().current_user.me().active
140
+ self.connection_config.get_client()
141
+ except Exception as e:
142
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
143
+ raise SourceConnectionError(f"failed to validate connection: {e}")
144
+
145
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
146
+ for file_info in self.connection_config.get_client().dbfs.list(
147
+ path=self.connection_config.path, recursive=self.index_config.recursive
148
+ ):
149
+ if file_info.is_dir:
150
+ continue
151
+ rel_path = file_info.path.replace(self.connection_config.path, "")
152
+ if rel_path.startswith("/"):
153
+ rel_path = rel_path[1:]
154
+ filename = Path(file_info.path).name
155
+ yield FileData(
156
+ identifier=file_info.path,
157
+ connector_type=CONNECTOR_TYPE,
158
+ source_identifiers=SourceIdentifiers(
159
+ filename=filename,
160
+ rel_path=rel_path,
161
+ fullpath=file_info.path,
162
+ ),
163
+ additional_metadata={
164
+ "catalog": self.connection_config.catalog,
165
+ },
166
+ metadata=FileDataSourceMetadata(
167
+ url=file_info.path, date_modified=str(file_info.modification_time)
168
+ ),
169
+ )
170
+
171
+
172
+ @dataclass
173
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
174
+ pass
175
+
176
+
177
+ @dataclass
178
+ class DatabricksVolumesDownloader(Downloader):
179
+ download_config: DatabricksVolumesDownloaderConfig
180
+ connection_config: DatabricksVolumesConnectionConfig
181
+ connector_type: str = CONNECTOR_TYPE
182
+
183
+ def precheck(self) -> None:
184
+ try:
185
+ self.connection_config.get_client()
186
+ except Exception as e:
187
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
188
+ raise SourceConnectionError(f"failed to validate connection: {e}")
189
+
190
+ def get_download_path(self, file_data: FileData) -> Path:
191
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
192
+
193
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
194
+ download_path = self.get_download_path(file_data=file_data)
195
+ download_path.parent.mkdir(parents=True, exist_ok=True)
196
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
197
+ try:
198
+ with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
199
+ read_content = c._read_handle.read()
200
+ with open(download_path, "wb") as f:
201
+ f.write(read_content)
202
+ except Exception as e:
203
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
204
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
205
+
206
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
207
+
208
+
209
+ class DatabricksVolumesUploaderConfig(UploaderConfig):
210
+ overwrite: bool = Field(
211
+ default=False, description="If true, an existing file will be overwritten."
212
+ )
213
+
214
+
215
+ @dataclass
216
+ class DatabricksVolumesUploader(Uploader):
217
+ upload_config: DatabricksVolumesUploaderConfig
218
+ connection_config: DatabricksVolumesConnectionConfig
219
+ connector_type: str = CONNECTOR_TYPE
220
+
221
+ def precheck(self) -> None:
222
+ try:
223
+ assert self.connection_config.get_client().current_user.me().active
139
224
  except Exception as e:
140
225
  logger.error(f"failed to validate connection: {e}", exc_info=True)
141
226
  raise DestinationConnectionError(f"failed to validate connection: {e}")
142
227
 
143
228
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
144
- output_path = os.path.join(self.upload_config.path, path.name)
229
+ output_path = os.path.join(self.connection_config.path, path.name)
145
230
  with open(path, "rb") as elements_file:
146
- self.get_client().files.upload(
231
+ self.connection_config.get_client().files.upload(
147
232
  file_path=output_path,
148
233
  contents=elements_file,
149
234
  overwrite=self.upload_config.overwrite,
@@ -155,3 +240,11 @@ databricks_volumes_destination_entry = DestinationRegistryEntry(
155
240
  uploader=DatabricksVolumesUploader,
156
241
  uploader_config=DatabricksVolumesUploaderConfig,
157
242
  )
243
+
244
+ databricks_volumes_source_entry = SourceRegistryEntry(
245
+ connection_config=DatabricksVolumesConnectionConfig,
246
+ indexer=DatabricksVolumesIndexer,
247
+ indexer_config=DatabricksVolumesIndexerConfig,
248
+ downloader=DatabricksVolumesDownloader,
249
+ downloader_config=DatabricksVolumesDownloaderConfig,
250
+ )
@@ -0,0 +1,239 @@
1
+ import hashlib
2
+ import time
3
+ from dataclasses import dataclass, field
4
+ from datetime import timezone
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Coroutine, Generator
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.error import SourceConnectionError
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ FileData,
19
+ Indexer,
20
+ IndexerConfig,
21
+ download_responses,
22
+ )
23
+ from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
24
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
25
+
26
+ MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
27
+
28
+ if TYPE_CHECKING:
29
+ from office365.graph_client import GraphClient
30
+ from office365.outlook.mail.folders.folder import MailFolder
31
+ from office365.outlook.mail.messages.message import Message
32
+
33
+
34
+ CONNECTOR_TYPE = "outlook"
35
+
36
+
37
+ class OutlookAccessConfig(AccessConfig):
38
+ client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
39
+
40
+
41
+ class OutlookConnectionConfig(ConnectionConfig):
42
+ access_config: Secret[OutlookAccessConfig]
43
+ client_id: str = Field(description="Azure AD App client ID")
44
+ tenant: str = Field(
45
+ default="common", description="ID or domain name associated with your Azure AD instance"
46
+ )
47
+ authority_url: str = Field(
48
+ default="https://login.microsoftonline.com",
49
+ description="Authentication token provider for Microsoft apps",
50
+ )
51
+
52
+ @requires_dependencies(["msal"], extras="outlook")
53
+ def _acquire_token(self):
54
+ """Acquire token via MSAL"""
55
+ from msal import ConfidentialClientApplication
56
+
57
+ # NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
58
+ # constant as default in the future but they do not fit well with `authority_url` right now
59
+ authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
60
+ app = ConfidentialClientApplication(
61
+ authority=authority_url,
62
+ client_id=self.client_id,
63
+ client_credential=self.access_config.get_secret_value().client_credential,
64
+ )
65
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
66
+ return token
67
+
68
+ @requires_dependencies(["office365"], extras="outlook")
69
+ @SourceConnectionError.wrap
70
+ def get_client(self) -> "GraphClient":
71
+ from office365.graph_client import GraphClient
72
+
73
+ return GraphClient(self._acquire_token)
74
+
75
+
76
+ class OutlookIndexerConfig(IndexerConfig):
77
+ outlook_folders: list[str] = Field(
78
+ description="Folders to download email messages from. Do not specify subfolders. "
79
+ "Use quotes if there are spaces in folder names."
80
+ )
81
+ recursive: bool = Field(
82
+ default=False,
83
+ description="Recursively download files in their respective folders otherwise stop at the"
84
+ " files in provided folder level.",
85
+ )
86
+ user_email: str = Field(description="Outlook email to download messages from.")
87
+
88
+
89
+ @dataclass
90
+ class OutlookIndexer(Indexer):
91
+ index_config: OutlookIndexerConfig
92
+ connection_config: OutlookConnectionConfig
93
+ connector_type: str = CONNECTOR_TYPE
94
+
95
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
96
+ messages = self._list_messages(recursive=self.index_config.recursive)
97
+
98
+ for message in messages:
99
+ yield self._message_to_file_data(message)
100
+
101
+ def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
102
+ raise NotImplementedError
103
+
104
+ @SourceConnectionError.wrap
105
+ def precheck(self) -> None:
106
+ client = self.connection_config.get_client()
107
+ client.users[self.index_config.user_email].get().execute_query()
108
+
109
+ def is_async(self) -> bool:
110
+ return False
111
+
112
+ def _list_messages(self, recursive: bool) -> list["Message"]:
113
+ mail_folders = self._get_selected_root_folders()
114
+ messages = []
115
+
116
+ while mail_folders:
117
+ mail_folder = mail_folders.pop()
118
+ messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
119
+
120
+ if recursive:
121
+ mail_folders += list(mail_folder.child_folders.get().execute_query())
122
+
123
+ return messages
124
+
125
+ def _get_selected_root_folders(self) -> list["MailFolder"]:
126
+ client_user = self.connection_config.get_client().users[self.index_config.user_email]
127
+ root_mail_folders = client_user.mail_folders.get().execute_query()
128
+
129
+ selected_names_normalized = [
130
+ folder_name.lower() for folder_name in self.index_config.outlook_folders
131
+ ]
132
+ selected_root_mail_folders = [
133
+ folder
134
+ for folder in root_mail_folders
135
+ if folder.display_name.lower() in selected_names_normalized
136
+ ]
137
+
138
+ if not selected_root_mail_folders:
139
+ logger.error(
140
+ f"Root folders selected in configuration: {self.index_config.outlook_folders}"
141
+ f"not found for user email {self.index_config.user_email}. Aborting."
142
+ )
143
+ raise ValueError("Root folders selected in configuration not found.")
144
+
145
+ return selected_root_mail_folders
146
+
147
+ def _message_to_file_data(self, message: "Message") -> FileData:
148
+ fullpath = self._generate_fullpath(message)
149
+
150
+ return FileData(
151
+ identifier=message.id,
152
+ connector_type=CONNECTOR_TYPE,
153
+ source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
154
+ metadata=FileDataSourceMetadata(
155
+ url=message.resource_url,
156
+ version=message.change_key,
157
+ date_modified=str(
158
+ message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
159
+ ),
160
+ date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
161
+ date_processed=str(time.time()),
162
+ record_locator={
163
+ "message_id": message.id,
164
+ "user_email": self.index_config.user_email,
165
+ },
166
+ ),
167
+ additional_metadata={
168
+ "sent_from": str(message.sent_from),
169
+ "to_recipients": [str(recipient) for recipient in message.to_recipients],
170
+ "bcc_recipients": [str(recipient) for recipient in message.to_recipients],
171
+ "subject": message.subject,
172
+ "conversation_id": message.conversation_id,
173
+ "is_draft": message.is_draft,
174
+ "is_read": message.is_read,
175
+ "has_attachments": message.has_attachments,
176
+ "importance": message.importance,
177
+ },
178
+ )
179
+
180
+ def _generate_fullpath(self, message: "Message") -> Path:
181
+ return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
182
+
183
+
184
+ class OutlookDownloaderConfig(DownloaderConfig):
185
+ pass
186
+
187
+
188
+ @dataclass
189
+ class OutlookDownloader(Downloader):
190
+ connector_type: str = CONNECTOR_TYPE
191
+ connection_config: OutlookConnectionConfig
192
+ download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
193
+
194
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
195
+ # NOTE: Indexer should provide source identifiers required to generate the download path
196
+ download_path = self.get_download_path(file_data)
197
+ if download_path is None:
198
+ logger.error(
199
+ "Generated download path is None, source_identifiers might be missing"
200
+ "from FileData."
201
+ )
202
+ raise ValueError("Generated invalid download path.")
203
+
204
+ self._download_message(file_data, download_path)
205
+ return self.generate_download_response(file_data, download_path)
206
+
207
+ def is_async(self) -> bool:
208
+ return False
209
+
210
+ def _download_message(self, file_data: FileData, download_path: Path) -> None:
211
+ # NOTE: Indexer should supply the record locator in metadata
212
+ if (
213
+ file_data.metadata.record_locator is None
214
+ or "user_email" not in file_data.metadata.record_locator
215
+ or "message_id" not in file_data.metadata.record_locator
216
+ ):
217
+ logger.error(
218
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
219
+ "Keys 'user_email' and 'message_id' must be present."
220
+ )
221
+ raise ValueError("Invalid record locator.")
222
+
223
+ user_email = file_data.metadata.record_locator["user_email"]
224
+ message_id = file_data.metadata.record_locator["message_id"]
225
+
226
+ message = self.connection_config.get_client().users[user_email].messages[message_id]
227
+ download_path.parent.mkdir(exist_ok=True, parents=True)
228
+
229
+ with open(download_path, "wb") as file:
230
+ message.download(file).execute_query()
231
+
232
+
233
+ outlook_source_entry = SourceRegistryEntry(
234
+ indexer=OutlookIndexer,
235
+ indexer_config=OutlookIndexerConfig,
236
+ downloader=OutlookDownloader,
237
+ downloader_config=OutlookDownloaderConfig,
238
+ connection_config=OutlookConnectionConfig,
239
+ )
@@ -94,6 +94,10 @@ class PineconeUploaderConfig(UploaderConfig):
94
94
  pool_threads: Optional[int] = Field(
95
95
  default=1, description="Optional limit on number of threads to use for upload"
96
96
  )
97
+ namespace: Optional[str] = Field(
98
+ default=None,
99
+ description="The namespace to write to. If not specified, the default namespace is used",
100
+ )
97
101
 
98
102
 
99
103
  @dataclass
@@ -183,7 +187,11 @@ class PineconeUploader(Uploader):
183
187
  pool_threads = max_pool_threads
184
188
  index = self.connection_config.get_index(pool_threads=pool_threads)
185
189
  with index:
186
- async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
190
+ upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
191
+ if namespace := self.upload_config.namespace:
192
+ for kwargs in upsert_kwargs:
193
+ kwargs["namespace"] = namespace
194
+ async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
187
195
  # Wait for and retrieve responses (this raises in case of error)
188
196
  try:
189
197
  results = [async_result.get() for async_result in async_results]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.24
3
+ Version: 0.0.25
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pydantic>=2.7
26
- Requires-Dist: tqdm
27
- Requires-Dist: click
28
- Requires-Dist: python-dateutil
26
+ Requires-Dist: dataclasses-json
29
27
  Requires-Dist: opentelemetry-sdk
28
+ Requires-Dist: python-dateutil
30
29
  Requires-Dist: pandas
31
- Requires-Dist: dataclasses-json
30
+ Requires-Dist: tqdm
31
+ Requires-Dist: click
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
@@ -41,11 +41,11 @@ Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: bs4; extra == "biomed"
45
44
  Requires-Dist: requests; extra == "biomed"
45
+ Requires-Dist: bs4; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: boxfs; extra == "box"
48
47
  Requires-Dist: fsspec; extra == "box"
48
+ Requires-Dist: boxfs; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -91,8 +91,8 @@ Requires-Dist: gcsfs; extra == "gcs"
91
91
  Requires-Dist: fsspec; extra == "gcs"
92
92
  Requires-Dist: bs4; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: pygithub>1.58.0; extra == "github"
95
94
  Requires-Dist: requests; extra == "github"
95
+ Requires-Dist: pygithub>1.58.0; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
@@ -116,14 +116,14 @@ Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
118
  Requires-Dist: notion-client; extra == "notion"
119
- Requires-Dist: backoff; extra == "notion"
120
- Requires-Dist: htmlBuilder; extra == "notion"
121
119
  Requires-Dist: httpx; extra == "notion"
120
+ Requires-Dist: htmlBuilder; extra == "notion"
121
+ Requires-Dist: backoff; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
- Requires-Dist: msal; extra == "onedrive"
126
125
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
126
+ Requires-Dist: msal; extra == "onedrive"
127
127
  Requires-Dist: bs4; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
134
134
  Requires-Dist: unstructured[org]; extra == "org"
135
135
  Provides-Extra: outlook
136
- Requires-Dist: msal; extra == "outlook"
137
136
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
137
+ Requires-Dist: msal; extra == "outlook"
138
138
  Provides-Extra: pdf
139
139
  Requires-Dist: unstructured[pdf]; extra == "pdf"
140
140
  Provides-Extra: pinecone
@@ -164,8 +164,8 @@ Provides-Extra: sftp
164
164
  Requires-Dist: paramiko; extra == "sftp"
165
165
  Requires-Dist: fsspec; extra == "sftp"
166
166
  Provides-Extra: sharepoint
167
- Requires-Dist: msal; extra == "sharepoint"
168
167
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
168
+ Requires-Dist: msal; extra == "sharepoint"
169
169
  Provides-Extra: singlestore
170
170
  Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=i77-gjXpw3EQpetJm6qwuhTR53KoBsdCYSBjHDaGJUQ,43
2
+ unstructured_ingest/__version__.py,sha256=WG3ykkrrofptunFgyMVyh_5Uyla9d5aYDfBtMqyZ_lE,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/interfaces.py,sha256=0r0gQoHJQ4DVSQEVbUPBA3N6WyvGMkR1u6U2SwUvoAQ,31361
5
5
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -257,7 +257,7 @@ unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
257
257
  unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
258
258
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
259
259
  unstructured_ingest/utils/data_prep.py,sha256=9UKewDHB8-cMlQ8POvokhjVsy-ksiSqAAW2ibqPYAfk,4400
260
- unstructured_ingest/utils/dep_check.py,sha256=cVEqZtMwji8BIt7pjtUOMtEmN7KaNXRXwelEKFpOdW8,1914
260
+ unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
261
261
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
262
262
  unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
263
263
  unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
@@ -308,13 +308,13 @@ unstructured_ingest/v2/processes/embedder.py,sha256=nFYiOmIJwWLodBt_cC-E5h7zmYB9
308
308
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
309
309
  unstructured_ingest/v2/processes/partitioner.py,sha256=bpqmZDsKKi6qtxNWdIWBfQmr1ccQUhU0axecpGAUf_4,7739
310
310
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
311
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=6iBdoH6BW8oMK1ZvEi0IgEchuk0cNUPoNIaikpzeML8,4992
311
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=XZWdbUKXioO4vfCYjgNNV4ZDNPQ_VrAUcHMjHGIys3E,5334
312
312
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
313
313
  unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
314
314
  unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
315
315
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
316
316
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
317
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=E_4DzeemC4mhZsVuLmSXtfy4MR1MoU6CNyvpRqsKnJU,6030
317
+ unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
318
318
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
319
319
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
320
320
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
@@ -323,7 +323,8 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYH
323
323
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=2_R_hrEAaTU4vJTCK9oKblWTgv6BKjyUhFtC7uq3q2w,4859
324
324
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
325
325
  unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
326
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=8St-JaVrDdQEVZpRS_TfjFusfjg0bAg3IYyykGFyWdw,7169
326
+ unstructured_ingest/v2/processes/connectors/outlook.py,sha256=NK67Pd8Nk5oUIXTK-sK18K7rZ_Cl0UuCbeF2ExBEZho,9294
327
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6gkhRrNpXIFECYZ_2Gjz_XRtY6Y,7561
327
328
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
328
329
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
329
330
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
@@ -339,9 +340,9 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
339
340
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
340
341
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
341
342
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
342
- unstructured_ingest-0.0.24.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
343
- unstructured_ingest-0.0.24.dist-info/METADATA,sha256=rHTF8fy1vNg5NmCBNVdobYWeGgpn_PBKao2z54UbgnE,7108
344
- unstructured_ingest-0.0.24.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
345
- unstructured_ingest-0.0.24.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
346
- unstructured_ingest-0.0.24.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
347
- unstructured_ingest-0.0.24.dist-info/RECORD,,
343
+ unstructured_ingest-0.0.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
344
+ unstructured_ingest-0.0.25.dist-info/METADATA,sha256=NdNIJw4d0nu0NKP_FD5c8RZ2Tt3hWMMm0pJNdKGZdQU,7108
345
+ unstructured_ingest-0.0.25.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
346
+ unstructured_ingest-0.0.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
347
+ unstructured_ingest-0.0.25.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
348
+ unstructured_ingest-0.0.25.dist-info/RECORD,,