unstructured-ingest 0.5.23__py3-none-any.whl → 0.5.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  import time
4
+ from functools import lru_cache
4
5
  from pathlib import Path
5
6
  from typing import Generator
6
7
  from uuid import uuid4
@@ -25,24 +26,29 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
25
26
  )
26
27
 
27
28
 
28
- def validate_upload(response: dict, expected_data: dict):
29
+ def validate_upload(document: dict, expected_data: dict):
30
+ logger.info(f"validating document: {document}")
29
31
  element_id = expected_data["element_id"]
30
32
  expected_text = expected_data["text"]
31
33
  filename = expected_data["metadata"]["filename"]
32
34
  filetype = expected_data["metadata"]["filetype"]
33
35
  page_number = expected_data["metadata"]["page_number"]
34
36
 
35
- response = response["search_results"][0]
36
-
37
- assert response is not None
38
- assert response["text"] == expected_text
39
- assert response["part_metadata"]["element_id"] == element_id
40
- assert response["part_metadata"]["filename"] == filename
41
- assert response["part_metadata"]["filetype"] == filetype
42
- assert response["part_metadata"]["page_number"] == page_number
37
+ assert document is not None
38
+ speech_parts = document["parts"]
39
+ assert speech_parts
40
+ first_part = speech_parts[0]
41
+ assert first_part["text"] == expected_text
42
+ part_metadata = first_part["metadata"]
43
+ assert part_metadata
44
+ assert part_metadata["element_id"] == element_id
45
+ assert part_metadata["filename"] == filename
46
+ assert part_metadata["filetype"] == filetype
47
+ assert part_metadata["page_number"] == page_number
43
48
 
44
49
 
45
50
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
51
+ @lru_cache()
46
52
  def _get_jwt_token():
47
53
  """Connect to the server and get a JWT token."""
48
54
  customer_id = os.environ["VECTARA_CUSTOMER_ID"]
@@ -65,23 +71,12 @@ def _get_jwt_token():
65
71
  return response_json.get("access_token")
66
72
 
67
73
 
68
- def query_data(corpus_key: str, element_id: str) -> dict:
74
+ def list_documents(corpus_key: str) -> list[str]:
69
75
 
70
- url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
76
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
71
77
 
72
78
  # the query below requires the corpus to have filter attributes for element_id
73
79
 
74
- data = json.dumps(
75
- {
76
- "query": "string",
77
- "search": {
78
- "metadata_filter": f"part.element_id = '{element_id}'",
79
- "lexical_interpolation": 1,
80
- "limit": 10,
81
- },
82
- }
83
- )
84
-
85
80
  jwt_token = _get_jwt_token()
86
81
  headers = {
87
82
  "Content-Type": "application/json",
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
90
85
  "X-source": "unstructured",
91
86
  }
92
87
 
93
- response = requests.post(url, headers=headers, data=data)
88
+ response = requests.get(url, headers=headers)
94
89
  response.raise_for_status()
95
90
  response_json = response.json()
91
+ documents = response_json.get("documents", [])
92
+ return documents
93
+
96
94
 
97
- return response_json
95
+ def fetch_document(corpus_key: str, documents_id: str) -> dict:
96
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
97
+ jwt_token = _get_jwt_token()
98
+ headers = {
99
+ "Content-Type": "application/json",
100
+ "Accept": "application/json",
101
+ "Authorization": f"Bearer {jwt_token}",
102
+ "X-source": "unstructured",
103
+ }
104
+
105
+ response = requests.get(url, headers=headers)
106
+ response.raise_for_status()
107
+ return response.json()
98
108
 
99
109
 
100
110
  def create_corpora(corpus_key: str, corpus_name: str) -> None:
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
148
158
  response.raise_for_status()
149
159
 
150
160
 
151
- def list_corpora() -> list:
152
- url = "https://api.vectara.io/v2/corpora?limit=100"
161
+ def get_metadata(corpus_key: str):
162
+ url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
153
163
  jwt_token = _get_jwt_token()
154
164
  headers = {
155
165
  "Content-Type": "application/json",
@@ -159,35 +169,28 @@ def list_corpora() -> list:
159
169
  }
160
170
  response = requests.get(url, headers=headers)
161
171
  response.raise_for_status()
162
- response_json = response.json()
163
- if response_json.get("corpora"):
164
- return [item["key"] for item in response_json.get("corpora")]
165
- else:
166
- return []
172
+ return response.json()
167
173
 
168
174
 
169
175
  def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170
- def is_ready_status():
171
- corpora_list = list_corpora()
172
- return corpus_key in corpora_list
173
-
174
176
  start = time.time()
175
- is_ready = is_ready_status()
176
- while not is_ready and time.time() - start < timeout:
177
- time.sleep(interval)
178
- is_ready = is_ready_status()
179
- if not is_ready:
180
- raise TimeoutError("time out waiting for corpus to be ready")
177
+ while time.time() - start < timeout:
178
+ try:
179
+ get_metadata(corpus_key)
180
+ return
181
+ except requests.HTTPError:
182
+ time.sleep(interval)
183
+ raise TimeoutError("time out waiting for corpus to be ready")
181
184
 
182
185
 
183
186
  def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184
187
  start = time.time()
185
188
  while time.time() - start < timeout:
186
- corpora_list = list_corpora()
187
- if corpus_key not in corpora_list:
189
+ try:
190
+ get_metadata(corpus_key)
191
+ time.sleep(interval)
192
+ except requests.HTTPError:
188
193
  return
189
- time.sleep(interval)
190
-
191
194
  raise TimeoutError("time out waiting for corpus to delete")
192
195
 
193
196
 
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
210
213
  wait_for_delete(corpus_key=corpus_key)
211
214
 
212
215
 
216
+ def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
217
+ start = time.time()
218
+ while time.time() - start < timeout:
219
+ all_document_meta = list_documents(corpus_key)
220
+ if not all_document_meta:
221
+ time.sleep(interval)
222
+ continue
223
+ else:
224
+ return all_document_meta
225
+ raise TimeoutError("time out waiting for document to be ready")
226
+
227
+
213
228
  @pytest.mark.asyncio
214
229
  @pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
215
230
  @requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216
231
  async def test_vectara_destination(
217
- upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
232
+ upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
218
233
  ):
219
234
  corpus_key = corpora_util
220
235
  connection_kwargs = {
@@ -231,7 +246,7 @@ async def test_vectara_destination(
231
246
  identifier="mock-file-data",
232
247
  )
233
248
 
234
- stager_config = VectaraUploadStagerConfig(batch_size=10)
249
+ stager_config = VectaraUploadStagerConfig()
235
250
  stager = VectaraUploadStager(upload_stager_config=stager_config)
236
251
  new_upload_file = stager.run(
237
252
  elements_filepath=upload_file,
@@ -260,11 +275,8 @@ async def test_vectara_destination(
260
275
  elements = json.load(upload_fp)
261
276
  first_element = elements[0]
262
277
 
263
- for i in range(retries):
264
- response = query_data(corpus_key, first_element["element_id"])
265
- if not response["search_results"]:
266
- time.sleep(interval)
267
- else:
268
- break
269
-
270
- validate_upload(response=response, expected_data=first_element)
278
+ all_document_meta = wait_for_doc_meta(corpus_key)
279
+ assert len(all_document_meta) == 1
280
+ document_meta = all_document_meta[0]
281
+ document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
282
+ validate_upload(document=document, expected_data=first_element)
@@ -1 +1 @@
1
- __version__ = "0.5.23" # pragma: no cover
1
+ __version__ = "0.5.25" # pragma: no cover
@@ -1,13 +1,13 @@
1
- import json
2
1
  import os
2
+ import tempfile
3
3
  from contextlib import contextmanager
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Generator
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field
9
9
 
10
- from unstructured_ingest.utils.data_prep import write_data
10
+ from unstructured_ingest.utils.data_prep import get_data_df, write_data
11
11
  from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
12
12
  from unstructured_ingest.v2.logger import logger
13
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -22,6 +22,9 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
22
22
 
23
23
  CONNECTOR_TYPE = "databricks_volume_delta_tables"
24
24
 
25
+ if TYPE_CHECKING:
26
+ from pandas import DataFrame
27
+
25
28
 
26
29
  class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
27
30
  database: str = Field(description="Database name", default="default")
@@ -30,10 +33,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
30
33
 
31
34
  @dataclass
32
35
  class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
33
- def write_output(self, output_path: Path, data: list[dict]) -> None:
36
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
34
37
  # To avoid new line issues when migrating from volumes into delta tables, omit indenting
35
38
  # and always write it as a json file
36
- write_data(path=output_path.with_suffix(".json"), data=data, indent=None)
39
+ final_output_path = output_path.with_suffix(".json")
40
+ write_data(path=final_output_path, data=data, indent=None)
41
+ return final_output_path
37
42
 
38
43
 
39
44
  @dataclass
@@ -41,6 +46,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
41
46
  connection_config: DatabricksDeltaTablesConnectionConfig
42
47
  upload_config: DatabricksVolumeDeltaTableUploaderConfig
43
48
  connector_type: str = CONNECTOR_TYPE
49
+ _columns: Optional[dict[str, str]] = None
44
50
 
45
51
  def precheck(self) -> None:
46
52
  with self.connection_config.get_cursor() as cursor:
@@ -84,20 +90,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
84
90
  cursor.execute(f"USE DATABASE {self.upload_config.database}")
85
91
  yield cursor
86
92
 
87
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
88
- with self.get_cursor(staging_allowed_local_path=str(path.parent)) as cursor:
89
- catalog_path = self.get_output_path(file_data=file_data)
90
- logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
91
- cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
92
- logger.debug(
93
- f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
93
+ def get_table_columns(self) -> dict[str, str]:
94
+ if self._columns is None:
95
+ with self.get_cursor() as cursor:
96
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
97
+ self._columns = {desc[0]: desc[1] for desc in cursor.description}
98
+ return self._columns
99
+
100
+ def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
101
+ import pandas as pd
102
+
103
+ table_columns = self.get_table_columns()
104
+ columns = set(df.columns)
105
+ schema_fields = set(table_columns.keys())
106
+ columns_to_drop = columns - schema_fields
107
+ missing_columns = schema_fields - columns
108
+
109
+ if columns_to_drop:
110
+ logger.info(
111
+ "Following columns will be dropped to match the table's schema: "
112
+ f"{', '.join(columns_to_drop)}"
113
+ )
114
+ if missing_columns and add_missing_columns:
115
+ logger.info(
116
+ "Following null filled columns will be added to match the table's schema:"
117
+ f" {', '.join(missing_columns)} "
94
118
  )
95
- with path.open() as f:
96
- data = json.load(f)
97
- columns = data[0].keys()
98
- column_str = ", ".join(columns)
99
- sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
100
- cursor.execute(sql_statment)
119
+
120
+ df = df.drop(columns=columns_to_drop)
121
+
122
+ if add_missing_columns:
123
+ for column in missing_columns:
124
+ df[column] = pd.Series()
125
+ return df
126
+
127
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
128
+ with tempfile.TemporaryDirectory() as temp_dir:
129
+ df = get_data_df()
130
+ df = self._fit_to_schema(df=df)
131
+ temp_path = Path(temp_dir) / path.name
132
+ df.to_json(temp_path, orient="records", lines=False)
133
+ with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
134
+ catalog_path = self.get_output_path(file_data=file_data)
135
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
136
+ cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
137
+ logger.debug(
138
+ f"migrating content from {catalog_path} to "
139
+ f"table {self.upload_config.table_name}"
140
+ )
141
+ columns = list(df.columns)
142
+ column_str = ", ".join(columns)
143
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
144
+ cursor.execute(sql_statment)
101
145
 
102
146
 
103
147
  databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
@@ -33,6 +33,9 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
33
33
 
34
34
  CONNECTOR_TYPE = "s3"
35
35
 
36
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
37
+ CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
38
+
36
39
  if TYPE_CHECKING:
37
40
  from s3fs import S3FileSystem
38
41
 
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
91
94
  if isinstance(e, PermissionError):
92
95
  return UserAuthError(e)
93
96
  if isinstance(e, FileNotFoundError):
94
- return UserError(e)
97
+ return UserError(f"File not found: {e}")
95
98
  if cause := getattr(e, "__cause__", None):
96
99
  error_response = cause.response
97
100
  error_meta = error_response["ResponseMetadata"]
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
140
143
  }
141
144
  if metadata:
142
145
  record_locator["metadata"] = metadata
146
+ issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
147
+ if issue_characters:
148
+ logger.warning(
149
+ f"File path {path} contains characters "
150
+ f"that can cause issues with S3: {issue_characters}"
151
+ )
143
152
  return FileDataSourceMetadata(
144
153
  date_created=date_created,
145
154
  date_modified=date_modified,
@@ -251,8 +251,9 @@ class SQLUploadStager(UploadStager):
251
251
  df[column] = df[column].apply(str)
252
252
  return df
253
253
 
254
- def write_output(self, output_path: Path, data: list[dict]) -> None:
254
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
255
255
  write_data(path=output_path, data=data)
256
+ return output_path
256
257
 
257
258
  def run(
258
259
  self,
@@ -278,8 +279,10 @@ class SQLUploadStager(UploadStager):
278
279
  output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
279
280
  output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
280
281
 
281
- self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
282
- return output_path
282
+ final_output_path = self.write_output(
283
+ output_path=output_path, data=df.to_dict(orient="records")
284
+ )
285
+ return final_output_path
283
286
 
284
287
 
285
288
  class SQLUploaderConfig(UploaderConfig):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.23
3
+ Version: 0.5.25
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,11 +22,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: dataclasses_json
26
25
  Requires-Dist: opentelemetry-sdk
26
+ Requires-Dist: python-dateutil
27
27
  Requires-Dist: click
28
+ Requires-Dist: dataclasses_json
28
29
  Requires-Dist: tqdm
29
- Requires-Dist: python-dateutil
30
30
  Requires-Dist: pydantic>=2.7
31
31
  Requires-Dist: numpy
32
32
  Requires-Dist: pandas
@@ -112,8 +112,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
112
112
  Requires-Dist: numpy; extra == "azure-ai-search"
113
113
  Requires-Dist: pandas; extra == "azure-ai-search"
114
114
  Provides-Extra: biomed
115
- Requires-Dist: bs4; extra == "biomed"
116
115
  Requires-Dist: requests; extra == "biomed"
116
+ Requires-Dist: bs4; extra == "biomed"
117
117
  Requires-Dist: numpy; extra == "biomed"
118
118
  Requires-Dist: pandas; extra == "biomed"
119
119
  Provides-Extra: box
@@ -161,14 +161,14 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
161
  Requires-Dist: numpy; extra == "elasticsearch"
162
162
  Requires-Dist: pandas; extra == "elasticsearch"
163
163
  Provides-Extra: gcs
164
+ Requires-Dist: bs4; extra == "gcs"
164
165
  Requires-Dist: gcsfs; extra == "gcs"
165
166
  Requires-Dist: fsspec; extra == "gcs"
166
- Requires-Dist: bs4; extra == "gcs"
167
167
  Requires-Dist: numpy; extra == "gcs"
168
168
  Requires-Dist: pandas; extra == "gcs"
169
169
  Provides-Extra: github
170
- Requires-Dist: pygithub>1.58.0; extra == "github"
171
170
  Requires-Dist: requests; extra == "github"
171
+ Requires-Dist: pygithub>1.58.0; extra == "github"
172
172
  Requires-Dist: numpy; extra == "github"
173
173
  Requires-Dist: pandas; extra == "github"
174
174
  Provides-Extra: gitlab
@@ -180,14 +180,14 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
180
180
  Requires-Dist: numpy; extra == "google-drive"
181
181
  Requires-Dist: pandas; extra == "google-drive"
182
182
  Provides-Extra: hubspot
183
- Requires-Dist: urllib3; extra == "hubspot"
184
183
  Requires-Dist: hubspot-api-client; extra == "hubspot"
184
+ Requires-Dist: urllib3; extra == "hubspot"
185
185
  Requires-Dist: numpy; extra == "hubspot"
186
186
  Requires-Dist: pandas; extra == "hubspot"
187
187
  Provides-Extra: ibm-watsonx-s3
188
- Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
189
188
  Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
190
189
  Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
190
+ Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
191
191
  Requires-Dist: httpx; extra == "ibm-watsonx-s3"
192
192
  Requires-Dist: numpy; extra == "ibm-watsonx-s3"
193
193
  Requires-Dist: pandas; extra == "ibm-watsonx-s3"
@@ -216,21 +216,21 @@ Requires-Dist: pymongo; extra == "mongodb"
216
216
  Requires-Dist: numpy; extra == "mongodb"
217
217
  Requires-Dist: pandas; extra == "mongodb"
218
218
  Provides-Extra: neo4j
219
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
220
219
  Requires-Dist: networkx; extra == "neo4j"
221
220
  Requires-Dist: cymple; extra == "neo4j"
221
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
222
222
  Requires-Dist: numpy; extra == "neo4j"
223
223
  Requires-Dist: pandas; extra == "neo4j"
224
224
  Provides-Extra: notion
225
+ Requires-Dist: notion-client; extra == "notion"
225
226
  Requires-Dist: backoff; extra == "notion"
226
227
  Requires-Dist: htmlBuilder; extra == "notion"
227
- Requires-Dist: notion-client; extra == "notion"
228
228
  Requires-Dist: httpx; extra == "notion"
229
229
  Requires-Dist: numpy; extra == "notion"
230
230
  Requires-Dist: pandas; extra == "notion"
231
231
  Provides-Extra: onedrive
232
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
233
232
  Requires-Dist: msal; extra == "onedrive"
233
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
234
  Requires-Dist: bs4; extra == "onedrive"
235
235
  Requires-Dist: numpy; extra == "onedrive"
236
236
  Requires-Dist: pandas; extra == "onedrive"
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
239
239
  Requires-Dist: numpy; extra == "opensearch"
240
240
  Requires-Dist: pandas; extra == "opensearch"
241
241
  Provides-Extra: outlook
242
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
243
242
  Requires-Dist: msal; extra == "outlook"
243
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
244
244
  Requires-Dist: numpy; extra == "outlook"
245
245
  Requires-Dist: pandas; extra == "outlook"
246
246
  Provides-Extra: pinecone
@@ -264,13 +264,13 @@ Requires-Dist: redis; extra == "redis"
264
264
  Requires-Dist: numpy; extra == "redis"
265
265
  Requires-Dist: pandas; extra == "redis"
266
266
  Provides-Extra: s3
267
- Requires-Dist: s3fs; extra == "s3"
268
267
  Requires-Dist: fsspec; extra == "s3"
268
+ Requires-Dist: s3fs; extra == "s3"
269
269
  Requires-Dist: numpy; extra == "s3"
270
270
  Requires-Dist: pandas; extra == "s3"
271
271
  Provides-Extra: sharepoint
272
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
273
272
  Requires-Dist: msal; extra == "sharepoint"
273
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
274
274
  Requires-Dist: numpy; extra == "sharepoint"
275
275
  Requires-Dist: pandas; extra == "sharepoint"
276
276
  Provides-Extra: salesforce
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
278
278
  Requires-Dist: numpy; extra == "salesforce"
279
279
  Requires-Dist: pandas; extra == "salesforce"
280
280
  Provides-Extra: sftp
281
- Requires-Dist: fsspec; extra == "sftp"
282
281
  Requires-Dist: paramiko; extra == "sftp"
282
+ Requires-Dist: fsspec; extra == "sftp"
283
283
  Requires-Dist: numpy; extra == "sftp"
284
284
  Requires-Dist: pandas; extra == "sftp"
285
285
  Provides-Extra: slack
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
312
312
  Requires-Dist: numpy; extra == "singlestore"
313
313
  Requires-Dist: pandas; extra == "singlestore"
314
314
  Provides-Extra: vectara
315
- Requires-Dist: aiofiles; extra == "vectara"
316
315
  Requires-Dist: requests; extra == "vectara"
316
+ Requires-Dist: aiofiles; extra == "vectara"
317
317
  Requires-Dist: httpx; extra == "vectara"
318
318
  Requires-Dist: numpy; extra == "vectara"
319
319
  Requires-Dist: pandas; extra == "vectara"
320
320
  Provides-Extra: vastdb
321
- Requires-Dist: vastdb; extra == "vastdb"
322
321
  Requires-Dist: pyarrow; extra == "vastdb"
323
322
  Requires-Dist: ibis; extra == "vastdb"
323
+ Requires-Dist: vastdb; extra == "vastdb"
324
324
  Requires-Dist: numpy; extra == "vastdb"
325
325
  Requires-Dist: pandas; extra == "vastdb"
326
326
  Provides-Extra: zendesk
327
- Requires-Dist: bs4; extra == "zendesk"
328
327
  Requires-Dist: aiofiles; extra == "zendesk"
329
328
  Requires-Dist: httpx; extra == "zendesk"
329
+ Requires-Dist: bs4; extra == "zendesk"
330
330
  Requires-Dist: numpy; extra == "zendesk"
331
331
  Requires-Dist: pandas; extra == "zendesk"
332
332
  Provides-Extra: embed-huggingface
@@ -24,7 +24,7 @@ test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfv
24
24
  test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
- test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
27
+ test/integration/connectors/test_vectara.py,sha256=thM9vIWn7vcH1xjQK3owuEJMr65Z7L4j7NICsMpsMv8,9290
28
28
  test/integration/connectors/test_zendesk.py,sha256=nMBVNlEQr1uvmI1fzUC1bmoa2doXnYp5n4bMJS2FN-o,3727
29
29
  test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
113
113
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
114
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
115
115
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
116
- unstructured_ingest/__version__.py,sha256=zwHqD3LgpFA-cY-rrS_2n5Kv-NY8b8mUJsGrAiSe2kA,43
116
+ unstructured_ingest/__version__.py,sha256=A9I2h_N6BTgmKRhQ1HbPOAJuwdOFgMb_aDmK1czvHyc,43
117
117
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
118
118
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
119
119
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -462,7 +462,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6q
462
462
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
463
463
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
464
464
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
465
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=FZhjrMYBr_je6mWYp7MUUvyKR9YwGD2HiNljeT7U5ws,5044
465
+ unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=0kEtIVQSD6RhLAqpc-0BNFQazS7lnsnWalaN3Mdn97g,6805
466
466
  unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
467
467
  unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
468
468
  unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
@@ -476,7 +476,7 @@ unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdW
476
476
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
477
477
  unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
478
478
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
479
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=qO4WDZPoxmYMbUkaSvrxXaLn3UxzyMVhpj5wVyXqmi4,6623
479
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=MtD41jZQXB-fqNzW3Whqq6ydQYDUK6Jub7sSPvgLErw,7130
480
480
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
481
481
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
482
482
  unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
@@ -568,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
568
568
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
569
569
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
570
570
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
571
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=G28VUR0zaMVmQtbdZG6TRpkWFHvXJqFrr7SBuyM-fME,15608
571
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=CbysCnBBHtmYkqXiaoZSazI1ombNltrsqFrY-gQzm4U,15683
572
572
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
573
573
  unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
574
574
  unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
@@ -583,9 +583,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
583
583
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
584
584
  unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
585
585
  unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
586
- unstructured_ingest-0.5.23.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
587
- unstructured_ingest-0.5.23.dist-info/METADATA,sha256=yEHUhxSR1EF-2IoXViunb9iiNlEy9p0LgMTngzwtjLM,14999
588
- unstructured_ingest-0.5.23.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
589
- unstructured_ingest-0.5.23.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
590
- unstructured_ingest-0.5.23.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
591
- unstructured_ingest-0.5.23.dist-info/RECORD,,
586
+ unstructured_ingest-0.5.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
587
+ unstructured_ingest-0.5.25.dist-info/METADATA,sha256=Z_PvUmam-C56UwoY92VhbvUd-fubXBHevjSMHKVgPx4,14999
588
+ unstructured_ingest-0.5.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
589
+ unstructured_ingest-0.5.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
590
+ unstructured_ingest-0.5.25.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
591
+ unstructured_ingest-0.5.25.dist-info/RECORD,,