unstructured-ingest 0.5.23__py3-none-any.whl → 0.5.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_vectara.py +67 -55
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +62 -18
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +10 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +6 -3
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/METADATA +18 -18
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/RECORD +11 -11
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
from functools import lru_cache
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Generator
|
|
6
7
|
from uuid import uuid4
|
|
@@ -25,24 +26,29 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def validate_upload(
|
|
29
|
+
def validate_upload(document: dict, expected_data: dict):
|
|
30
|
+
logger.info(f"validating document: {document}")
|
|
29
31
|
element_id = expected_data["element_id"]
|
|
30
32
|
expected_text = expected_data["text"]
|
|
31
33
|
filename = expected_data["metadata"]["filename"]
|
|
32
34
|
filetype = expected_data["metadata"]["filetype"]
|
|
33
35
|
page_number = expected_data["metadata"]["page_number"]
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
assert
|
|
38
|
-
|
|
39
|
-
assert
|
|
40
|
-
|
|
41
|
-
assert
|
|
42
|
-
assert
|
|
37
|
+
assert document is not None
|
|
38
|
+
speech_parts = document["parts"]
|
|
39
|
+
assert speech_parts
|
|
40
|
+
first_part = speech_parts[0]
|
|
41
|
+
assert first_part["text"] == expected_text
|
|
42
|
+
part_metadata = first_part["metadata"]
|
|
43
|
+
assert part_metadata
|
|
44
|
+
assert part_metadata["element_id"] == element_id
|
|
45
|
+
assert part_metadata["filename"] == filename
|
|
46
|
+
assert part_metadata["filetype"] == filetype
|
|
47
|
+
assert part_metadata["page_number"] == page_number
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
51
|
+
@lru_cache()
|
|
46
52
|
def _get_jwt_token():
|
|
47
53
|
"""Connect to the server and get a JWT token."""
|
|
48
54
|
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
@@ -65,23 +71,12 @@ def _get_jwt_token():
|
|
|
65
71
|
return response_json.get("access_token")
|
|
66
72
|
|
|
67
73
|
|
|
68
|
-
def
|
|
74
|
+
def list_documents(corpus_key: str) -> list[str]:
|
|
69
75
|
|
|
70
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/
|
|
76
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
|
|
71
77
|
|
|
72
78
|
# the query below requires the corpus to have filter attributes for element_id
|
|
73
79
|
|
|
74
|
-
data = json.dumps(
|
|
75
|
-
{
|
|
76
|
-
"query": "string",
|
|
77
|
-
"search": {
|
|
78
|
-
"metadata_filter": f"part.element_id = '{element_id}'",
|
|
79
|
-
"lexical_interpolation": 1,
|
|
80
|
-
"limit": 10,
|
|
81
|
-
},
|
|
82
|
-
}
|
|
83
|
-
)
|
|
84
|
-
|
|
85
80
|
jwt_token = _get_jwt_token()
|
|
86
81
|
headers = {
|
|
87
82
|
"Content-Type": "application/json",
|
|
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
|
|
|
90
85
|
"X-source": "unstructured",
|
|
91
86
|
}
|
|
92
87
|
|
|
93
|
-
response = requests.
|
|
88
|
+
response = requests.get(url, headers=headers)
|
|
94
89
|
response.raise_for_status()
|
|
95
90
|
response_json = response.json()
|
|
91
|
+
documents = response_json.get("documents", [])
|
|
92
|
+
return documents
|
|
93
|
+
|
|
96
94
|
|
|
97
|
-
|
|
95
|
+
def fetch_document(corpus_key: str, documents_id: str) -> dict:
|
|
96
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
|
|
97
|
+
jwt_token = _get_jwt_token()
|
|
98
|
+
headers = {
|
|
99
|
+
"Content-Type": "application/json",
|
|
100
|
+
"Accept": "application/json",
|
|
101
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
102
|
+
"X-source": "unstructured",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
response = requests.get(url, headers=headers)
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
return response.json()
|
|
98
108
|
|
|
99
109
|
|
|
100
110
|
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
|
|
|
148
158
|
response.raise_for_status()
|
|
149
159
|
|
|
150
160
|
|
|
151
|
-
def
|
|
152
|
-
url = "https://api.vectara.io/v2/corpora
|
|
161
|
+
def get_metadata(corpus_key: str):
|
|
162
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
153
163
|
jwt_token = _get_jwt_token()
|
|
154
164
|
headers = {
|
|
155
165
|
"Content-Type": "application/json",
|
|
@@ -159,35 +169,28 @@ def list_corpora() -> list:
|
|
|
159
169
|
}
|
|
160
170
|
response = requests.get(url, headers=headers)
|
|
161
171
|
response.raise_for_status()
|
|
162
|
-
|
|
163
|
-
if response_json.get("corpora"):
|
|
164
|
-
return [item["key"] for item in response_json.get("corpora")]
|
|
165
|
-
else:
|
|
166
|
-
return []
|
|
172
|
+
return response.json()
|
|
167
173
|
|
|
168
174
|
|
|
169
175
|
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
170
|
-
def is_ready_status():
|
|
171
|
-
corpora_list = list_corpora()
|
|
172
|
-
return corpus_key in corpora_list
|
|
173
|
-
|
|
174
176
|
start = time.time()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
while time.time() - start < timeout:
|
|
178
|
+
try:
|
|
179
|
+
get_metadata(corpus_key)
|
|
180
|
+
return
|
|
181
|
+
except requests.HTTPError:
|
|
182
|
+
time.sleep(interval)
|
|
183
|
+
raise TimeoutError("time out waiting for corpus to be ready")
|
|
181
184
|
|
|
182
185
|
|
|
183
186
|
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
184
187
|
start = time.time()
|
|
185
188
|
while time.time() - start < timeout:
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
try:
|
|
190
|
+
get_metadata(corpus_key)
|
|
191
|
+
time.sleep(interval)
|
|
192
|
+
except requests.HTTPError:
|
|
188
193
|
return
|
|
189
|
-
time.sleep(interval)
|
|
190
|
-
|
|
191
194
|
raise TimeoutError("time out waiting for corpus to delete")
|
|
192
195
|
|
|
193
196
|
|
|
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
|
|
|
210
213
|
wait_for_delete(corpus_key=corpus_key)
|
|
211
214
|
|
|
212
215
|
|
|
216
|
+
def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
|
|
217
|
+
start = time.time()
|
|
218
|
+
while time.time() - start < timeout:
|
|
219
|
+
all_document_meta = list_documents(corpus_key)
|
|
220
|
+
if not all_document_meta:
|
|
221
|
+
time.sleep(interval)
|
|
222
|
+
continue
|
|
223
|
+
else:
|
|
224
|
+
return all_document_meta
|
|
225
|
+
raise TimeoutError("time out waiting for document to be ready")
|
|
226
|
+
|
|
227
|
+
|
|
213
228
|
@pytest.mark.asyncio
|
|
214
229
|
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
|
|
215
230
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
231
|
async def test_vectara_destination(
|
|
217
|
-
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=
|
|
232
|
+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
|
|
218
233
|
):
|
|
219
234
|
corpus_key = corpora_util
|
|
220
235
|
connection_kwargs = {
|
|
@@ -231,7 +246,7 @@ async def test_vectara_destination(
|
|
|
231
246
|
identifier="mock-file-data",
|
|
232
247
|
)
|
|
233
248
|
|
|
234
|
-
stager_config = VectaraUploadStagerConfig(
|
|
249
|
+
stager_config = VectaraUploadStagerConfig()
|
|
235
250
|
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
236
251
|
new_upload_file = stager.run(
|
|
237
252
|
elements_filepath=upload_file,
|
|
@@ -260,11 +275,8 @@ async def test_vectara_destination(
|
|
|
260
275
|
elements = json.load(upload_fp)
|
|
261
276
|
first_element = elements[0]
|
|
262
277
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
break
|
|
269
|
-
|
|
270
|
-
validate_upload(response=response, expected_data=first_element)
|
|
278
|
+
all_document_meta = wait_for_doc_meta(corpus_key)
|
|
279
|
+
assert len(all_document_meta) == 1
|
|
280
|
+
document_meta = all_document_meta[0]
|
|
281
|
+
document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
|
|
282
|
+
validate_upload(document=document, expected_data=first_element)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.25" # pragma: no cover
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
2
|
+
import tempfile
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.utils.data_prep import write_data
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_data_df, write_data
|
|
11
11
|
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
12
12
|
from unstructured_ingest.v2.logger import logger
|
|
13
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -22,6 +22,9 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
|
|
|
22
22
|
|
|
23
23
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
24
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from pandas import DataFrame
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
27
30
|
database: str = Field(description="Database name", default="default")
|
|
@@ -30,10 +33,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
|
|
|
30
33
|
|
|
31
34
|
@dataclass
|
|
32
35
|
class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
|
|
33
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
36
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
34
37
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
35
38
|
# and always write it as a json file
|
|
36
|
-
|
|
39
|
+
final_output_path = output_path.with_suffix(".json")
|
|
40
|
+
write_data(path=final_output_path, data=data, indent=None)
|
|
41
|
+
return final_output_path
|
|
37
42
|
|
|
38
43
|
|
|
39
44
|
@dataclass
|
|
@@ -41,6 +46,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
41
46
|
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
42
47
|
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
48
|
connector_type: str = CONNECTOR_TYPE
|
|
49
|
+
_columns: Optional[dict[str, str]] = None
|
|
44
50
|
|
|
45
51
|
def precheck(self) -> None:
|
|
46
52
|
with self.connection_config.get_cursor() as cursor:
|
|
@@ -84,20 +90,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
84
90
|
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
85
91
|
yield cursor
|
|
86
92
|
|
|
87
|
-
def
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
93
|
+
def get_table_columns(self) -> dict[str, str]:
|
|
94
|
+
if self._columns is None:
|
|
95
|
+
with self.get_cursor() as cursor:
|
|
96
|
+
cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
|
|
97
|
+
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
98
|
+
return self._columns
|
|
99
|
+
|
|
100
|
+
def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
|
|
101
|
+
import pandas as pd
|
|
102
|
+
|
|
103
|
+
table_columns = self.get_table_columns()
|
|
104
|
+
columns = set(df.columns)
|
|
105
|
+
schema_fields = set(table_columns.keys())
|
|
106
|
+
columns_to_drop = columns - schema_fields
|
|
107
|
+
missing_columns = schema_fields - columns
|
|
108
|
+
|
|
109
|
+
if columns_to_drop:
|
|
110
|
+
logger.info(
|
|
111
|
+
"Following columns will be dropped to match the table's schema: "
|
|
112
|
+
f"{', '.join(columns_to_drop)}"
|
|
113
|
+
)
|
|
114
|
+
if missing_columns and add_missing_columns:
|
|
115
|
+
logger.info(
|
|
116
|
+
"Following null filled columns will be added to match the table's schema:"
|
|
117
|
+
f" {', '.join(missing_columns)} "
|
|
94
118
|
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
119
|
+
|
|
120
|
+
df = df.drop(columns=columns_to_drop)
|
|
121
|
+
|
|
122
|
+
if add_missing_columns:
|
|
123
|
+
for column in missing_columns:
|
|
124
|
+
df[column] = pd.Series()
|
|
125
|
+
return df
|
|
126
|
+
|
|
127
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
128
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
129
|
+
df = get_data_df()
|
|
130
|
+
df = self._fit_to_schema(df=df)
|
|
131
|
+
temp_path = Path(temp_dir) / path.name
|
|
132
|
+
df.to_json(temp_path, orient="records", lines=False)
|
|
133
|
+
with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
|
|
134
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
135
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
136
|
+
cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
137
|
+
logger.debug(
|
|
138
|
+
f"migrating content from {catalog_path} to "
|
|
139
|
+
f"table {self.upload_config.table_name}"
|
|
140
|
+
)
|
|
141
|
+
columns = list(df.columns)
|
|
142
|
+
column_str = ", ".join(columns)
|
|
143
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
144
|
+
cursor.execute(sql_statment)
|
|
101
145
|
|
|
102
146
|
|
|
103
147
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
@@ -33,6 +33,9 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
33
33
|
|
|
34
34
|
CONNECTOR_TYPE = "s3"
|
|
35
35
|
|
|
36
|
+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
|
|
37
|
+
CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
|
|
38
|
+
|
|
36
39
|
if TYPE_CHECKING:
|
|
37
40
|
from s3fs import S3FileSystem
|
|
38
41
|
|
|
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
91
94
|
if isinstance(e, PermissionError):
|
|
92
95
|
return UserAuthError(e)
|
|
93
96
|
if isinstance(e, FileNotFoundError):
|
|
94
|
-
return UserError(e)
|
|
97
|
+
return UserError(f"File not found: {e}")
|
|
95
98
|
if cause := getattr(e, "__cause__", None):
|
|
96
99
|
error_response = cause.response
|
|
97
100
|
error_meta = error_response["ResponseMetadata"]
|
|
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
|
|
|
140
143
|
}
|
|
141
144
|
if metadata:
|
|
142
145
|
record_locator["metadata"] = metadata
|
|
146
|
+
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
147
|
+
if issue_characters:
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"File path {path} contains characters "
|
|
150
|
+
f"that can cause issues with S3: {issue_characters}"
|
|
151
|
+
)
|
|
143
152
|
return FileDataSourceMetadata(
|
|
144
153
|
date_created=date_created,
|
|
145
154
|
date_modified=date_modified,
|
|
@@ -251,8 +251,9 @@ class SQLUploadStager(UploadStager):
|
|
|
251
251
|
df[column] = df[column].apply(str)
|
|
252
252
|
return df
|
|
253
253
|
|
|
254
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
254
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
255
255
|
write_data(path=output_path, data=data)
|
|
256
|
+
return output_path
|
|
256
257
|
|
|
257
258
|
def run(
|
|
258
259
|
self,
|
|
@@ -278,8 +279,10 @@ class SQLUploadStager(UploadStager):
|
|
|
278
279
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
279
280
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
280
281
|
|
|
281
|
-
self.write_output(
|
|
282
|
-
|
|
282
|
+
final_output_path = self.write_output(
|
|
283
|
+
output_path=output_path, data=df.to_dict(orient="records")
|
|
284
|
+
)
|
|
285
|
+
return final_output_path
|
|
283
286
|
|
|
284
287
|
|
|
285
288
|
class SQLUploaderConfig(UploaderConfig):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.25
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,11 +22,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: dataclasses_json
|
|
26
25
|
Requires-Dist: opentelemetry-sdk
|
|
26
|
+
Requires-Dist: python-dateutil
|
|
27
27
|
Requires-Dist: click
|
|
28
|
+
Requires-Dist: dataclasses_json
|
|
28
29
|
Requires-Dist: tqdm
|
|
29
|
-
Requires-Dist: python-dateutil
|
|
30
30
|
Requires-Dist: pydantic>=2.7
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Requires-Dist: pandas
|
|
@@ -112,8 +112,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
112
112
|
Requires-Dist: numpy; extra == "azure-ai-search"
|
|
113
113
|
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
114
114
|
Provides-Extra: biomed
|
|
115
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
116
115
|
Requires-Dist: requests; extra == "biomed"
|
|
116
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
117
117
|
Requires-Dist: numpy; extra == "biomed"
|
|
118
118
|
Requires-Dist: pandas; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
@@ -161,14 +161,14 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
|
161
161
|
Requires-Dist: numpy; extra == "elasticsearch"
|
|
162
162
|
Requires-Dist: pandas; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
164
165
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
165
166
|
Requires-Dist: fsspec; extra == "gcs"
|
|
166
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
167
167
|
Requires-Dist: numpy; extra == "gcs"
|
|
168
168
|
Requires-Dist: pandas; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
170
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
171
170
|
Requires-Dist: requests; extra == "github"
|
|
171
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
172
172
|
Requires-Dist: numpy; extra == "github"
|
|
173
173
|
Requires-Dist: pandas; extra == "github"
|
|
174
174
|
Provides-Extra: gitlab
|
|
@@ -180,14 +180,14 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
|
180
180
|
Requires-Dist: numpy; extra == "google-drive"
|
|
181
181
|
Requires-Dist: pandas; extra == "google-drive"
|
|
182
182
|
Provides-Extra: hubspot
|
|
183
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
184
183
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
184
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
185
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
186
|
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
-
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
189
188
|
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
190
189
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
190
|
+
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
191
191
|
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
@@ -216,21 +216,21 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
216
216
|
Requires-Dist: numpy; extra == "mongodb"
|
|
217
217
|
Requires-Dist: pandas; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
220
219
|
Requires-Dist: networkx; extra == "neo4j"
|
|
221
220
|
Requires-Dist: cymple; extra == "neo4j"
|
|
221
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
222
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
223
|
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
225
226
|
Requires-Dist: backoff; extra == "notion"
|
|
226
227
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
227
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
228
228
|
Requires-Dist: httpx; extra == "notion"
|
|
229
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
230
|
Requires-Dist: pandas; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
233
232
|
Requires-Dist: msal; extra == "onedrive"
|
|
233
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
234
|
Requires-Dist: bs4; extra == "onedrive"
|
|
235
235
|
Requires-Dist: numpy; extra == "onedrive"
|
|
236
236
|
Requires-Dist: pandas; extra == "onedrive"
|
|
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
239
239
|
Requires-Dist: numpy; extra == "opensearch"
|
|
240
240
|
Requires-Dist: pandas; extra == "opensearch"
|
|
241
241
|
Provides-Extra: outlook
|
|
242
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
243
242
|
Requires-Dist: msal; extra == "outlook"
|
|
243
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
244
244
|
Requires-Dist: numpy; extra == "outlook"
|
|
245
245
|
Requires-Dist: pandas; extra == "outlook"
|
|
246
246
|
Provides-Extra: pinecone
|
|
@@ -264,13 +264,13 @@ Requires-Dist: redis; extra == "redis"
|
|
|
264
264
|
Requires-Dist: numpy; extra == "redis"
|
|
265
265
|
Requires-Dist: pandas; extra == "redis"
|
|
266
266
|
Provides-Extra: s3
|
|
267
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
268
267
|
Requires-Dist: fsspec; extra == "s3"
|
|
268
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
269
269
|
Requires-Dist: numpy; extra == "s3"
|
|
270
270
|
Requires-Dist: pandas; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
272
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
273
272
|
Requires-Dist: msal; extra == "sharepoint"
|
|
273
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
274
274
|
Requires-Dist: numpy; extra == "sharepoint"
|
|
275
275
|
Requires-Dist: pandas; extra == "sharepoint"
|
|
276
276
|
Provides-Extra: salesforce
|
|
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
|
278
278
|
Requires-Dist: numpy; extra == "salesforce"
|
|
279
279
|
Requires-Dist: pandas; extra == "salesforce"
|
|
280
280
|
Provides-Extra: sftp
|
|
281
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
282
281
|
Requires-Dist: paramiko; extra == "sftp"
|
|
282
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
283
283
|
Requires-Dist: numpy; extra == "sftp"
|
|
284
284
|
Requires-Dist: pandas; extra == "sftp"
|
|
285
285
|
Provides-Extra: slack
|
|
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
312
312
|
Requires-Dist: numpy; extra == "singlestore"
|
|
313
313
|
Requires-Dist: pandas; extra == "singlestore"
|
|
314
314
|
Provides-Extra: vectara
|
|
315
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
316
315
|
Requires-Dist: requests; extra == "vectara"
|
|
316
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
317
317
|
Requires-Dist: httpx; extra == "vectara"
|
|
318
318
|
Requires-Dist: numpy; extra == "vectara"
|
|
319
319
|
Requires-Dist: pandas; extra == "vectara"
|
|
320
320
|
Provides-Extra: vastdb
|
|
321
|
-
Requires-Dist: vastdb; extra == "vastdb"
|
|
322
321
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
323
322
|
Requires-Dist: ibis; extra == "vastdb"
|
|
323
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
324
324
|
Requires-Dist: numpy; extra == "vastdb"
|
|
325
325
|
Requires-Dist: pandas; extra == "vastdb"
|
|
326
326
|
Provides-Extra: zendesk
|
|
327
|
-
Requires-Dist: bs4; extra == "zendesk"
|
|
328
327
|
Requires-Dist: aiofiles; extra == "zendesk"
|
|
329
328
|
Requires-Dist: httpx; extra == "zendesk"
|
|
329
|
+
Requires-Dist: bs4; extra == "zendesk"
|
|
330
330
|
Requires-Dist: numpy; extra == "zendesk"
|
|
331
331
|
Requires-Dist: pandas; extra == "zendesk"
|
|
332
332
|
Provides-Extra: embed-huggingface
|
|
@@ -24,7 +24,7 @@ test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfv
|
|
|
24
24
|
test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
|
|
25
25
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
26
26
|
test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
|
|
27
|
-
test/integration/connectors/test_vectara.py,sha256=
|
|
27
|
+
test/integration/connectors/test_vectara.py,sha256=thM9vIWn7vcH1xjQK3owuEJMr65Z7L4j7NICsMpsMv8,9290
|
|
28
28
|
test/integration/connectors/test_zendesk.py,sha256=nMBVNlEQr1uvmI1fzUC1bmoa2doXnYp5n4bMJS2FN-o,3727
|
|
29
29
|
test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
|
|
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
113
113
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
114
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
115
115
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
116
|
-
unstructured_ingest/__version__.py,sha256=
|
|
116
|
+
unstructured_ingest/__version__.py,sha256=A9I2h_N6BTgmKRhQ1HbPOAJuwdOFgMb_aDmK1czvHyc,43
|
|
117
117
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
118
118
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
119
119
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -462,7 +462,7 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6q
|
|
|
462
462
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
|
|
463
463
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
|
|
464
464
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
465
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=
|
|
465
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=0kEtIVQSD6RhLAqpc-0BNFQazS7lnsnWalaN3Mdn97g,6805
|
|
466
466
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
467
467
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
|
|
468
468
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
|
|
@@ -476,7 +476,7 @@ unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdW
|
|
|
476
476
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
|
|
477
477
|
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
|
|
478
478
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
|
|
479
|
-
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=
|
|
479
|
+
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=MtD41jZQXB-fqNzW3Whqq6ydQYDUK6Jub7sSPvgLErw,7130
|
|
480
480
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
|
|
481
481
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
482
482
|
unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
|
|
@@ -568,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
|
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
569
569
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
|
|
570
570
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
|
|
571
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
571
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=CbysCnBBHtmYkqXiaoZSazI1ombNltrsqFrY-gQzm4U,15683
|
|
572
572
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
|
|
573
573
|
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
|
|
574
574
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
@@ -583,9 +583,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
583
583
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
|
|
584
584
|
unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
585
585
|
unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
|
|
586
|
-
unstructured_ingest-0.5.
|
|
587
|
-
unstructured_ingest-0.5.
|
|
588
|
-
unstructured_ingest-0.5.
|
|
589
|
-
unstructured_ingest-0.5.
|
|
590
|
-
unstructured_ingest-0.5.
|
|
591
|
-
unstructured_ingest-0.5.
|
|
586
|
+
unstructured_ingest-0.5.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
587
|
+
unstructured_ingest-0.5.25.dist-info/METADATA,sha256=Z_PvUmam-C56UwoY92VhbvUd-fubXBHevjSMHKVgPx4,14999
|
|
588
|
+
unstructured_ingest-0.5.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
589
|
+
unstructured_ingest-0.5.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
590
|
+
unstructured_ingest-0.5.25.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
591
|
+
unstructured_ingest-0.5.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.23.dist-info → unstructured_ingest-0.5.25.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|