unstructured-ingest 0.5.21__py3-none-any.whl → 0.5.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_vectara.py +67 -55
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +8 -1
- unstructured_ingest/v2/interfaces/file_data.py +13 -116
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +62 -18
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +7 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +10 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +6 -3
- unstructured_ingest/v2/types/__init__.py +0 -0
- unstructured_ingest/v2/types/file_data.py +116 -0
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/METADATA +26 -26
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/RECORD +17 -15
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
from functools import lru_cache
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Generator
|
|
6
7
|
from uuid import uuid4
|
|
@@ -25,24 +26,29 @@ from unstructured_ingest.v2.processes.connectors.vectara import (
|
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
def validate_upload(
|
|
29
|
+
def validate_upload(document: dict, expected_data: dict):
|
|
30
|
+
logger.info(f"validating document: {document}")
|
|
29
31
|
element_id = expected_data["element_id"]
|
|
30
32
|
expected_text = expected_data["text"]
|
|
31
33
|
filename = expected_data["metadata"]["filename"]
|
|
32
34
|
filetype = expected_data["metadata"]["filetype"]
|
|
33
35
|
page_number = expected_data["metadata"]["page_number"]
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
assert
|
|
38
|
-
|
|
39
|
-
assert
|
|
40
|
-
|
|
41
|
-
assert
|
|
42
|
-
assert
|
|
37
|
+
assert document is not None
|
|
38
|
+
speech_parts = document["parts"]
|
|
39
|
+
assert speech_parts
|
|
40
|
+
first_part = speech_parts[0]
|
|
41
|
+
assert first_part["text"] == expected_text
|
|
42
|
+
part_metadata = first_part["metadata"]
|
|
43
|
+
assert part_metadata
|
|
44
|
+
assert part_metadata["element_id"] == element_id
|
|
45
|
+
assert part_metadata["filename"] == filename
|
|
46
|
+
assert part_metadata["filetype"] == filetype
|
|
47
|
+
assert part_metadata["page_number"] == page_number
|
|
43
48
|
|
|
44
49
|
|
|
45
50
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
51
|
+
@lru_cache()
|
|
46
52
|
def _get_jwt_token():
|
|
47
53
|
"""Connect to the server and get a JWT token."""
|
|
48
54
|
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
|
|
@@ -65,23 +71,12 @@ def _get_jwt_token():
|
|
|
65
71
|
return response_json.get("access_token")
|
|
66
72
|
|
|
67
73
|
|
|
68
|
-
def
|
|
74
|
+
def list_documents(corpus_key: str) -> list[str]:
|
|
69
75
|
|
|
70
|
-
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/
|
|
76
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents"
|
|
71
77
|
|
|
72
78
|
# the query below requires the corpus to have filter attributes for element_id
|
|
73
79
|
|
|
74
|
-
data = json.dumps(
|
|
75
|
-
{
|
|
76
|
-
"query": "string",
|
|
77
|
-
"search": {
|
|
78
|
-
"metadata_filter": f"part.element_id = '{element_id}'",
|
|
79
|
-
"lexical_interpolation": 1,
|
|
80
|
-
"limit": 10,
|
|
81
|
-
},
|
|
82
|
-
}
|
|
83
|
-
)
|
|
84
|
-
|
|
85
80
|
jwt_token = _get_jwt_token()
|
|
86
81
|
headers = {
|
|
87
82
|
"Content-Type": "application/json",
|
|
@@ -90,11 +85,26 @@ def query_data(corpus_key: str, element_id: str) -> dict:
|
|
|
90
85
|
"X-source": "unstructured",
|
|
91
86
|
}
|
|
92
87
|
|
|
93
|
-
response = requests.
|
|
88
|
+
response = requests.get(url, headers=headers)
|
|
94
89
|
response.raise_for_status()
|
|
95
90
|
response_json = response.json()
|
|
91
|
+
documents = response_json.get("documents", [])
|
|
92
|
+
return documents
|
|
93
|
+
|
|
96
94
|
|
|
97
|
-
|
|
95
|
+
def fetch_document(corpus_key: str, documents_id: str) -> dict:
|
|
96
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/documents/{documents_id}"
|
|
97
|
+
jwt_token = _get_jwt_token()
|
|
98
|
+
headers = {
|
|
99
|
+
"Content-Type": "application/json",
|
|
100
|
+
"Accept": "application/json",
|
|
101
|
+
"Authorization": f"Bearer {jwt_token}",
|
|
102
|
+
"X-source": "unstructured",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
response = requests.get(url, headers=headers)
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
return response.json()
|
|
98
108
|
|
|
99
109
|
|
|
100
110
|
def create_corpora(corpus_key: str, corpus_name: str) -> None:
|
|
@@ -148,8 +158,8 @@ def delete_corpora(corpus_key: str) -> None:
|
|
|
148
158
|
response.raise_for_status()
|
|
149
159
|
|
|
150
160
|
|
|
151
|
-
def
|
|
152
|
-
url = "https://api.vectara.io/v2/corpora
|
|
161
|
+
def get_metadata(corpus_key: str):
|
|
162
|
+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
|
|
153
163
|
jwt_token = _get_jwt_token()
|
|
154
164
|
headers = {
|
|
155
165
|
"Content-Type": "application/json",
|
|
@@ -159,35 +169,28 @@ def list_corpora() -> list:
|
|
|
159
169
|
}
|
|
160
170
|
response = requests.get(url, headers=headers)
|
|
161
171
|
response.raise_for_status()
|
|
162
|
-
|
|
163
|
-
if response_json.get("corpora"):
|
|
164
|
-
return [item["key"] for item in response_json.get("corpora")]
|
|
165
|
-
else:
|
|
166
|
-
return []
|
|
172
|
+
return response.json()
|
|
167
173
|
|
|
168
174
|
|
|
169
175
|
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
|
|
170
|
-
def is_ready_status():
|
|
171
|
-
corpora_list = list_corpora()
|
|
172
|
-
return corpus_key in corpora_list
|
|
173
|
-
|
|
174
176
|
start = time.time()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
while time.time() - start < timeout:
|
|
178
|
+
try:
|
|
179
|
+
get_metadata(corpus_key)
|
|
180
|
+
return
|
|
181
|
+
except requests.HTTPError:
|
|
182
|
+
time.sleep(interval)
|
|
183
|
+
raise TimeoutError("time out waiting for corpus to be ready")
|
|
181
184
|
|
|
182
185
|
|
|
183
186
|
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
|
|
184
187
|
start = time.time()
|
|
185
188
|
while time.time() - start < timeout:
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
try:
|
|
190
|
+
get_metadata(corpus_key)
|
|
191
|
+
time.sleep(interval)
|
|
192
|
+
except requests.HTTPError:
|
|
188
193
|
return
|
|
189
|
-
time.sleep(interval)
|
|
190
|
-
|
|
191
194
|
raise TimeoutError("time out waiting for corpus to delete")
|
|
192
195
|
|
|
193
196
|
|
|
@@ -210,11 +213,23 @@ def corpora_util() -> Generator[str, None, None]:
|
|
|
210
213
|
wait_for_delete(corpus_key=corpus_key)
|
|
211
214
|
|
|
212
215
|
|
|
216
|
+
def wait_for_doc_meta(corpus_key: str, timeout=60, interval=1) -> list[str]:
|
|
217
|
+
start = time.time()
|
|
218
|
+
while time.time() - start < timeout:
|
|
219
|
+
all_document_meta = list_documents(corpus_key)
|
|
220
|
+
if not all_document_meta:
|
|
221
|
+
time.sleep(interval)
|
|
222
|
+
continue
|
|
223
|
+
else:
|
|
224
|
+
return all_document_meta
|
|
225
|
+
raise TimeoutError("time out waiting for document to be ready")
|
|
226
|
+
|
|
227
|
+
|
|
213
228
|
@pytest.mark.asyncio
|
|
214
229
|
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara", NOSQL_TAG)
|
|
215
230
|
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
|
|
216
231
|
async def test_vectara_destination(
|
|
217
|
-
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=
|
|
232
|
+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=1
|
|
218
233
|
):
|
|
219
234
|
corpus_key = corpora_util
|
|
220
235
|
connection_kwargs = {
|
|
@@ -231,7 +246,7 @@ async def test_vectara_destination(
|
|
|
231
246
|
identifier="mock-file-data",
|
|
232
247
|
)
|
|
233
248
|
|
|
234
|
-
stager_config = VectaraUploadStagerConfig(
|
|
249
|
+
stager_config = VectaraUploadStagerConfig()
|
|
235
250
|
stager = VectaraUploadStager(upload_stager_config=stager_config)
|
|
236
251
|
new_upload_file = stager.run(
|
|
237
252
|
elements_filepath=upload_file,
|
|
@@ -260,11 +275,8 @@ async def test_vectara_destination(
|
|
|
260
275
|
elements = json.load(upload_fp)
|
|
261
276
|
first_element = elements[0]
|
|
262
277
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
break
|
|
269
|
-
|
|
270
|
-
validate_upload(response=response, expected_data=first_element)
|
|
278
|
+
all_document_meta = wait_for_doc_meta(corpus_key)
|
|
279
|
+
assert len(all_document_meta) == 1
|
|
280
|
+
document_meta = all_document_meta[0]
|
|
281
|
+
document = fetch_document(corpus_key=corpus_key, documents_id=document_meta["id"])
|
|
282
|
+
validate_upload(document=document, expected_data=first_element)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.25" # pragma: no cover
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
2
|
+
BatchFileData,
|
|
3
|
+
BatchItem,
|
|
4
|
+
FileData,
|
|
5
|
+
FileDataSourceMetadata,
|
|
6
|
+
SourceIdentifiers,
|
|
7
|
+
)
|
|
8
|
+
|
|
1
9
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
10
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
11
|
from .indexer import Indexer, IndexerConfig
|
|
5
12
|
from .process import BaseProcess
|
|
6
13
|
from .processor import ProcessorConfig
|
|
@@ -1,116 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
rel_path: Optional[str] = None
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def filename_stem(self) -> str:
|
|
18
|
-
return Path(self.filename).stem
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def relative_path(self) -> str:
|
|
22
|
-
return self.rel_path or self.fullpath
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class FileDataSourceMetadata(BaseModel):
|
|
26
|
-
url: Optional[str] = None
|
|
27
|
-
version: Optional[str] = None
|
|
28
|
-
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
-
date_created: Optional[str] = None
|
|
30
|
-
date_modified: Optional[str] = None
|
|
31
|
-
date_processed: Optional[str] = None
|
|
32
|
-
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
-
filesize_bytes: Optional[int] = None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class FileData(BaseModel):
|
|
37
|
-
identifier: str
|
|
38
|
-
connector_type: str
|
|
39
|
-
source_identifiers: SourceIdentifiers
|
|
40
|
-
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
-
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
|
-
reprocess: bool = False
|
|
43
|
-
local_download_path: Optional[str] = None
|
|
44
|
-
display_name: Optional[str] = None
|
|
45
|
-
|
|
46
|
-
@classmethod
|
|
47
|
-
def from_file(cls, path: str) -> "FileData":
|
|
48
|
-
path = Path(path).resolve()
|
|
49
|
-
if not path.exists() or not path.is_file():
|
|
50
|
-
raise ValueError(f"file path not valid: {path}")
|
|
51
|
-
with open(str(path.resolve()), "rb") as f:
|
|
52
|
-
file_data_dict = json.load(f)
|
|
53
|
-
file_data = cls.model_validate(file_data_dict)
|
|
54
|
-
return file_data
|
|
55
|
-
|
|
56
|
-
@classmethod
|
|
57
|
-
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
-
file_data_dict = file_data.model_dump()
|
|
59
|
-
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
-
|
|
61
|
-
def to_file(self, path: str) -> None:
|
|
62
|
-
path = Path(path).resolve()
|
|
63
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
-
with open(str(path.resolve()), "w") as f:
|
|
65
|
-
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class BatchItem(BaseModel):
|
|
69
|
-
identifier: str
|
|
70
|
-
version: Optional[str] = None
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class BatchFileData(FileData):
|
|
74
|
-
identifier: str = Field(init=False)
|
|
75
|
-
batch_items: list[BatchItem]
|
|
76
|
-
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
-
|
|
78
|
-
@field_validator("batch_items")
|
|
79
|
-
@classmethod
|
|
80
|
-
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
-
if not v:
|
|
82
|
-
raise ValueError("batch items cannot be empty")
|
|
83
|
-
all_identifiers = [item.identifier for item in v]
|
|
84
|
-
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
-
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
-
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
-
return sorted_batch_items
|
|
88
|
-
|
|
89
|
-
@model_validator(mode="before")
|
|
90
|
-
@classmethod
|
|
91
|
-
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
-
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
-
batch_items = data["batch_items"]
|
|
94
|
-
identifier_data = json.dumps(
|
|
95
|
-
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
-
)
|
|
97
|
-
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
-
return data
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def file_data_from_file(path: str) -> FileData:
|
|
102
|
-
try:
|
|
103
|
-
return BatchFileData.from_file(path=path)
|
|
104
|
-
except ValidationError:
|
|
105
|
-
logger.debug(f"{path} not detected as batch file data")
|
|
106
|
-
|
|
107
|
-
return FileData.from_file(path=path)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
-
try:
|
|
112
|
-
return BatchFileData.model_validate(data)
|
|
113
|
-
except ValidationError:
|
|
114
|
-
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
-
|
|
116
|
-
return FileData.model_validate(data)
|
|
1
|
+
"""
|
|
2
|
+
COMPATABILITY NOTICE:
|
|
3
|
+
This file has moved to the v2/types/ module.
|
|
4
|
+
The following line exists for backward compatibility.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.types.file_data import * # noqa - star imports are bad, but this is for maximal backward compatability
|
|
8
|
+
|
|
9
|
+
# Eventually this file should go away. Let's start warning users now:
|
|
10
|
+
logger.warning( # noqa - using logger from the star import
|
|
11
|
+
"Importing file_data.py through interfaces is deprecated. "
|
|
12
|
+
"Please use unstructured_ingest.v2.types.file_data instead!"
|
|
13
|
+
)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
2
|
+
import tempfile
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Generator
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.utils.data_prep import write_data
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_data_df, write_data
|
|
11
11
|
from unstructured_ingest.v2.interfaces import FileData, Uploader, UploaderConfig
|
|
12
12
|
from unstructured_ingest.v2.logger import logger
|
|
13
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -22,6 +22,9 @@ from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables imp
|
|
|
22
22
|
|
|
23
23
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
24
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from pandas import DataFrame
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
27
30
|
database: str = Field(description="Database name", default="default")
|
|
@@ -30,10 +33,12 @@ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMix
|
|
|
30
33
|
|
|
31
34
|
@dataclass
|
|
32
35
|
class DatabricksVolumeDeltaTableStager(DatabricksDeltaTablesUploadStager):
|
|
33
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
36
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
34
37
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
35
38
|
# and always write it as a json file
|
|
36
|
-
|
|
39
|
+
final_output_path = output_path.with_suffix(".json")
|
|
40
|
+
write_data(path=final_output_path, data=data, indent=None)
|
|
41
|
+
return final_output_path
|
|
37
42
|
|
|
38
43
|
|
|
39
44
|
@dataclass
|
|
@@ -41,6 +46,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
41
46
|
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
42
47
|
upload_config: DatabricksVolumeDeltaTableUploaderConfig
|
|
43
48
|
connector_type: str = CONNECTOR_TYPE
|
|
49
|
+
_columns: Optional[dict[str, str]] = None
|
|
44
50
|
|
|
45
51
|
def precheck(self) -> None:
|
|
46
52
|
with self.connection_config.get_cursor() as cursor:
|
|
@@ -84,20 +90,58 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
84
90
|
cursor.execute(f"USE DATABASE {self.upload_config.database}")
|
|
85
91
|
yield cursor
|
|
86
92
|
|
|
87
|
-
def
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
93
|
+
def get_table_columns(self) -> dict[str, str]:
|
|
94
|
+
if self._columns is None:
|
|
95
|
+
with self.get_cursor() as cursor:
|
|
96
|
+
cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
|
|
97
|
+
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
98
|
+
return self._columns
|
|
99
|
+
|
|
100
|
+
def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
|
|
101
|
+
import pandas as pd
|
|
102
|
+
|
|
103
|
+
table_columns = self.get_table_columns()
|
|
104
|
+
columns = set(df.columns)
|
|
105
|
+
schema_fields = set(table_columns.keys())
|
|
106
|
+
columns_to_drop = columns - schema_fields
|
|
107
|
+
missing_columns = schema_fields - columns
|
|
108
|
+
|
|
109
|
+
if columns_to_drop:
|
|
110
|
+
logger.info(
|
|
111
|
+
"Following columns will be dropped to match the table's schema: "
|
|
112
|
+
f"{', '.join(columns_to_drop)}"
|
|
113
|
+
)
|
|
114
|
+
if missing_columns and add_missing_columns:
|
|
115
|
+
logger.info(
|
|
116
|
+
"Following null filled columns will be added to match the table's schema:"
|
|
117
|
+
f" {', '.join(missing_columns)} "
|
|
94
118
|
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
119
|
+
|
|
120
|
+
df = df.drop(columns=columns_to_drop)
|
|
121
|
+
|
|
122
|
+
if add_missing_columns:
|
|
123
|
+
for column in missing_columns:
|
|
124
|
+
df[column] = pd.Series()
|
|
125
|
+
return df
|
|
126
|
+
|
|
127
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
128
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
129
|
+
df = get_data_df()
|
|
130
|
+
df = self._fit_to_schema(df=df)
|
|
131
|
+
temp_path = Path(temp_dir) / path.name
|
|
132
|
+
df.to_json(temp_path, orient="records", lines=False)
|
|
133
|
+
with self.get_cursor(staging_allowed_local_path=temp_dir) as cursor:
|
|
134
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
135
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
136
|
+
cursor.execute(f"PUT '{temp_path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
137
|
+
logger.debug(
|
|
138
|
+
f"migrating content from {catalog_path} to "
|
|
139
|
+
f"table {self.upload_config.table_name}"
|
|
140
|
+
)
|
|
141
|
+
columns = list(df.columns)
|
|
142
|
+
column_str = ", ".join(columns)
|
|
143
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
144
|
+
cursor.execute(sql_statment)
|
|
101
145
|
|
|
102
146
|
|
|
103
147
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
9
|
+
from pydantic import BaseModel, Field, Secret, SecretStr, field_validator
|
|
10
10
|
|
|
11
11
|
from unstructured_ingest.error import (
|
|
12
12
|
DestinationConnectionError,
|
|
@@ -98,6 +98,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
98
98
|
ca_certs: Optional[Path] = None
|
|
99
99
|
access_config: Secret[ElasticsearchAccessConfig]
|
|
100
100
|
|
|
101
|
+
@field_validator("hosts", mode="before")
|
|
102
|
+
def to_list(cls, value):
|
|
103
|
+
if isinstance(value, str):
|
|
104
|
+
return [value]
|
|
105
|
+
return value
|
|
106
|
+
|
|
101
107
|
def get_client_kwargs(self) -> dict:
|
|
102
108
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
103
109
|
# supported methods:
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, Field, Secret
|
|
5
|
+
from pydantic import BaseModel, Field, Secret, field_validator
|
|
6
6
|
|
|
7
7
|
from unstructured_ingest.error import (
|
|
8
8
|
DestinationConnectionError,
|
|
@@ -78,6 +78,12 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
78
78
|
|
|
79
79
|
access_config: Secret[OpenSearchAccessConfig]
|
|
80
80
|
|
|
81
|
+
@field_validator("hosts", mode="before")
|
|
82
|
+
def to_list(cls, value):
|
|
83
|
+
if isinstance(value, str):
|
|
84
|
+
return [value]
|
|
85
|
+
return value
|
|
86
|
+
|
|
81
87
|
def get_client_kwargs(self) -> dict:
|
|
82
88
|
# Update auth related fields to conform to what the SDK expects based on the
|
|
83
89
|
# supported methods:
|
|
@@ -33,6 +33,9 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
33
33
|
|
|
34
34
|
CONNECTOR_TYPE = "s3"
|
|
35
35
|
|
|
36
|
+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters
|
|
37
|
+
CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
|
|
38
|
+
|
|
36
39
|
if TYPE_CHECKING:
|
|
37
40
|
from s3fs import S3FileSystem
|
|
38
41
|
|
|
@@ -91,7 +94,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
91
94
|
if isinstance(e, PermissionError):
|
|
92
95
|
return UserAuthError(e)
|
|
93
96
|
if isinstance(e, FileNotFoundError):
|
|
94
|
-
return UserError(e)
|
|
97
|
+
return UserError(f"File not found: {e}")
|
|
95
98
|
if cause := getattr(e, "__cause__", None):
|
|
96
99
|
error_response = cause.response
|
|
97
100
|
error_meta = error_response["ResponseMetadata"]
|
|
@@ -140,6 +143,12 @@ class S3Indexer(FsspecIndexer):
|
|
|
140
143
|
}
|
|
141
144
|
if metadata:
|
|
142
145
|
record_locator["metadata"] = metadata
|
|
146
|
+
issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
|
|
147
|
+
if issue_characters:
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"File path {path} contains characters "
|
|
150
|
+
f"that can cause issues with S3: {issue_characters}"
|
|
151
|
+
)
|
|
143
152
|
return FileDataSourceMetadata(
|
|
144
153
|
date_created=date_created,
|
|
145
154
|
date_modified=date_modified,
|
|
@@ -251,8 +251,9 @@ class SQLUploadStager(UploadStager):
|
|
|
251
251
|
df[column] = df[column].apply(str)
|
|
252
252
|
return df
|
|
253
253
|
|
|
254
|
-
def write_output(self, output_path: Path, data: list[dict]) ->
|
|
254
|
+
def write_output(self, output_path: Path, data: list[dict]) -> Path:
|
|
255
255
|
write_data(path=output_path, data=data)
|
|
256
|
+
return output_path
|
|
256
257
|
|
|
257
258
|
def run(
|
|
258
259
|
self,
|
|
@@ -278,8 +279,10 @@ class SQLUploadStager(UploadStager):
|
|
|
278
279
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
279
280
|
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
280
281
|
|
|
281
|
-
self.write_output(
|
|
282
|
-
|
|
282
|
+
final_output_path = self.write_output(
|
|
283
|
+
output_path=output_path, data=df.to_dict(orient="records")
|
|
284
|
+
)
|
|
285
|
+
return final_output_path
|
|
283
286
|
|
|
284
287
|
|
|
285
288
|
class SQLUploaderConfig(UploaderConfig):
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SourceIdentifiers(BaseModel):
|
|
12
|
+
filename: str
|
|
13
|
+
fullpath: str
|
|
14
|
+
rel_path: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def filename_stem(self) -> str:
|
|
18
|
+
return Path(self.filename).stem
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def relative_path(self) -> str:
|
|
22
|
+
return self.rel_path or self.fullpath
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FileDataSourceMetadata(BaseModel):
|
|
26
|
+
url: Optional[str] = None
|
|
27
|
+
version: Optional[str] = None
|
|
28
|
+
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
+
date_created: Optional[str] = None
|
|
30
|
+
date_modified: Optional[str] = None
|
|
31
|
+
date_processed: Optional[str] = None
|
|
32
|
+
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
+
filesize_bytes: Optional[int] = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FileData(BaseModel):
|
|
37
|
+
identifier: str
|
|
38
|
+
connector_type: str
|
|
39
|
+
source_identifiers: SourceIdentifiers
|
|
40
|
+
metadata: FileDataSourceMetadata = Field(default_factory=lambda: FileDataSourceMetadata())
|
|
41
|
+
additional_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
42
|
+
reprocess: bool = False
|
|
43
|
+
local_download_path: Optional[str] = None
|
|
44
|
+
display_name: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_file(cls, path: str) -> "FileData":
|
|
48
|
+
path = Path(path).resolve()
|
|
49
|
+
if not path.exists() or not path.is_file():
|
|
50
|
+
raise ValueError(f"file path not valid: {path}")
|
|
51
|
+
with open(str(path.resolve()), "rb") as f:
|
|
52
|
+
file_data_dict = json.load(f)
|
|
53
|
+
file_data = cls.model_validate(file_data_dict)
|
|
54
|
+
return file_data
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def cast(cls, file_data: "FileData", **kwargs) -> "FileData":
|
|
58
|
+
file_data_dict = file_data.model_dump()
|
|
59
|
+
return cls.model_validate(file_data_dict, **kwargs)
|
|
60
|
+
|
|
61
|
+
def to_file(self, path: str) -> None:
|
|
62
|
+
path = Path(path).resolve()
|
|
63
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with open(str(path.resolve()), "w") as f:
|
|
65
|
+
json.dump(self.model_dump(), f, indent=2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BatchItem(BaseModel):
|
|
69
|
+
identifier: str
|
|
70
|
+
version: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BatchFileData(FileData):
|
|
74
|
+
identifier: str = Field(init=False)
|
|
75
|
+
batch_items: list[BatchItem]
|
|
76
|
+
source_identifiers: Optional[SourceIdentifiers] = None
|
|
77
|
+
|
|
78
|
+
@field_validator("batch_items")
|
|
79
|
+
@classmethod
|
|
80
|
+
def check_batch_items(cls, v: list[BatchItem]) -> list[BatchItem]:
|
|
81
|
+
if not v:
|
|
82
|
+
raise ValueError("batch items cannot be empty")
|
|
83
|
+
all_identifiers = [item.identifier for item in v]
|
|
84
|
+
if len(all_identifiers) != len(set(all_identifiers)):
|
|
85
|
+
raise ValueError(f"duplicate identifiers: {all_identifiers}")
|
|
86
|
+
sorted_batch_items = sorted(v, key=lambda item: item.identifier)
|
|
87
|
+
return sorted_batch_items
|
|
88
|
+
|
|
89
|
+
@model_validator(mode="before")
|
|
90
|
+
@classmethod
|
|
91
|
+
def populate_identifier(cls, data: Any) -> Any:
|
|
92
|
+
if isinstance(data, dict) and "identifier" not in data:
|
|
93
|
+
batch_items = data["batch_items"]
|
|
94
|
+
identifier_data = json.dumps(
|
|
95
|
+
{item.identifier: item.version for item in batch_items}, sort_keys=True
|
|
96
|
+
)
|
|
97
|
+
data["identifier"] = str(uuid5(NAMESPACE_DNS, str(identifier_data)))
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def file_data_from_file(path: str) -> FileData:
|
|
102
|
+
try:
|
|
103
|
+
return BatchFileData.from_file(path=path)
|
|
104
|
+
except ValidationError:
|
|
105
|
+
logger.debug(f"{path} not detected as batch file data")
|
|
106
|
+
|
|
107
|
+
return FileData.from_file(path=path)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def file_data_from_dict(data: dict) -> FileData:
|
|
111
|
+
try:
|
|
112
|
+
return BatchFileData.model_validate(data)
|
|
113
|
+
except ValidationError:
|
|
114
|
+
logger.debug(f"{data} not valid for batch file data")
|
|
115
|
+
|
|
116
|
+
return FileData.model_validate(data)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.25
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: opentelemetry-sdk
|
|
26
|
+
Requires-Dist: python-dateutil
|
|
25
27
|
Requires-Dist: click
|
|
26
28
|
Requires-Dist: dataclasses_json
|
|
27
|
-
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist: python-dateutil
|
|
29
|
-
Requires-Dist: opentelemetry-sdk
|
|
30
29
|
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: pydantic>=2.7
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Requires-Dist: pandas
|
|
33
33
|
Provides-Extra: remote
|
|
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
|
|
|
103
103
|
Requires-Dist: numpy; extra == "astradb"
|
|
104
104
|
Requires-Dist: pandas; extra == "astradb"
|
|
105
105
|
Provides-Extra: azure
|
|
106
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
107
106
|
Requires-Dist: adlfs; extra == "azure"
|
|
107
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
108
108
|
Requires-Dist: numpy; extra == "azure"
|
|
109
109
|
Requires-Dist: pandas; extra == "azure"
|
|
110
110
|
Provides-Extra: azure-ai-search
|
|
@@ -112,13 +112,13 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
112
112
|
Requires-Dist: numpy; extra == "azure-ai-search"
|
|
113
113
|
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
114
114
|
Provides-Extra: biomed
|
|
115
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
116
115
|
Requires-Dist: requests; extra == "biomed"
|
|
116
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
117
117
|
Requires-Dist: numpy; extra == "biomed"
|
|
118
118
|
Requires-Dist: pandas; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
120
|
-
Requires-Dist: fsspec; extra == "box"
|
|
121
120
|
Requires-Dist: boxfs; extra == "box"
|
|
121
|
+
Requires-Dist: fsspec; extra == "box"
|
|
122
122
|
Requires-Dist: numpy; extra == "box"
|
|
123
123
|
Requires-Dist: pandas; extra == "box"
|
|
124
124
|
Provides-Extra: chroma
|
|
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
|
|
|
148
148
|
Requires-Dist: numpy; extra == "discord"
|
|
149
149
|
Requires-Dist: pandas; extra == "discord"
|
|
150
150
|
Provides-Extra: dropbox
|
|
151
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
152
151
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
152
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
153
153
|
Requires-Dist: numpy; extra == "dropbox"
|
|
154
154
|
Requires-Dist: pandas; extra == "dropbox"
|
|
155
155
|
Provides-Extra: duckdb
|
|
@@ -162,13 +162,13 @@ Requires-Dist: numpy; extra == "elasticsearch"
|
|
|
162
162
|
Requires-Dist: pandas; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
164
|
Requires-Dist: bs4; extra == "gcs"
|
|
165
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
166
165
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
166
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
167
167
|
Requires-Dist: numpy; extra == "gcs"
|
|
168
168
|
Requires-Dist: pandas; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
170
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
171
170
|
Requires-Dist: requests; extra == "github"
|
|
171
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
172
172
|
Requires-Dist: numpy; extra == "github"
|
|
173
173
|
Requires-Dist: pandas; extra == "github"
|
|
174
174
|
Provides-Extra: gitlab
|
|
@@ -180,15 +180,15 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
|
180
180
|
Requires-Dist: numpy; extra == "google-drive"
|
|
181
181
|
Requires-Dist: pandas; extra == "google-drive"
|
|
182
182
|
Provides-Extra: hubspot
|
|
183
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
184
183
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
184
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
185
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
186
|
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
+
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
188
189
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
189
190
|
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
190
191
|
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
191
|
-
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
194
194
|
Provides-Extra: jira
|
|
@@ -217,21 +217,21 @@ Requires-Dist: numpy; extra == "mongodb"
|
|
|
217
217
|
Requires-Dist: pandas; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
219
|
Requires-Dist: networkx; extra == "neo4j"
|
|
220
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
221
220
|
Requires-Dist: cymple; extra == "neo4j"
|
|
221
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
222
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
223
|
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
|
-
Requires-Dist: httpx; extra == "notion"
|
|
226
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
227
225
|
Requires-Dist: notion-client; extra == "notion"
|
|
228
226
|
Requires-Dist: backoff; extra == "notion"
|
|
227
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
228
|
+
Requires-Dist: httpx; extra == "notion"
|
|
229
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
230
|
Requires-Dist: pandas; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
233
232
|
Requires-Dist: msal; extra == "onedrive"
|
|
234
233
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
235
235
|
Requires-Dist: numpy; extra == "onedrive"
|
|
236
236
|
Requires-Dist: pandas; extra == "onedrive"
|
|
237
237
|
Provides-Extra: opensearch
|
|
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
|
278
278
|
Requires-Dist: numpy; extra == "salesforce"
|
|
279
279
|
Requires-Dist: pandas; extra == "salesforce"
|
|
280
280
|
Provides-Extra: sftp
|
|
281
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
282
281
|
Requires-Dist: paramiko; extra == "sftp"
|
|
282
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
283
283
|
Requires-Dist: numpy; extra == "sftp"
|
|
284
284
|
Requires-Dist: pandas; extra == "sftp"
|
|
285
285
|
Provides-Extra: slack
|
|
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
|
287
287
|
Requires-Dist: numpy; extra == "slack"
|
|
288
288
|
Requires-Dist: pandas; extra == "slack"
|
|
289
289
|
Provides-Extra: snowflake
|
|
290
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
291
290
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
291
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
292
292
|
Requires-Dist: numpy; extra == "snowflake"
|
|
293
293
|
Requires-Dist: pandas; extra == "snowflake"
|
|
294
294
|
Provides-Extra: wikipedia
|
|
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
312
312
|
Requires-Dist: numpy; extra == "singlestore"
|
|
313
313
|
Requires-Dist: pandas; extra == "singlestore"
|
|
314
314
|
Provides-Extra: vectara
|
|
315
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
316
|
-
Requires-Dist: aiofiles; extra == "vectara"
|
|
317
315
|
Requires-Dist: requests; extra == "vectara"
|
|
316
|
+
Requires-Dist: aiofiles; extra == "vectara"
|
|
317
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
318
318
|
Requires-Dist: numpy; extra == "vectara"
|
|
319
319
|
Requires-Dist: pandas; extra == "vectara"
|
|
320
320
|
Provides-Extra: vastdb
|
|
321
|
-
Requires-Dist: ibis; extra == "vastdb"
|
|
322
321
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
322
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
323
323
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
324
324
|
Requires-Dist: numpy; extra == "vastdb"
|
|
325
325
|
Requires-Dist: pandas; extra == "vastdb"
|
|
326
326
|
Provides-Extra: zendesk
|
|
327
|
-
Requires-Dist: bs4; extra == "zendesk"
|
|
328
|
-
Requires-Dist: httpx; extra == "zendesk"
|
|
329
327
|
Requires-Dist: aiofiles; extra == "zendesk"
|
|
328
|
+
Requires-Dist: httpx; extra == "zendesk"
|
|
329
|
+
Requires-Dist: bs4; extra == "zendesk"
|
|
330
330
|
Requires-Dist: numpy; extra == "zendesk"
|
|
331
331
|
Requires-Dist: pandas; extra == "zendesk"
|
|
332
332
|
Provides-Extra: embed-huggingface
|
|
@@ -334,8 +334,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
334
334
|
Requires-Dist: numpy; extra == "embed-huggingface"
|
|
335
335
|
Requires-Dist: pandas; extra == "embed-huggingface"
|
|
336
336
|
Provides-Extra: embed-octoai
|
|
337
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
338
337
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
338
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
339
339
|
Requires-Dist: numpy; extra == "embed-octoai"
|
|
340
340
|
Requires-Dist: pandas; extra == "embed-octoai"
|
|
341
341
|
Provides-Extra: embed-vertexai
|
|
@@ -351,13 +351,13 @@ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
|
351
351
|
Requires-Dist: numpy; extra == "embed-mixedbreadai"
|
|
352
352
|
Requires-Dist: pandas; extra == "embed-mixedbreadai"
|
|
353
353
|
Provides-Extra: openai
|
|
354
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
355
354
|
Requires-Dist: openai; extra == "openai"
|
|
355
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
356
356
|
Requires-Dist: numpy; extra == "openai"
|
|
357
357
|
Requires-Dist: pandas; extra == "openai"
|
|
358
358
|
Provides-Extra: bedrock
|
|
359
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
360
359
|
Requires-Dist: boto3; extra == "bedrock"
|
|
360
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
361
361
|
Requires-Dist: numpy; extra == "bedrock"
|
|
362
362
|
Requires-Dist: pandas; extra == "bedrock"
|
|
363
363
|
Provides-Extra: togetherai
|
|
@@ -24,7 +24,7 @@ test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfv
|
|
|
24
24
|
test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
|
|
25
25
|
test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
|
|
26
26
|
test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
|
|
27
|
-
test/integration/connectors/test_vectara.py,sha256=
|
|
27
|
+
test/integration/connectors/test_vectara.py,sha256=thM9vIWn7vcH1xjQK3owuEJMr65Z7L4j7NICsMpsMv8,9290
|
|
28
28
|
test/integration/connectors/test_zendesk.py,sha256=nMBVNlEQr1uvmI1fzUC1bmoa2doXnYp5n4bMJS2FN-o,3727
|
|
29
29
|
test/integration/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
test/integration/connectors/databricks/test_volumes_native.py,sha256=KqiapQAV0s_Zv0CO8BwYoiCk30dwrSZzuigUWNRIem0,9559
|
|
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
113
113
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
114
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
115
115
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
116
|
-
unstructured_ingest/__version__.py,sha256=
|
|
116
|
+
unstructured_ingest/__version__.py,sha256=A9I2h_N6BTgmKRhQ1HbPOAJuwdOFgMb_aDmK1czvHyc,43
|
|
117
117
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
118
118
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
119
119
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -398,10 +398,10 @@ unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdj
|
|
|
398
398
|
unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
399
399
|
unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
|
|
400
400
|
unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
|
|
401
|
-
unstructured_ingest/v2/interfaces/__init__.py,sha256=
|
|
401
|
+
unstructured_ingest/v2/interfaces/__init__.py,sha256=Jn5qtWOnmBZzsb2PoQYN3Xj5xHa9thSVc0BEoIN0Pw0,1059
|
|
402
402
|
unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
|
|
403
403
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=Qi_wISgUACZKEPu5p1kUaG3uiCXcr3zWg9z9uRDwoOk,2927
|
|
404
|
-
unstructured_ingest/v2/interfaces/file_data.py,sha256=
|
|
404
|
+
unstructured_ingest/v2/interfaces/file_data.py,sha256=DQYzXr8yjlm6VkGuwQLGJ1sia4Gr0d__POAFLrow1PE,525
|
|
405
405
|
unstructured_ingest/v2/interfaces/indexer.py,sha256=i0oftyifXefxfKa4a3sCfSwkzWGSPE6EvC9sg6fwZgk,833
|
|
406
406
|
unstructured_ingest/v2/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
|
|
407
407
|
unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
|
|
@@ -462,21 +462,21 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6q
|
|
|
462
462
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
|
|
463
463
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
|
|
464
464
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
465
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=
|
|
465
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=0kEtIVQSD6RhLAqpc-0BNFQazS7lnsnWalaN3Mdn97g,6805
|
|
466
466
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
467
467
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
|
|
468
468
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
|
|
469
469
|
unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=RW-Cw94Hs3ZsN8Kb4ciSh_N-Qkp0cqkw_xkJbt8CDNU,4656
|
|
470
470
|
unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
|
|
471
|
-
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=
|
|
472
|
-
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=
|
|
471
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=KmlQCA7LXppxhL9e27LBBqNT999nUcc39qe2IkZsUJ8,18988
|
|
472
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=tzOV0eNMyVHMXE5nedp6u0yyWC0Gn_blklg2ZdoOa4c,6956
|
|
473
473
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
474
474
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=kw0UfGI2fx3oQ8jVpzF45pH8Qg_QP_que5C_VXgnktc,7156
|
|
475
475
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=aJCtCHRBAauLwdWEQe704Cm4UHv-ukTXV2bT3SBENVk,5881
|
|
476
476
|
unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=epf2okPKqF4R-u_zxEYDJK4g0qhFqf1ejuz8JSJaNyU,8360
|
|
477
477
|
unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=0Z--cPh17W_j4jQkSe2BeeD_j0Tt147Z01gqqF58Z9A,14421
|
|
478
478
|
unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=nlDSKHs8mbXCY5Bok1hGH8UZJCdtnyhZWiRwn180ohk,7177
|
|
479
|
-
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=
|
|
479
|
+
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=MtD41jZQXB-fqNzW3Whqq6ydQYDUK6Jub7sSPvgLErw,7130
|
|
480
480
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=ZimcBJL-Har7GOESb9blzDb8pzPZcmh16YvvHYxYkJM,6373
|
|
481
481
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
482
482
|
unstructured_ingest/v2/processes/connectors/ibm_watsonx/__init__.py,sha256=EMG7lyThrYO8W7y3DIxGgNNXtbpdeAdvLd0m4tpO-Io,377
|
|
@@ -568,7 +568,7 @@ unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha25
|
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
569
569
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
|
|
570
570
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
|
|
571
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
571
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=CbysCnBBHtmYkqXiaoZSazI1ombNltrsqFrY-gQzm4U,15683
|
|
572
572
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
|
|
573
573
|
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
|
|
574
574
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
@@ -581,9 +581,11 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
|
|
|
581
581
|
unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
|
|
582
582
|
unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
583
583
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
|
|
584
|
-
unstructured_ingest
|
|
585
|
-
unstructured_ingest
|
|
586
|
-
unstructured_ingest-0.5.
|
|
587
|
-
unstructured_ingest-0.5.
|
|
588
|
-
unstructured_ingest-0.5.
|
|
589
|
-
unstructured_ingest-0.5.
|
|
584
|
+
unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
585
|
+
unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
|
|
586
|
+
unstructured_ingest-0.5.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
587
|
+
unstructured_ingest-0.5.25.dist-info/METADATA,sha256=Z_PvUmam-C56UwoY92VhbvUd-fubXBHevjSMHKVgPx4,14999
|
|
588
|
+
unstructured_ingest-0.5.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
589
|
+
unstructured_ingest-0.5.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
590
|
+
unstructured_ingest-0.5.25.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
591
|
+
unstructured_ingest-0.5.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.21.dist-info → unstructured_ingest-0.5.25.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|