unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +102 -91
- test/integration/connectors/sql/test_singlestore.py +111 -99
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +86 -75
- test/integration/connectors/test_astradb.py +22 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +4 -4
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +3 -3
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
- unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
- unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -65
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +21 -17
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from contextlib import asynccontextmanager
|
|
4
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
-
from
|
|
7
|
-
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
|
|
8
7
|
|
|
9
8
|
from pydantic import Field, Secret
|
|
10
9
|
|
|
@@ -24,7 +23,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
24
23
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
25
24
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
27
|
-
from qdrant_client import AsyncQdrantClient
|
|
26
|
+
from qdrant_client import AsyncQdrantClient, QdrantClient
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class QdrantAccessConfig(AccessConfig, ABC):
|
|
@@ -42,8 +41,8 @@ class QdrantConnectionConfig(ConnectionConfig, ABC):
|
|
|
42
41
|
|
|
43
42
|
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
44
43
|
@asynccontextmanager
|
|
45
|
-
async def
|
|
46
|
-
from qdrant_client
|
|
44
|
+
async def get_async_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
|
|
45
|
+
from qdrant_client import AsyncQdrantClient
|
|
47
46
|
|
|
48
47
|
client_kwargs = self.get_client_kwargs()
|
|
49
48
|
client = AsyncQdrantClient(**client_kwargs)
|
|
@@ -52,6 +51,18 @@ class QdrantConnectionConfig(ConnectionConfig, ABC):
|
|
|
52
51
|
finally:
|
|
53
52
|
await client.close()
|
|
54
53
|
|
|
54
|
+
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
55
|
+
@contextmanager
|
|
56
|
+
def get_client(self) -> Generator["QdrantClient", None, None]:
|
|
57
|
+
from qdrant_client import QdrantClient
|
|
58
|
+
|
|
59
|
+
client_kwargs = self.get_client_kwargs()
|
|
60
|
+
client = QdrantClient(**client_kwargs)
|
|
61
|
+
try:
|
|
62
|
+
yield client
|
|
63
|
+
finally:
|
|
64
|
+
client.close()
|
|
65
|
+
|
|
55
66
|
|
|
56
67
|
class QdrantUploadStagerConfig(UploadStagerConfig):
|
|
57
68
|
pass
|
|
@@ -63,9 +74,9 @@ class QdrantUploadStager(UploadStager, ABC):
|
|
|
63
74
|
default_factory=lambda: QdrantUploadStagerConfig()
|
|
64
75
|
)
|
|
65
76
|
|
|
66
|
-
|
|
67
|
-
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
77
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
68
78
|
"""Prepares dictionary in the format that Chroma requires"""
|
|
79
|
+
data = element_dict.copy()
|
|
69
80
|
return {
|
|
70
81
|
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
71
82
|
"vector": data.pop("embeddings", {}),
|
|
@@ -80,26 +91,6 @@ class QdrantUploadStager(UploadStager, ABC):
|
|
|
80
91
|
},
|
|
81
92
|
}
|
|
82
93
|
|
|
83
|
-
def run(
|
|
84
|
-
self,
|
|
85
|
-
elements_filepath: Path,
|
|
86
|
-
file_data: FileData,
|
|
87
|
-
output_dir: Path,
|
|
88
|
-
output_filename: str,
|
|
89
|
-
**kwargs: Any,
|
|
90
|
-
) -> Path:
|
|
91
|
-
with open(elements_filepath) as elements_file:
|
|
92
|
-
elements_contents = json.load(elements_file)
|
|
93
|
-
|
|
94
|
-
conformed_elements = [
|
|
95
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
96
|
-
]
|
|
97
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
98
|
-
|
|
99
|
-
with open(output_path, "w") as output_file:
|
|
100
|
-
json.dump(conformed_elements, output_file)
|
|
101
|
-
return output_path
|
|
102
|
-
|
|
103
94
|
|
|
104
95
|
class QdrantUploaderConfig(UploaderConfig):
|
|
105
96
|
collection_name: str = Field(description="Name of the collection.")
|
|
@@ -118,27 +109,27 @@ class QdrantUploader(Uploader, ABC):
|
|
|
118
109
|
|
|
119
110
|
@DestinationConnectionError.wrap
|
|
120
111
|
def precheck(self) -> None:
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
112
|
+
with self.connection_config.get_client() as client:
|
|
113
|
+
collections_response = client.get_collections()
|
|
114
|
+
collection_names = [c.name for c in collections_response.collections]
|
|
115
|
+
if self.upload_config.collection_name not in collection_names:
|
|
116
|
+
raise DestinationConnectionError(
|
|
117
|
+
"collection '{}' not found: {}".format(
|
|
118
|
+
self.upload_config.collection_name, ", ".join(collection_names)
|
|
119
|
+
)
|
|
120
|
+
)
|
|
126
121
|
|
|
127
122
|
def is_async(self):
|
|
128
123
|
return True
|
|
129
124
|
|
|
130
|
-
async def
|
|
125
|
+
async def run_data_async(
|
|
131
126
|
self,
|
|
132
|
-
|
|
127
|
+
data: list[dict],
|
|
133
128
|
file_data: FileData,
|
|
134
129
|
**kwargs: Any,
|
|
135
130
|
) -> None:
|
|
136
|
-
with path.open("r") as file:
|
|
137
|
-
elements: list[dict] = json.load(file)
|
|
138
|
-
|
|
139
|
-
logger.debug("Loaded %i elements from %s", len(elements), path)
|
|
140
131
|
|
|
141
|
-
batches = list(batch_generator(
|
|
132
|
+
batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
|
|
142
133
|
logger.debug(
|
|
143
134
|
"Elements split into %i batches of size %i.",
|
|
144
135
|
len(batches),
|
|
@@ -156,7 +147,7 @@ class QdrantUploader(Uploader, ABC):
|
|
|
156
147
|
len(points),
|
|
157
148
|
self.upload_config.collection_name,
|
|
158
149
|
)
|
|
159
|
-
async with self.connection_config.
|
|
150
|
+
async with self.connection_config.get_async_client() as async_client:
|
|
160
151
|
await async_client.upsert(
|
|
161
152
|
self.upload_config.collection_name, points=points, wait=True
|
|
162
153
|
)
|
|
@@ -15,7 +15,7 @@ from dateutil import parser
|
|
|
15
15
|
from pydantic import Field, Secret
|
|
16
16
|
|
|
17
17
|
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
|
|
18
|
-
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
18
|
+
from unstructured_ingest.utils.data_prep import get_data_df, split_dataframe
|
|
19
19
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
20
20
|
from unstructured_ingest.v2.interfaces import (
|
|
21
21
|
AccessConfig,
|
|
@@ -238,27 +238,24 @@ class SQLUploadStagerConfig(UploadStagerConfig):
|
|
|
238
238
|
class SQLUploadStager(UploadStager):
|
|
239
239
|
upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
|
|
240
240
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
metadata: dict[str, Any] = element.pop("metadata", {})
|
|
247
|
-
data_source = metadata.pop("data_source", {})
|
|
248
|
-
coordinates = metadata.pop("coordinates", {})
|
|
241
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
242
|
+
data = element_dict.copy()
|
|
243
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
244
|
+
data_source = metadata.pop("data_source", {})
|
|
245
|
+
coordinates = metadata.pop("coordinates", {})
|
|
249
246
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
247
|
+
data.update(metadata)
|
|
248
|
+
data.update(data_source)
|
|
249
|
+
data.update(coordinates)
|
|
253
250
|
|
|
254
|
-
|
|
251
|
+
data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
|
|
255
252
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
253
|
+
# remove extraneous, not supported columns
|
|
254
|
+
element = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
255
|
+
element[RECORD_ID_LABEL] = file_data.identifier
|
|
256
|
+
return element
|
|
260
257
|
|
|
261
|
-
|
|
258
|
+
def conform_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
262
259
|
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
263
260
|
df[column] = df[column].apply(parse_date_string)
|
|
264
261
|
for column in filter(
|
|
@@ -283,19 +280,19 @@ class SQLUploadStager(UploadStager):
|
|
|
283
280
|
output_filename: str,
|
|
284
281
|
**kwargs: Any,
|
|
285
282
|
) -> Path:
|
|
286
|
-
|
|
287
|
-
elements_contents: list[dict] = json.load(elements_file)
|
|
283
|
+
elements_contents = self.get_data(elements_filepath=elements_filepath)
|
|
288
284
|
|
|
289
|
-
df =
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
285
|
+
df = pd.DataFrame(
|
|
286
|
+
data=[
|
|
287
|
+
self.conform_dict(element_dict=element_dict, file_data=file_data)
|
|
288
|
+
for element_dict in elements_contents
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
df = self.conform_dataframe(df=df)
|
|
296
292
|
|
|
297
|
-
|
|
298
|
-
|
|
293
|
+
output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
294
|
+
|
|
295
|
+
self.write_output(output_path=output_path, data=df.to_dict(orient="records"))
|
|
299
296
|
return output_path
|
|
300
297
|
|
|
301
298
|
|
|
@@ -361,8 +358,15 @@ class SQLUploader(Uploader):
|
|
|
361
358
|
for column in missing_columns:
|
|
362
359
|
df[column] = pd.Series()
|
|
363
360
|
|
|
364
|
-
def
|
|
365
|
-
|
|
361
|
+
def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
|
|
362
|
+
if self.can_delete():
|
|
363
|
+
self.delete_by_record_id(file_data=file_data)
|
|
364
|
+
else:
|
|
365
|
+
logger.warning(
|
|
366
|
+
f"table doesn't contain expected "
|
|
367
|
+
f"record id column "
|
|
368
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
369
|
+
)
|
|
366
370
|
df.replace({np.nan: None}, inplace=True)
|
|
367
371
|
self._fit_to_schema(df=df, columns=self.get_table_columns())
|
|
368
372
|
|
|
@@ -411,13 +415,10 @@ class SQLUploader(Uploader):
|
|
|
411
415
|
rowcount = cursor.rowcount
|
|
412
416
|
logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
|
|
413
417
|
|
|
418
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
419
|
+
df = pd.DataFrame(data)
|
|
420
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
421
|
+
|
|
414
422
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
else:
|
|
418
|
-
logger.warning(
|
|
419
|
-
f"table doesn't contain expected "
|
|
420
|
-
f"record id column "
|
|
421
|
-
f"{self.upload_config.record_id_key}, skipping delete"
|
|
422
|
-
)
|
|
423
|
-
self.upload_contents(path=path)
|
|
423
|
+
df = get_data_df(path=path)
|
|
424
|
+
self.upload_dataframe(df=df, file_data=file_data)
|
|
@@ -3,7 +3,6 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import date, datetime
|
|
6
|
-
from pathlib import Path
|
|
7
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
7
|
|
|
9
8
|
from dateutil import parser
|
|
@@ -74,11 +73,11 @@ class WeaviateUploadStager(UploadStager):
|
|
|
74
73
|
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
75
74
|
return parser.parse(date_string)
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
def conform_dict(cls, data: dict, file_data: FileData) -> dict:
|
|
76
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
79
77
|
"""
|
|
80
78
|
Updates the element dictionary to conform to the Weaviate schema
|
|
81
79
|
"""
|
|
80
|
+
data = element_dict.copy()
|
|
82
81
|
working_data = data.copy()
|
|
83
82
|
# Dict as string formatting
|
|
84
83
|
if (
|
|
@@ -111,7 +110,7 @@ class WeaviateUploadStager(UploadStager):
|
|
|
111
110
|
.get("data_source", {})
|
|
112
111
|
.get("date_created")
|
|
113
112
|
):
|
|
114
|
-
working_data["metadata"]["data_source"]["date_created"] =
|
|
113
|
+
working_data["metadata"]["data_source"]["date_created"] = self.parse_date_string(
|
|
115
114
|
date_created
|
|
116
115
|
).strftime(
|
|
117
116
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
@@ -122,7 +121,7 @@ class WeaviateUploadStager(UploadStager):
|
|
|
122
121
|
.get("data_source", {})
|
|
123
122
|
.get("date_modified")
|
|
124
123
|
):
|
|
125
|
-
working_data["metadata"]["data_source"]["date_modified"] =
|
|
124
|
+
working_data["metadata"]["data_source"]["date_modified"] = self.parse_date_string(
|
|
126
125
|
date_modified
|
|
127
126
|
).strftime(
|
|
128
127
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
@@ -133,14 +132,14 @@ class WeaviateUploadStager(UploadStager):
|
|
|
133
132
|
.get("data_source", {})
|
|
134
133
|
.get("date_processed")
|
|
135
134
|
):
|
|
136
|
-
working_data["metadata"]["data_source"]["date_processed"] =
|
|
135
|
+
working_data["metadata"]["data_source"]["date_processed"] = self.parse_date_string(
|
|
137
136
|
date_processed
|
|
138
137
|
).strftime(
|
|
139
138
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
140
139
|
)
|
|
141
140
|
|
|
142
141
|
if last_modified := working_data.get("metadata", {}).get("last_modified"):
|
|
143
|
-
working_data["metadata"]["last_modified"] =
|
|
142
|
+
working_data["metadata"]["last_modified"] = self.parse_date_string(
|
|
144
143
|
last_modified
|
|
145
144
|
).strftime(
|
|
146
145
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
@@ -159,25 +158,6 @@ class WeaviateUploadStager(UploadStager):
|
|
|
159
158
|
working_data[RECORD_ID_LABEL] = file_data.identifier
|
|
160
159
|
return working_data
|
|
161
160
|
|
|
162
|
-
def run(
|
|
163
|
-
self,
|
|
164
|
-
elements_filepath: Path,
|
|
165
|
-
file_data: FileData,
|
|
166
|
-
output_dir: Path,
|
|
167
|
-
output_filename: str,
|
|
168
|
-
**kwargs: Any,
|
|
169
|
-
) -> Path:
|
|
170
|
-
with open(elements_filepath) as elements_file:
|
|
171
|
-
elements_contents = json.load(elements_file)
|
|
172
|
-
updated_elements = [
|
|
173
|
-
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
174
|
-
]
|
|
175
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
176
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
177
|
-
with open(output_path, "w") as output_file:
|
|
178
|
-
json.dump(updated_elements, output_file, indent=2)
|
|
179
|
-
return output_path
|
|
180
|
-
|
|
181
161
|
|
|
182
162
|
class WeaviateUploaderConfig(UploaderConfig):
|
|
183
163
|
collection: str = Field(description="The name of the collection this object belongs to")
|
|
@@ -268,18 +248,16 @@ class WeaviateUploader(Uploader, ABC):
|
|
|
268
248
|
if not resp.failed and not resp.successful:
|
|
269
249
|
break
|
|
270
250
|
|
|
271
|
-
def
|
|
272
|
-
with path.open("r") as file:
|
|
273
|
-
elements_dict = json.load(file)
|
|
251
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
274
252
|
logger.info(
|
|
275
|
-
f"writing {len(
|
|
253
|
+
f"writing {len(data)} objects to destination "
|
|
276
254
|
f"class {self.connection_config.access_config} "
|
|
277
255
|
)
|
|
278
256
|
|
|
279
257
|
with self.connection_config.get_client() as weaviate_client:
|
|
280
258
|
self.delete_by_record_id(client=weaviate_client, file_data=file_data)
|
|
281
259
|
with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
|
|
282
|
-
for e in
|
|
260
|
+
for e in data:
|
|
283
261
|
vector = e.pop("embeddings", None)
|
|
284
262
|
batch_client.add_object(
|
|
285
263
|
collection=self.upload_config.collection,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist: tqdm
|
|
25
|
+
Requires-Dist: click
|
|
27
26
|
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: pandas
|
|
29
28
|
Requires-Dist: dataclasses-json
|
|
30
|
-
Requires-Dist: click
|
|
31
29
|
Requires-Dist: python-dateutil
|
|
30
|
+
Requires-Dist: tqdm
|
|
31
|
+
Requires-Dist: ndjson
|
|
32
|
+
Requires-Dist: opentelemetry-sdk
|
|
32
33
|
Provides-Extra: airtable
|
|
33
34
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
35
|
Provides-Extra: astradb
|
|
@@ -41,8 +42,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
41
42
|
Provides-Extra: bedrock
|
|
42
43
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
44
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: requests; extra == "biomed"
|
|
45
45
|
Requires-Dist: bs4; extra == "biomed"
|
|
46
|
+
Requires-Dist: requests; extra == "biomed"
|
|
46
47
|
Provides-Extra: box
|
|
47
48
|
Requires-Dist: fsspec; extra == "box"
|
|
48
49
|
Requires-Dist: boxfs; extra == "box"
|
|
@@ -69,8 +70,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
70
|
Provides-Extra: docx
|
|
70
71
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
72
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
73
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
74
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
74
75
|
Provides-Extra: duckdb
|
|
75
76
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
76
77
|
Provides-Extra: elasticsearch
|
|
@@ -80,8 +81,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
80
81
|
Provides-Extra: embed-mixedbreadai
|
|
81
82
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
82
83
|
Provides-Extra: embed-octoai
|
|
83
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
84
84
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
85
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
85
86
|
Provides-Extra: embed-vertexai
|
|
86
87
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
87
88
|
Provides-Extra: embed-voyageai
|
|
@@ -89,19 +90,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
89
90
|
Provides-Extra: epub
|
|
90
91
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
91
92
|
Provides-Extra: gcs
|
|
92
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
93
|
Requires-Dist: fsspec; extra == "gcs"
|
|
94
94
|
Requires-Dist: bs4; extra == "gcs"
|
|
95
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
95
96
|
Provides-Extra: github
|
|
96
|
-
Requires-Dist: requests; extra == "github"
|
|
97
97
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
98
|
+
Requires-Dist: requests; extra == "github"
|
|
98
99
|
Provides-Extra: gitlab
|
|
99
100
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
100
101
|
Provides-Extra: google-drive
|
|
101
102
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
102
103
|
Provides-Extra: hubspot
|
|
103
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
104
104
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
105
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
105
106
|
Provides-Extra: jira
|
|
106
107
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
107
108
|
Provides-Extra: kafka
|
|
@@ -118,11 +119,14 @@ Provides-Extra: mongodb
|
|
|
118
119
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
119
120
|
Provides-Extra: msg
|
|
120
121
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
122
|
+
Provides-Extra: neo4j
|
|
123
|
+
Requires-Dist: neo4j; extra == "neo4j"
|
|
124
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
121
125
|
Provides-Extra: notion
|
|
122
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
123
|
-
Requires-Dist: backoff; extra == "notion"
|
|
124
126
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
125
127
|
Requires-Dist: httpx; extra == "notion"
|
|
128
|
+
Requires-Dist: backoff; extra == "notion"
|
|
129
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
126
130
|
Provides-Extra: odt
|
|
127
131
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
128
132
|
Provides-Extra: onedrive
|
|
@@ -130,8 +134,8 @@ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
|
130
134
|
Requires-Dist: msal; extra == "onedrive"
|
|
131
135
|
Requires-Dist: bs4; extra == "onedrive"
|
|
132
136
|
Provides-Extra: openai
|
|
133
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
134
137
|
Requires-Dist: openai; extra == "openai"
|
|
138
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
135
139
|
Provides-Extra: opensearch
|
|
136
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
137
141
|
Provides-Extra: org
|
|
@@ -160,13 +164,13 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
160
164
|
Provides-Extra: rtf
|
|
161
165
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
162
166
|
Provides-Extra: s3
|
|
163
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
164
167
|
Requires-Dist: fsspec; extra == "s3"
|
|
168
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
165
169
|
Provides-Extra: salesforce
|
|
166
170
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
167
171
|
Provides-Extra: sftp
|
|
168
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
169
172
|
Requires-Dist: fsspec; extra == "sftp"
|
|
173
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
170
174
|
Provides-Extra: sharepoint
|
|
171
175
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
172
176
|
Requires-Dist: msal; extra == "sharepoint"
|
|
@@ -175,8 +179,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
175
179
|
Provides-Extra: slack
|
|
176
180
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
177
181
|
Provides-Extra: snowflake
|
|
178
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
179
182
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
183
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
180
184
|
Provides-Extra: togetherai
|
|
181
185
|
Requires-Dist: together; extra == "togetherai"
|
|
182
186
|
Provides-Extra: tsv
|