unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
- test/integration/connectors/sql/test_postgres.py +10 -4
- test/integration/connectors/sql/test_singlestore.py +8 -4
- test/integration/connectors/sql/test_snowflake.py +10 -6
- test/integration/connectors/sql/test_sqlite.py +4 -4
- test/integration/connectors/test_astradb.py +50 -3
- test/integration/connectors/test_delta_table.py +46 -0
- test/integration/connectors/test_kafka.py +40 -6
- test/integration/connectors/test_lancedb.py +210 -0
- test/integration/connectors/test_milvus.py +141 -0
- test/integration/connectors/test_mongodb.py +332 -0
- test/integration/connectors/test_pinecone.py +53 -1
- test/integration/connectors/utils/docker.py +81 -15
- test/integration/connectors/utils/validation.py +10 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
- unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
- unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
- unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
- unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
- unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
- /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import requests
|
|
7
|
+
import weaviate
|
|
8
|
+
from weaviate.client import WeaviateClient
|
|
9
|
+
|
|
10
|
+
from test.integration.connectors.utils.constants import DESTINATION_TAG
|
|
11
|
+
from test.integration.connectors.utils.docker import container_context
|
|
12
|
+
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
13
|
+
from unstructured_ingest.v2.processes.connectors.weaviate.local import (
|
|
14
|
+
CONNECTOR_TYPE,
|
|
15
|
+
LocalWeaviateConnectionConfig,
|
|
16
|
+
LocalWeaviateUploader,
|
|
17
|
+
LocalWeaviateUploaderConfig,
|
|
18
|
+
LocalWeaviateUploadStager,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
COLLECTION_NAME = "elements"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
|
|
25
|
+
start_time = time.time()
|
|
26
|
+
while time.time() - start_time < timeout:
|
|
27
|
+
try:
|
|
28
|
+
requests.get("http://localhost:8080/v1/.well-known/read")
|
|
29
|
+
return
|
|
30
|
+
except Exception as e:
|
|
31
|
+
print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
|
|
32
|
+
time.sleep(interval)
|
|
33
|
+
raise TimeoutError("Docker container never came up healthy")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.fixture
|
|
37
|
+
def collection(collections_schema_config: dict) -> str:
|
|
38
|
+
with container_context(
|
|
39
|
+
image="semitechnologies/weaviate:1.27.3",
|
|
40
|
+
ports={8080: 8080, 50051: 50051},
|
|
41
|
+
):
|
|
42
|
+
wait_for_container()
|
|
43
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
44
|
+
weaviate_client.collections.create_from_dict(config=collections_schema_config)
|
|
45
|
+
yield COLLECTION_NAME
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_count(client: WeaviateClient) -> int:
|
|
49
|
+
collection = client.collections.get(COLLECTION_NAME)
|
|
50
|
+
resp = collection.aggregate.over_all(total_count=True)
|
|
51
|
+
return resp.total_count
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
|
|
55
|
+
with weaviate.connect_to_local() as weaviate_client:
|
|
56
|
+
current_count = get_count(client=weaviate_client)
|
|
57
|
+
retry_count = 0
|
|
58
|
+
while current_count != expected_count and retry_count < retries:
|
|
59
|
+
retry_count += 1
|
|
60
|
+
time.sleep(interval)
|
|
61
|
+
current_count = get_count(client=weaviate_client)
|
|
62
|
+
assert current_count == expected_count, (
|
|
63
|
+
f"Expected count ({expected_count}) doesn't match how "
|
|
64
|
+
f"much came back from collection: {current_count}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_uploader_and_validate(
|
|
69
|
+
uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
|
|
70
|
+
):
|
|
71
|
+
uploader.precheck()
|
|
72
|
+
uploader.run(path=path, file_data=file_data)
|
|
73
|
+
validate_count(expected_count=expected_count)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@pytest.mark.asyncio
|
|
77
|
+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
|
|
78
|
+
def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
|
|
79
|
+
file_data = FileData(
|
|
80
|
+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
81
|
+
connector_type=CONNECTOR_TYPE,
|
|
82
|
+
identifier="mock file data",
|
|
83
|
+
)
|
|
84
|
+
stager = LocalWeaviateUploadStager()
|
|
85
|
+
|
|
86
|
+
staged_filepath = stager.run(
|
|
87
|
+
elements_filepath=upload_file,
|
|
88
|
+
file_data=file_data,
|
|
89
|
+
output_dir=tmp_path,
|
|
90
|
+
output_filename=upload_file.name,
|
|
91
|
+
)
|
|
92
|
+
dynamic_uploader = LocalWeaviateUploader(
|
|
93
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
94
|
+
collection=COLLECTION_NAME,
|
|
95
|
+
),
|
|
96
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
97
|
+
)
|
|
98
|
+
fixed_size_uploader = LocalWeaviateUploader(
|
|
99
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
100
|
+
collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
|
|
101
|
+
),
|
|
102
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
103
|
+
)
|
|
104
|
+
rate_limited_uploader = LocalWeaviateUploader(
|
|
105
|
+
upload_config=LocalWeaviateUploaderConfig(
|
|
106
|
+
collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
|
|
107
|
+
),
|
|
108
|
+
connection_config=LocalWeaviateConnectionConfig(),
|
|
109
|
+
)
|
|
110
|
+
with staged_filepath.open() as f:
|
|
111
|
+
staged_elements = json.load(f)
|
|
112
|
+
expected_count = len(staged_elements)
|
|
113
|
+
|
|
114
|
+
run_uploader_and_validate(
|
|
115
|
+
uploader=dynamic_uploader,
|
|
116
|
+
path=staged_filepath,
|
|
117
|
+
file_data=file_data,
|
|
118
|
+
expected_count=expected_count,
|
|
119
|
+
)
|
|
120
|
+
run_uploader_and_validate(
|
|
121
|
+
uploader=fixed_size_uploader,
|
|
122
|
+
path=staged_filepath,
|
|
123
|
+
file_data=file_data,
|
|
124
|
+
expected_count=expected_count,
|
|
125
|
+
)
|
|
126
|
+
run_uploader_and_validate(
|
|
127
|
+
uploader=rate_limited_uploader,
|
|
128
|
+
path=staged_filepath,
|
|
129
|
+
file_data=file_data,
|
|
130
|
+
expected_count=expected_count,
|
|
131
|
+
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.2" # pragma: no cover
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
|
|
4
|
+
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
5
7
|
|
|
6
8
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
7
9
|
|
|
@@ -9,6 +11,12 @@ T = TypeVar("T")
|
|
|
9
11
|
IterableT = Iterable[T]
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
|
|
15
|
+
num_chunks = len(df) // chunk_size + 1
|
|
16
|
+
for i in range(num_chunks):
|
|
17
|
+
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
18
|
+
|
|
19
|
+
|
|
12
20
|
def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
|
|
13
21
|
"""A helper function to break an iterable into batches of size batch_size."""
|
|
14
22
|
it = iter(iterable)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
4
|
+
import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
|
|
4
5
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
6
|
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
7
|
+
import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
|
|
6
8
|
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
7
9
|
import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
|
|
10
|
+
import unstructured_ingest.v2.processes.connectors.weaviate # noqa: F401
|
|
8
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
9
12
|
add_destination_entry,
|
|
10
13
|
add_source_entry,
|
|
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
|
24
27
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
25
28
|
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
26
29
|
from .delta_table import delta_table_destination_entry
|
|
27
|
-
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
28
|
-
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
29
30
|
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
30
31
|
from .gitlab import gitlab_source_entry
|
|
31
32
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
|
40
41
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
41
42
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
42
43
|
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
43
|
-
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
44
|
-
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
45
44
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
46
45
|
from .outlook import outlook_source_entry
|
|
47
46
|
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
|
52
51
|
from .sharepoint import sharepoint_source_entry
|
|
53
52
|
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
|
|
54
53
|
from .slack import slack_source_entry
|
|
55
|
-
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
56
|
-
from .weaviate import weaviate_destination_entry
|
|
57
54
|
|
|
58
55
|
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
|
|
59
56
|
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
|
|
@@ -67,10 +64,6 @@ add_destination_entry(
|
|
|
67
64
|
destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
|
|
68
65
|
)
|
|
69
66
|
|
|
70
|
-
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
71
|
-
add_destination_entry(
|
|
72
|
-
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
73
|
-
)
|
|
74
67
|
|
|
75
68
|
add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
|
|
76
69
|
|
|
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
|
|
|
80
73
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
81
74
|
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
82
75
|
|
|
83
|
-
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
84
|
-
add_destination_entry(
|
|
85
|
-
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
86
|
-
)
|
|
87
76
|
|
|
88
77
|
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
|
89
78
|
|
|
90
|
-
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
91
|
-
|
|
92
79
|
add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
|
|
93
80
|
add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
|
|
94
81
|
|
|
@@ -170,7 +170,7 @@ class AstraDBIndexer(Indexer):
|
|
|
170
170
|
|
|
171
171
|
def precheck(self) -> None:
|
|
172
172
|
try:
|
|
173
|
-
self.get_collection()
|
|
173
|
+
self.get_collection().options()
|
|
174
174
|
except Exception as e:
|
|
175
175
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
176
176
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -345,7 +345,7 @@ class AstraDBUploader(Uploader):
|
|
|
345
345
|
connection_config=self.connection_config,
|
|
346
346
|
collection_name=self.upload_config.collection_name,
|
|
347
347
|
keyspace=self.upload_config.keyspace,
|
|
348
|
-
)
|
|
348
|
+
).options()
|
|
349
349
|
except Exception as e:
|
|
350
350
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
351
351
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -155,6 +155,10 @@ class AzureAISearchUploadStager(UploadStager):
|
|
|
155
155
|
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
156
|
]
|
|
157
157
|
|
|
158
|
+
if Path(output_filename).suffix != ".json":
|
|
159
|
+
output_filename = f"{output_filename}.json"
|
|
160
|
+
else:
|
|
161
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
158
162
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
159
163
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
160
164
|
with open(output_path, "w") as output_file:
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import traceback
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from multiprocessing import Process
|
|
5
|
+
from multiprocessing import Process, Queue
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Optional
|
|
7
8
|
from urllib.parse import urlparse
|
|
@@ -27,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
27
28
|
CONNECTOR_TYPE = "delta_table"
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
32
|
+
from deltalake.writer import write_deltalake
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
write_deltalake(**kwargs)
|
|
36
|
+
except Exception:
|
|
37
|
+
queue.put(traceback.format_exc())
|
|
38
|
+
|
|
39
|
+
|
|
30
40
|
class DeltaTableAccessConfig(AccessConfig):
|
|
31
41
|
aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
|
|
32
42
|
aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
|
|
@@ -157,7 +167,6 @@ class DeltaTableUploader(Uploader):
|
|
|
157
167
|
|
|
158
168
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
159
169
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
160
|
-
from deltalake.writer import write_deltalake
|
|
161
170
|
|
|
162
171
|
df = self.read_dataframe(path)
|
|
163
172
|
updated_upload_path = os.path.join(
|
|
@@ -176,17 +185,24 @@ class DeltaTableUploader(Uploader):
|
|
|
176
185
|
"mode": "overwrite",
|
|
177
186
|
"storage_options": storage_options,
|
|
178
187
|
}
|
|
188
|
+
queue = Queue()
|
|
179
189
|
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
180
190
|
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
181
191
|
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
182
192
|
# rust backend to finish
|
|
183
193
|
writer = Process(
|
|
184
|
-
target=
|
|
185
|
-
kwargs=writer_kwargs,
|
|
194
|
+
target=write_deltalake_with_error_handling,
|
|
195
|
+
kwargs={"queue": queue, **writer_kwargs},
|
|
186
196
|
)
|
|
187
197
|
writer.start()
|
|
188
198
|
writer.join()
|
|
189
199
|
|
|
200
|
+
# Check if the queue has any exception message
|
|
201
|
+
if not queue.empty():
|
|
202
|
+
error_message = queue.get()
|
|
203
|
+
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
204
|
+
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
205
|
+
|
|
190
206
|
|
|
191
207
|
delta_table_destination_entry = DestinationRegistryEntry(
|
|
192
208
|
connection_config=DeltaTableConnectionConfig,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
2
|
+
add_destination_entry,
|
|
3
|
+
add_source_entry,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
7
|
+
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
8
|
+
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
9
|
+
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
10
|
+
|
|
11
|
+
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
12
|
+
add_destination_entry(
|
|
13
|
+
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
17
|
+
add_destination_entry(
|
|
18
|
+
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
19
|
+
)
|
unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py}
RENAMED
|
@@ -2,6 +2,7 @@ import hashlib
|
|
|
2
2
|
import json
|
|
3
3
|
import sys
|
|
4
4
|
import uuid
|
|
5
|
+
from contextlib import contextmanager
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from time import time
|
|
@@ -13,9 +14,11 @@ from unstructured_ingest.error import (
|
|
|
13
14
|
DestinationConnectionError,
|
|
14
15
|
SourceConnectionError,
|
|
15
16
|
SourceConnectionNetworkError,
|
|
17
|
+
WriteError,
|
|
16
18
|
)
|
|
17
19
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
18
20
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
21
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
19
22
|
from unstructured_ingest.v2.interfaces import (
|
|
20
23
|
AccessConfig,
|
|
21
24
|
ConnectionConfig,
|
|
@@ -26,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
26
29
|
FileDataSourceMetadata,
|
|
27
30
|
Indexer,
|
|
28
31
|
IndexerConfig,
|
|
32
|
+
SourceIdentifiers,
|
|
29
33
|
Uploader,
|
|
30
34
|
UploaderConfig,
|
|
31
35
|
UploadStager,
|
|
@@ -116,19 +120,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
116
120
|
return client_kwargs
|
|
117
121
|
|
|
118
122
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
119
|
-
|
|
123
|
+
@contextmanager
|
|
124
|
+
def get_client(self) -> Generator["ElasticsearchClient", None, None]:
|
|
120
125
|
from elasticsearch import Elasticsearch as ElasticsearchClient
|
|
121
126
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
return client
|
|
125
|
-
|
|
126
|
-
def check_connection(self, client: "ElasticsearchClient"):
|
|
127
|
-
try:
|
|
128
|
-
client.perform_request("HEAD", "/", headers={"accept": "application/json"})
|
|
129
|
-
except Exception as e:
|
|
130
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
127
|
+
with ElasticsearchClient(**self.get_client_kwargs()) as client:
|
|
128
|
+
yield client
|
|
132
129
|
|
|
133
130
|
|
|
134
131
|
class ElasticsearchIndexerConfig(IndexerConfig):
|
|
@@ -144,7 +141,16 @@ class ElasticsearchIndexer(Indexer):
|
|
|
144
141
|
|
|
145
142
|
def precheck(self) -> None:
|
|
146
143
|
try:
|
|
147
|
-
self.connection_config.get_client()
|
|
144
|
+
with self.connection_config.get_client() as client:
|
|
145
|
+
if not client.ping():
|
|
146
|
+
raise SourceConnectionError("cluster not detected")
|
|
147
|
+
indices = client.indices.get_alias(index="*")
|
|
148
|
+
if self.index_config.index_name not in indices:
|
|
149
|
+
raise SourceConnectionError(
|
|
150
|
+
"index {} not found: {}".format(
|
|
151
|
+
self.index_config.index_name, ", ".join(indices.keys())
|
|
152
|
+
)
|
|
153
|
+
)
|
|
148
154
|
except Exception as e:
|
|
149
155
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
150
156
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -160,15 +166,15 @@ class ElasticsearchIndexer(Indexer):
|
|
|
160
166
|
scan = self.load_scan()
|
|
161
167
|
|
|
162
168
|
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
169
|
+
with self.connection_config.get_client() as client:
|
|
170
|
+
hits = scan(
|
|
171
|
+
client,
|
|
172
|
+
query=scan_query,
|
|
173
|
+
scroll="1m",
|
|
174
|
+
index=self.index_config.index_name,
|
|
175
|
+
)
|
|
170
176
|
|
|
171
|
-
|
|
177
|
+
return {hit["_id"] for hit in hits}
|
|
172
178
|
|
|
173
179
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
174
180
|
all_ids = self._get_doc_ids()
|
|
@@ -257,6 +263,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
257
263
|
file_data=FileData(
|
|
258
264
|
identifier=filename_id,
|
|
259
265
|
connector_type=CONNECTOR_TYPE,
|
|
266
|
+
source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
|
|
260
267
|
metadata=FileDataSourceMetadata(
|
|
261
268
|
version=str(result["_version"]) if "_version" in result else None,
|
|
262
269
|
date_processed=str(time()),
|
|
@@ -318,7 +325,7 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
|
|
|
318
325
|
class ElasticsearchUploadStager(UploadStager):
|
|
319
326
|
upload_stager_config: ElasticsearchUploadStagerConfig
|
|
320
327
|
|
|
321
|
-
def conform_dict(self, data: dict) -> dict:
|
|
328
|
+
def conform_dict(self, data: dict, file_data: FileData) -> dict:
|
|
322
329
|
resp = {
|
|
323
330
|
"_index": self.upload_stager_config.index_name,
|
|
324
331
|
"_id": str(uuid.uuid4()),
|
|
@@ -327,6 +334,7 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
327
334
|
"embeddings": data.pop("embeddings", None),
|
|
328
335
|
"text": data.pop("text", None),
|
|
329
336
|
"type": data.pop("type", None),
|
|
337
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
330
338
|
},
|
|
331
339
|
}
|
|
332
340
|
if "metadata" in data and isinstance(data["metadata"], dict):
|
|
@@ -343,10 +351,17 @@ class ElasticsearchUploadStager(UploadStager):
|
|
|
343
351
|
) -> Path:
|
|
344
352
|
with open(elements_filepath) as elements_file:
|
|
345
353
|
elements_contents = json.load(elements_file)
|
|
346
|
-
conformed_elements = [
|
|
347
|
-
|
|
354
|
+
conformed_elements = [
|
|
355
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
356
|
+
]
|
|
357
|
+
if Path(output_filename).suffix != ".json":
|
|
358
|
+
output_filename = f"{output_filename}.json"
|
|
359
|
+
else:
|
|
360
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
361
|
+
output_path = Path(output_dir) / output_filename
|
|
362
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
348
363
|
with open(output_path, "w") as output_file:
|
|
349
|
-
json.dump(conformed_elements, output_file)
|
|
364
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
350
365
|
return output_path
|
|
351
366
|
|
|
352
367
|
|
|
@@ -363,6 +378,10 @@ class ElasticsearchUploaderConfig(UploaderConfig):
|
|
|
363
378
|
num_threads: int = Field(
|
|
364
379
|
default=4, description="Number of threads to be used while uploading content"
|
|
365
380
|
)
|
|
381
|
+
record_id_key: str = Field(
|
|
382
|
+
default=RECORD_ID_LABEL,
|
|
383
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
384
|
+
)
|
|
366
385
|
|
|
367
386
|
|
|
368
387
|
@dataclass
|
|
@@ -373,7 +392,16 @@ class ElasticsearchUploader(Uploader):
|
|
|
373
392
|
|
|
374
393
|
def precheck(self) -> None:
|
|
375
394
|
try:
|
|
376
|
-
self.connection_config.get_client()
|
|
395
|
+
with self.connection_config.get_client() as client:
|
|
396
|
+
if not client.ping():
|
|
397
|
+
raise DestinationConnectionError("cluster not detected")
|
|
398
|
+
indices = client.indices.get_alias(index="*")
|
|
399
|
+
if self.upload_config.index_name not in indices:
|
|
400
|
+
raise SourceConnectionError(
|
|
401
|
+
"index {} not found: {}".format(
|
|
402
|
+
self.upload_config.index_name, ", ".join(indices.keys())
|
|
403
|
+
)
|
|
404
|
+
)
|
|
377
405
|
except Exception as e:
|
|
378
406
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
379
407
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -384,6 +412,23 @@ class ElasticsearchUploader(Uploader):
|
|
|
384
412
|
|
|
385
413
|
return parallel_bulk
|
|
386
414
|
|
|
415
|
+
def delete_by_record_id(self, client, file_data: FileData) -> None:
|
|
416
|
+
logger.debug(
|
|
417
|
+
f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
|
|
418
|
+
f"from {self.upload_config.index_name} index"
|
|
419
|
+
)
|
|
420
|
+
delete_resp = client.delete_by_query(
|
|
421
|
+
index=self.upload_config.index_name,
|
|
422
|
+
body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
|
|
423
|
+
)
|
|
424
|
+
logger.info(
|
|
425
|
+
"deleted {} records from index {}".format(
|
|
426
|
+
delete_resp["deleted"], self.upload_config.index_name
|
|
427
|
+
)
|
|
428
|
+
)
|
|
429
|
+
if failures := delete_resp.get("failures"):
|
|
430
|
+
raise WriteError(f"failed to delete records: {failures}")
|
|
431
|
+
|
|
387
432
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
388
433
|
parallel_bulk = self.load_parallel_bulk()
|
|
389
434
|
with path.open("r") as file:
|
|
@@ -397,28 +442,29 @@ class ElasticsearchUploader(Uploader):
|
|
|
397
442
|
f"{self.upload_config.num_threads} (number of) threads"
|
|
398
443
|
)
|
|
399
444
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
for success, info in parallel_bulk(
|
|
411
|
-
client=client,
|
|
412
|
-
actions=batch,
|
|
413
|
-
thread_count=self.upload_config.num_threads,
|
|
445
|
+
with self.connection_config.get_client() as client:
|
|
446
|
+
self.delete_by_record_id(client=client, file_data=file_data)
|
|
447
|
+
if not client.indices.exists(index=self.upload_config.index_name):
|
|
448
|
+
logger.warning(
|
|
449
|
+
f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
|
|
450
|
+
f"{self.upload_config.index_name}. "
|
|
451
|
+
f"This may cause issues when uploading."
|
|
452
|
+
)
|
|
453
|
+
for batch in generator_batching_wbytes(
|
|
454
|
+
elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
|
|
414
455
|
):
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
456
|
+
for success, info in parallel_bulk(
|
|
457
|
+
client=client,
|
|
458
|
+
actions=batch,
|
|
459
|
+
thread_count=self.upload_config.num_threads,
|
|
460
|
+
):
|
|
461
|
+
if not success:
|
|
462
|
+
logger.error(
|
|
463
|
+
"upload failed for a batch in "
|
|
464
|
+
f"{(self.__class__.__name__).replace('Uploader', '')} "
|
|
465
|
+
"destination connector:",
|
|
466
|
+
info,
|
|
467
|
+
)
|
|
422
468
|
|
|
423
469
|
|
|
424
470
|
elasticsearch_source_entry = SourceRegistryEntry(
|
|
@@ -17,7 +17,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
17
17
|
DestinationRegistryEntry,
|
|
18
18
|
SourceRegistryEntry,
|
|
19
19
|
)
|
|
20
|
-
from unstructured_ingest.v2.processes.connectors.elasticsearch import (
|
|
20
|
+
from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
|
|
21
21
|
ElasticsearchDownloader,
|
|
22
22
|
ElasticsearchDownloaderConfig,
|
|
23
23
|
ElasticsearchIndexer,
|
|
@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
161
161
|
and isinstance(parent_root_path, str)
|
|
162
162
|
):
|
|
163
163
|
fullpath = f"{parent_path}/{filename}"
|
|
164
|
-
rel_path = fullpath.
|
|
164
|
+
rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
|
|
165
165
|
source_identifiers = SourceIdentifiers(
|
|
166
166
|
filename=filename, fullpath=fullpath, rel_path=rel_path
|
|
167
167
|
)
|
|
@@ -161,6 +161,12 @@ class KafkaIndexer(Indexer, ABC):
|
|
|
161
161
|
current_topics = [
|
|
162
162
|
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
163
163
|
]
|
|
164
|
+
if self.index_config.topic not in current_topics:
|
|
165
|
+
raise SourceConnectionError(
|
|
166
|
+
"expected topic {} not detected in cluster: {}".format(
|
|
167
|
+
self.index_config.topic, ", ".join(current_topics)
|
|
168
|
+
)
|
|
169
|
+
)
|
|
164
170
|
logger.info(f"successfully checked available topics: {current_topics}")
|
|
165
171
|
except Exception as e:
|
|
166
172
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
|
|
4
|
+
|
|
5
|
+
from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
|
|
6
|
+
from .aws import lancedb_aws_destination_entry
|
|
7
|
+
from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
|
|
8
|
+
from .azure import lancedb_azure_destination_entry
|
|
9
|
+
from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
|
|
10
|
+
from .gcp import lancedb_gcp_destination_entry
|
|
11
|
+
from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
|
|
12
|
+
from .local import lancedb_local_destination_entry
|
|
13
|
+
|
|
14
|
+
add_destination_entry(LANCEDB_S3_CONNECTOR_TYPE, lancedb_aws_destination_entry)
|
|
15
|
+
add_destination_entry(LANCEDB_AZURE_CONNECTOR_TYPE, lancedb_azure_destination_entry)
|
|
16
|
+
add_destination_entry(LANCEDB_GCS_CONNECTOR_TYPE, lancedb_gcp_destination_entry)
|
|
17
|
+
add_destination_entry(LANCEDB_LOCAL_CONNECTOR_TYPE, lancedb_local_destination_entry)
|