unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (55) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +210 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_mongodb.py +332 -0
  15. test/integration/connectors/test_pinecone.py +53 -1
  16. test/integration/connectors/utils/docker.py +81 -15
  17. test/integration/connectors/utils/validation.py +10 -0
  18. test/integration/connectors/weaviate/__init__.py +0 -0
  19. test/integration/connectors/weaviate/conftest.py +15 -0
  20. test/integration/connectors/weaviate/test_local.py +131 -0
  21. unstructured_ingest/__version__.py +1 -1
  22. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  23. unstructured_ingest/utils/data_prep.py +9 -1
  24. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  25. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  27. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  28. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  29. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  30. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  31. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  32. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  38. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  39. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  40. unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
  41. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  42. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  43. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  45. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  46. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  47. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
  50. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  51. /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
  52. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
  53. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
  54. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
  55. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ import json
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ import requests
7
+ import weaviate
8
+ from weaviate.client import WeaviateClient
9
+
10
+ from test.integration.connectors.utils.constants import DESTINATION_TAG
11
+ from test.integration.connectors.utils.docker import container_context
12
+ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
13
+ from unstructured_ingest.v2.processes.connectors.weaviate.local import (
14
+ CONNECTOR_TYPE,
15
+ LocalWeaviateConnectionConfig,
16
+ LocalWeaviateUploader,
17
+ LocalWeaviateUploaderConfig,
18
+ LocalWeaviateUploadStager,
19
+ )
20
+
21
+ COLLECTION_NAME = "elements"
22
+
23
+
24
+ def wait_for_container(timeout: int = 10, interval: int = 1) -> None:
25
+ start_time = time.time()
26
+ while time.time() - start_time < timeout:
27
+ try:
28
+ requests.get("http://localhost:8080/v1/.well-known/read")
29
+ return
30
+ except Exception as e:
31
+ print(f"Failed to validate container healthy, sleeping for {interval} seconds: {e}")
32
+ time.sleep(interval)
33
+ raise TimeoutError("Docker container never came up healthy")
34
+
35
+
36
+ @pytest.fixture
37
+ def collection(collections_schema_config: dict) -> str:
38
+ with container_context(
39
+ image="semitechnologies/weaviate:1.27.3",
40
+ ports={8080: 8080, 50051: 50051},
41
+ ):
42
+ wait_for_container()
43
+ with weaviate.connect_to_local() as weaviate_client:
44
+ weaviate_client.collections.create_from_dict(config=collections_schema_config)
45
+ yield COLLECTION_NAME
46
+
47
+
48
+ def get_count(client: WeaviateClient) -> int:
49
+ collection = client.collections.get(COLLECTION_NAME)
50
+ resp = collection.aggregate.over_all(total_count=True)
51
+ return resp.total_count
52
+
53
+
54
+ def validate_count(expected_count: int, retries: int = 10, interval: int = 1) -> None:
55
+ with weaviate.connect_to_local() as weaviate_client:
56
+ current_count = get_count(client=weaviate_client)
57
+ retry_count = 0
58
+ while current_count != expected_count and retry_count < retries:
59
+ retry_count += 1
60
+ time.sleep(interval)
61
+ current_count = get_count(client=weaviate_client)
62
+ assert current_count == expected_count, (
63
+ f"Expected count ({expected_count}) doesn't match how "
64
+ f"much came back from collection: {current_count}"
65
+ )
66
+
67
+
68
+ def run_uploader_and_validate(
69
+ uploader: LocalWeaviateUploader, path: Path, file_data: FileData, expected_count: int
70
+ ):
71
+ uploader.precheck()
72
+ uploader.run(path=path, file_data=file_data)
73
+ validate_count(expected_count=expected_count)
74
+
75
+
76
+ @pytest.mark.asyncio
77
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
78
+ def test_weaviate_local_destination(upload_file: Path, collection: str, tmp_path: Path):
79
+ file_data = FileData(
80
+ source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
81
+ connector_type=CONNECTOR_TYPE,
82
+ identifier="mock file data",
83
+ )
84
+ stager = LocalWeaviateUploadStager()
85
+
86
+ staged_filepath = stager.run(
87
+ elements_filepath=upload_file,
88
+ file_data=file_data,
89
+ output_dir=tmp_path,
90
+ output_filename=upload_file.name,
91
+ )
92
+ dynamic_uploader = LocalWeaviateUploader(
93
+ upload_config=LocalWeaviateUploaderConfig(
94
+ collection=COLLECTION_NAME,
95
+ ),
96
+ connection_config=LocalWeaviateConnectionConfig(),
97
+ )
98
+ fixed_size_uploader = LocalWeaviateUploader(
99
+ upload_config=LocalWeaviateUploaderConfig(
100
+ collection=COLLECTION_NAME, batch_size=10, dynamic_batch=False
101
+ ),
102
+ connection_config=LocalWeaviateConnectionConfig(),
103
+ )
104
+ rate_limited_uploader = LocalWeaviateUploader(
105
+ upload_config=LocalWeaviateUploaderConfig(
106
+ collection=COLLECTION_NAME, requests_per_minute=50, dynamic_batch=False
107
+ ),
108
+ connection_config=LocalWeaviateConnectionConfig(),
109
+ )
110
+ with staged_filepath.open() as f:
111
+ staged_elements = json.load(f)
112
+ expected_count = len(staged_elements)
113
+
114
+ run_uploader_and_validate(
115
+ uploader=dynamic_uploader,
116
+ path=staged_filepath,
117
+ file_data=file_data,
118
+ expected_count=expected_count,
119
+ )
120
+ run_uploader_and_validate(
121
+ uploader=fixed_size_uploader,
122
+ path=staged_filepath,
123
+ file_data=file_data,
124
+ expected_count=expected_count,
125
+ )
126
+ run_uploader_and_validate(
127
+ uploader=rate_limited_uploader,
128
+ path=staged_filepath,
129
+ file_data=file_data,
130
+ expected_count=expected_count,
131
+ )
@@ -1 +1 @@
1
- __version__ = "0.3.0" # pragma: no cover
1
+ __version__ = "0.3.2" # pragma: no cover
@@ -61,4 +61,4 @@ class Embedder(ReformatNode):
61
61
  return None
62
62
 
63
63
  def get_path(self) -> Path:
64
- return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
64
+ return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
@@ -1,7 +1,9 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
- from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
4
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+
6
+ import pandas as pd
5
7
 
6
8
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
7
9
 
@@ -9,6 +11,12 @@ T = TypeVar("T")
9
11
  IterableT = Iterable[T]
10
12
 
11
13
 
14
+ def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
15
+ num_chunks = len(df) // chunk_size + 1
16
+ for i in range(num_chunks):
17
+ yield df[i * chunk_size : (i + 1) * chunk_size]
18
+
19
+
12
20
  def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
13
21
  """A helper function to break an iterable into batches of size batch_size."""
14
22
  it = iter(iterable)
@@ -1,10 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
+ import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
4
5
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
6
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
6
8
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
7
9
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
10
+ import unstructured_ingest.v2.processes.connectors.weaviate # noqa: F401
8
11
  from unstructured_ingest.v2.processes.connector_registry import (
9
12
  add_destination_entry,
10
13
  add_source_entry,
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
24
27
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
25
28
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
26
29
  from .delta_table import delta_table_destination_entry
27
- from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
28
- from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
30
  from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
31
  from .gitlab import gitlab_source_entry
31
32
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
40
41
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
41
42
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
42
43
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
43
- from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
44
- from .opensearch import opensearch_destination_entry, opensearch_source_entry
45
44
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
46
45
  from .outlook import outlook_source_entry
47
46
  from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
52
51
  from .sharepoint import sharepoint_source_entry
53
52
  from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
54
53
  from .slack import slack_source_entry
55
- from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
56
- from .weaviate import weaviate_destination_entry
57
54
 
58
55
  add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
59
56
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -67,10 +64,6 @@ add_destination_entry(
67
64
  destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
68
65
  )
69
66
 
70
- add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
71
- add_destination_entry(
72
- destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
73
- )
74
67
 
75
68
  add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
76
69
 
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
80
73
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
74
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
82
75
 
83
- add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
84
- add_destination_entry(
85
- destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
86
- )
87
76
 
88
77
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
89
78
 
90
- add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
91
-
92
79
  add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
93
80
  add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
94
81
 
@@ -170,7 +170,7 @@ class AstraDBIndexer(Indexer):
170
170
 
171
171
  def precheck(self) -> None:
172
172
  try:
173
- self.get_collection()
173
+ self.get_collection().options()
174
174
  except Exception as e:
175
175
  logger.error(f"Failed to validate connection {e}", exc_info=True)
176
176
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -345,7 +345,7 @@ class AstraDBUploader(Uploader):
345
345
  connection_config=self.connection_config,
346
346
  collection_name=self.upload_config.collection_name,
347
347
  keyspace=self.upload_config.keyspace,
348
- )
348
+ ).options()
349
349
  except Exception as e:
350
350
  logger.error(f"Failed to validate connection {e}", exc_info=True)
351
351
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -155,6 +155,10 @@ class AzureAISearchUploadStager(UploadStager):
155
155
  self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
156
  ]
157
157
 
158
+ if Path(output_filename).suffix != ".json":
159
+ output_filename = f"{output_filename}.json"
160
+ else:
161
+ output_filename = f"{Path(output_filename).stem}.json"
158
162
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
163
  output_path.parent.mkdir(parents=True, exist_ok=True)
160
164
  with open(output_path, "w") as output_file:
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import os
3
+ import traceback
3
4
  from dataclasses import dataclass, field
4
- from multiprocessing import Process
5
+ from multiprocessing import Process, Queue
5
6
  from pathlib import Path
6
7
  from typing import Any, Optional
7
8
  from urllib.parse import urlparse
@@ -27,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
27
28
  CONNECTOR_TYPE = "delta_table"
28
29
 
29
30
 
31
+ def write_deltalake_with_error_handling(queue, **kwargs):
32
+ from deltalake.writer import write_deltalake
33
+
34
+ try:
35
+ write_deltalake(**kwargs)
36
+ except Exception:
37
+ queue.put(traceback.format_exc())
38
+
39
+
30
40
  class DeltaTableAccessConfig(AccessConfig):
31
41
  aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
32
42
  aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
@@ -157,7 +167,6 @@ class DeltaTableUploader(Uploader):
157
167
 
158
168
  @requires_dependencies(["deltalake"], extras="delta-table")
159
169
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
160
- from deltalake.writer import write_deltalake
161
170
 
162
171
  df = self.read_dataframe(path)
163
172
  updated_upload_path = os.path.join(
@@ -176,17 +185,24 @@ class DeltaTableUploader(Uploader):
176
185
  "mode": "overwrite",
177
186
  "storage_options": storage_options,
178
187
  }
188
+ queue = Queue()
179
189
  # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
180
190
  # ingest to fail, even though all tasks are completed normally. Putting the writer into a
181
191
  # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
182
192
  # rust backend to finish
183
193
  writer = Process(
184
- target=write_deltalake,
185
- kwargs=writer_kwargs,
194
+ target=write_deltalake_with_error_handling,
195
+ kwargs={"queue": queue, **writer_kwargs},
186
196
  )
187
197
  writer.start()
188
198
  writer.join()
189
199
 
200
+ # Check if the queue has any exception message
201
+ if not queue.empty():
202
+ error_message = queue.get()
203
+ logger.error(f"Exception occurred in write_deltalake: {error_message}")
204
+ raise RuntimeError(f"Error in write_deltalake: {error_message}")
205
+
190
206
 
191
207
  delta_table_destination_entry = DestinationRegistryEntry(
192
208
  connection_config=DeltaTableConnectionConfig,
@@ -0,0 +1,19 @@
1
+ from unstructured_ingest.v2.processes.connector_registry import (
2
+ add_destination_entry,
3
+ add_source_entry,
4
+ )
5
+
6
+ from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
7
+ from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
8
+ from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
9
+ from .opensearch import opensearch_destination_entry, opensearch_source_entry
10
+
11
+ add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
12
+ add_destination_entry(
13
+ destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
14
+ )
15
+
16
+ add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
17
+ add_destination_entry(
18
+ destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
19
+ )
@@ -2,6 +2,7 @@ import hashlib
2
2
  import json
3
3
  import sys
4
4
  import uuid
5
+ from contextlib import contextmanager
5
6
  from dataclasses import dataclass, field
6
7
  from pathlib import Path
7
8
  from time import time
@@ -13,9 +14,11 @@ from unstructured_ingest.error import (
13
14
  DestinationConnectionError,
14
15
  SourceConnectionError,
15
16
  SourceConnectionNetworkError,
17
+ WriteError,
16
18
  )
17
19
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
18
20
  from unstructured_ingest.utils.dep_check import requires_dependencies
21
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
19
22
  from unstructured_ingest.v2.interfaces import (
20
23
  AccessConfig,
21
24
  ConnectionConfig,
@@ -26,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
26
29
  FileDataSourceMetadata,
27
30
  Indexer,
28
31
  IndexerConfig,
32
+ SourceIdentifiers,
29
33
  Uploader,
30
34
  UploaderConfig,
31
35
  UploadStager,
@@ -116,19 +120,12 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
116
120
  return client_kwargs
117
121
 
118
122
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
119
- def get_client(self) -> "ElasticsearchClient":
123
+ @contextmanager
124
+ def get_client(self) -> Generator["ElasticsearchClient", None, None]:
120
125
  from elasticsearch import Elasticsearch as ElasticsearchClient
121
126
 
122
- client = ElasticsearchClient(**self.get_client_kwargs())
123
- self.check_connection(client=client)
124
- return client
125
-
126
- def check_connection(self, client: "ElasticsearchClient"):
127
- try:
128
- client.perform_request("HEAD", "/", headers={"accept": "application/json"})
129
- except Exception as e:
130
- logger.error(f"failed to validate connection: {e}", exc_info=True)
131
- raise SourceConnectionError(f"failed to validate connection: {e}")
127
+ with ElasticsearchClient(**self.get_client_kwargs()) as client:
128
+ yield client
132
129
 
133
130
 
134
131
  class ElasticsearchIndexerConfig(IndexerConfig):
@@ -144,7 +141,16 @@ class ElasticsearchIndexer(Indexer):
144
141
 
145
142
  def precheck(self) -> None:
146
143
  try:
147
- self.connection_config.get_client()
144
+ with self.connection_config.get_client() as client:
145
+ if not client.ping():
146
+ raise SourceConnectionError("cluster not detected")
147
+ indices = client.indices.get_alias(index="*")
148
+ if self.index_config.index_name not in indices:
149
+ raise SourceConnectionError(
150
+ "index {} not found: {}".format(
151
+ self.index_config.index_name, ", ".join(indices.keys())
152
+ )
153
+ )
148
154
  except Exception as e:
149
155
  logger.error(f"failed to validate connection: {e}", exc_info=True)
150
156
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -160,15 +166,15 @@ class ElasticsearchIndexer(Indexer):
160
166
  scan = self.load_scan()
161
167
 
162
168
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
163
- client = self.connection_config.get_client()
164
- hits = scan(
165
- client,
166
- query=scan_query,
167
- scroll="1m",
168
- index=self.index_config.index_name,
169
- )
169
+ with self.connection_config.get_client() as client:
170
+ hits = scan(
171
+ client,
172
+ query=scan_query,
173
+ scroll="1m",
174
+ index=self.index_config.index_name,
175
+ )
170
176
 
171
- return {hit["_id"] for hit in hits}
177
+ return {hit["_id"] for hit in hits}
172
178
 
173
179
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
174
180
  all_ids = self._get_doc_ids()
@@ -257,6 +263,7 @@ class ElasticsearchDownloader(Downloader):
257
263
  file_data=FileData(
258
264
  identifier=filename_id,
259
265
  connector_type=CONNECTOR_TYPE,
266
+ source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
260
267
  metadata=FileDataSourceMetadata(
261
268
  version=str(result["_version"]) if "_version" in result else None,
262
269
  date_processed=str(time()),
@@ -318,7 +325,7 @@ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
318
325
  class ElasticsearchUploadStager(UploadStager):
319
326
  upload_stager_config: ElasticsearchUploadStagerConfig
320
327
 
321
- def conform_dict(self, data: dict) -> dict:
328
+ def conform_dict(self, data: dict, file_data: FileData) -> dict:
322
329
  resp = {
323
330
  "_index": self.upload_stager_config.index_name,
324
331
  "_id": str(uuid.uuid4()),
@@ -327,6 +334,7 @@ class ElasticsearchUploadStager(UploadStager):
327
334
  "embeddings": data.pop("embeddings", None),
328
335
  "text": data.pop("text", None),
329
336
  "type": data.pop("type", None),
337
+ RECORD_ID_LABEL: file_data.identifier,
330
338
  },
331
339
  }
332
340
  if "metadata" in data and isinstance(data["metadata"], dict):
@@ -343,10 +351,17 @@ class ElasticsearchUploadStager(UploadStager):
343
351
  ) -> Path:
344
352
  with open(elements_filepath) as elements_file:
345
353
  elements_contents = json.load(elements_file)
346
- conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
347
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
354
+ conformed_elements = [
355
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
356
+ ]
357
+ if Path(output_filename).suffix != ".json":
358
+ output_filename = f"{output_filename}.json"
359
+ else:
360
+ output_filename = f"{Path(output_filename).stem}.json"
361
+ output_path = Path(output_dir) / output_filename
362
+ output_path.parent.mkdir(parents=True, exist_ok=True)
348
363
  with open(output_path, "w") as output_file:
349
- json.dump(conformed_elements, output_file)
364
+ json.dump(conformed_elements, output_file, indent=2)
350
365
  return output_path
351
366
 
352
367
 
@@ -363,6 +378,10 @@ class ElasticsearchUploaderConfig(UploaderConfig):
363
378
  num_threads: int = Field(
364
379
  default=4, description="Number of threads to be used while uploading content"
365
380
  )
381
+ record_id_key: str = Field(
382
+ default=RECORD_ID_LABEL,
383
+ description="searchable key to find entries for the same record on previous runs",
384
+ )
366
385
 
367
386
 
368
387
  @dataclass
@@ -373,7 +392,16 @@ class ElasticsearchUploader(Uploader):
373
392
 
374
393
  def precheck(self) -> None:
375
394
  try:
376
- self.connection_config.get_client()
395
+ with self.connection_config.get_client() as client:
396
+ if not client.ping():
397
+ raise DestinationConnectionError("cluster not detected")
398
+ indices = client.indices.get_alias(index="*")
399
+ if self.upload_config.index_name not in indices:
400
+ raise SourceConnectionError(
401
+ "index {} not found: {}".format(
402
+ self.upload_config.index_name, ", ".join(indices.keys())
403
+ )
404
+ )
377
405
  except Exception as e:
378
406
  logger.error(f"failed to validate connection: {e}", exc_info=True)
379
407
  raise DestinationConnectionError(f"failed to validate connection: {e}")
@@ -384,6 +412,23 @@ class ElasticsearchUploader(Uploader):
384
412
 
385
413
  return parallel_bulk
386
414
 
415
+ def delete_by_record_id(self, client, file_data: FileData) -> None:
416
+ logger.debug(
417
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
418
+ f"from {self.upload_config.index_name} index"
419
+ )
420
+ delete_resp = client.delete_by_query(
421
+ index=self.upload_config.index_name,
422
+ body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
423
+ )
424
+ logger.info(
425
+ "deleted {} records from index {}".format(
426
+ delete_resp["deleted"], self.upload_config.index_name
427
+ )
428
+ )
429
+ if failures := delete_resp.get("failures"):
430
+ raise WriteError(f"failed to delete records: {failures}")
431
+
387
432
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
388
433
  parallel_bulk = self.load_parallel_bulk()
389
434
  with path.open("r") as file:
@@ -397,28 +442,29 @@ class ElasticsearchUploader(Uploader):
397
442
  f"{self.upload_config.num_threads} (number of) threads"
398
443
  )
399
444
 
400
- client = self.connection_config.get_client()
401
- if not client.indices.exists(index=self.upload_config.index_name):
402
- logger.warning(
403
- f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
404
- f"{self.upload_config.index_name}. "
405
- f"This may cause issues when uploading."
406
- )
407
- for batch in generator_batching_wbytes(
408
- elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
409
- ):
410
- for success, info in parallel_bulk(
411
- client=client,
412
- actions=batch,
413
- thread_count=self.upload_config.num_threads,
445
+ with self.connection_config.get_client() as client:
446
+ self.delete_by_record_id(client=client, file_data=file_data)
447
+ if not client.indices.exists(index=self.upload_config.index_name):
448
+ logger.warning(
449
+ f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
450
+ f"{self.upload_config.index_name}. "
451
+ f"This may cause issues when uploading."
452
+ )
453
+ for batch in generator_batching_wbytes(
454
+ elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
414
455
  ):
415
- if not success:
416
- logger.error(
417
- "upload failed for a batch in "
418
- f"{(self.__class__.__name__).replace('Uploader', '')} "
419
- "destination connector:",
420
- info,
421
- )
456
+ for success, info in parallel_bulk(
457
+ client=client,
458
+ actions=batch,
459
+ thread_count=self.upload_config.num_threads,
460
+ ):
461
+ if not success:
462
+ logger.error(
463
+ "upload failed for a batch in "
464
+ f"{(self.__class__.__name__).replace('Uploader', '')} "
465
+ "destination connector:",
466
+ info,
467
+ )
422
468
 
423
469
 
424
470
  elasticsearch_source_entry = SourceRegistryEntry(
@@ -17,7 +17,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
17
17
  DestinationRegistryEntry,
18
18
  SourceRegistryEntry,
19
19
  )
20
- from unstructured_ingest.v2.processes.connectors.elasticsearch import (
20
+ from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
21
21
  ElasticsearchDownloader,
22
22
  ElasticsearchDownloaderConfig,
23
23
  ElasticsearchIndexer,
@@ -161,7 +161,7 @@ class GoogleDriveIndexer(Indexer):
161
161
  and isinstance(parent_root_path, str)
162
162
  ):
163
163
  fullpath = f"{parent_path}/{filename}"
164
- rel_path = fullpath.replace(parent_root_path, "")
164
+ rel_path = Path(fullpath).relative_to(parent_root_path).as_posix()
165
165
  source_identifiers = SourceIdentifiers(
166
166
  filename=filename, fullpath=fullpath, rel_path=rel_path
167
167
  )
@@ -161,6 +161,12 @@ class KafkaIndexer(Indexer, ABC):
161
161
  current_topics = [
162
162
  topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
163
163
  ]
164
+ if self.index_config.topic not in current_topics:
165
+ raise SourceConnectionError(
166
+ "expected topic {} not detected in cluster: {}".format(
167
+ self.index_config.topic, ", ".join(current_topics)
168
+ )
169
+ )
164
170
  logger.info(f"successfully checked available topics: {current_topics}")
165
171
  except Exception as e:
166
172
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
4
+
5
+ from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
6
+ from .aws import lancedb_aws_destination_entry
7
+ from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
8
+ from .azure import lancedb_azure_destination_entry
9
+ from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
10
+ from .gcp import lancedb_gcp_destination_entry
11
+ from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
12
+ from .local import lancedb_local_destination_entry
13
+
14
+ add_destination_entry(LANCEDB_S3_CONNECTOR_TYPE, lancedb_aws_destination_entry)
15
+ add_destination_entry(LANCEDB_AZURE_CONNECTOR_TYPE, lancedb_azure_destination_entry)
16
+ add_destination_entry(LANCEDB_GCS_CONNECTOR_TYPE, lancedb_gcp_destination_entry)
17
+ add_destination_entry(LANCEDB_LOCAL_CONNECTOR_TYPE, lancedb_local_destination_entry)