unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
- test/integration/connectors/sql/test_postgres.py +10 -4
- test/integration/connectors/sql/test_singlestore.py +8 -4
- test/integration/connectors/sql/test_snowflake.py +10 -6
- test/integration/connectors/sql/test_sqlite.py +4 -4
- test/integration/connectors/test_astradb.py +156 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_delta_table.py +46 -0
- test/integration/connectors/test_kafka.py +150 -16
- test/integration/connectors/test_lancedb.py +209 -0
- test/integration/connectors/test_milvus.py +141 -0
- test/integration/connectors/test_pinecone.py +213 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +81 -15
- test/integration/connectors/utils/validation.py +10 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +7 -20
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +35 -23
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +116 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +37 -9
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +93 -46
- unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +84 -23
- unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +101 -13
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +20 -19
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +91 -50
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
|
@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
ConnectionConfig,
|
|
13
13
|
Downloader,
|
|
14
14
|
DownloaderConfig,
|
|
15
|
+
DownloadResponse,
|
|
15
16
|
FileData,
|
|
16
17
|
Indexer,
|
|
17
18
|
IndexerConfig,
|
|
18
19
|
SourceIdentifiers,
|
|
19
|
-
download_responses,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
22
|
SourceRegistryEntry,
|
|
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
|
|
|
214
214
|
row_dict.update(table_row["fields"])
|
|
215
215
|
return row_dict
|
|
216
216
|
|
|
217
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
218
218
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
219
219
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
220
220
|
df = pandas.DataFrame.from_dict(
|
|
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
)
|
|
20
20
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
21
21
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
22
23
|
from unstructured_ingest.v2.interfaces import (
|
|
23
24
|
AccessConfig,
|
|
24
25
|
ConnectionConfig,
|
|
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
|
|
|
129
130
|
"numbers, and underscores."
|
|
130
131
|
)
|
|
131
132
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
132
|
-
namespace: Optional[str] = Field(
|
|
133
|
-
default=None,
|
|
134
|
-
description="The Astra DB connection namespace.",
|
|
135
|
-
deprecated="Please use 'keyspace' instead.",
|
|
136
|
-
)
|
|
137
133
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
138
134
|
|
|
139
135
|
|
|
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
|
|
|
147
143
|
"Note that the collection name must only include letters, "
|
|
148
144
|
"numbers, and underscores."
|
|
149
145
|
)
|
|
150
|
-
embedding_dimension: int = Field(
|
|
151
|
-
default=384, description="The dimensionality of the embeddings"
|
|
152
|
-
)
|
|
153
146
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
154
|
-
namespace: Optional[str] = Field(
|
|
155
|
-
default=None,
|
|
156
|
-
description="The Astra DB connection namespace.",
|
|
157
|
-
deprecated="Please use 'keyspace' instead.",
|
|
158
|
-
)
|
|
159
147
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
160
148
|
default=None,
|
|
161
149
|
description="The indexing policy to use for the collection.",
|
|
162
150
|
examples=['{"deny": ["metadata"]}'],
|
|
163
151
|
)
|
|
164
152
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
153
|
+
record_id_key: str = Field(
|
|
154
|
+
default=RECORD_ID_LABEL,
|
|
155
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
156
|
+
)
|
|
165
157
|
|
|
166
158
|
|
|
167
159
|
@dataclass
|
|
@@ -173,12 +165,12 @@ class AstraDBIndexer(Indexer):
|
|
|
173
165
|
return get_astra_collection(
|
|
174
166
|
connection_config=self.connection_config,
|
|
175
167
|
collection_name=self.index_config.collection_name,
|
|
176
|
-
keyspace=self.index_config.keyspace
|
|
168
|
+
keyspace=self.index_config.keyspace,
|
|
177
169
|
)
|
|
178
170
|
|
|
179
171
|
def precheck(self) -> None:
|
|
180
172
|
try:
|
|
181
|
-
self.get_collection()
|
|
173
|
+
self.get_collection().options()
|
|
182
174
|
except Exception as e:
|
|
183
175
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
184
176
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
|
|
|
223
215
|
additional_metadata={
|
|
224
216
|
"ids": list(batch),
|
|
225
217
|
"collection_name": self.index_config.collection_name,
|
|
226
|
-
"keyspace": self.index_config.keyspace
|
|
218
|
+
"keyspace": self.index_config.keyspace,
|
|
227
219
|
},
|
|
228
220
|
)
|
|
229
221
|
yield fd
|
|
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
|
|
|
309
301
|
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
310
302
|
)
|
|
311
303
|
|
|
312
|
-
def conform_dict(self, element_dict: dict) -> dict:
|
|
304
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
313
305
|
return {
|
|
314
306
|
"$vector": element_dict.pop("embeddings", None),
|
|
315
307
|
"content": element_dict.pop("text", None),
|
|
308
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
316
309
|
"metadata": element_dict,
|
|
317
310
|
}
|
|
318
311
|
|
|
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
|
|
|
328
321
|
elements_contents = json.load(elements_file)
|
|
329
322
|
conformed_elements = []
|
|
330
323
|
for element in elements_contents:
|
|
331
|
-
conformed_elements.append(self.conform_dict(element_dict=element))
|
|
332
|
-
|
|
324
|
+
conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
|
|
325
|
+
output_filename_path = Path(output_filename)
|
|
326
|
+
if output_filename_path.suffix == ".json":
|
|
327
|
+
output_path = Path(output_dir) / output_filename_path
|
|
328
|
+
else:
|
|
329
|
+
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
330
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
333
331
|
with open(output_path, "w") as output_file:
|
|
334
|
-
json.dump(conformed_elements, output_file)
|
|
332
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
335
333
|
return output_path
|
|
336
334
|
|
|
337
335
|
|
|
@@ -346,8 +344,8 @@ class AstraDBUploader(Uploader):
|
|
|
346
344
|
get_astra_collection(
|
|
347
345
|
connection_config=self.connection_config,
|
|
348
346
|
collection_name=self.upload_config.collection_name,
|
|
349
|
-
keyspace=self.upload_config.keyspace
|
|
350
|
-
)
|
|
347
|
+
keyspace=self.upload_config.keyspace,
|
|
348
|
+
).options()
|
|
351
349
|
except Exception as e:
|
|
352
350
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
353
351
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
|
|
|
357
355
|
return get_astra_collection(
|
|
358
356
|
connection_config=self.connection_config,
|
|
359
357
|
collection_name=self.upload_config.collection_name,
|
|
360
|
-
keyspace=self.upload_config.keyspace
|
|
358
|
+
keyspace=self.upload_config.keyspace,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
|
|
362
|
+
logger.debug(
|
|
363
|
+
f"deleting records from collection {collection.name} "
|
|
364
|
+
f"with {self.upload_config.record_id_key} "
|
|
365
|
+
f"set to {file_data.identifier}"
|
|
366
|
+
)
|
|
367
|
+
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
368
|
+
delete_resp = collection.delete_many(filter=delete_filter)
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
361
371
|
)
|
|
362
372
|
|
|
363
373
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
|
|
|
371
381
|
astra_db_batch_size = self.upload_config.batch_size
|
|
372
382
|
collection = self.get_collection()
|
|
373
383
|
|
|
384
|
+
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
385
|
+
|
|
374
386
|
for chunk in batch_generator(elements_dict, astra_db_batch_size):
|
|
375
387
|
collection.insert_many(chunk)
|
|
376
388
|
|
unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py}
RENAMED
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
|
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
|
26
27
|
|
|
27
28
|
if TYPE_CHECKING:
|
|
28
29
|
from azure.search.documents import SearchClient
|
|
30
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
29
31
|
|
|
32
|
+
CONNECTOR_TYPE = "azure_ai_search"
|
|
30
33
|
|
|
31
|
-
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
azure_cognitive_search_key: str = Field(
|
|
35
|
+
class AzureAISearchAccessConfig(AccessConfig):
|
|
36
|
+
azure_ai_search_key: str = Field(
|
|
36
37
|
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
class
|
|
41
|
+
class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
41
42
|
endpoint: str = Field(
|
|
42
43
|
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
44
|
"In the form of https://{{service_name}}.search.windows.net"
|
|
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
45
46
|
index: str = Field(
|
|
46
47
|
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
48
|
)
|
|
48
|
-
access_config: Secret[
|
|
49
|
+
access_config: Secret[AzureAISearchAccessConfig]
|
|
49
50
|
|
|
50
|
-
@requires_dependencies(["azure.search", "azure.core"], extras="azure-
|
|
51
|
-
def
|
|
51
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
+
def get_search_client(self) -> "SearchClient":
|
|
52
53
|
from azure.core.credentials import AzureKeyCredential
|
|
53
54
|
from azure.search.documents import SearchClient
|
|
54
55
|
|
|
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
56
57
|
endpoint=self.endpoint,
|
|
57
58
|
index_name=self.index,
|
|
58
59
|
credential=AzureKeyCredential(
|
|
59
|
-
self.access_config.get_secret_value().
|
|
60
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
65
|
+
def get_search_index_client(self) -> "SearchIndexClient":
|
|
66
|
+
from azure.core.credentials import AzureKeyCredential
|
|
67
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
68
|
+
|
|
69
|
+
return SearchIndexClient(
|
|
70
|
+
endpoint=self.endpoint,
|
|
71
|
+
credential=AzureKeyCredential(
|
|
72
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
60
73
|
),
|
|
61
74
|
)
|
|
62
75
|
|
|
63
76
|
|
|
64
|
-
class
|
|
77
|
+
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
65
78
|
pass
|
|
66
79
|
|
|
67
80
|
|
|
68
|
-
class
|
|
81
|
+
class AzureAISearchUploaderConfig(UploaderConfig):
|
|
69
82
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
83
|
+
record_id_key: str = Field(
|
|
84
|
+
default=RECORD_ID_LABEL,
|
|
85
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
86
|
+
)
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
@dataclass
|
|
73
|
-
class
|
|
74
|
-
upload_stager_config:
|
|
75
|
-
default_factory=lambda:
|
|
90
|
+
class AzureAISearchUploadStager(UploadStager):
|
|
91
|
+
upload_stager_config: AzureAISearchUploadStagerConfig = field(
|
|
92
|
+
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
76
93
|
)
|
|
77
94
|
|
|
78
95
|
@staticmethod
|
|
79
|
-
def conform_dict(data: dict) -> dict:
|
|
96
|
+
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
80
97
|
"""
|
|
81
98
|
updates the dictionary that is from each Element being converted into a dict/json
|
|
82
99
|
into a dictionary that conforms to the schema expected by the
|
|
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
84
101
|
"""
|
|
85
102
|
|
|
86
103
|
data["id"] = str(uuid.uuid4())
|
|
104
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
87
105
|
|
|
88
106
|
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
89
107
|
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
124
142
|
|
|
125
143
|
def run(
|
|
126
144
|
self,
|
|
145
|
+
file_data: FileData,
|
|
127
146
|
elements_filepath: Path,
|
|
128
147
|
output_dir: Path,
|
|
129
148
|
output_filename: str,
|
|
@@ -132,23 +151,63 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
132
151
|
with open(elements_filepath) as elements_file:
|
|
133
152
|
elements_contents = json.load(elements_file)
|
|
134
153
|
|
|
135
|
-
conformed_elements = [
|
|
154
|
+
conformed_elements = [
|
|
155
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
|
+
]
|
|
136
157
|
|
|
158
|
+
if Path(output_filename).suffix != ".json":
|
|
159
|
+
output_filename = f"{output_filename}.json"
|
|
160
|
+
else:
|
|
161
|
+
output_filename = f"{Path(output_filename).stem}.json"
|
|
137
162
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
163
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
138
164
|
with open(output_path, "w") as output_file:
|
|
139
|
-
json.dump(conformed_elements, output_file)
|
|
165
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
140
166
|
return output_path
|
|
141
167
|
|
|
142
168
|
|
|
143
169
|
@dataclass
|
|
144
|
-
class
|
|
145
|
-
upload_config:
|
|
146
|
-
connection_config:
|
|
170
|
+
class AzureAISearchUploader(Uploader):
|
|
171
|
+
upload_config: AzureAISearchUploaderConfig
|
|
172
|
+
connection_config: AzureAISearchConnectionConfig
|
|
147
173
|
connector_type: str = CONNECTOR_TYPE
|
|
148
174
|
|
|
175
|
+
def query_docs(self, record_id: str, index_key: str) -> list[str]:
|
|
176
|
+
client = self.connection_config.get_search_client()
|
|
177
|
+
results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
|
|
178
|
+
return [result[index_key] for result in results]
|
|
179
|
+
|
|
180
|
+
def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
|
|
181
|
+
logger.debug(
|
|
182
|
+
f"deleting any content with metadata "
|
|
183
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
184
|
+
f"from azure cognitive search index: {self.connection_config.index}"
|
|
185
|
+
)
|
|
186
|
+
doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
|
|
187
|
+
if not doc_ids_to_delete:
|
|
188
|
+
return
|
|
189
|
+
client: SearchClient = self.connection_config.get_search_client()
|
|
190
|
+
results = client.delete_documents(
|
|
191
|
+
documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
|
|
192
|
+
)
|
|
193
|
+
errors = []
|
|
194
|
+
success = []
|
|
195
|
+
for result in results:
|
|
196
|
+
if result.succeeded:
|
|
197
|
+
success.append(result)
|
|
198
|
+
else:
|
|
199
|
+
errors.append(result)
|
|
200
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
201
|
+
if errors:
|
|
202
|
+
raise WriteError(
|
|
203
|
+
", ".join(
|
|
204
|
+
[f"[{error.status_code}] {error.error_message}" for error in errors],
|
|
205
|
+
),
|
|
206
|
+
)
|
|
207
|
+
|
|
149
208
|
@DestinationConnectionError.wrap
|
|
150
|
-
@requires_dependencies(["azure"], extras="azure-
|
|
151
|
-
def write_dict(self,
|
|
209
|
+
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
210
|
+
def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
|
|
152
211
|
import azure.core.exceptions
|
|
153
212
|
|
|
154
213
|
logger.info(
|
|
@@ -156,7 +215,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
156
215
|
f"index at {self.connection_config.index}",
|
|
157
216
|
)
|
|
158
217
|
try:
|
|
159
|
-
results = self.connection_config.
|
|
218
|
+
results = self.connection_config.get_search_client().upload_documents(
|
|
160
219
|
documents=elements_dict
|
|
161
220
|
)
|
|
162
221
|
|
|
@@ -174,24 +233,42 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
174
233
|
raise WriteError(
|
|
175
234
|
", ".join(
|
|
176
235
|
[
|
|
177
|
-
f"{error.
|
|
236
|
+
f"{error.azure_ai_search_key}: "
|
|
178
237
|
f"[{error.status_code}] {error.error_message}"
|
|
179
238
|
for error in errors
|
|
180
239
|
],
|
|
181
240
|
),
|
|
182
241
|
)
|
|
183
242
|
|
|
243
|
+
def can_delete(self) -> bool:
|
|
244
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
245
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
246
|
+
index_fields = index.fields
|
|
247
|
+
record_id_fields = [
|
|
248
|
+
field for field in index_fields if field.name == self.upload_config.record_id_key
|
|
249
|
+
]
|
|
250
|
+
if not record_id_fields:
|
|
251
|
+
return False
|
|
252
|
+
record_id_field = record_id_fields[0]
|
|
253
|
+
return record_id_field.filterable
|
|
254
|
+
|
|
255
|
+
def get_index_key(self) -> str:
|
|
256
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
257
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
258
|
+
index_fields = index.fields
|
|
259
|
+
key_fields = [field for field in index_fields if field.key]
|
|
260
|
+
if not key_fields:
|
|
261
|
+
raise ValueError("no key field found in index fields")
|
|
262
|
+
return key_fields[0].name
|
|
263
|
+
|
|
184
264
|
def precheck(self) -> None:
|
|
185
265
|
try:
|
|
186
|
-
client = self.connection_config.
|
|
266
|
+
client = self.connection_config.get_search_client()
|
|
187
267
|
client.get_document_count()
|
|
188
268
|
except Exception as e:
|
|
189
269
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
190
270
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
191
271
|
|
|
192
|
-
def write_dict_wrapper(self, elements_dict):
|
|
193
|
-
return self.write_dict(elements_dict=elements_dict)
|
|
194
|
-
|
|
195
272
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
196
273
|
with path.open("r") as file:
|
|
197
274
|
elements_dict = json.load(file)
|
|
@@ -201,17 +278,21 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
201
278
|
f" index at {str(self.connection_config.index)}"
|
|
202
279
|
f" with batch size {str(self.upload_config.batch_size)}"
|
|
203
280
|
)
|
|
281
|
+
if self.can_delete():
|
|
282
|
+
index_key = self.get_index_key()
|
|
283
|
+
self.delete_by_record_id(file_data=file_data, index_key=index_key)
|
|
284
|
+
else:
|
|
285
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
204
286
|
|
|
205
287
|
batch_size = self.upload_config.batch_size
|
|
206
|
-
|
|
207
288
|
for chunk in batch_generator(elements_dict, batch_size):
|
|
208
289
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
209
290
|
|
|
210
291
|
|
|
211
|
-
|
|
212
|
-
connection_config=
|
|
213
|
-
uploader=
|
|
214
|
-
uploader_config=
|
|
215
|
-
upload_stager=
|
|
216
|
-
upload_stager_config=
|
|
292
|
+
azure_ai_search_destination_entry = DestinationRegistryEntry(
|
|
293
|
+
connection_config=AzureAISearchConnectionConfig,
|
|
294
|
+
uploader=AzureAISearchUploader,
|
|
295
|
+
uploader_config=AzureAISearchUploaderConfig,
|
|
296
|
+
upload_stager=AzureAISearchUploadStager,
|
|
297
|
+
upload_stager_config=AzureAISearchUploadStagerConfig,
|
|
217
298
|
)
|
|
@@ -11,12 +11,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
11
11
|
ConnectionConfig,
|
|
12
12
|
Downloader,
|
|
13
13
|
DownloaderConfig,
|
|
14
|
+
DownloadResponse,
|
|
14
15
|
FileData,
|
|
15
16
|
FileDataSourceMetadata,
|
|
16
17
|
Indexer,
|
|
17
18
|
IndexerConfig,
|
|
18
19
|
SourceIdentifiers,
|
|
19
|
-
download_responses,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.logger import logger
|
|
22
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -154,7 +154,7 @@ class ConfluenceDownloader(Downloader):
|
|
|
154
154
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
155
155
|
connector_type: str = CONNECTOR_TYPE
|
|
156
156
|
|
|
157
|
-
def run(self, file_data: FileData, **kwargs) ->
|
|
157
|
+
def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
158
158
|
doc_id = file_data.identifier
|
|
159
159
|
try:
|
|
160
160
|
client = self.connection_config.get_client()
|
|
@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
|
|
|
205
205
|
yield FileData(
|
|
206
206
|
identifier=identified,
|
|
207
207
|
connector_type=CONNECTOR_TYPE,
|
|
208
|
+
doc_type="batch",
|
|
208
209
|
metadata=FileDataSourceMetadata(
|
|
209
210
|
url=f"{self.connection_config.connection_string}/"
|
|
210
211
|
f"{self.connection_config.bucket}",
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
import traceback
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from multiprocessing import Process
|
|
5
|
+
from multiprocessing import Process, Queue
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Optional
|
|
8
|
+
from urllib.parse import urlparse
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
from pydantic import Field, Secret
|
|
@@ -26,6 +28,15 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
26
28
|
CONNECTOR_TYPE = "delta_table"
|
|
27
29
|
|
|
28
30
|
|
|
31
|
+
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
32
|
+
from deltalake.writer import write_deltalake
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
write_deltalake(**kwargs)
|
|
36
|
+
except Exception:
|
|
37
|
+
queue.put(traceback.format_exc())
|
|
38
|
+
|
|
39
|
+
|
|
29
40
|
class DeltaTableAccessConfig(AccessConfig):
|
|
30
41
|
aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
|
|
31
42
|
aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
|
|
@@ -94,7 +105,7 @@ class DeltaTableUploader(Uploader):
|
|
|
94
105
|
connection_config: DeltaTableConnectionConfig
|
|
95
106
|
connector_type: str = CONNECTOR_TYPE
|
|
96
107
|
|
|
97
|
-
@requires_dependencies(["
|
|
108
|
+
@requires_dependencies(["boto3"], extras="delta-table")
|
|
98
109
|
def precheck(self):
|
|
99
110
|
secrets = self.connection_config.access_config.get_secret_value()
|
|
100
111
|
if (
|
|
@@ -102,13 +113,24 @@ class DeltaTableUploader(Uploader):
|
|
|
102
113
|
and secrets.aws_access_key_id
|
|
103
114
|
and secrets.aws_secret_access_key
|
|
104
115
|
):
|
|
105
|
-
from
|
|
116
|
+
from boto3 import client
|
|
117
|
+
|
|
118
|
+
url = urlparse(self.connection_config.table_uri)
|
|
119
|
+
bucket_name = url.netloc
|
|
120
|
+
dir_path = url.path.lstrip("/")
|
|
106
121
|
|
|
107
122
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
123
|
+
s3_client = client(
|
|
124
|
+
"s3",
|
|
125
|
+
aws_access_key_id=secrets.aws_access_key_id,
|
|
126
|
+
aws_secret_access_key=secrets.aws_secret_access_key,
|
|
110
127
|
)
|
|
111
|
-
|
|
128
|
+
s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
|
|
129
|
+
|
|
130
|
+
response = s3_client.get_bucket_location(Bucket=bucket_name)
|
|
131
|
+
|
|
132
|
+
if self.connection_config.aws_region != response.get("LocationConstraint"):
|
|
133
|
+
raise ValueError("Wrong AWS Region was provided.")
|
|
112
134
|
|
|
113
135
|
except Exception as e:
|
|
114
136
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -145,7 +167,6 @@ class DeltaTableUploader(Uploader):
|
|
|
145
167
|
|
|
146
168
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
147
169
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
148
|
-
from deltalake.writer import write_deltalake
|
|
149
170
|
|
|
150
171
|
df = self.read_dataframe(path)
|
|
151
172
|
updated_upload_path = os.path.join(
|
|
@@ -164,17 +185,24 @@ class DeltaTableUploader(Uploader):
|
|
|
164
185
|
"mode": "overwrite",
|
|
165
186
|
"storage_options": storage_options,
|
|
166
187
|
}
|
|
188
|
+
queue = Queue()
|
|
167
189
|
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
|
168
190
|
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
|
169
191
|
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
|
170
192
|
# rust backend to finish
|
|
171
193
|
writer = Process(
|
|
172
|
-
target=
|
|
173
|
-
kwargs=writer_kwargs,
|
|
194
|
+
target=write_deltalake_with_error_handling,
|
|
195
|
+
kwargs={"queue": queue, **writer_kwargs},
|
|
174
196
|
)
|
|
175
197
|
writer.start()
|
|
176
198
|
writer.join()
|
|
177
199
|
|
|
200
|
+
# Check if the queue has any exception message
|
|
201
|
+
if not queue.empty():
|
|
202
|
+
error_message = queue.get()
|
|
203
|
+
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
204
|
+
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
205
|
+
|
|
178
206
|
|
|
179
207
|
delta_table_destination_entry = DestinationRegistryEntry(
|
|
180
208
|
connection_config=DeltaTableConnectionConfig,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
2
|
+
add_destination_entry,
|
|
3
|
+
add_source_entry,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
7
|
+
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
8
|
+
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
9
|
+
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
10
|
+
|
|
11
|
+
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
12
|
+
add_destination_entry(
|
|
13
|
+
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
17
|
+
add_destination_entry(
|
|
18
|
+
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
19
|
+
)
|