unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_kafka.py +116 -16
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_s3.py +23 -0
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -4
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +78 -23
- unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +20 -19
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +58 -37
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -14,8 +14,8 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
|
14
14
|
from .airtable import airtable_source_entry
|
|
15
15
|
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
16
16
|
from .astradb import astra_db_destination_entry, astra_db_source_entry
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
17
|
+
from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
|
|
18
|
+
from .azure_ai_search import azure_ai_search_destination_entry
|
|
19
19
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
20
20
|
from .chroma import chroma_destination_entry
|
|
21
21
|
from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
|
|
@@ -97,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
|
|
|
97
97
|
|
|
98
98
|
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
99
99
|
add_destination_entry(
|
|
100
|
-
destination_type=
|
|
101
|
-
entry=
|
|
100
|
+
destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
|
|
101
|
+
entry=azure_ai_search_destination_entry,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
ConnectionConfig,
|
|
13
13
|
Downloader,
|
|
14
14
|
DownloaderConfig,
|
|
15
|
+
DownloadResponse,
|
|
15
16
|
FileData,
|
|
16
17
|
Indexer,
|
|
17
18
|
IndexerConfig,
|
|
18
19
|
SourceIdentifiers,
|
|
19
|
-
download_responses,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
22
|
SourceRegistryEntry,
|
|
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
|
|
|
214
214
|
row_dict.update(table_row["fields"])
|
|
215
215
|
return row_dict
|
|
216
216
|
|
|
217
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
218
218
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
219
219
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
220
220
|
df = pandas.DataFrame.from_dict(
|
|
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
)
|
|
20
20
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
21
21
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
22
23
|
from unstructured_ingest.v2.interfaces import (
|
|
23
24
|
AccessConfig,
|
|
24
25
|
ConnectionConfig,
|
|
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
|
|
|
129
130
|
"numbers, and underscores."
|
|
130
131
|
)
|
|
131
132
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
132
|
-
namespace: Optional[str] = Field(
|
|
133
|
-
default=None,
|
|
134
|
-
description="The Astra DB connection namespace.",
|
|
135
|
-
deprecated="Please use 'keyspace' instead.",
|
|
136
|
-
)
|
|
137
133
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
138
134
|
|
|
139
135
|
|
|
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
|
|
|
147
143
|
"Note that the collection name must only include letters, "
|
|
148
144
|
"numbers, and underscores."
|
|
149
145
|
)
|
|
150
|
-
embedding_dimension: int = Field(
|
|
151
|
-
default=384, description="The dimensionality of the embeddings"
|
|
152
|
-
)
|
|
153
146
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
154
|
-
namespace: Optional[str] = Field(
|
|
155
|
-
default=None,
|
|
156
|
-
description="The Astra DB connection namespace.",
|
|
157
|
-
deprecated="Please use 'keyspace' instead.",
|
|
158
|
-
)
|
|
159
147
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
160
148
|
default=None,
|
|
161
149
|
description="The indexing policy to use for the collection.",
|
|
162
150
|
examples=['{"deny": ["metadata"]}'],
|
|
163
151
|
)
|
|
164
152
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
153
|
+
record_id_key: str = Field(
|
|
154
|
+
default=RECORD_ID_LABEL,
|
|
155
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
156
|
+
)
|
|
165
157
|
|
|
166
158
|
|
|
167
159
|
@dataclass
|
|
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
|
|
|
173
165
|
return get_astra_collection(
|
|
174
166
|
connection_config=self.connection_config,
|
|
175
167
|
collection_name=self.index_config.collection_name,
|
|
176
|
-
keyspace=self.index_config.keyspace
|
|
168
|
+
keyspace=self.index_config.keyspace,
|
|
177
169
|
)
|
|
178
170
|
|
|
179
171
|
def precheck(self) -> None:
|
|
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
|
|
|
223
215
|
additional_metadata={
|
|
224
216
|
"ids": list(batch),
|
|
225
217
|
"collection_name": self.index_config.collection_name,
|
|
226
|
-
"keyspace": self.index_config.keyspace
|
|
218
|
+
"keyspace": self.index_config.keyspace,
|
|
227
219
|
},
|
|
228
220
|
)
|
|
229
221
|
yield fd
|
|
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
|
|
|
309
301
|
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
310
302
|
)
|
|
311
303
|
|
|
312
|
-
def conform_dict(self, element_dict: dict) -> dict:
|
|
304
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
313
305
|
return {
|
|
314
306
|
"$vector": element_dict.pop("embeddings", None),
|
|
315
307
|
"content": element_dict.pop("text", None),
|
|
308
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
316
309
|
"metadata": element_dict,
|
|
317
310
|
}
|
|
318
311
|
|
|
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
|
|
|
328
321
|
elements_contents = json.load(elements_file)
|
|
329
322
|
conformed_elements = []
|
|
330
323
|
for element in elements_contents:
|
|
331
|
-
conformed_elements.append(self.conform_dict(element_dict=element))
|
|
332
|
-
|
|
324
|
+
conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
|
|
325
|
+
output_filename_path = Path(output_filename)
|
|
326
|
+
if output_filename_path.suffix == ".json":
|
|
327
|
+
output_path = Path(output_dir) / output_filename_path
|
|
328
|
+
else:
|
|
329
|
+
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
330
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
333
331
|
with open(output_path, "w") as output_file:
|
|
334
|
-
json.dump(conformed_elements, output_file)
|
|
332
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
335
333
|
return output_path
|
|
336
334
|
|
|
337
335
|
|
|
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
|
|
|
346
344
|
get_astra_collection(
|
|
347
345
|
connection_config=self.connection_config,
|
|
348
346
|
collection_name=self.upload_config.collection_name,
|
|
349
|
-
keyspace=self.upload_config.keyspace
|
|
347
|
+
keyspace=self.upload_config.keyspace,
|
|
350
348
|
)
|
|
351
349
|
except Exception as e:
|
|
352
350
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
|
|
|
357
355
|
return get_astra_collection(
|
|
358
356
|
connection_config=self.connection_config,
|
|
359
357
|
collection_name=self.upload_config.collection_name,
|
|
360
|
-
keyspace=self.upload_config.keyspace
|
|
358
|
+
keyspace=self.upload_config.keyspace,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
|
|
362
|
+
logger.debug(
|
|
363
|
+
f"deleting records from collection {collection.name} "
|
|
364
|
+
f"with {self.upload_config.record_id_key} "
|
|
365
|
+
f"set to {file_data.identifier}"
|
|
366
|
+
)
|
|
367
|
+
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
368
|
+
delete_resp = collection.delete_many(filter=delete_filter)
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
361
371
|
)
|
|
362
372
|
|
|
363
373
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
|
|
|
371
381
|
astra_db_batch_size = self.upload_config.batch_size
|
|
372
382
|
collection = self.get_collection()
|
|
373
383
|
|
|
384
|
+
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
385
|
+
|
|
374
386
|
for chunk in batch_generator(elements_dict, astra_db_batch_size):
|
|
375
387
|
collection.insert_many(chunk)
|
|
376
388
|
|
unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py}
RENAMED
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
|
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
|
26
27
|
|
|
27
28
|
if TYPE_CHECKING:
|
|
28
29
|
from azure.search.documents import SearchClient
|
|
30
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
29
31
|
|
|
32
|
+
CONNECTOR_TYPE = "azure_ai_search"
|
|
30
33
|
|
|
31
|
-
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
azure_cognitive_search_key: str = Field(
|
|
35
|
+
class AzureAISearchAccessConfig(AccessConfig):
|
|
36
|
+
azure_ai_search_key: str = Field(
|
|
36
37
|
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
class
|
|
41
|
+
class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
41
42
|
endpoint: str = Field(
|
|
42
43
|
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
44
|
"In the form of https://{{service_name}}.search.windows.net"
|
|
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
45
46
|
index: str = Field(
|
|
46
47
|
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
48
|
)
|
|
48
|
-
access_config: Secret[
|
|
49
|
+
access_config: Secret[AzureAISearchAccessConfig]
|
|
49
50
|
|
|
50
|
-
@requires_dependencies(["azure.search", "azure.core"], extras="azure-
|
|
51
|
-
def
|
|
51
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
+
def get_search_client(self) -> "SearchClient":
|
|
52
53
|
from azure.core.credentials import AzureKeyCredential
|
|
53
54
|
from azure.search.documents import SearchClient
|
|
54
55
|
|
|
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
56
57
|
endpoint=self.endpoint,
|
|
57
58
|
index_name=self.index,
|
|
58
59
|
credential=AzureKeyCredential(
|
|
59
|
-
self.access_config.get_secret_value().
|
|
60
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
65
|
+
def get_search_index_client(self) -> "SearchIndexClient":
|
|
66
|
+
from azure.core.credentials import AzureKeyCredential
|
|
67
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
68
|
+
|
|
69
|
+
return SearchIndexClient(
|
|
70
|
+
endpoint=self.endpoint,
|
|
71
|
+
credential=AzureKeyCredential(
|
|
72
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
60
73
|
),
|
|
61
74
|
)
|
|
62
75
|
|
|
63
76
|
|
|
64
|
-
class
|
|
77
|
+
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
65
78
|
pass
|
|
66
79
|
|
|
67
80
|
|
|
68
|
-
class
|
|
81
|
+
class AzureAISearchUploaderConfig(UploaderConfig):
|
|
69
82
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
83
|
+
record_id_key: str = Field(
|
|
84
|
+
default=RECORD_ID_LABEL,
|
|
85
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
86
|
+
)
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
@dataclass
|
|
73
|
-
class
|
|
74
|
-
upload_stager_config:
|
|
75
|
-
default_factory=lambda:
|
|
90
|
+
class AzureAISearchUploadStager(UploadStager):
|
|
91
|
+
upload_stager_config: AzureAISearchUploadStagerConfig = field(
|
|
92
|
+
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
76
93
|
)
|
|
77
94
|
|
|
78
95
|
@staticmethod
|
|
79
|
-
def conform_dict(data: dict) -> dict:
|
|
96
|
+
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
80
97
|
"""
|
|
81
98
|
updates the dictionary that is from each Element being converted into a dict/json
|
|
82
99
|
into a dictionary that conforms to the schema expected by the
|
|
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
84
101
|
"""
|
|
85
102
|
|
|
86
103
|
data["id"] = str(uuid.uuid4())
|
|
104
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
87
105
|
|
|
88
106
|
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
89
107
|
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
124
142
|
|
|
125
143
|
def run(
|
|
126
144
|
self,
|
|
145
|
+
file_data: FileData,
|
|
127
146
|
elements_filepath: Path,
|
|
128
147
|
output_dir: Path,
|
|
129
148
|
output_filename: str,
|
|
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
132
151
|
with open(elements_filepath) as elements_file:
|
|
133
152
|
elements_contents = json.load(elements_file)
|
|
134
153
|
|
|
135
|
-
conformed_elements = [
|
|
154
|
+
conformed_elements = [
|
|
155
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
|
+
]
|
|
136
157
|
|
|
137
158
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
159
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
138
160
|
with open(output_path, "w") as output_file:
|
|
139
|
-
json.dump(conformed_elements, output_file)
|
|
161
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
140
162
|
return output_path
|
|
141
163
|
|
|
142
164
|
|
|
143
165
|
@dataclass
|
|
144
|
-
class
|
|
145
|
-
upload_config:
|
|
146
|
-
connection_config:
|
|
166
|
+
class AzureAISearchUploader(Uploader):
|
|
167
|
+
upload_config: AzureAISearchUploaderConfig
|
|
168
|
+
connection_config: AzureAISearchConnectionConfig
|
|
147
169
|
connector_type: str = CONNECTOR_TYPE
|
|
148
170
|
|
|
171
|
+
def query_docs(self, record_id: str, index_key: str) -> list[str]:
|
|
172
|
+
client = self.connection_config.get_search_client()
|
|
173
|
+
results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
|
|
174
|
+
return [result[index_key] for result in results]
|
|
175
|
+
|
|
176
|
+
def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
|
|
177
|
+
logger.debug(
|
|
178
|
+
f"deleting any content with metadata "
|
|
179
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
180
|
+
f"from azure cognitive search index: {self.connection_config.index}"
|
|
181
|
+
)
|
|
182
|
+
doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
|
|
183
|
+
if not doc_ids_to_delete:
|
|
184
|
+
return
|
|
185
|
+
client: SearchClient = self.connection_config.get_search_client()
|
|
186
|
+
results = client.delete_documents(
|
|
187
|
+
documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
|
|
188
|
+
)
|
|
189
|
+
errors = []
|
|
190
|
+
success = []
|
|
191
|
+
for result in results:
|
|
192
|
+
if result.succeeded:
|
|
193
|
+
success.append(result)
|
|
194
|
+
else:
|
|
195
|
+
errors.append(result)
|
|
196
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
197
|
+
if errors:
|
|
198
|
+
raise WriteError(
|
|
199
|
+
", ".join(
|
|
200
|
+
[f"[{error.status_code}] {error.error_message}" for error in errors],
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
|
|
149
204
|
@DestinationConnectionError.wrap
|
|
150
|
-
@requires_dependencies(["azure"], extras="azure-
|
|
151
|
-
def write_dict(self,
|
|
205
|
+
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
206
|
+
def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
|
|
152
207
|
import azure.core.exceptions
|
|
153
208
|
|
|
154
209
|
logger.info(
|
|
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
156
211
|
f"index at {self.connection_config.index}",
|
|
157
212
|
)
|
|
158
213
|
try:
|
|
159
|
-
results = self.connection_config.
|
|
214
|
+
results = self.connection_config.get_search_client().upload_documents(
|
|
160
215
|
documents=elements_dict
|
|
161
216
|
)
|
|
162
217
|
|
|
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
174
229
|
raise WriteError(
|
|
175
230
|
", ".join(
|
|
176
231
|
[
|
|
177
|
-
f"{error.
|
|
232
|
+
f"{error.azure_ai_search_key}: "
|
|
178
233
|
f"[{error.status_code}] {error.error_message}"
|
|
179
234
|
for error in errors
|
|
180
235
|
],
|
|
181
236
|
),
|
|
182
237
|
)
|
|
183
238
|
|
|
239
|
+
def can_delete(self) -> bool:
|
|
240
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
241
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
242
|
+
index_fields = index.fields
|
|
243
|
+
record_id_fields = [
|
|
244
|
+
field for field in index_fields if field.name == self.upload_config.record_id_key
|
|
245
|
+
]
|
|
246
|
+
if not record_id_fields:
|
|
247
|
+
return False
|
|
248
|
+
record_id_field = record_id_fields[0]
|
|
249
|
+
return record_id_field.filterable
|
|
250
|
+
|
|
251
|
+
def get_index_key(self) -> str:
|
|
252
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
253
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
254
|
+
index_fields = index.fields
|
|
255
|
+
key_fields = [field for field in index_fields if field.key]
|
|
256
|
+
if not key_fields:
|
|
257
|
+
raise ValueError("no key field found in index fields")
|
|
258
|
+
return key_fields[0].name
|
|
259
|
+
|
|
184
260
|
def precheck(self) -> None:
|
|
185
261
|
try:
|
|
186
|
-
client = self.connection_config.
|
|
262
|
+
client = self.connection_config.get_search_client()
|
|
187
263
|
client.get_document_count()
|
|
188
264
|
except Exception as e:
|
|
189
265
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
190
266
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
191
267
|
|
|
192
|
-
def write_dict_wrapper(self, elements_dict):
|
|
193
|
-
return self.write_dict(elements_dict=elements_dict)
|
|
194
|
-
|
|
195
268
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
196
269
|
with path.open("r") as file:
|
|
197
270
|
elements_dict = json.load(file)
|
|
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
201
274
|
f" index at {str(self.connection_config.index)}"
|
|
202
275
|
f" with batch size {str(self.upload_config.batch_size)}"
|
|
203
276
|
)
|
|
277
|
+
if self.can_delete():
|
|
278
|
+
index_key = self.get_index_key()
|
|
279
|
+
self.delete_by_record_id(file_data=file_data, index_key=index_key)
|
|
280
|
+
else:
|
|
281
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
204
282
|
|
|
205
283
|
batch_size = self.upload_config.batch_size
|
|
206
|
-
|
|
207
284
|
for chunk in batch_generator(elements_dict, batch_size):
|
|
208
285
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
209
286
|
|
|
210
287
|
|
|
211
|
-
|
|
212
|
-
connection_config=
|
|
213
|
-
uploader=
|
|
214
|
-
uploader_config=
|
|
215
|
-
upload_stager=
|
|
216
|
-
upload_stager_config=
|
|
288
|
+
azure_ai_search_destination_entry = DestinationRegistryEntry(
|
|
289
|
+
connection_config=AzureAISearchConnectionConfig,
|
|
290
|
+
uploader=AzureAISearchUploader,
|
|
291
|
+
uploader_config=AzureAISearchUploaderConfig,
|
|
292
|
+
upload_stager=AzureAISearchUploadStager,
|
|
293
|
+
upload_stager_config=AzureAISearchUploadStagerConfig,
|
|
217
294
|
)
|
|
@@ -11,12 +11,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
11
11
|
ConnectionConfig,
|
|
12
12
|
Downloader,
|
|
13
13
|
DownloaderConfig,
|
|
14
|
+
DownloadResponse,
|
|
14
15
|
FileData,
|
|
15
16
|
FileDataSourceMetadata,
|
|
16
17
|
Indexer,
|
|
17
18
|
IndexerConfig,
|
|
18
19
|
SourceIdentifiers,
|
|
19
|
-
download_responses,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.logger import logger
|
|
22
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -154,7 +154,7 @@ class ConfluenceDownloader(Downloader):
|
|
|
154
154
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
155
155
|
connector_type: str = CONNECTOR_TYPE
|
|
156
156
|
|
|
157
|
-
def run(self, file_data: FileData, **kwargs) ->
|
|
157
|
+
def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
158
158
|
doc_id = file_data.identifier
|
|
159
159
|
try:
|
|
160
160
|
client = self.connection_config.get_client()
|
|
@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
|
|
|
205
205
|
yield FileData(
|
|
206
206
|
identifier=identified,
|
|
207
207
|
connector_type=CONNECTOR_TYPE,
|
|
208
|
+
doc_type="batch",
|
|
208
209
|
metadata=FileDataSourceMetadata(
|
|
209
210
|
url=f"{self.connection_config.connection_string}/"
|
|
210
211
|
f"{self.connection_config.bucket}",
|
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from multiprocessing import Process
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
from pydantic import Field, Secret
|
|
@@ -94,7 +95,7 @@ class DeltaTableUploader(Uploader):
|
|
|
94
95
|
connection_config: DeltaTableConnectionConfig
|
|
95
96
|
connector_type: str = CONNECTOR_TYPE
|
|
96
97
|
|
|
97
|
-
@requires_dependencies(["
|
|
98
|
+
@requires_dependencies(["boto3"], extras="delta-table")
|
|
98
99
|
def precheck(self):
|
|
99
100
|
secrets = self.connection_config.access_config.get_secret_value()
|
|
100
101
|
if (
|
|
@@ -102,13 +103,24 @@ class DeltaTableUploader(Uploader):
|
|
|
102
103
|
and secrets.aws_access_key_id
|
|
103
104
|
and secrets.aws_secret_access_key
|
|
104
105
|
):
|
|
105
|
-
from
|
|
106
|
+
from boto3 import client
|
|
107
|
+
|
|
108
|
+
url = urlparse(self.connection_config.table_uri)
|
|
109
|
+
bucket_name = url.netloc
|
|
110
|
+
dir_path = url.path.lstrip("/")
|
|
106
111
|
|
|
107
112
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
s3_client = client(
|
|
114
|
+
"s3",
|
|
115
|
+
aws_access_key_id=secrets.aws_access_key_id,
|
|
116
|
+
aws_secret_access_key=secrets.aws_secret_access_key,
|
|
110
117
|
)
|
|
111
|
-
|
|
118
|
+
s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
|
|
119
|
+
|
|
120
|
+
response = s3_client.get_bucket_location(Bucket=bucket_name)
|
|
121
|
+
|
|
122
|
+
if self.connection_config.aws_region != response.get("LocationConstraint"):
|
|
123
|
+
raise ValueError("Wrong AWS Region was provided.")
|
|
112
124
|
|
|
113
125
|
except Exception as e:
|
|
114
126
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
@@ -191,6 +191,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
191
191
|
yield FileData(
|
|
192
192
|
identifier=identified,
|
|
193
193
|
connector_type=CONNECTOR_TYPE,
|
|
194
|
+
doc_type="batch",
|
|
194
195
|
metadata=FileDataSourceMetadata(
|
|
195
196
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
196
197
|
date_processed=str(time()),
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import random
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
4
7
|
from dataclasses import dataclass, field
|
|
5
8
|
from pathlib import Path
|
|
6
9
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
|
|
|
207
210
|
**self.connection_config.get_access_config(),
|
|
208
211
|
)
|
|
209
212
|
|
|
213
|
+
def handle_directory_download(self, lpath: Path) -> None:
|
|
214
|
+
# If the object's name contains certain characters (i.e. '?'), it
|
|
215
|
+
# gets downloaded into a new directory of the same name. This
|
|
216
|
+
# reconciles that with what is expected, which is to download it
|
|
217
|
+
# as a file that is not within a directory.
|
|
218
|
+
if not lpath.is_dir():
|
|
219
|
+
return
|
|
220
|
+
desired_name = lpath.name
|
|
221
|
+
files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
|
|
222
|
+
if not files_in_dir:
|
|
223
|
+
raise ValueError(f"no files in {lpath}")
|
|
224
|
+
if len(files_in_dir) > 1:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
"Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
|
|
227
|
+
)
|
|
228
|
+
file = files_in_dir[0]
|
|
229
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
230
|
+
temp_location = os.path.join(temp_dir, desired_name)
|
|
231
|
+
shutil.copyfile(src=file, dst=temp_location)
|
|
232
|
+
shutil.rmtree(lpath)
|
|
233
|
+
shutil.move(src=temp_location, dst=lpath)
|
|
234
|
+
|
|
210
235
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
211
236
|
download_path = self.get_download_path(file_data=file_data)
|
|
212
237
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
213
238
|
try:
|
|
214
239
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
215
240
|
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
241
|
+
self.handle_directory_download(lpath=download_path)
|
|
216
242
|
except Exception as e:
|
|
217
243
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
218
244
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
|
|
|
224
250
|
try:
|
|
225
251
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
226
252
|
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
253
|
+
self.handle_directory_download(lpath=download_path)
|
|
227
254
|
except Exception as e:
|
|
228
255
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
229
256
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
19
|
ConnectionConfig,
|
|
20
20
|
Downloader,
|
|
21
21
|
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
22
23
|
FileData,
|
|
23
24
|
FileDataSourceMetadata,
|
|
24
25
|
Indexer,
|
|
25
26
|
IndexerConfig,
|
|
26
27
|
SourceIdentifiers,
|
|
27
|
-
download_responses,
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.logger import logger
|
|
30
30
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
294
294
|
_, downloaded = downloader.next_chunk()
|
|
295
295
|
return downloaded
|
|
296
296
|
|
|
297
|
-
def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
|
|
297
|
+
def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
|
|
298
298
|
download_path = self.get_download_path(file_data=file_data)
|
|
299
299
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
300
300
|
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
|
|
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
|
|
|
303
303
|
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
304
304
|
|
|
305
305
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
306
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
306
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
307
307
|
from googleapiclient.http import MediaIoBaseDownload
|
|
308
308
|
|
|
309
309
|
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
4
5
|
add_source_entry,
|
|
5
6
|
)
|
|
6
7
|
|
|
7
8
|
from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
|
|
8
|
-
from .cloud import kafka_cloud_source_entry
|
|
9
|
+
from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
|
|
9
10
|
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
|
|
10
|
-
from .local import kafka_local_source_entry
|
|
11
|
+
from .local import kafka_local_destination_entry, kafka_local_source_entry
|
|
11
12
|
|
|
12
13
|
add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
|
|
14
|
+
add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
|
|
15
|
+
|
|
13
16
|
add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
|
|
17
|
+
add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)
|