unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +109 -0
- test/integration/connectors/test_azure_cog_search.py +233 -0
- test/integration/connectors/test_confluence.py +113 -0
- test/integration/connectors/test_kafka.py +167 -0
- test/integration/connectors/test_onedrive.py +112 -0
- test/integration/connectors/test_pinecone.py +161 -0
- test/integration/connectors/test_qdrant.py +137 -0
- test/integration/connectors/test_s3.py +23 -0
- test/integration/connectors/utils/docker.py +2 -1
- test/integration/connectors/utils/validation.py +73 -22
- test/unit/v2/__init__.py +0 -0
- test/unit/v2/chunkers/__init__.py +0 -0
- test/unit/v2/chunkers/test_chunkers.py +49 -0
- test/unit/v2/connectors/__init__.py +0 -0
- test/unit/v2/embedders/__init__.py +0 -0
- test/unit/v2/embedders/test_bedrock.py +36 -0
- test/unit/v2/embedders/test_huggingface.py +48 -0
- test/unit/v2/embedders/test_mixedbread.py +37 -0
- test/unit/v2/embedders/test_octoai.py +35 -0
- test/unit/v2/embedders/test_openai.py +35 -0
- test/unit/v2/embedders/test_togetherai.py +37 -0
- test/unit/v2/embedders/test_vertexai.py +37 -0
- test/unit/v2/embedders/test_voyageai.py +38 -0
- test/unit/v2/partitioners/__init__.py +0 -0
- test/unit/v2/partitioners/test_partitioner.py +63 -0
- test/unit/v2/utils/__init__.py +0 -0
- test/unit/v2/utils/data_generator.py +32 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cmds/__init__.py +2 -2
- unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
- unstructured_ingest/connector/kafka.py +0 -1
- unstructured_ingest/interfaces.py +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
- unstructured_ingest/v2/constants.py +2 -0
- unstructured_ingest/v2/processes/chunker.py +2 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
- unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
- unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
- unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
- unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
- unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
- unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
- unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
- unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
- unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
- unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
- unstructured_ingest/v2/processes/partitioner.py +14 -3
- unstructured_ingest/v2/unstructured_api.py +24 -10
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
- unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
- /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
- /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
- /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
4
4
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
|
+
import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
|
|
6
|
+
import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
|
|
5
7
|
import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
|
|
6
8
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
7
9
|
add_destination_entry,
|
|
@@ -12,16 +14,20 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
|
12
14
|
from .airtable import airtable_source_entry
|
|
13
15
|
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
14
16
|
from .astradb import astra_db_destination_entry, astra_db_source_entry
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
17
|
+
from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
|
|
18
|
+
from .azure_ai_search import azure_ai_search_destination_entry
|
|
17
19
|
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
18
20
|
from .chroma import chroma_destination_entry
|
|
21
|
+
from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
|
|
22
|
+
from .confluence import confluence_source_entry
|
|
19
23
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
20
24
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
21
25
|
from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
|
|
22
26
|
from .delta_table import delta_table_destination_entry
|
|
23
27
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
24
28
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
29
|
+
from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
|
|
30
|
+
from .gitlab import gitlab_source_entry
|
|
25
31
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
26
32
|
from .google_drive import google_drive_source_entry
|
|
27
33
|
from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
|
|
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
|
|
|
33
39
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
34
40
|
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
35
41
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
36
|
-
from .onedrive import onedrive_source_entry
|
|
42
|
+
from .onedrive import onedrive_destination_entry, onedrive_source_entry
|
|
37
43
|
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
38
44
|
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
39
45
|
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
@@ -72,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
|
|
|
72
78
|
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
|
|
73
79
|
|
|
74
80
|
add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
|
|
81
|
+
add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
|
|
75
82
|
|
|
76
83
|
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
77
84
|
add_destination_entry(
|
|
@@ -90,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
|
|
|
90
97
|
|
|
91
98
|
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
92
99
|
add_destination_entry(
|
|
93
|
-
destination_type=
|
|
94
|
-
entry=
|
|
100
|
+
destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
|
|
101
|
+
entry=azure_ai_search_destination_entry,
|
|
95
102
|
)
|
|
96
103
|
|
|
97
104
|
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
@@ -99,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
|
|
|
99
106
|
|
|
100
107
|
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
101
108
|
|
|
109
|
+
add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
|
|
110
|
+
|
|
102
111
|
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
|
|
112
|
+
|
|
113
|
+
add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
|
|
@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
ConnectionConfig,
|
|
13
13
|
Downloader,
|
|
14
14
|
DownloaderConfig,
|
|
15
|
+
DownloadResponse,
|
|
15
16
|
FileData,
|
|
16
17
|
Indexer,
|
|
17
18
|
IndexerConfig,
|
|
18
19
|
SourceIdentifiers,
|
|
19
|
-
download_responses,
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
22
|
SourceRegistryEntry,
|
|
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
|
|
|
214
214
|
row_dict.update(table_row["fields"])
|
|
215
215
|
return row_dict
|
|
216
216
|
|
|
217
|
-
def run(self, file_data: FileData, **kwargs: Any) ->
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
218
218
|
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
219
219
|
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
220
220
|
df = pandas.DataFrame.from_dict(
|
|
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
|
|
|
19
19
|
)
|
|
20
20
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
21
21
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
22
23
|
from unstructured_ingest.v2.interfaces import (
|
|
23
24
|
AccessConfig,
|
|
24
25
|
ConnectionConfig,
|
|
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
|
|
|
129
130
|
"numbers, and underscores."
|
|
130
131
|
)
|
|
131
132
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
132
|
-
namespace: Optional[str] = Field(
|
|
133
|
-
default=None,
|
|
134
|
-
description="The Astra DB connection namespace.",
|
|
135
|
-
deprecated="Please use 'keyspace' instead.",
|
|
136
|
-
)
|
|
137
133
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
138
134
|
|
|
139
135
|
|
|
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
|
|
|
147
143
|
"Note that the collection name must only include letters, "
|
|
148
144
|
"numbers, and underscores."
|
|
149
145
|
)
|
|
150
|
-
embedding_dimension: int = Field(
|
|
151
|
-
default=384, description="The dimensionality of the embeddings"
|
|
152
|
-
)
|
|
153
146
|
keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
|
|
154
|
-
namespace: Optional[str] = Field(
|
|
155
|
-
default=None,
|
|
156
|
-
description="The Astra DB connection namespace.",
|
|
157
|
-
deprecated="Please use 'keyspace' instead.",
|
|
158
|
-
)
|
|
159
147
|
requested_indexing_policy: Optional[dict[str, Any]] = Field(
|
|
160
148
|
default=None,
|
|
161
149
|
description="The indexing policy to use for the collection.",
|
|
162
150
|
examples=['{"deny": ["metadata"]}'],
|
|
163
151
|
)
|
|
164
152
|
batch_size: int = Field(default=20, description="Number of records per batch")
|
|
153
|
+
record_id_key: str = Field(
|
|
154
|
+
default=RECORD_ID_LABEL,
|
|
155
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
156
|
+
)
|
|
165
157
|
|
|
166
158
|
|
|
167
159
|
@dataclass
|
|
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
|
|
|
173
165
|
return get_astra_collection(
|
|
174
166
|
connection_config=self.connection_config,
|
|
175
167
|
collection_name=self.index_config.collection_name,
|
|
176
|
-
keyspace=self.index_config.keyspace
|
|
168
|
+
keyspace=self.index_config.keyspace,
|
|
177
169
|
)
|
|
178
170
|
|
|
179
171
|
def precheck(self) -> None:
|
|
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
|
|
|
223
215
|
additional_metadata={
|
|
224
216
|
"ids": list(batch),
|
|
225
217
|
"collection_name": self.index_config.collection_name,
|
|
226
|
-
"keyspace": self.index_config.keyspace
|
|
218
|
+
"keyspace": self.index_config.keyspace,
|
|
227
219
|
},
|
|
228
220
|
)
|
|
229
221
|
yield fd
|
|
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
|
|
|
309
301
|
default_factory=lambda: AstraDBUploadStagerConfig()
|
|
310
302
|
)
|
|
311
303
|
|
|
312
|
-
def conform_dict(self, element_dict: dict) -> dict:
|
|
304
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
313
305
|
return {
|
|
314
306
|
"$vector": element_dict.pop("embeddings", None),
|
|
315
307
|
"content": element_dict.pop("text", None),
|
|
308
|
+
RECORD_ID_LABEL: file_data.identifier,
|
|
316
309
|
"metadata": element_dict,
|
|
317
310
|
}
|
|
318
311
|
|
|
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
|
|
|
328
321
|
elements_contents = json.load(elements_file)
|
|
329
322
|
conformed_elements = []
|
|
330
323
|
for element in elements_contents:
|
|
331
|
-
conformed_elements.append(self.conform_dict(element_dict=element))
|
|
332
|
-
|
|
324
|
+
conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
|
|
325
|
+
output_filename_path = Path(output_filename)
|
|
326
|
+
if output_filename_path.suffix == ".json":
|
|
327
|
+
output_path = Path(output_dir) / output_filename_path
|
|
328
|
+
else:
|
|
329
|
+
output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
|
|
330
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
333
331
|
with open(output_path, "w") as output_file:
|
|
334
|
-
json.dump(conformed_elements, output_file)
|
|
332
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
335
333
|
return output_path
|
|
336
334
|
|
|
337
335
|
|
|
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
|
|
|
346
344
|
get_astra_collection(
|
|
347
345
|
connection_config=self.connection_config,
|
|
348
346
|
collection_name=self.upload_config.collection_name,
|
|
349
|
-
keyspace=self.upload_config.keyspace
|
|
347
|
+
keyspace=self.upload_config.keyspace,
|
|
350
348
|
)
|
|
351
349
|
except Exception as e:
|
|
352
350
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
|
|
|
357
355
|
return get_astra_collection(
|
|
358
356
|
connection_config=self.connection_config,
|
|
359
357
|
collection_name=self.upload_config.collection_name,
|
|
360
|
-
keyspace=self.upload_config.keyspace
|
|
358
|
+
keyspace=self.upload_config.keyspace,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
|
|
362
|
+
logger.debug(
|
|
363
|
+
f"deleting records from collection {collection.name} "
|
|
364
|
+
f"with {self.upload_config.record_id_key} "
|
|
365
|
+
f"set to {file_data.identifier}"
|
|
366
|
+
)
|
|
367
|
+
delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
|
|
368
|
+
delete_resp = collection.delete_many(filter=delete_filter)
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
|
|
361
371
|
)
|
|
362
372
|
|
|
363
373
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
|
|
|
371
381
|
astra_db_batch_size = self.upload_config.batch_size
|
|
372
382
|
collection = self.get_collection()
|
|
373
383
|
|
|
384
|
+
self.delete_by_record_id(collection=collection, file_data=file_data)
|
|
385
|
+
|
|
374
386
|
for chunk in batch_generator(elements_dict, astra_db_batch_size):
|
|
375
387
|
collection.insert_many(chunk)
|
|
376
388
|
|
unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py}
RENAMED
|
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
|
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
13
14
|
AccessConfig,
|
|
14
15
|
ConnectionConfig,
|
|
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
|
|
|
26
27
|
|
|
27
28
|
if TYPE_CHECKING:
|
|
28
29
|
from azure.search.documents import SearchClient
|
|
30
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
29
31
|
|
|
32
|
+
CONNECTOR_TYPE = "azure_ai_search"
|
|
30
33
|
|
|
31
|
-
CONNECTOR_TYPE = "azure_cognitive_search"
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
azure_cognitive_search_key: str = Field(
|
|
35
|
+
class AzureAISearchAccessConfig(AccessConfig):
|
|
36
|
+
azure_ai_search_key: str = Field(
|
|
36
37
|
alias="key", description="Credential that is used for authenticating to an Azure service"
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
class
|
|
41
|
+
class AzureAISearchConnectionConfig(ConnectionConfig):
|
|
41
42
|
endpoint: str = Field(
|
|
42
43
|
description="The URL endpoint of an Azure AI (Cognitive) search service. "
|
|
43
44
|
"In the form of https://{{service_name}}.search.windows.net"
|
|
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
45
46
|
index: str = Field(
|
|
46
47
|
description="The name of the Azure AI (Cognitive) Search index to connect to."
|
|
47
48
|
)
|
|
48
|
-
access_config: Secret[
|
|
49
|
+
access_config: Secret[AzureAISearchAccessConfig]
|
|
49
50
|
|
|
50
|
-
@requires_dependencies(["azure.search", "azure.core"], extras="azure-
|
|
51
|
-
def
|
|
51
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
52
|
+
def get_search_client(self) -> "SearchClient":
|
|
52
53
|
from azure.core.credentials import AzureKeyCredential
|
|
53
54
|
from azure.search.documents import SearchClient
|
|
54
55
|
|
|
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
|
|
|
56
57
|
endpoint=self.endpoint,
|
|
57
58
|
index_name=self.index,
|
|
58
59
|
credential=AzureKeyCredential(
|
|
59
|
-
self.access_config.get_secret_value().
|
|
60
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
|
|
65
|
+
def get_search_index_client(self) -> "SearchIndexClient":
|
|
66
|
+
from azure.core.credentials import AzureKeyCredential
|
|
67
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
68
|
+
|
|
69
|
+
return SearchIndexClient(
|
|
70
|
+
endpoint=self.endpoint,
|
|
71
|
+
credential=AzureKeyCredential(
|
|
72
|
+
self.access_config.get_secret_value().azure_ai_search_key
|
|
60
73
|
),
|
|
61
74
|
)
|
|
62
75
|
|
|
63
76
|
|
|
64
|
-
class
|
|
77
|
+
class AzureAISearchUploadStagerConfig(UploadStagerConfig):
|
|
65
78
|
pass
|
|
66
79
|
|
|
67
80
|
|
|
68
|
-
class
|
|
81
|
+
class AzureAISearchUploaderConfig(UploaderConfig):
|
|
69
82
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
83
|
+
record_id_key: str = Field(
|
|
84
|
+
default=RECORD_ID_LABEL,
|
|
85
|
+
description="searchable key to find entries for the same record on previous runs",
|
|
86
|
+
)
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
@dataclass
|
|
73
|
-
class
|
|
74
|
-
upload_stager_config:
|
|
75
|
-
default_factory=lambda:
|
|
90
|
+
class AzureAISearchUploadStager(UploadStager):
|
|
91
|
+
upload_stager_config: AzureAISearchUploadStagerConfig = field(
|
|
92
|
+
default_factory=lambda: AzureAISearchUploadStagerConfig()
|
|
76
93
|
)
|
|
77
94
|
|
|
78
95
|
@staticmethod
|
|
79
|
-
def conform_dict(data: dict) -> dict:
|
|
96
|
+
def conform_dict(data: dict, file_data: FileData) -> dict:
|
|
80
97
|
"""
|
|
81
98
|
updates the dictionary that is from each Element being converted into a dict/json
|
|
82
99
|
into a dictionary that conforms to the schema expected by the
|
|
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
84
101
|
"""
|
|
85
102
|
|
|
86
103
|
data["id"] = str(uuid.uuid4())
|
|
104
|
+
data[RECORD_ID_LABEL] = file_data.identifier
|
|
87
105
|
|
|
88
106
|
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
89
107
|
data["metadata"]["coordinates"]["points"] = json.dumps(points)
|
|
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
124
142
|
|
|
125
143
|
def run(
|
|
126
144
|
self,
|
|
145
|
+
file_data: FileData,
|
|
127
146
|
elements_filepath: Path,
|
|
128
147
|
output_dir: Path,
|
|
129
148
|
output_filename: str,
|
|
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|
|
132
151
|
with open(elements_filepath) as elements_file:
|
|
133
152
|
elements_contents = json.load(elements_file)
|
|
134
153
|
|
|
135
|
-
conformed_elements = [
|
|
154
|
+
conformed_elements = [
|
|
155
|
+
self.conform_dict(data=element, file_data=file_data) for element in elements_contents
|
|
156
|
+
]
|
|
136
157
|
|
|
137
158
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
159
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
138
160
|
with open(output_path, "w") as output_file:
|
|
139
|
-
json.dump(conformed_elements, output_file)
|
|
161
|
+
json.dump(conformed_elements, output_file, indent=2)
|
|
140
162
|
return output_path
|
|
141
163
|
|
|
142
164
|
|
|
143
165
|
@dataclass
|
|
144
|
-
class
|
|
145
|
-
upload_config:
|
|
146
|
-
connection_config:
|
|
166
|
+
class AzureAISearchUploader(Uploader):
|
|
167
|
+
upload_config: AzureAISearchUploaderConfig
|
|
168
|
+
connection_config: AzureAISearchConnectionConfig
|
|
147
169
|
connector_type: str = CONNECTOR_TYPE
|
|
148
170
|
|
|
171
|
+
def query_docs(self, record_id: str, index_key: str) -> list[str]:
|
|
172
|
+
client = self.connection_config.get_search_client()
|
|
173
|
+
results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
|
|
174
|
+
return [result[index_key] for result in results]
|
|
175
|
+
|
|
176
|
+
def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
|
|
177
|
+
logger.debug(
|
|
178
|
+
f"deleting any content with metadata "
|
|
179
|
+
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
180
|
+
f"from azure cognitive search index: {self.connection_config.index}"
|
|
181
|
+
)
|
|
182
|
+
doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
|
|
183
|
+
if not doc_ids_to_delete:
|
|
184
|
+
return
|
|
185
|
+
client: SearchClient = self.connection_config.get_search_client()
|
|
186
|
+
results = client.delete_documents(
|
|
187
|
+
documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
|
|
188
|
+
)
|
|
189
|
+
errors = []
|
|
190
|
+
success = []
|
|
191
|
+
for result in results:
|
|
192
|
+
if result.succeeded:
|
|
193
|
+
success.append(result)
|
|
194
|
+
else:
|
|
195
|
+
errors.append(result)
|
|
196
|
+
logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
|
|
197
|
+
if errors:
|
|
198
|
+
raise WriteError(
|
|
199
|
+
", ".join(
|
|
200
|
+
[f"[{error.status_code}] {error.error_message}" for error in errors],
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
|
|
149
204
|
@DestinationConnectionError.wrap
|
|
150
|
-
@requires_dependencies(["azure"], extras="azure-
|
|
151
|
-
def write_dict(self,
|
|
205
|
+
@requires_dependencies(["azure"], extras="azure-ai-search")
|
|
206
|
+
def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
|
|
152
207
|
import azure.core.exceptions
|
|
153
208
|
|
|
154
209
|
logger.info(
|
|
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
156
211
|
f"index at {self.connection_config.index}",
|
|
157
212
|
)
|
|
158
213
|
try:
|
|
159
|
-
results = self.connection_config.
|
|
214
|
+
results = self.connection_config.get_search_client().upload_documents(
|
|
160
215
|
documents=elements_dict
|
|
161
216
|
)
|
|
162
217
|
|
|
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
174
229
|
raise WriteError(
|
|
175
230
|
", ".join(
|
|
176
231
|
[
|
|
177
|
-
f"{error.
|
|
232
|
+
f"{error.azure_ai_search_key}: "
|
|
178
233
|
f"[{error.status_code}] {error.error_message}"
|
|
179
234
|
for error in errors
|
|
180
235
|
],
|
|
181
236
|
),
|
|
182
237
|
)
|
|
183
238
|
|
|
239
|
+
def can_delete(self) -> bool:
|
|
240
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
241
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
242
|
+
index_fields = index.fields
|
|
243
|
+
record_id_fields = [
|
|
244
|
+
field for field in index_fields if field.name == self.upload_config.record_id_key
|
|
245
|
+
]
|
|
246
|
+
if not record_id_fields:
|
|
247
|
+
return False
|
|
248
|
+
record_id_field = record_id_fields[0]
|
|
249
|
+
return record_id_field.filterable
|
|
250
|
+
|
|
251
|
+
def get_index_key(self) -> str:
|
|
252
|
+
search_index_client = self.connection_config.get_search_index_client()
|
|
253
|
+
index = search_index_client.get_index(name=self.connection_config.index)
|
|
254
|
+
index_fields = index.fields
|
|
255
|
+
key_fields = [field for field in index_fields if field.key]
|
|
256
|
+
if not key_fields:
|
|
257
|
+
raise ValueError("no key field found in index fields")
|
|
258
|
+
return key_fields[0].name
|
|
259
|
+
|
|
184
260
|
def precheck(self) -> None:
|
|
185
261
|
try:
|
|
186
|
-
client = self.connection_config.
|
|
262
|
+
client = self.connection_config.get_search_client()
|
|
187
263
|
client.get_document_count()
|
|
188
264
|
except Exception as e:
|
|
189
265
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
190
266
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
191
267
|
|
|
192
|
-
def write_dict_wrapper(self, elements_dict):
|
|
193
|
-
return self.write_dict(elements_dict=elements_dict)
|
|
194
|
-
|
|
195
268
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
196
269
|
with path.open("r") as file:
|
|
197
270
|
elements_dict = json.load(file)
|
|
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
201
274
|
f" index at {str(self.connection_config.index)}"
|
|
202
275
|
f" with batch size {str(self.upload_config.batch_size)}"
|
|
203
276
|
)
|
|
277
|
+
if self.can_delete():
|
|
278
|
+
index_key = self.get_index_key()
|
|
279
|
+
self.delete_by_record_id(file_data=file_data, index_key=index_key)
|
|
280
|
+
else:
|
|
281
|
+
logger.warning("criteria for deleting previous content not met, skipping")
|
|
204
282
|
|
|
205
283
|
batch_size = self.upload_config.batch_size
|
|
206
|
-
|
|
207
284
|
for chunk in batch_generator(elements_dict, batch_size):
|
|
208
285
|
self.write_dict(elements_dict=chunk) # noqa: E203
|
|
209
286
|
|
|
210
287
|
|
|
211
|
-
|
|
212
|
-
connection_config=
|
|
213
|
-
uploader=
|
|
214
|
-
uploader_config=
|
|
215
|
-
upload_stager=
|
|
216
|
-
upload_stager_config=
|
|
288
|
+
azure_ai_search_destination_entry = DestinationRegistryEntry(
|
|
289
|
+
connection_config=AzureAISearchConnectionConfig,
|
|
290
|
+
uploader=AzureAISearchUploader,
|
|
291
|
+
uploader_config=AzureAISearchUploaderConfig,
|
|
292
|
+
upload_stager=AzureAISearchUploadStager,
|
|
293
|
+
upload_stager_config=AzureAISearchUploadStagerConfig,
|
|
217
294
|
)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
from unstructured_ingest.v2.interfaces import (
|
|
10
|
+
AccessConfig,
|
|
11
|
+
ConnectionConfig,
|
|
12
|
+
Downloader,
|
|
13
|
+
DownloaderConfig,
|
|
14
|
+
DownloadResponse,
|
|
15
|
+
FileData,
|
|
16
|
+
FileDataSourceMetadata,
|
|
17
|
+
Indexer,
|
|
18
|
+
IndexerConfig,
|
|
19
|
+
SourceIdentifiers,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
+
SourceRegistryEntry,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from atlassian import Confluence
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "confluence"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConfluenceAccessConfig(AccessConfig):
|
|
33
|
+
api_token: str = Field(description="Confluence API token")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ConfluenceConnectionConfig(ConnectionConfig):
|
|
37
|
+
url: str = Field(description="URL of the Confluence instance")
|
|
38
|
+
user_email: str = Field(description="User email for authentication")
|
|
39
|
+
access_config: Secret[ConfluenceAccessConfig] = Field(
|
|
40
|
+
description="Access configuration for Confluence"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(["atlassian"], extras="confluence")
|
|
44
|
+
def get_client(self) -> "Confluence":
|
|
45
|
+
from atlassian import Confluence
|
|
46
|
+
|
|
47
|
+
access_configs = self.access_config.get_secret_value()
|
|
48
|
+
return Confluence(
|
|
49
|
+
url=self.url,
|
|
50
|
+
username=self.user_email,
|
|
51
|
+
password=access_configs.api_token,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ConfluenceIndexerConfig(IndexerConfig):
|
|
56
|
+
max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
|
|
57
|
+
max_num_of_docs_from_each_space: int = Field(
|
|
58
|
+
100, description="Maximum number of documents to fetch from each space"
|
|
59
|
+
)
|
|
60
|
+
spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class ConfluenceIndexer(Indexer):
|
|
65
|
+
connection_config: ConfluenceConnectionConfig
|
|
66
|
+
index_config: ConfluenceIndexerConfig
|
|
67
|
+
connector_type: str = CONNECTOR_TYPE
|
|
68
|
+
|
|
69
|
+
def precheck(self) -> bool:
|
|
70
|
+
try:
|
|
71
|
+
|
|
72
|
+
# Attempt to retrieve a list of spaces with limit=1.
|
|
73
|
+
# This should only succeed if all creds are valid
|
|
74
|
+
client = self.connection_config.get_client()
|
|
75
|
+
client.get_all_spaces(limit=1)
|
|
76
|
+
logger.info("Connection to Confluence successful.")
|
|
77
|
+
return True
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
|
|
80
|
+
raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
|
|
81
|
+
|
|
82
|
+
def _get_space_ids(self) -> List[str]:
|
|
83
|
+
spaces = self.index_config.spaces
|
|
84
|
+
if spaces:
|
|
85
|
+
return spaces
|
|
86
|
+
else:
|
|
87
|
+
client = self.connection_config.get_client()
|
|
88
|
+
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
89
|
+
space_ids = [space["key"] for space in all_spaces["results"]]
|
|
90
|
+
return space_ids
|
|
91
|
+
|
|
92
|
+
def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
|
|
93
|
+
client = self.connection_config.get_client()
|
|
94
|
+
pages = client.get_all_pages_from_space(
|
|
95
|
+
space=space_id,
|
|
96
|
+
start=0,
|
|
97
|
+
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
98
|
+
expand=None,
|
|
99
|
+
content_type="page",
|
|
100
|
+
status=None,
|
|
101
|
+
)
|
|
102
|
+
doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
|
|
103
|
+
return doc_ids
|
|
104
|
+
|
|
105
|
+
def run(self) -> Generator[FileData, None, None]:
|
|
106
|
+
from time import time
|
|
107
|
+
|
|
108
|
+
space_ids = self._get_space_ids()
|
|
109
|
+
for space_id in space_ids:
|
|
110
|
+
doc_ids = self._get_docs_ids_within_one_space(space_id)
|
|
111
|
+
for doc in doc_ids:
|
|
112
|
+
doc_id = doc["doc_id"]
|
|
113
|
+
# Build metadata
|
|
114
|
+
metadata = FileDataSourceMetadata(
|
|
115
|
+
date_processed=str(time()),
|
|
116
|
+
url=f"{self.connection_config.url}/pages/{doc_id}",
|
|
117
|
+
record_locator={
|
|
118
|
+
"space_id": space_id,
|
|
119
|
+
"document_id": doc_id,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
additional_metadata = {
|
|
123
|
+
"space_id": space_id,
|
|
124
|
+
"document_id": doc_id,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Construct relative path and filename
|
|
128
|
+
filename = f"{doc_id}.html"
|
|
129
|
+
relative_path = str(Path(space_id) / filename)
|
|
130
|
+
|
|
131
|
+
source_identifiers = SourceIdentifiers(
|
|
132
|
+
filename=filename,
|
|
133
|
+
fullpath=relative_path,
|
|
134
|
+
rel_path=relative_path,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
file_data = FileData(
|
|
138
|
+
identifier=doc_id,
|
|
139
|
+
connector_type=self.connector_type,
|
|
140
|
+
metadata=metadata,
|
|
141
|
+
additional_metadata=additional_metadata,
|
|
142
|
+
source_identifiers=source_identifiers,
|
|
143
|
+
)
|
|
144
|
+
yield file_data
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class ConfluenceDownloader(Downloader):
|
|
153
|
+
connection_config: ConfluenceConnectionConfig
|
|
154
|
+
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
155
|
+
connector_type: str = CONNECTOR_TYPE
|
|
156
|
+
|
|
157
|
+
def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
158
|
+
doc_id = file_data.identifier
|
|
159
|
+
try:
|
|
160
|
+
client = self.connection_config.get_client()
|
|
161
|
+
page = client.get_page_by_id(
|
|
162
|
+
page_id=doc_id,
|
|
163
|
+
expand="history.lastUpdated,version,body.view",
|
|
164
|
+
)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
167
|
+
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
168
|
+
|
|
169
|
+
if not page:
|
|
170
|
+
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
171
|
+
|
|
172
|
+
content = page["body"]["view"]["value"]
|
|
173
|
+
|
|
174
|
+
filepath = file_data.source_identifiers.relative_path
|
|
175
|
+
download_path = Path(self.download_dir) / filepath
|
|
176
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
with open(download_path, "w", encoding="utf8") as f:
|
|
178
|
+
f.write(content)
|
|
179
|
+
|
|
180
|
+
# Update file_data with metadata
|
|
181
|
+
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
182
|
+
file_data.metadata.date_modified = page["version"]["when"]
|
|
183
|
+
file_data.metadata.version = str(page["version"]["number"])
|
|
184
|
+
file_data.display_name = page["title"]
|
|
185
|
+
|
|
186
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
confluence_source_entry = SourceRegistryEntry(
|
|
190
|
+
connection_config=ConfluenceConnectionConfig,
|
|
191
|
+
indexer_config=ConfluenceIndexerConfig,
|
|
192
|
+
indexer=ConfluenceIndexer,
|
|
193
|
+
downloader_config=ConfluenceDownloaderConfig,
|
|
194
|
+
downloader=ConfluenceDownloader,
|
|
195
|
+
)
|