unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Mapping, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
12
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError
|
|
13
|
+
from unstructured_ingest.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Uploader,
|
|
17
|
+
UploaderConfig,
|
|
18
|
+
UploadStager,
|
|
19
|
+
UploadStagerConfig,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.logger import logger
|
|
22
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
23
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
+
|
|
26
|
+
BASE_URL = "https://api.vectara.io/v2"
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "vectara"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class VectaraAccessConfig(AccessConfig):
|
|
32
|
+
oauth_client_id: str = Field(description="Client ID")
|
|
33
|
+
oauth_secret: str = Field(description="Client Secret")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class VectaraConnectionConfig(ConnectionConfig):
|
|
37
|
+
access_config: Secret[VectaraAccessConfig]
|
|
38
|
+
customer_id: str
|
|
39
|
+
corpus_name: Optional[str] = None
|
|
40
|
+
corpus_key: Optional[str] = None
|
|
41
|
+
token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class VectaraUploadStagerConfig(UploadStagerConfig):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class VectaraUploadStager(UploadStager):
|
|
50
|
+
upload_stager_config: VectaraUploadStagerConfig = field(
|
|
51
|
+
default_factory=lambda: VectaraUploadStagerConfig()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def conform_dict(data: dict) -> dict:
|
|
56
|
+
"""
|
|
57
|
+
Prepares dictionary in the format that Vectara requires.
|
|
58
|
+
See more detail in https://docs.vectara.com/docs/rest-api/create-corpus-document
|
|
59
|
+
|
|
60
|
+
Select which meta-data fields to include and optionally map them to a new format.
|
|
61
|
+
remove the "metadata-" prefix from the keys
|
|
62
|
+
"""
|
|
63
|
+
metadata_map = {
|
|
64
|
+
"page_number": "page_number",
|
|
65
|
+
"data_source-url": "url",
|
|
66
|
+
"filename": "filename",
|
|
67
|
+
"filetype": "filetype",
|
|
68
|
+
"last_modified": "last_modified",
|
|
69
|
+
"element_id": "element_id",
|
|
70
|
+
}
|
|
71
|
+
md = flatten_dict(data, separator="-", flatten_lists=True)
|
|
72
|
+
md = {k.replace("metadata-", ""): v for k, v in md.items()}
|
|
73
|
+
md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
|
|
74
|
+
return md
|
|
75
|
+
|
|
76
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
77
|
+
with input_file.open() as in_f:
|
|
78
|
+
elements_contents = json.load(in_f)
|
|
79
|
+
|
|
80
|
+
logger.info(
|
|
81
|
+
f"Extending {len(elements_contents)} json elements from content in {input_file}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
conformed_elements = [
|
|
85
|
+
{
|
|
86
|
+
"id": str(uuid.uuid4()),
|
|
87
|
+
"type": "core",
|
|
88
|
+
"metadata": {
|
|
89
|
+
"title": file_data.identifier,
|
|
90
|
+
},
|
|
91
|
+
"document_parts": [
|
|
92
|
+
{
|
|
93
|
+
"text": element.pop("text", None),
|
|
94
|
+
"metadata": self.conform_dict(data=element),
|
|
95
|
+
}
|
|
96
|
+
for element in elements_contents
|
|
97
|
+
],
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
with open(output_file, "w") as out_f:
|
|
102
|
+
json.dump(conformed_elements, out_f, indent=2)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class VectaraUploaderConfig(UploaderConfig):
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class VectaraUploader(Uploader):
|
|
111
|
+
connector_type: str = CONNECTOR_TYPE
|
|
112
|
+
upload_config: VectaraUploaderConfig
|
|
113
|
+
connection_config: VectaraConnectionConfig
|
|
114
|
+
_jwt_token: Optional[str] = field(init=False, default=None)
|
|
115
|
+
_jwt_token_expires_ts: Optional[float] = field(init=False, default=None)
|
|
116
|
+
|
|
117
|
+
def is_async(self) -> bool:
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
def precheck(self) -> None:
|
|
121
|
+
try:
|
|
122
|
+
self._check_connection_and_corpora()
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
125
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
async def jwt_token_async(self) -> str:
|
|
129
|
+
if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
|
|
130
|
+
self._jwt_token = await self._get_jwt_token_async()
|
|
131
|
+
return self._jwt_token
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def jwt_token(self) -> str:
|
|
135
|
+
if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
|
|
136
|
+
self._jwt_token = self._get_jwt_token()
|
|
137
|
+
return self._jwt_token
|
|
138
|
+
|
|
139
|
+
# Get Oauth2 JWT token
|
|
140
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
141
|
+
async def _get_jwt_token_async(self) -> str:
|
|
142
|
+
import httpx
|
|
143
|
+
|
|
144
|
+
"""Connect to the server and get a JWT token."""
|
|
145
|
+
token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
|
|
146
|
+
headers = {
|
|
147
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
148
|
+
}
|
|
149
|
+
data = {
|
|
150
|
+
"grant_type": "client_credentials",
|
|
151
|
+
"client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
|
|
152
|
+
"client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
async with httpx.AsyncClient() as client:
|
|
156
|
+
response = await client.post(token_endpoint, headers=headers, data=data)
|
|
157
|
+
response.raise_for_status()
|
|
158
|
+
response_json = response.json()
|
|
159
|
+
|
|
160
|
+
request_time = datetime.now().timestamp()
|
|
161
|
+
self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
|
|
162
|
+
|
|
163
|
+
return response_json.get("access_token")
|
|
164
|
+
|
|
165
|
+
# Get Oauth2 JWT token
|
|
166
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
167
|
+
def _get_jwt_token(self) -> str:
|
|
168
|
+
import httpx
|
|
169
|
+
|
|
170
|
+
"""Connect to the server and get a JWT token."""
|
|
171
|
+
token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
|
|
172
|
+
headers = {
|
|
173
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
174
|
+
}
|
|
175
|
+
data = {
|
|
176
|
+
"grant_type": "client_credentials",
|
|
177
|
+
"client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
|
|
178
|
+
"client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
with httpx.Client() as client:
|
|
182
|
+
response = client.post(token_endpoint, headers=headers, data=data)
|
|
183
|
+
response.raise_for_status()
|
|
184
|
+
response_json = response.json()
|
|
185
|
+
|
|
186
|
+
request_time = datetime.now().timestamp()
|
|
187
|
+
self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
|
|
188
|
+
|
|
189
|
+
return response_json.get("access_token")
|
|
190
|
+
|
|
191
|
+
@DestinationConnectionError.wrap
|
|
192
|
+
def _check_connection_and_corpora(self) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Check the connection for Vectara and validate corpus exists.
|
|
195
|
+
- If more than one corpus with the same name exists - raise error
|
|
196
|
+
- If exactly one corpus exists with this name - use it.
|
|
197
|
+
- If does not exist - raise error.
|
|
198
|
+
"""
|
|
199
|
+
# Get token if not already set
|
|
200
|
+
self.jwt_token
|
|
201
|
+
|
|
202
|
+
_, list_corpora_response = self._request(
|
|
203
|
+
http_method="GET",
|
|
204
|
+
endpoint="corpora",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if self.connection_config.corpus_name:
|
|
208
|
+
possible_corpora_keys_names_map = {
|
|
209
|
+
corpus.get("key"): corpus.get("name")
|
|
210
|
+
for corpus in list_corpora_response.get("corpora")
|
|
211
|
+
if corpus.get("name") == self.connection_config.corpus_name
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if len(possible_corpora_keys_names_map) > 1:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"Multiple Corpus exist with name {self.connection_config.corpus_name} in dest."
|
|
217
|
+
)
|
|
218
|
+
if len(possible_corpora_keys_names_map) == 1:
|
|
219
|
+
if not self.connection_config.corpus_key:
|
|
220
|
+
self.connection_config.corpus_key = list(
|
|
221
|
+
possible_corpora_keys_names_map.keys()
|
|
222
|
+
)[0]
|
|
223
|
+
elif (
|
|
224
|
+
self.connection_config.corpus_key
|
|
225
|
+
!= list(possible_corpora_keys_names_map.keys())[0]
|
|
226
|
+
):
|
|
227
|
+
raise ValueError("Corpus key does not match provided corpus name.")
|
|
228
|
+
else:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"No Corpora exist with name {self.connection_config.corpus_name} in dest."
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
234
|
+
async def _async_request(
|
|
235
|
+
self,
|
|
236
|
+
endpoint: str,
|
|
237
|
+
http_method: str = "POST",
|
|
238
|
+
params: Mapping[str, Any] = None,
|
|
239
|
+
data: Mapping[str, Any] = None,
|
|
240
|
+
) -> tuple[bool, dict]:
|
|
241
|
+
import httpx
|
|
242
|
+
|
|
243
|
+
url = f"{BASE_URL}/{endpoint}"
|
|
244
|
+
|
|
245
|
+
headers = {
|
|
246
|
+
"Content-Type": "application/json",
|
|
247
|
+
"Accept": "application/json",
|
|
248
|
+
"Authorization": f"Bearer {await self.jwt_token_async}",
|
|
249
|
+
"X-source": "unstructured",
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
async with httpx.AsyncClient() as client:
|
|
253
|
+
response = await client.request(
|
|
254
|
+
method=http_method, url=url, headers=headers, params=params, json=data
|
|
255
|
+
)
|
|
256
|
+
response.raise_for_status()
|
|
257
|
+
return response.json()
|
|
258
|
+
|
|
259
|
+
@requires_dependencies(["httpx"], extras="vectara")
|
|
260
|
+
def _request(
|
|
261
|
+
self,
|
|
262
|
+
endpoint: str,
|
|
263
|
+
http_method: str = "POST",
|
|
264
|
+
params: Mapping[str, Any] = None,
|
|
265
|
+
data: Mapping[str, Any] = None,
|
|
266
|
+
) -> tuple[bool, dict]:
|
|
267
|
+
import httpx
|
|
268
|
+
|
|
269
|
+
url = f"{BASE_URL}/{endpoint}"
|
|
270
|
+
|
|
271
|
+
headers = {
|
|
272
|
+
"Content-Type": "application/json",
|
|
273
|
+
"Accept": "application/json",
|
|
274
|
+
"Authorization": f"Bearer {self.jwt_token}",
|
|
275
|
+
"X-source": "unstructured",
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
with httpx.Client() as client:
|
|
279
|
+
response = client.request(
|
|
280
|
+
method=http_method, url=url, headers=headers, params=params, json=data
|
|
281
|
+
)
|
|
282
|
+
response.raise_for_status()
|
|
283
|
+
return response.json()
|
|
284
|
+
|
|
285
|
+
async def _delete_doc(self, doc_id: str) -> tuple[bool, dict]:
|
|
286
|
+
"""
|
|
287
|
+
Delete a document from the Vectara corpus.
|
|
288
|
+
"""
|
|
289
|
+
|
|
290
|
+
return await self._async_request(
|
|
291
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents/{doc_id}",
|
|
292
|
+
http_method="DELETE",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
async def _index_document(self, document: Dict[str, Any]) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Index a document (by uploading it to the Vectara corpus) from the document dictionary
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
logger.debug(
|
|
301
|
+
f"Indexing document {document['id']} to corpus key {self.connection_config.corpus_key}"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
result = await self._async_request(
|
|
306
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
|
|
307
|
+
)
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.error(f"exception {e} while indexing document {document['id']}")
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
if (
|
|
313
|
+
"messages" in result
|
|
314
|
+
and result["messages"]
|
|
315
|
+
and (
|
|
316
|
+
"ALREADY_EXISTS" in result["messages"]
|
|
317
|
+
or (
|
|
318
|
+
"CONFLICT: Indexing doesn't support updating documents."
|
|
319
|
+
in result["messages"][0]
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
):
|
|
323
|
+
logger.info(f"document {document['id']} already exists, re-indexing")
|
|
324
|
+
await self._delete_doc(document["id"])
|
|
325
|
+
await self._async_request(
|
|
326
|
+
endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
|
|
327
|
+
)
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
logger.info(f"indexing document {document['id']} succeeded")
|
|
331
|
+
|
|
332
|
+
async def run_data_async(
|
|
333
|
+
self,
|
|
334
|
+
data: list[dict],
|
|
335
|
+
file_data: FileData,
|
|
336
|
+
**kwargs: Any,
|
|
337
|
+
) -> None:
|
|
338
|
+
logger.info(f"inserting / updating {len(data)} documents to Vectara ")
|
|
339
|
+
await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
vectara_destination_entry = DestinationRegistryEntry(
|
|
343
|
+
connection_config=VectaraConnectionConfig,
|
|
344
|
+
uploader=VectaraUploader,
|
|
345
|
+
uploader_config=VectaraUploaderConfig,
|
|
346
|
+
upload_stager=VectaraUploadStager,
|
|
347
|
+
upload_stager_config=VectaraUploadStagerConfig,
|
|
348
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .cloud import CONNECTOR_TYPE as CLOUD_WEAVIATE_CONNECTOR_TYPE
|
|
8
|
+
from .cloud import weaviate_cloud_destination_entry
|
|
9
|
+
from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
|
|
10
|
+
from .embedded import weaviate_embedded_destination_entry
|
|
11
|
+
from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
|
|
12
|
+
from .local import weaviate_local_destination_entry
|
|
13
|
+
|
|
14
|
+
add_destination_entry(
|
|
15
|
+
destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
|
|
16
|
+
)
|
|
17
|
+
add_destination_entry(
|
|
18
|
+
destination_type=CLOUD_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_cloud_destination_entry
|
|
19
|
+
)
|
|
20
|
+
add_destination_entry(
|
|
21
|
+
destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
|
|
22
|
+
)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.error import ValueError
|
|
8
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
9
|
+
from unstructured_ingest.processes.connectors.weaviate.weaviate import (
|
|
10
|
+
WeaviateAccessConfig,
|
|
11
|
+
WeaviateConnectionConfig,
|
|
12
|
+
WeaviateUploader,
|
|
13
|
+
WeaviateUploaderConfig,
|
|
14
|
+
WeaviateUploadStager,
|
|
15
|
+
WeaviateUploadStagerConfig,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from weaviate.auth import AuthCredentials
|
|
21
|
+
from weaviate.client import WeaviateClient
|
|
22
|
+
|
|
23
|
+
CONNECTOR_TYPE = "weaviate-cloud"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CloudWeaviateAccessConfig(WeaviateAccessConfig):
|
|
27
|
+
access_token: Optional[str] = Field(
|
|
28
|
+
default=None, description="Used to create the bearer token."
|
|
29
|
+
)
|
|
30
|
+
api_key: Optional[str] = None
|
|
31
|
+
client_secret: Optional[str] = None
|
|
32
|
+
password: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
|
|
36
|
+
cluster_url: str = Field(
|
|
37
|
+
description="The WCD cluster URL or hostname to connect to. "
|
|
38
|
+
"Usually in the form: rAnD0mD1g1t5.something.weaviate.cloud"
|
|
39
|
+
)
|
|
40
|
+
username: Optional[str] = None
|
|
41
|
+
anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
|
|
42
|
+
refresh_token: Optional[str] = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
description="Will tie this value to the bearer token. If not provided, "
|
|
45
|
+
"the authentication will expire once the lifetime of the access token is up.",
|
|
46
|
+
)
|
|
47
|
+
access_config: Secret[CloudWeaviateAccessConfig]
|
|
48
|
+
|
|
49
|
+
def model_post_init(self, __context: Any) -> None:
|
|
50
|
+
if self.anonymous:
|
|
51
|
+
return
|
|
52
|
+
access_config = self.access_config.get_secret_value()
|
|
53
|
+
auths = {
|
|
54
|
+
"api_key": access_config.api_key is not None,
|
|
55
|
+
"bearer_token": access_config.access_token is not None,
|
|
56
|
+
"client_secret": access_config.client_secret is not None,
|
|
57
|
+
"client_password": access_config.password is not None and self.username is not None,
|
|
58
|
+
}
|
|
59
|
+
existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
|
|
60
|
+
|
|
61
|
+
if len(existing_auths) == 0:
|
|
62
|
+
raise ValueError("No auth values provided and anonymous is False")
|
|
63
|
+
if len(existing_auths) > 1:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"Multiple auth values provided, only one approach can be used: {}".format(
|
|
66
|
+
", ".join(existing_auths)
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
71
|
+
def get_api_key_auth(self) -> Optional["AuthCredentials"]:
|
|
72
|
+
from weaviate.classes.init import Auth
|
|
73
|
+
|
|
74
|
+
if api_key := self.access_config.get_secret_value().api_key:
|
|
75
|
+
return Auth.api_key(api_key=api_key)
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
79
|
+
def get_bearer_token_auth(self) -> Optional["AuthCredentials"]:
|
|
80
|
+
from weaviate.classes.init import Auth
|
|
81
|
+
|
|
82
|
+
if access_token := self.access_config.get_secret_value().access_token:
|
|
83
|
+
return Auth.bearer_token(access_token=access_token, refresh_token=self.refresh_token)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
87
|
+
def get_client_secret_auth(self) -> Optional["AuthCredentials"]:
|
|
88
|
+
from weaviate.classes.init import Auth
|
|
89
|
+
|
|
90
|
+
if client_secret := self.access_config.get_secret_value().client_secret:
|
|
91
|
+
return Auth.client_credentials(client_secret=client_secret)
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
95
|
+
def get_client_password_auth(self) -> Optional["AuthCredentials"]:
|
|
96
|
+
from weaviate.classes.init import Auth
|
|
97
|
+
|
|
98
|
+
if (username := self.username) and (
|
|
99
|
+
password := self.access_config.get_secret_value().password
|
|
100
|
+
):
|
|
101
|
+
return Auth.client_password(username=username, password=password)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
105
|
+
def get_auth(self) -> "AuthCredentials":
|
|
106
|
+
auths = [
|
|
107
|
+
self.get_api_key_auth(),
|
|
108
|
+
self.get_client_secret_auth(),
|
|
109
|
+
self.get_bearer_token_auth(),
|
|
110
|
+
self.get_client_password_auth(),
|
|
111
|
+
]
|
|
112
|
+
auths = [auth for auth in auths if auth]
|
|
113
|
+
if len(auths) == 0:
|
|
114
|
+
raise ValueError("No auth values provided and anonymous is False")
|
|
115
|
+
if len(auths) > 1:
|
|
116
|
+
raise ValueError("Multiple auth values provided, only one approach can be used")
|
|
117
|
+
return auths[0]
|
|
118
|
+
|
|
119
|
+
@contextmanager
|
|
120
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
121
|
+
def get_client(self) -> Generator["WeaviateClient", None, None]:
|
|
122
|
+
from weaviate import connect_to_weaviate_cloud
|
|
123
|
+
from weaviate.classes.init import AdditionalConfig
|
|
124
|
+
|
|
125
|
+
auth_credentials = None if self.anonymous else self.get_auth()
|
|
126
|
+
with connect_to_weaviate_cloud(
|
|
127
|
+
cluster_url=self.cluster_url,
|
|
128
|
+
auth_credentials=auth_credentials,
|
|
129
|
+
additional_config=AdditionalConfig(timeout=self.get_timeout()),
|
|
130
|
+
) as weaviate_client:
|
|
131
|
+
yield weaviate_client
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class CloudWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class CloudWeaviateUploadStager(WeaviateUploadStager):
|
|
140
|
+
upload_stager_config: CloudWeaviateUploadStagerConfig = field(
|
|
141
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class CloudWeaviateUploaderConfig(WeaviateUploaderConfig):
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class CloudWeaviateUploader(WeaviateUploader):
|
|
151
|
+
connection_config: CloudWeaviateConnectionConfig = field(
|
|
152
|
+
default_factory=lambda: CloudWeaviateConnectionConfig()
|
|
153
|
+
)
|
|
154
|
+
upload_config: CloudWeaviateUploaderConfig = field(
|
|
155
|
+
default_factory=lambda: CloudWeaviateUploaderConfig()
|
|
156
|
+
)
|
|
157
|
+
connector_type: str = CONNECTOR_TYPE
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
weaviate_cloud_destination_entry = DestinationRegistryEntry(
|
|
161
|
+
connection_config=CloudWeaviateConnectionConfig,
|
|
162
|
+
uploader=CloudWeaviateUploader,
|
|
163
|
+
uploader_config=CloudWeaviateUploaderConfig,
|
|
164
|
+
upload_stager=CloudWeaviateUploadStager,
|
|
165
|
+
upload_stager_config=CloudWeaviateUploadStagerConfig,
|
|
166
|
+
)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
8
|
+
from unstructured_ingest.processes.connectors.weaviate.weaviate import (
|
|
9
|
+
WeaviateAccessConfig,
|
|
10
|
+
WeaviateConnectionConfig,
|
|
11
|
+
WeaviateUploader,
|
|
12
|
+
WeaviateUploaderConfig,
|
|
13
|
+
WeaviateUploadStager,
|
|
14
|
+
WeaviateUploadStagerConfig,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from weaviate.client import WeaviateClient
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "weaviate-embedded"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class EmbeddedWeaviateAccessConfig(WeaviateAccessConfig):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EmbeddedWeaviateConnectionConfig(WeaviateConnectionConfig):
|
|
29
|
+
hostname: str = Field(default="127.0.0.1", description="hostname")
|
|
30
|
+
port: int = Field(default=8079, description="http port")
|
|
31
|
+
grpc_port: int = Field(default=50050, description="grpc port")
|
|
32
|
+
data_path: Optional[str] = Field(
|
|
33
|
+
default=None,
|
|
34
|
+
description="directory where the files making up the "
|
|
35
|
+
"database are stored. If not provided, will "
|
|
36
|
+
"default to underlying SDK implementation",
|
|
37
|
+
)
|
|
38
|
+
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
39
|
+
default=WeaviateAccessConfig(), validate_default=True
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@contextmanager
|
|
43
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
44
|
+
def get_client(self) -> Generator["WeaviateClient", None, None]:
|
|
45
|
+
from weaviate import connect_to_embedded
|
|
46
|
+
from weaviate.classes.init import AdditionalConfig
|
|
47
|
+
|
|
48
|
+
with connect_to_embedded(
|
|
49
|
+
hostname=self.hostname,
|
|
50
|
+
port=self.port,
|
|
51
|
+
grpc_port=self.grpc_port,
|
|
52
|
+
persistence_data_path=self.data_path,
|
|
53
|
+
additional_config=AdditionalConfig(timeout=self.get_timeout()),
|
|
54
|
+
) as weaviate_client:
|
|
55
|
+
yield weaviate_client
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EmbeddedWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class EmbeddedWeaviateUploadStager(WeaviateUploadStager):
|
|
64
|
+
upload_stager_config: EmbeddedWeaviateUploadStagerConfig = field(
|
|
65
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class EmbeddedWeaviateUploaderConfig(WeaviateUploaderConfig):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class EmbeddedWeaviateUploader(WeaviateUploader):
|
|
75
|
+
connection_config: EmbeddedWeaviateConnectionConfig = field(
|
|
76
|
+
default_factory=lambda: EmbeddedWeaviateConnectionConfig()
|
|
77
|
+
)
|
|
78
|
+
upload_config: EmbeddedWeaviateUploaderConfig = field(
|
|
79
|
+
default_factory=lambda: EmbeddedWeaviateUploaderConfig()
|
|
80
|
+
)
|
|
81
|
+
connector_type: str = CONNECTOR_TYPE
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
weaviate_embedded_destination_entry = DestinationRegistryEntry(
|
|
85
|
+
connection_config=EmbeddedWeaviateConnectionConfig,
|
|
86
|
+
uploader=EmbeddedWeaviateUploader,
|
|
87
|
+
uploader_config=EmbeddedWeaviateUploaderConfig,
|
|
88
|
+
upload_stager=EmbeddedWeaviateUploadStager,
|
|
89
|
+
upload_stager_config=EmbeddedWeaviateUploadStagerConfig,
|
|
90
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, Generator
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
8
|
+
from unstructured_ingest.processes.connectors.weaviate.weaviate import (
|
|
9
|
+
WeaviateAccessConfig,
|
|
10
|
+
WeaviateConnectionConfig,
|
|
11
|
+
WeaviateUploader,
|
|
12
|
+
WeaviateUploaderConfig,
|
|
13
|
+
WeaviateUploadStager,
|
|
14
|
+
WeaviateUploadStagerConfig,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from weaviate.client import WeaviateClient
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "weaviate-local"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LocalWeaviateAccessConfig(WeaviateAccessConfig):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LocalWeaviateConnectionConfig(WeaviateConnectionConfig):
|
|
29
|
+
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
30
|
+
default=WeaviateAccessConfig(), validate_default=True
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
@contextmanager
|
|
34
|
+
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
35
|
+
def get_client(self) -> Generator["WeaviateClient", None, None]:
|
|
36
|
+
from weaviate import connect_to_local
|
|
37
|
+
from weaviate.classes.init import AdditionalConfig
|
|
38
|
+
|
|
39
|
+
with connect_to_local(
|
|
40
|
+
additional_config=AdditionalConfig(timeout=self.get_timeout())
|
|
41
|
+
) as weaviate_client:
|
|
42
|
+
yield weaviate_client
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class LocalWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class LocalWeaviateUploadStager(WeaviateUploadStager):
|
|
51
|
+
upload_stager_config: LocalWeaviateUploadStagerConfig = field(
|
|
52
|
+
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class LocalWeaviateUploaderConfig(WeaviateUploaderConfig):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class LocalWeaviateUploader(WeaviateUploader):
|
|
62
|
+
upload_config: LocalWeaviateUploaderConfig
|
|
63
|
+
connector_type: str = CONNECTOR_TYPE
|
|
64
|
+
connection_config: LocalWeaviateConnectionConfig
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
weaviate_local_destination_entry = DestinationRegistryEntry(
|
|
68
|
+
connection_config=LocalWeaviateConnectionConfig,
|
|
69
|
+
uploader=LocalWeaviateUploader,
|
|
70
|
+
uploader_config=LocalWeaviateUploaderConfig,
|
|
71
|
+
upload_stager=LocalWeaviateUploadStager,
|
|
72
|
+
upload_stager_config=LocalWeaviateUploadStagerConfig,
|
|
73
|
+
)
|