unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CloudQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
api_key: str = Field(description="Qdrant API key")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CloudQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant Cloud")
|
|
24
|
+
access_config: Secret[CloudQdrantAccessConfig]
|
|
25
|
+
|
|
26
|
+
def get_client_kwargs(self) -> dict:
|
|
27
|
+
return {
|
|
28
|
+
"api_key": self.access_config.get_secret_value().api_key,
|
|
29
|
+
"url": self.url,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class CloudQdrantUploadStager(QdrantUploadStager):
|
|
39
|
+
upload_stager_config: CloudQdrantUploadStagerConfig
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CloudQdrantUploaderConfig(QdrantUploaderConfig):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CloudQdrantUploader(QdrantUploader):
|
|
48
|
+
connection_config: CloudQdrantConnectionConfig
|
|
49
|
+
upload_config: CloudQdrantUploaderConfig
|
|
50
|
+
connector_type: str = CONNECTOR_TYPE
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
qdrant_cloud_destination_entry = DestinationRegistryEntry(
|
|
54
|
+
connection_config=CloudQdrantConnectionConfig,
|
|
55
|
+
uploader=CloudQdrantUploader,
|
|
56
|
+
uploader_config=CloudQdrantUploaderConfig,
|
|
57
|
+
upload_stager=CloudQdrantUploadStager,
|
|
58
|
+
upload_stager_config=CloudQdrantUploadStagerConfig,
|
|
59
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-local"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LocalQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LocalQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
path: str = Field(default=None, description="Persistence path for QdrantLocal.")
|
|
24
|
+
access_config: Secret[LocalQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=LocalQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {"path": self.path}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class LocalQdrantUploadStager(QdrantUploadStager):
|
|
38
|
+
upload_stager_config: LocalQdrantUploadStagerConfig
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LocalQdrantUploaderConfig(QdrantUploaderConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class LocalQdrantUploader(QdrantUploader):
|
|
47
|
+
connection_config: LocalQdrantConnectionConfig
|
|
48
|
+
upload_config: LocalQdrantUploaderConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
qdrant_local_destination_entry = DestinationRegistryEntry(
|
|
53
|
+
connection_config=LocalQdrantConnectionConfig,
|
|
54
|
+
uploader=LocalQdrantUploader,
|
|
55
|
+
uploader_config=LocalQdrantUploaderConfig,
|
|
56
|
+
upload_stager=LocalQdrantUploadStager,
|
|
57
|
+
upload_stager_config=LocalQdrantUploadStagerConfig,
|
|
58
|
+
)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError, WriteError
|
|
12
|
+
from unstructured_ingest.interfaces import (
|
|
13
|
+
AccessConfig,
|
|
14
|
+
ConnectionConfig,
|
|
15
|
+
Uploader,
|
|
16
|
+
UploaderConfig,
|
|
17
|
+
UploadStager,
|
|
18
|
+
UploadStagerConfig,
|
|
19
|
+
)
|
|
20
|
+
from unstructured_ingest.logger import logger
|
|
21
|
+
from unstructured_ingest.utils.data_prep import (
|
|
22
|
+
batch_generator,
|
|
23
|
+
flatten_dict,
|
|
24
|
+
get_enhanced_element_id,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from qdrant_client import AsyncQdrantClient, QdrantClient
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class QdrantAccessConfig(AccessConfig, ABC):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class QdrantConnectionConfig(ConnectionConfig, ABC):
|
|
37
|
+
access_config: Secret[QdrantAccessConfig] = Field(
|
|
38
|
+
default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def get_client_kwargs(self) -> dict:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
46
|
+
@asynccontextmanager
|
|
47
|
+
async def get_async_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
|
|
48
|
+
from qdrant_client import AsyncQdrantClient
|
|
49
|
+
|
|
50
|
+
client_kwargs = self.get_client_kwargs()
|
|
51
|
+
client = AsyncQdrantClient(**client_kwargs)
|
|
52
|
+
try:
|
|
53
|
+
yield client
|
|
54
|
+
finally:
|
|
55
|
+
await client.close()
|
|
56
|
+
|
|
57
|
+
@requires_dependencies(["qdrant_client"], extras="qdrant")
|
|
58
|
+
@contextmanager
|
|
59
|
+
def get_client(self) -> Generator["QdrantClient", None, None]:
|
|
60
|
+
from qdrant_client import QdrantClient
|
|
61
|
+
|
|
62
|
+
client_kwargs = self.get_client_kwargs()
|
|
63
|
+
client = QdrantClient(**client_kwargs)
|
|
64
|
+
try:
|
|
65
|
+
yield client
|
|
66
|
+
finally:
|
|
67
|
+
client.close()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class QdrantUploadStagerConfig(UploadStagerConfig):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class QdrantUploadStager(UploadStager, ABC):
|
|
76
|
+
upload_stager_config: QdrantUploadStagerConfig = field(
|
|
77
|
+
default_factory=lambda: QdrantUploadStagerConfig()
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
81
|
+
"""Prepares dictionary in the format that Chroma requires"""
|
|
82
|
+
data = element_dict.copy()
|
|
83
|
+
return {
|
|
84
|
+
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
85
|
+
"vector": data.pop("embeddings", {}),
|
|
86
|
+
"payload": {
|
|
87
|
+
"text": data.pop("text", None),
|
|
88
|
+
"element_serialized": json.dumps(data),
|
|
89
|
+
**flatten_dict(
|
|
90
|
+
data,
|
|
91
|
+
separator="-",
|
|
92
|
+
flatten_lists=True,
|
|
93
|
+
),
|
|
94
|
+
},
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class QdrantUploaderConfig(UploaderConfig):
|
|
99
|
+
collection_name: str = Field(description="Name of the collection.")
|
|
100
|
+
batch_size: int = Field(default=50, description="Number of records per batch.")
|
|
101
|
+
num_processes: Optional[int] = Field(
|
|
102
|
+
default=1,
|
|
103
|
+
description="Optional limit on number of threads to use for upload.",
|
|
104
|
+
deprecated=True,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class QdrantUploader(Uploader, ABC):
|
|
110
|
+
upload_config: QdrantUploaderConfig
|
|
111
|
+
connection_config: QdrantConnectionConfig
|
|
112
|
+
|
|
113
|
+
@DestinationConnectionError.wrap
|
|
114
|
+
def precheck(self) -> None:
|
|
115
|
+
with self.connection_config.get_client() as client:
|
|
116
|
+
collections_response = client.get_collections()
|
|
117
|
+
collection_names = [c.name for c in collections_response.collections]
|
|
118
|
+
if self.upload_config.collection_name not in collection_names:
|
|
119
|
+
raise DestinationConnectionError(
|
|
120
|
+
"collection '{}' not found: {}".format(
|
|
121
|
+
self.upload_config.collection_name, ", ".join(collection_names)
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def is_async(self):
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
async def run_data_async(
|
|
129
|
+
self,
|
|
130
|
+
data: list[dict],
|
|
131
|
+
file_data: FileData,
|
|
132
|
+
**kwargs: Any,
|
|
133
|
+
) -> None:
|
|
134
|
+
batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
|
|
135
|
+
logger.debug(
|
|
136
|
+
"Elements split into %i batches of size %i.",
|
|
137
|
+
len(batches),
|
|
138
|
+
self.upload_config.batch_size,
|
|
139
|
+
)
|
|
140
|
+
await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
|
|
141
|
+
|
|
142
|
+
async def _upsert_batch(self, batch: list[dict]) -> None:
|
|
143
|
+
from qdrant_client import models
|
|
144
|
+
|
|
145
|
+
points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
|
|
146
|
+
try:
|
|
147
|
+
logger.debug(
|
|
148
|
+
"Upserting %i points to the '%s' collection.",
|
|
149
|
+
len(points),
|
|
150
|
+
self.upload_config.collection_name,
|
|
151
|
+
)
|
|
152
|
+
async with self.connection_config.get_async_client() as async_client:
|
|
153
|
+
await async_client.upsert(
|
|
154
|
+
self.upload_config.collection_name, points=points, wait=True
|
|
155
|
+
)
|
|
156
|
+
except Exception as api_error:
|
|
157
|
+
logger.error(
|
|
158
|
+
"Failed to upsert points to the collection due to the following error %s", api_error
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
raise WriteError(f"Qdrant error: {api_error}") from api_error
|
|
162
|
+
|
|
163
|
+
logger.debug("Successfully upsert points to the collection.")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
6
|
+
from unstructured_ingest.processes.connectors.qdrant.qdrant import (
|
|
7
|
+
QdrantAccessConfig,
|
|
8
|
+
QdrantConnectionConfig,
|
|
9
|
+
QdrantUploader,
|
|
10
|
+
QdrantUploaderConfig,
|
|
11
|
+
QdrantUploadStager,
|
|
12
|
+
QdrantUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "qdrant-server"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ServerQdrantAccessConfig(QdrantAccessConfig):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ServerQdrantConnectionConfig(QdrantConnectionConfig):
|
|
23
|
+
url: str = Field(default=None, description="url of Qdrant server")
|
|
24
|
+
access_config: Secret[ServerQdrantAccessConfig] = Field(
|
|
25
|
+
default_factory=ServerQdrantAccessConfig, validate_default=True
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def get_client_kwargs(self) -> dict:
|
|
29
|
+
return {
|
|
30
|
+
"url": self.url,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ServerQdrantUploadStager(QdrantUploadStager):
|
|
40
|
+
upload_stager_config: ServerQdrantUploadStagerConfig
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServerQdrantUploaderConfig(QdrantUploaderConfig):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class ServerQdrantUploader(QdrantUploader):
|
|
49
|
+
connection_config: ServerQdrantConnectionConfig
|
|
50
|
+
upload_config: ServerQdrantUploaderConfig
|
|
51
|
+
connector_type: str = CONNECTOR_TYPE
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
qdrant_server_destination_entry = DestinationRegistryEntry(
|
|
55
|
+
connection_config=ServerQdrantConnectionConfig,
|
|
56
|
+
uploader=ServerQdrantUploader,
|
|
57
|
+
uploader_config=ServerQdrantUploaderConfig,
|
|
58
|
+
upload_stager=ServerQdrantUploadStager,
|
|
59
|
+
upload_stager_config=ServerQdrantUploadStagerConfig,
|
|
60
|
+
)
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret, model_validator
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, ResponseError, ValueError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.logger import logger
|
|
17
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
18
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
19
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from redis.asyncio import Redis
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "redis"
|
|
27
|
+
SERVER_API_VERSION = "1"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RedisAccessConfig(AccessConfig):
|
|
31
|
+
uri: Optional[str] = Field(
|
|
32
|
+
default=None, description="If not anonymous, use this uri, if specified."
|
|
33
|
+
)
|
|
34
|
+
password: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Password used to connect to database if uri is "
|
|
37
|
+
"not specified and connection is not anonymous.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RedisConnectionConfig(ConnectionConfig):
|
|
42
|
+
access_config: Secret[RedisAccessConfig] = Field(
|
|
43
|
+
default=RedisAccessConfig(), validate_default=True
|
|
44
|
+
)
|
|
45
|
+
host: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Hostname or IP address of a Redis instance to connect to "
|
|
48
|
+
"if uri is not specified.",
|
|
49
|
+
)
|
|
50
|
+
database: int = Field(default=0, description="Database index to connect to.")
|
|
51
|
+
port: Optional[int] = Field(
|
|
52
|
+
default=6379, description="Port used to connect to database if uri is not specified."
|
|
53
|
+
)
|
|
54
|
+
username: Optional[str] = Field(
|
|
55
|
+
default=None, description="Username used to connect to database if uri is not specified."
|
|
56
|
+
)
|
|
57
|
+
ssl: Optional[bool] = Field(
|
|
58
|
+
default=True,
|
|
59
|
+
description="Whether the connection should use SSL encryption if uri is not specified.",
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
62
|
+
|
|
63
|
+
@model_validator(mode="after")
|
|
64
|
+
def validate_host_or_url(self) -> "RedisConnectionConfig":
|
|
65
|
+
if not self.access_config.get_secret_value().uri:
|
|
66
|
+
if not self.host:
|
|
67
|
+
raise ValueError("Please pass a hostname either directly or through uri")
|
|
68
|
+
if self.port is None:
|
|
69
|
+
raise ValueError("Since URI is not specified, port cannot be None")
|
|
70
|
+
if self.ssl is None:
|
|
71
|
+
raise ValueError("Since URI is not specified, ssl cannot be None")
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
75
|
+
@asynccontextmanager
|
|
76
|
+
async def create_async_client(self) -> AsyncGenerator["Redis", None]:
|
|
77
|
+
from redis.asyncio import Redis, from_url
|
|
78
|
+
|
|
79
|
+
access_config = self.access_config.get_secret_value()
|
|
80
|
+
|
|
81
|
+
if access_config.uri:
|
|
82
|
+
async with from_url(access_config.uri) as client:
|
|
83
|
+
yield client
|
|
84
|
+
else:
|
|
85
|
+
options = {
|
|
86
|
+
"host": self.host,
|
|
87
|
+
"port": self.port,
|
|
88
|
+
"db": self.database,
|
|
89
|
+
"ssl": self.ssl,
|
|
90
|
+
"username": self.username,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if access_config.password:
|
|
94
|
+
options["password"] = access_config.password
|
|
95
|
+
async with Redis(**options) as client:
|
|
96
|
+
yield client
|
|
97
|
+
|
|
98
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
99
|
+
@contextmanager
|
|
100
|
+
def create_client(self) -> Generator["Redis", None, None]:
|
|
101
|
+
from redis import Redis, from_url
|
|
102
|
+
|
|
103
|
+
access_config = self.access_config.get_secret_value()
|
|
104
|
+
|
|
105
|
+
options = {
|
|
106
|
+
"host": self.host,
|
|
107
|
+
"port": self.port,
|
|
108
|
+
"db": self.database,
|
|
109
|
+
"ssl": self.ssl,
|
|
110
|
+
"username": self.username,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if access_config.password:
|
|
114
|
+
options["password"] = access_config.password
|
|
115
|
+
|
|
116
|
+
if access_config.uri:
|
|
117
|
+
with from_url(access_config.uri) as client:
|
|
118
|
+
yield client
|
|
119
|
+
else:
|
|
120
|
+
with Redis(**options) as client:
|
|
121
|
+
yield client
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class RedisUploaderConfig(UploaderConfig):
|
|
125
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
126
|
+
key_prefix: str = Field(default="", description="Prefix for Redis keys")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _form_redis_pipeline_error_message(error: str) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Form a user-friendly error message for Redis pipeline errors.
|
|
132
|
+
The error message has `$` character at the beginning and `) of pipeline` at the end.
|
|
133
|
+
Everything between these two strings is the value an should be removed.
|
|
134
|
+
"""
|
|
135
|
+
start = error.find("$")
|
|
136
|
+
end = error.find(") of pipeline")
|
|
137
|
+
if start != -1 and end != -1:
|
|
138
|
+
return error[: start + 1] + "<value>" + error[end:]
|
|
139
|
+
else:
|
|
140
|
+
return error
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class RedisUploader(Uploader):
|
|
145
|
+
upload_config: RedisUploaderConfig
|
|
146
|
+
connection_config: RedisConnectionConfig
|
|
147
|
+
connector_type: str = CONNECTOR_TYPE
|
|
148
|
+
|
|
149
|
+
def is_async(self) -> bool:
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
def precheck(self) -> None:
|
|
153
|
+
try:
|
|
154
|
+
with self.connection_config.create_client() as client:
|
|
155
|
+
client.ping()
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
158
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
159
|
+
|
|
160
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
161
|
+
first_element = data[0]
|
|
162
|
+
redis_stack = await self._check_redis_stack(first_element)
|
|
163
|
+
logger.info(
|
|
164
|
+
f"writing {len(data)} objects to destination asynchronously, "
|
|
165
|
+
f"db, {self.connection_config.database}, "
|
|
166
|
+
f"at {self.connection_config.host}",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
|
|
170
|
+
await asyncio.gather(*[self._write_batch(batch, redis_stack) for batch in batches])
|
|
171
|
+
|
|
172
|
+
async def _write_batch(self, batch: list[dict], redis_stack: bool) -> None:
|
|
173
|
+
async with (
|
|
174
|
+
self.connection_config.create_async_client() as async_client,
|
|
175
|
+
async_client.pipeline(transaction=True) as pipe,
|
|
176
|
+
):
|
|
177
|
+
for element in batch:
|
|
178
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
179
|
+
if redis_stack:
|
|
180
|
+
pipe.json().set(key_with_prefix, "$", element)
|
|
181
|
+
else:
|
|
182
|
+
pipe.set(key_with_prefix, json.dumps(element))
|
|
183
|
+
await pipe.execute()
|
|
184
|
+
|
|
185
|
+
@requires_dependencies(["redis"], extras="redis")
|
|
186
|
+
async def _check_redis_stack(self, element: dict) -> bool:
|
|
187
|
+
from redis import exceptions as redis_exceptions
|
|
188
|
+
|
|
189
|
+
redis_stack = True
|
|
190
|
+
async with (
|
|
191
|
+
self.connection_config.create_async_client() as async_client,
|
|
192
|
+
async_client.pipeline(transaction=True) as pipe,
|
|
193
|
+
):
|
|
194
|
+
key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
|
|
195
|
+
try:
|
|
196
|
+
# Redis with stack extension supports JSON type
|
|
197
|
+
await pipe.json().set(key_with_prefix, "$", element).execute()
|
|
198
|
+
except redis_exceptions.ResponseError as e:
|
|
199
|
+
message = _form_redis_pipeline_error_message(str(e))
|
|
200
|
+
if "unknown command `JSON.SET`" in message:
|
|
201
|
+
# if this error occurs, Redis server doesn't support JSON type,
|
|
202
|
+
# so save as string type instead
|
|
203
|
+
await pipe.set(key_with_prefix, json.dumps(element)).execute()
|
|
204
|
+
redis_stack = False
|
|
205
|
+
else:
|
|
206
|
+
raise ResponseError(message) from e
|
|
207
|
+
return redis_stack
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
redis_destination_entry = DestinationRegistryEntry(
|
|
211
|
+
connection_config=RedisConnectionConfig,
|
|
212
|
+
uploader=RedisUploader,
|
|
213
|
+
uploader_config=RedisUploaderConfig,
|
|
214
|
+
)
|