unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import (
|
|
17
|
+
DestinationConnectionError,
|
|
18
|
+
SourceConnectionError,
|
|
19
|
+
SourceConnectionNetworkError,
|
|
20
|
+
UnstructuredIngestError,
|
|
21
|
+
ValueError,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.interfaces import (
|
|
24
|
+
AccessConfig,
|
|
25
|
+
ConnectionConfig,
|
|
26
|
+
Downloader,
|
|
27
|
+
DownloaderConfig,
|
|
28
|
+
DownloadResponse,
|
|
29
|
+
Indexer,
|
|
30
|
+
IndexerConfig,
|
|
31
|
+
Uploader,
|
|
32
|
+
UploaderConfig,
|
|
33
|
+
)
|
|
34
|
+
from unstructured_ingest.logger import logger
|
|
35
|
+
from unstructured_ingest.utils.data_prep import batch_generator
|
|
36
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from confluent_kafka import Consumer, Producer
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class KafkaAccessConfig(AccessConfig, ABC):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class KafkaConnectionConfig(ConnectionConfig, ABC):
|
|
47
|
+
access_config: Secret[KafkaAccessConfig]
|
|
48
|
+
bootstrap_server: str
|
|
49
|
+
port: int
|
|
50
|
+
group_id: str = Field(
|
|
51
|
+
description="A consumer group is a way to allow a pool of consumers "
|
|
52
|
+
"to divide the consumption of data over topics and partitions.",
|
|
53
|
+
default="default_group_id",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def get_consumer_configuration(self) -> dict:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def get_producer_configuration(self) -> dict:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
@contextmanager
|
|
65
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
66
|
+
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
67
|
+
from confluent_kafka import Consumer
|
|
68
|
+
|
|
69
|
+
consumer = Consumer(self.get_consumer_configuration())
|
|
70
|
+
try:
|
|
71
|
+
logger.debug("kafka consumer connected")
|
|
72
|
+
yield consumer
|
|
73
|
+
finally:
|
|
74
|
+
consumer.close()
|
|
75
|
+
|
|
76
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
77
|
+
def get_producer(self) -> "Producer":
|
|
78
|
+
from confluent_kafka import Producer
|
|
79
|
+
|
|
80
|
+
producer = Producer(self.get_producer_configuration())
|
|
81
|
+
return producer
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class KafkaIndexerConfig(IndexerConfig):
|
|
85
|
+
topic: str = Field(description="which topic to consume from")
|
|
86
|
+
num_messages_to_consume: Optional[int] = 100
|
|
87
|
+
timeout: Optional[float] = Field(default=3.0, description="polling timeout", ge=3.0)
|
|
88
|
+
|
|
89
|
+
def update_consumer(self, consumer: "Consumer") -> None:
|
|
90
|
+
consumer.subscribe([self.topic])
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class KafkaIndexer(Indexer, ABC):
|
|
95
|
+
connection_config: KafkaConnectionConfig
|
|
96
|
+
index_config: KafkaIndexerConfig
|
|
97
|
+
|
|
98
|
+
@contextmanager
|
|
99
|
+
def get_consumer(self) -> ContextManager["Consumer"]:
|
|
100
|
+
with self.connection_config.get_consumer() as consumer:
|
|
101
|
+
self.index_config.update_consumer(consumer=consumer)
|
|
102
|
+
yield consumer
|
|
103
|
+
|
|
104
|
+
@requires_dependencies(["confluent_kafka"], extras="kafka")
|
|
105
|
+
def generate_messages(self) -> Generator[Any, None, None]:
|
|
106
|
+
from confluent_kafka import KafkaError
|
|
107
|
+
|
|
108
|
+
messages_consumed = 0
|
|
109
|
+
max_empty_polls = 10
|
|
110
|
+
empty_polls = 0
|
|
111
|
+
num_messages_to_consume = self.index_config.num_messages_to_consume
|
|
112
|
+
with self.get_consumer() as consumer:
|
|
113
|
+
while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
|
|
114
|
+
msg = consumer.poll(timeout=self.index_config.timeout)
|
|
115
|
+
if msg is None:
|
|
116
|
+
logger.debug("No Kafka messages found")
|
|
117
|
+
empty_polls += 1
|
|
118
|
+
continue
|
|
119
|
+
if msg.error():
|
|
120
|
+
if msg.error().code() == KafkaError._PARTITION_EOF:
|
|
121
|
+
logger.info(
|
|
122
|
+
"Reached end of partition for topic %s [%d] at offset %d"
|
|
123
|
+
% (msg.topic(), msg.partition(), msg.offset())
|
|
124
|
+
)
|
|
125
|
+
break
|
|
126
|
+
else:
|
|
127
|
+
raise UnstructuredIngestError(msg.error())
|
|
128
|
+
try:
|
|
129
|
+
empty_polls = 0
|
|
130
|
+
messages_consumed += 1
|
|
131
|
+
yield msg
|
|
132
|
+
finally:
|
|
133
|
+
consumer.commit(asynchronous=False)
|
|
134
|
+
|
|
135
|
+
def generate_file_data(self, msg) -> FileData:
|
|
136
|
+
msg_content = msg.value().decode("utf8")
|
|
137
|
+
identifier = f"{msg.topic()}_{msg.partition()}_{msg.offset()}"
|
|
138
|
+
additional_metadata = {
|
|
139
|
+
"topic": msg.topic(),
|
|
140
|
+
"partition": msg.partition(),
|
|
141
|
+
"offset": msg.offset(),
|
|
142
|
+
"content": msg_content,
|
|
143
|
+
}
|
|
144
|
+
filename = f"{identifier}.txt"
|
|
145
|
+
return FileData(
|
|
146
|
+
identifier=identifier,
|
|
147
|
+
connector_type=self.connector_type,
|
|
148
|
+
source_identifiers=SourceIdentifiers(
|
|
149
|
+
filename=filename,
|
|
150
|
+
fullpath=filename,
|
|
151
|
+
),
|
|
152
|
+
metadata=FileDataSourceMetadata(
|
|
153
|
+
date_processed=str(time()),
|
|
154
|
+
),
|
|
155
|
+
additional_metadata=additional_metadata,
|
|
156
|
+
display_name=filename,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def run(self) -> Generator[FileData, None, None]:
|
|
160
|
+
for message in self.generate_messages():
|
|
161
|
+
yield self.generate_file_data(message)
|
|
162
|
+
|
|
163
|
+
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
164
|
+
raise NotImplementedError()
|
|
165
|
+
|
|
166
|
+
def precheck(self):
|
|
167
|
+
try:
|
|
168
|
+
with self.get_consumer() as consumer:
|
|
169
|
+
# timeout needs at least 3 secs, more info:
|
|
170
|
+
# https://forum.confluent.io/t/kafkacat-connect-failure-to-confcloud-ssl/2513
|
|
171
|
+
cluster_meta = consumer.list_topics(timeout=5)
|
|
172
|
+
current_topics = [
|
|
173
|
+
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
174
|
+
]
|
|
175
|
+
if self.index_config.topic not in current_topics:
|
|
176
|
+
raise SourceConnectionError(
|
|
177
|
+
"expected topic '{}' not detected in cluster: '{}'".format(
|
|
178
|
+
self.index_config.topic, ", ".join(current_topics)
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
logger.info(f"successfully checked available topics: {current_topics}")
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
184
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class KafkaDownloaderConfig(DownloaderConfig):
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@dataclass
|
|
192
|
+
class KafkaDownloader(Downloader, ABC):
|
|
193
|
+
connection_config: KafkaConnectionConfig
|
|
194
|
+
download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
|
|
195
|
+
version: Optional[str] = None
|
|
196
|
+
source_url: Optional[str] = None
|
|
197
|
+
|
|
198
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
199
|
+
source_identifiers = file_data.source_identifiers
|
|
200
|
+
if source_identifiers is None:
|
|
201
|
+
raise ValueError("FileData is missing source_identifiers")
|
|
202
|
+
|
|
203
|
+
# Build the download path using source_identifiers
|
|
204
|
+
download_path = Path(self.download_dir) / source_identifiers.relative_path
|
|
205
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
content = file_data.additional_metadata["content"]
|
|
209
|
+
with open(download_path, "w") as file:
|
|
210
|
+
file.write(content)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Failed to download file {file_data.identifier}: {e}")
|
|
213
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
214
|
+
|
|
215
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class KafkaUploaderConfig(UploaderConfig):
|
|
219
|
+
batch_size: int = Field(default=100, description="Batch size")
|
|
220
|
+
topic: str = Field(description="which topic to write to")
|
|
221
|
+
timeout: Optional[float] = Field(
|
|
222
|
+
default=10.0, description="Timeout in seconds to flush batch of messages"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@dataclass
|
|
227
|
+
class KafkaUploader(Uploader, ABC):
|
|
228
|
+
connection_config: KafkaConnectionConfig
|
|
229
|
+
upload_config: KafkaUploaderConfig
|
|
230
|
+
|
|
231
|
+
def precheck(self):
|
|
232
|
+
try:
|
|
233
|
+
with self.connection_config.get_consumer() as consumer:
|
|
234
|
+
cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
|
|
235
|
+
current_topics = [
|
|
236
|
+
topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
|
|
237
|
+
]
|
|
238
|
+
logger.info(f"successfully checked available topics: {current_topics}")
|
|
239
|
+
if self.upload_config.topic not in current_topics:
|
|
240
|
+
raise DestinationConnectionError(
|
|
241
|
+
"expected topic '{}' not detected in cluster: '{}'".format(
|
|
242
|
+
self.upload_config.topic, ", ".join(current_topics)
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
248
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
249
|
+
|
|
250
|
+
def produce_batch(self, elements: list[dict]) -> None:
|
|
251
|
+
producer = self.connection_config.get_producer()
|
|
252
|
+
failed_producer = False
|
|
253
|
+
|
|
254
|
+
def acked(err, msg):
|
|
255
|
+
nonlocal failed_producer
|
|
256
|
+
if err is not None:
|
|
257
|
+
failed_producer = True
|
|
258
|
+
logger.error("Failed to deliver kafka message: %s: %s" % (str(msg), str(err)))
|
|
259
|
+
|
|
260
|
+
for element in elements:
|
|
261
|
+
producer.produce(
|
|
262
|
+
topic=self.upload_config.topic,
|
|
263
|
+
value=json.dumps(element),
|
|
264
|
+
callback=acked,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
while producer_len := len(producer):
|
|
268
|
+
logger.debug(f"another iteration of kafka producer flush. Queue length: {producer_len}")
|
|
269
|
+
producer.flush(timeout=self.upload_config.timeout)
|
|
270
|
+
if failed_producer:
|
|
271
|
+
raise UnstructuredIngestError("failed to produce all kafka messages in batch")
|
|
272
|
+
|
|
273
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
274
|
+
for element_batch in batch_generator(data, batch_size=self.upload_config.batch_size):
|
|
275
|
+
self.produce_batch(elements=element_batch)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
7
|
+
DestinationRegistryEntry,
|
|
8
|
+
SourceRegistryEntry,
|
|
9
|
+
)
|
|
10
|
+
from unstructured_ingest.processes.connectors.kafka.kafka import (
|
|
11
|
+
KafkaAccessConfig,
|
|
12
|
+
KafkaConnectionConfig,
|
|
13
|
+
KafkaDownloader,
|
|
14
|
+
KafkaDownloaderConfig,
|
|
15
|
+
KafkaIndexer,
|
|
16
|
+
KafkaIndexerConfig,
|
|
17
|
+
KafkaUploader,
|
|
18
|
+
KafkaUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
CONNECTOR_TYPE = "kafka-local"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LocalKafkaAccessConfig(KafkaAccessConfig):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LocalKafkaConnectionConfig(KafkaConnectionConfig):
|
|
32
|
+
access_config: Secret[LocalKafkaAccessConfig] = Field(
|
|
33
|
+
default=LocalKafkaAccessConfig(), validate_default=True
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def get_consumer_configuration(self) -> dict:
|
|
37
|
+
bootstrap = self.bootstrap_server
|
|
38
|
+
port = self.port
|
|
39
|
+
|
|
40
|
+
conf = {
|
|
41
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
42
|
+
"group.id": self.group_id,
|
|
43
|
+
"enable.auto.commit": "false",
|
|
44
|
+
"auto.offset.reset": "earliest",
|
|
45
|
+
}
|
|
46
|
+
return conf
|
|
47
|
+
|
|
48
|
+
def get_producer_configuration(self) -> dict:
|
|
49
|
+
bootstrap = self.bootstrap_server
|
|
50
|
+
port = self.port
|
|
51
|
+
|
|
52
|
+
conf = {
|
|
53
|
+
"bootstrap.servers": f"{bootstrap}:{port}",
|
|
54
|
+
}
|
|
55
|
+
return conf
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class LocalKafkaIndexerConfig(KafkaIndexerConfig):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class LocalKafkaIndexer(KafkaIndexer):
|
|
64
|
+
connection_config: LocalKafkaConnectionConfig
|
|
65
|
+
index_config: LocalKafkaIndexerConfig
|
|
66
|
+
connector_type: str = CONNECTOR_TYPE
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class LocalKafkaDownloaderConfig(KafkaDownloaderConfig):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class LocalKafkaDownloader(KafkaDownloader):
|
|
75
|
+
connection_config: LocalKafkaConnectionConfig
|
|
76
|
+
download_config: LocalKafkaDownloaderConfig
|
|
77
|
+
connector_type: str = CONNECTOR_TYPE
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class LocalKafkaUploaderConfig(KafkaUploaderConfig):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class LocalKafkaUploader(KafkaUploader):
|
|
86
|
+
connection_config: LocalKafkaConnectionConfig
|
|
87
|
+
upload_config: LocalKafkaUploaderConfig
|
|
88
|
+
connector_type: str = CONNECTOR_TYPE
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
kafka_local_source_entry = SourceRegistryEntry(
|
|
92
|
+
connection_config=LocalKafkaConnectionConfig,
|
|
93
|
+
indexer=LocalKafkaIndexer,
|
|
94
|
+
indexer_config=LocalKafkaIndexerConfig,
|
|
95
|
+
downloader=LocalKafkaDownloader,
|
|
96
|
+
downloader_config=LocalKafkaDownloaderConfig,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
kafka_local_destination_entry = DestinationRegistryEntry(
|
|
100
|
+
connection_config=LocalKafkaConnectionConfig,
|
|
101
|
+
uploader=LocalKafkaUploader,
|
|
102
|
+
uploader_config=LocalKafkaUploaderConfig,
|
|
103
|
+
)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
UploadStager,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
20
|
+
DestinationRegistryEntry,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.utils.data_prep import (
|
|
23
|
+
flatten_dict,
|
|
24
|
+
get_data_df,
|
|
25
|
+
get_enhanced_element_id,
|
|
26
|
+
split_dataframe,
|
|
27
|
+
)
|
|
28
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from kdbai_client import Database, Session, Table
|
|
32
|
+
from pandas import DataFrame
|
|
33
|
+
|
|
34
|
+
CONNECTOR_TYPE = "kdbai"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class KdbaiAccessConfig(AccessConfig):
|
|
38
|
+
api_key: Optional[str] = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
description="A string for the api-key, can be left empty "
|
|
41
|
+
"when connecting to local KDBAI instance.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class KdbaiConnectionConfig(ConnectionConfig):
|
|
46
|
+
access_config: Secret[KdbaiAccessConfig] = Field(
|
|
47
|
+
default=KdbaiAccessConfig(), validate_default=True
|
|
48
|
+
)
|
|
49
|
+
endpoint: str = Field(
|
|
50
|
+
default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
@requires_dependencies(["kdbai_client"], extras="kdbai")
|
|
54
|
+
@contextmanager
|
|
55
|
+
def get_client(self) -> Generator["Session", None, None]:
|
|
56
|
+
from kdbai_client import Session
|
|
57
|
+
|
|
58
|
+
session = None
|
|
59
|
+
try:
|
|
60
|
+
session = Session(
|
|
61
|
+
api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
|
|
62
|
+
)
|
|
63
|
+
yield session
|
|
64
|
+
finally:
|
|
65
|
+
if session:
|
|
66
|
+
session.close()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class KdbaiUploadStagerConfig(UploadStagerConfig):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class KdbaiUploadStager(UploadStager):
|
|
75
|
+
upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
|
|
76
|
+
|
|
77
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
78
|
+
data = element_dict.copy()
|
|
79
|
+
return {
|
|
80
|
+
"id": get_enhanced_element_id(element_dict=data, file_data=file_data),
|
|
81
|
+
"element_id": data.get("element_id"),
|
|
82
|
+
"document": data.pop("text", None),
|
|
83
|
+
"embeddings": data.get("embeddings"),
|
|
84
|
+
"metadata": flatten_dict(
|
|
85
|
+
dictionary=data.get("metadata"),
|
|
86
|
+
flatten_lists=True,
|
|
87
|
+
remove_none=True,
|
|
88
|
+
),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class KdbaiUploaderConfig(UploaderConfig):
|
|
93
|
+
database_name: str = Field(
|
|
94
|
+
default="default", description="The name of the KDBAI database to write into."
|
|
95
|
+
)
|
|
96
|
+
table_name: str = Field(description="The name of the KDBAI table to write into.")
|
|
97
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class KdbaiUploader(Uploader):
|
|
102
|
+
connection_config: KdbaiConnectionConfig
|
|
103
|
+
upload_config: KdbaiUploaderConfig
|
|
104
|
+
connector_type: str = field(default=CONNECTOR_TYPE, init=False)
|
|
105
|
+
|
|
106
|
+
def precheck(self) -> None:
|
|
107
|
+
try:
|
|
108
|
+
self.get_database()
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
111
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
112
|
+
|
|
113
|
+
@contextmanager
|
|
114
|
+
def get_database(self) -> Generator["Database", None, None]:
|
|
115
|
+
with self.connection_config.get_client() as client:
|
|
116
|
+
db = client.database(self.upload_config.database_name)
|
|
117
|
+
yield db
|
|
118
|
+
|
|
119
|
+
@contextmanager
|
|
120
|
+
def get_table(self) -> Generator["Table", None, None]:
|
|
121
|
+
with self.get_database() as db:
|
|
122
|
+
table = db.table(self.upload_config.table_name)
|
|
123
|
+
yield table
|
|
124
|
+
|
|
125
|
+
def upsert_batch(self, batch: "DataFrame"):
|
|
126
|
+
with self.get_table() as table:
|
|
127
|
+
table.insert(batch)
|
|
128
|
+
|
|
129
|
+
def process_dataframe(self, df: "DataFrame"):
|
|
130
|
+
logger.debug(
|
|
131
|
+
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
132
|
+
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
133
|
+
)
|
|
134
|
+
for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
135
|
+
self.upsert_batch(batch=batch_df)
|
|
136
|
+
|
|
137
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
138
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
139
|
+
import pandas as pd
|
|
140
|
+
|
|
141
|
+
df = pd.DataFrame(data=data)
|
|
142
|
+
self.process_dataframe(df=df)
|
|
143
|
+
|
|
144
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
145
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
146
|
+
data = get_data_df(path=path)
|
|
147
|
+
self.process_dataframe(df=data)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
kdbai_destination_entry = DestinationRegistryEntry(
|
|
151
|
+
connection_config=KdbaiConnectionConfig,
|
|
152
|
+
uploader=KdbaiUploader,
|
|
153
|
+
uploader_config=KdbaiUploaderConfig,
|
|
154
|
+
upload_stager=KdbaiUploadStager,
|
|
155
|
+
upload_stager_config=KdbaiUploadStagerConfig,
|
|
156
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.processes.connector_registry import add_destination_entry
|
|
4
|
+
|
|
5
|
+
from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
|
|
6
|
+
from .aws import lancedb_aws_destination_entry
|
|
7
|
+
from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
|
|
8
|
+
from .azure import lancedb_azure_destination_entry
|
|
9
|
+
from .cloud import CONNECTOR_TYPE as LANCEDB_CLOUD_CONNECTOR_TYPE
|
|
10
|
+
from .cloud import lancedb_cloud_destination_entry
|
|
11
|
+
from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
|
|
12
|
+
from .gcp import lancedb_gcp_destination_entry
|
|
13
|
+
from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
|
|
14
|
+
from .local import lancedb_local_destination_entry
|
|
15
|
+
|
|
16
|
+
add_destination_entry(
|
|
17
|
+
destination_type=LANCEDB_S3_CONNECTOR_TYPE, entry=lancedb_aws_destination_entry
|
|
18
|
+
)
|
|
19
|
+
add_destination_entry(
|
|
20
|
+
destination_type=LANCEDB_AZURE_CONNECTOR_TYPE, entry=lancedb_azure_destination_entry
|
|
21
|
+
)
|
|
22
|
+
add_destination_entry(
|
|
23
|
+
destination_type=LANCEDB_GCS_CONNECTOR_TYPE, entry=lancedb_gcp_destination_entry
|
|
24
|
+
)
|
|
25
|
+
add_destination_entry(
|
|
26
|
+
destination_type=LANCEDB_LOCAL_CONNECTOR_TYPE, entry=lancedb_local_destination_entry
|
|
27
|
+
)
|
|
28
|
+
add_destination_entry(
|
|
29
|
+
destination_type=LANCEDB_CLOUD_CONNECTOR_TYPE, entry=lancedb_cloud_destination_entry
|
|
30
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_aws"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBAwsAccessConfig(AccessConfig):
|
|
19
|
+
aws_access_key_id: str = Field(description="The AWS access key ID to use.")
|
|
20
|
+
aws_secret_access_key: str = Field(description="The AWS secret access key to use.")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LanceDBAwsConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
24
|
+
access_config: Secret[LanceDBAwsAccessConfig]
|
|
25
|
+
|
|
26
|
+
def get_storage_options(self) -> dict:
|
|
27
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class LanceDBAwsUploader(LanceDBUploader):
|
|
32
|
+
upload_config: LanceDBUploaderConfig
|
|
33
|
+
connection_config: LanceDBAwsConnectionConfig
|
|
34
|
+
connector_type: str = CONNECTOR_TYPE
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
lancedb_aws_destination_entry = DestinationRegistryEntry(
|
|
38
|
+
connection_config=LanceDBAwsConnectionConfig,
|
|
39
|
+
uploader=LanceDBAwsUploader,
|
|
40
|
+
uploader_config=LanceDBUploaderConfig,
|
|
41
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
42
|
+
upload_stager=LanceDBUploadStager,
|
|
43
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_azure"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBAzureAccessConfig(AccessConfig):
|
|
19
|
+
azure_storage_account_name: str = Field(description="The name of the azure storage account.")
|
|
20
|
+
azure_storage_account_key: str = Field(description="The serialized azure service account key.")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LanceDBAzureConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
24
|
+
access_config: Secret[LanceDBAzureAccessConfig]
|
|
25
|
+
|
|
26
|
+
def get_storage_options(self) -> dict:
|
|
27
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class LanceDBAzureUploader(LanceDBUploader):
|
|
32
|
+
upload_config: LanceDBUploaderConfig
|
|
33
|
+
connection_config: LanceDBAzureConnectionConfig
|
|
34
|
+
connector_type: str = CONNECTOR_TYPE
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
lancedb_azure_destination_entry = DestinationRegistryEntry(
|
|
38
|
+
connection_config=LanceDBAzureConnectionConfig,
|
|
39
|
+
uploader=LanceDBAzureUploader,
|
|
40
|
+
uploader_config=LanceDBUploaderConfig,
|
|
41
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
42
|
+
upload_stager=LanceDBUploadStager,
|
|
43
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, Secret
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.interfaces.connector import AccessConfig
|
|
6
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
7
|
+
from unstructured_ingest.processes.connectors.lancedb.lancedb import (
|
|
8
|
+
LanceDBRemoteConnectionConfig,
|
|
9
|
+
LanceDBUploader,
|
|
10
|
+
LanceDBUploaderConfig,
|
|
11
|
+
LanceDBUploadStager,
|
|
12
|
+
LanceDBUploadStagerConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
CONNECTOR_TYPE = "lancedb_cloud"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LanceDBCloudAccessConfig(AccessConfig):
|
|
19
|
+
api_key: str = Field(description="Api key associated with LanceDb cloud")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
|
|
23
|
+
access_config: Secret[LanceDBCloudAccessConfig]
|
|
24
|
+
|
|
25
|
+
def get_storage_options(self) -> dict:
|
|
26
|
+
return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class LanceDBCloudUploader(LanceDBUploader):
|
|
31
|
+
upload_config: LanceDBUploaderConfig
|
|
32
|
+
connection_config: LanceDBCloudConnectionConfig
|
|
33
|
+
connector_type: str = CONNECTOR_TYPE
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
lancedb_cloud_destination_entry = DestinationRegistryEntry(
|
|
37
|
+
connection_config=LanceDBCloudConnectionConfig,
|
|
38
|
+
uploader=LanceDBCloudUploader,
|
|
39
|
+
uploader_config=LanceDBUploaderConfig,
|
|
40
|
+
upload_stager_config=LanceDBUploadStagerConfig,
|
|
41
|
+
upload_stager=LanceDBUploadStager,
|
|
42
|
+
)
|