unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError, ValueError
|
|
10
|
+
from unstructured_ingest.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
UploadStagerConfig,
|
|
16
|
+
)
|
|
17
|
+
from unstructured_ingest.logger import logger
|
|
18
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
19
|
+
from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
20
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
21
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
25
|
+
from pandas import DataFrame
|
|
26
|
+
|
|
27
|
+
CONNECTOR_TYPE = "duckdb"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DuckDBAccessConfig(AccessConfig):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DuckDBConnectionConfig(ConnectionConfig):
|
|
35
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
36
|
+
database: Optional[str] = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
description="Database name. Path to the DuckDB .db file. If the file does "
|
|
39
|
+
"not exist, it will be created at the specified path.",
|
|
40
|
+
)
|
|
41
|
+
db_schema: Optional[str] = Field(
|
|
42
|
+
default="main",
|
|
43
|
+
description="Schema name. Schema in the database where the elements table is located.",
|
|
44
|
+
)
|
|
45
|
+
table: Optional[str] = Field(
|
|
46
|
+
default="elements",
|
|
47
|
+
description="Table name. Table name into which the elements data is inserted.",
|
|
48
|
+
)
|
|
49
|
+
access_config: Secret[DuckDBAccessConfig] = Field(
|
|
50
|
+
default=DuckDBAccessConfig(), validate_default=True
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def __post_init__(self):
|
|
54
|
+
if self.database is None:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
"A DuckDB connection requires a path to a *.db or *.duckdb file "
|
|
57
|
+
"through the `database` argument"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
61
|
+
@contextmanager
|
|
62
|
+
def get_client(self) -> Generator["DuckDBConnection", None, None]:
|
|
63
|
+
import duckdb
|
|
64
|
+
|
|
65
|
+
with duckdb.connect(self.database) as client:
|
|
66
|
+
yield client
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
|
|
70
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
71
|
+
yield cursor
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DuckDBUploadStagerConfig(UploadStagerConfig):
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class DuckDBUploadStager(BaseDuckDBUploadStager):
|
|
80
|
+
upload_stager_config: DuckDBUploadStagerConfig = field(
|
|
81
|
+
default_factory=lambda: DuckDBUploadStagerConfig()
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class DuckDBUploaderConfig(UploaderConfig):
|
|
86
|
+
batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class DuckDBUploader(Uploader):
|
|
91
|
+
connector_type: str = CONNECTOR_TYPE
|
|
92
|
+
upload_config: DuckDBUploaderConfig
|
|
93
|
+
connection_config: DuckDBConnectionConfig
|
|
94
|
+
|
|
95
|
+
def precheck(self) -> None:
|
|
96
|
+
try:
|
|
97
|
+
with self.connection_config.get_cursor() as cursor:
|
|
98
|
+
cursor.execute("SELECT 1;")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
101
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
102
|
+
|
|
103
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
104
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
|
+
|
|
106
|
+
with self.connection_config.get_client() as conn:
|
|
107
|
+
conn.query(
|
|
108
|
+
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
113
|
+
import pandas as pd
|
|
114
|
+
|
|
115
|
+
df = pd.DataFrame(data=data)
|
|
116
|
+
self.upload_dataframe(df=df)
|
|
117
|
+
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
|
+
df = get_data_df(path)
|
|
121
|
+
self.upload_dataframe(df=df)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
duckdb_destination_entry = DestinationRegistryEntry(
|
|
125
|
+
connection_config=DuckDBConnectionConfig,
|
|
126
|
+
uploader=DuckDBUploader,
|
|
127
|
+
uploader_config=DuckDBUploaderConfig,
|
|
128
|
+
upload_stager=DuckDBUploadStager,
|
|
129
|
+
upload_stager_config=DuckDBUploadStagerConfig,
|
|
130
|
+
)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
11
|
+
from unstructured_ingest.interfaces import (
|
|
12
|
+
AccessConfig,
|
|
13
|
+
ConnectionConfig,
|
|
14
|
+
Uploader,
|
|
15
|
+
UploaderConfig,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
18
|
+
from unstructured_ingest.logger import logger
|
|
19
|
+
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
20
|
+
from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
|
|
21
|
+
from unstructured_ingest.utils.data_prep import get_data_df
|
|
22
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
26
|
+
from pandas import DataFrame
|
|
27
|
+
|
|
28
|
+
CONNECTOR_TYPE = "motherduck"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MotherDuckAccessConfig(AccessConfig):
|
|
32
|
+
md_token: str = Field(default=None, description="MotherDuck token")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MotherDuckConnectionConfig(ConnectionConfig):
|
|
36
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
37
|
+
database: str = Field(
|
|
38
|
+
description="Database name. Name of the MotherDuck database.",
|
|
39
|
+
)
|
|
40
|
+
db_schema: Optional[str] = Field(
|
|
41
|
+
default="main",
|
|
42
|
+
description="Schema name. Schema in the database where the elements table is located.",
|
|
43
|
+
)
|
|
44
|
+
table: Optional[str] = Field(
|
|
45
|
+
default="elements",
|
|
46
|
+
description="Table name. Table name into which the elements data is inserted.",
|
|
47
|
+
)
|
|
48
|
+
access_config: Secret[MotherDuckAccessConfig] = Field(
|
|
49
|
+
default=MotherDuckAccessConfig(), validate_default=True
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@requires_dependencies(["duckdb"], extras="duckdb")
|
|
53
|
+
@contextmanager
|
|
54
|
+
def get_client(self) -> Generator["MotherDuckConnection", None, None]:
|
|
55
|
+
import duckdb
|
|
56
|
+
|
|
57
|
+
access_config = self.access_config.get_secret_value()
|
|
58
|
+
with duckdb.connect(
|
|
59
|
+
f"md:?motherduck_token={access_config.md_token}",
|
|
60
|
+
config={
|
|
61
|
+
"custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
|
|
62
|
+
},
|
|
63
|
+
) as conn:
|
|
64
|
+
conn.sql(f'USE "{self.database}"')
|
|
65
|
+
yield conn
|
|
66
|
+
|
|
67
|
+
@contextmanager
|
|
68
|
+
def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
|
|
69
|
+
with self.get_client() as client, client.cursor() as cursor:
|
|
70
|
+
yield cursor
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MotherDuckUploadStagerConfig(UploadStagerConfig):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class MotherDuckUploadStager(BaseDuckDBUploadStager):
|
|
79
|
+
upload_stager_config: MotherDuckUploadStagerConfig = field(
|
|
80
|
+
default_factory=lambda: MotherDuckUploadStagerConfig()
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class MotherDuckUploaderConfig(UploaderConfig):
|
|
85
|
+
batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class MotherDuckUploader(Uploader):
|
|
90
|
+
connector_type: str = CONNECTOR_TYPE
|
|
91
|
+
upload_config: MotherDuckUploaderConfig
|
|
92
|
+
connection_config: MotherDuckConnectionConfig
|
|
93
|
+
|
|
94
|
+
def precheck(self) -> None:
|
|
95
|
+
try:
|
|
96
|
+
with self.connection_config.get_cursor() as cursor:
|
|
97
|
+
cursor.execute("SELECT 1;")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
101
|
+
|
|
102
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
103
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
104
|
+
database = self.connection_config.database
|
|
105
|
+
db_schema = self.connection_config.db_schema
|
|
106
|
+
table = self.connection_config.table
|
|
107
|
+
|
|
108
|
+
with self.connection_config.get_client() as conn:
|
|
109
|
+
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
110
|
+
|
|
111
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
113
|
+
import pandas as pd
|
|
114
|
+
|
|
115
|
+
df = pd.DataFrame(data=data)
|
|
116
|
+
self.upload_dataframe(df=df)
|
|
117
|
+
|
|
118
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
119
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
120
|
+
df = get_data_df(path)
|
|
121
|
+
self.upload_dataframe(df=df)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
motherduck_destination_entry = DestinationRegistryEntry(
|
|
125
|
+
connection_config=MotherDuckConnectionConfig,
|
|
126
|
+
uploader=MotherDuckUploader,
|
|
127
|
+
uploader_config=MotherDuckUploaderConfig,
|
|
128
|
+
upload_stager=MotherDuckUploadStager,
|
|
129
|
+
upload_stager_config=MotherDuckUploadStagerConfig,
|
|
130
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
2
|
+
add_destination_entry,
|
|
3
|
+
add_source_entry,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
7
|
+
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
8
|
+
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
9
|
+
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
10
|
+
|
|
11
|
+
add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
|
|
12
|
+
add_destination_entry(
|
|
13
|
+
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
|
|
17
|
+
add_destination_entry(
|
|
18
|
+
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
|
19
|
+
)
|