unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
10
|
+
from unstructured_ingest.logger import logger
|
|
11
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
12
|
+
DestinationRegistryEntry,
|
|
13
|
+
SourceRegistryEntry,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
16
|
+
_DATE_COLUMNS,
|
|
17
|
+
SQLAccessConfig,
|
|
18
|
+
SqlBatchFileData,
|
|
19
|
+
SQLConnectionConfig,
|
|
20
|
+
SQLDownloader,
|
|
21
|
+
SQLDownloaderConfig,
|
|
22
|
+
SQLIndexer,
|
|
23
|
+
SQLIndexerConfig,
|
|
24
|
+
SQLUploader,
|
|
25
|
+
SQLUploaderConfig,
|
|
26
|
+
SQLUploadStager,
|
|
27
|
+
SQLUploadStagerConfig,
|
|
28
|
+
parse_date_string,
|
|
29
|
+
)
|
|
30
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from singlestoredb.connection import Connection as SingleStoreConnection
|
|
34
|
+
from singlestoredb.connection import Cursor as SingleStoreCursor
|
|
35
|
+
|
|
36
|
+
CONNECTOR_TYPE = "singlestore"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SingleStoreAccessConfig(SQLAccessConfig):
|
|
40
|
+
password: Optional[str] = Field(default=None, description="SingleStore password")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SingleStoreConnectionConfig(SQLConnectionConfig):
|
|
44
|
+
access_config: Secret[SingleStoreAccessConfig]
|
|
45
|
+
host: Optional[str] = Field(default=None, description="SingleStore host")
|
|
46
|
+
port: Optional[int] = Field(default=None, description="SingleStore port")
|
|
47
|
+
user: Optional[str] = Field(default=None, description="SingleStore user")
|
|
48
|
+
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
49
|
+
|
|
50
|
+
@contextmanager
|
|
51
|
+
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
52
|
+
def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
|
|
53
|
+
import singlestoredb as s2
|
|
54
|
+
|
|
55
|
+
connection = s2.connect(
|
|
56
|
+
host=self.host,
|
|
57
|
+
port=self.port,
|
|
58
|
+
database=self.database,
|
|
59
|
+
user=self.user,
|
|
60
|
+
password=self.access_config.get_secret_value().password,
|
|
61
|
+
)
|
|
62
|
+
try:
|
|
63
|
+
yield connection
|
|
64
|
+
finally:
|
|
65
|
+
connection.commit()
|
|
66
|
+
connection.close()
|
|
67
|
+
|
|
68
|
+
@contextmanager
|
|
69
|
+
def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
|
|
70
|
+
with self.get_connection() as connection, connection.cursor() as cursor:
|
|
71
|
+
try:
|
|
72
|
+
yield cursor
|
|
73
|
+
finally:
|
|
74
|
+
cursor.close()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class SingleStoreIndexerConfig(SQLIndexerConfig):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class SingleStoreIndexer(SQLIndexer):
|
|
83
|
+
connection_config: SingleStoreConnectionConfig
|
|
84
|
+
index_config: SingleStoreIndexerConfig
|
|
85
|
+
connector_type: str = CONNECTOR_TYPE
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class SingleStoreDownloaderConfig(SQLDownloaderConfig):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class SingleStoreDownloader(SQLDownloader):
|
|
94
|
+
connection_config: SingleStoreConnectionConfig
|
|
95
|
+
download_config: SingleStoreDownloaderConfig
|
|
96
|
+
connector_type: str = CONNECTOR_TYPE
|
|
97
|
+
values_delimiter: str = "%s"
|
|
98
|
+
|
|
99
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
100
|
+
table_name = file_data.additional_metadata.table_name
|
|
101
|
+
id_column = file_data.additional_metadata.id_column
|
|
102
|
+
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
103
|
+
with self.connection_config.get_connection() as sqlite_connection:
|
|
104
|
+
cursor = sqlite_connection.cursor()
|
|
105
|
+
fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
106
|
+
query = (
|
|
107
|
+
f"SELECT {fields} FROM {table_name} WHERE {id_column} IN {self.values_delimiter}"
|
|
108
|
+
)
|
|
109
|
+
logger.debug(f"running query: {query}\nwith values: {(ids,)}")
|
|
110
|
+
cursor.execute(query, (ids,))
|
|
111
|
+
rows = cursor.fetchall()
|
|
112
|
+
columns = [col[0] for col in cursor.description]
|
|
113
|
+
return rows, columns
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class SingleStoreUploadStagerConfig(SQLUploadStagerConfig):
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class SingleStoreUploadStager(SQLUploadStager):
|
|
121
|
+
upload_stager_config: SingleStoreUploadStagerConfig
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class SingleStoreUploaderConfig(SQLUploaderConfig):
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class SingleStoreUploader(SQLUploader):
|
|
130
|
+
upload_config: SingleStoreUploaderConfig = field(default_factory=SingleStoreUploaderConfig)
|
|
131
|
+
connection_config: SingleStoreConnectionConfig
|
|
132
|
+
values_delimiter: str = "%s"
|
|
133
|
+
connector_type: str = CONNECTOR_TYPE
|
|
134
|
+
|
|
135
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
136
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
137
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
138
|
+
|
|
139
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
140
|
+
def prepare_data(
|
|
141
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
142
|
+
) -> list[tuple[Any, ...]]:
|
|
143
|
+
import pandas as pd
|
|
144
|
+
|
|
145
|
+
output = []
|
|
146
|
+
for row in data:
|
|
147
|
+
parsed = []
|
|
148
|
+
for column_name, value in zip(columns, row):
|
|
149
|
+
if isinstance(value, (list, dict)):
|
|
150
|
+
value = json.dumps(value)
|
|
151
|
+
if column_name in _DATE_COLUMNS:
|
|
152
|
+
if value is None or pd.isna(value):
|
|
153
|
+
parsed.append(None)
|
|
154
|
+
else:
|
|
155
|
+
parsed.append(parse_date_string(value))
|
|
156
|
+
else:
|
|
157
|
+
parsed.append(value)
|
|
158
|
+
output.append(tuple(parsed))
|
|
159
|
+
return output
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
singlestore_source_entry = SourceRegistryEntry(
|
|
163
|
+
connection_config=SingleStoreConnectionConfig,
|
|
164
|
+
indexer_config=SingleStoreIndexerConfig,
|
|
165
|
+
indexer=SingleStoreIndexer,
|
|
166
|
+
downloader_config=SingleStoreDownloaderConfig,
|
|
167
|
+
downloader=SingleStoreDownloader,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
singlestore_destination_entry = DestinationRegistryEntry(
|
|
171
|
+
connection_config=SingleStoreConnectionConfig,
|
|
172
|
+
uploader=SingleStoreUploader,
|
|
173
|
+
uploader_config=SingleStoreUploaderConfig,
|
|
174
|
+
upload_stager=SingleStoreUploadStager,
|
|
175
|
+
upload_stager_config=SingleStoreUploadStagerConfig,
|
|
176
|
+
)
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
10
|
+
from unstructured_ingest.logger import logger
|
|
11
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
12
|
+
DestinationRegistryEntry,
|
|
13
|
+
SourceRegistryEntry,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
16
|
+
_DATE_COLUMNS,
|
|
17
|
+
SQLAccessConfig,
|
|
18
|
+
SqlBatchFileData,
|
|
19
|
+
SQLConnectionConfig,
|
|
20
|
+
SQLDownloader,
|
|
21
|
+
SQLDownloaderConfig,
|
|
22
|
+
SQLIndexer,
|
|
23
|
+
SQLIndexerConfig,
|
|
24
|
+
SQLUploader,
|
|
25
|
+
SQLUploaderConfig,
|
|
26
|
+
SQLUploadStager,
|
|
27
|
+
SQLUploadStagerConfig,
|
|
28
|
+
parse_date_string,
|
|
29
|
+
)
|
|
30
|
+
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
31
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from pandas import DataFrame
|
|
35
|
+
from snowflake.connector import SnowflakeConnection
|
|
36
|
+
from snowflake.connector.cursor import SnowflakeCursor
|
|
37
|
+
|
|
38
|
+
CONNECTOR_TYPE = "snowflake"
|
|
39
|
+
|
|
40
|
+
EMBEDDINGS_COLUMN = "embeddings"
|
|
41
|
+
_ARRAY_COLUMNS = (
|
|
42
|
+
"languages",
|
|
43
|
+
"link_urls",
|
|
44
|
+
"link_texts",
|
|
45
|
+
"sent_from",
|
|
46
|
+
"sent_to",
|
|
47
|
+
"emphasized_text_contents",
|
|
48
|
+
"emphasized_text_tags",
|
|
49
|
+
)
|
|
50
|
+
_VECTOR_COLUMNS = (EMBEDDINGS_COLUMN,)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SnowflakeAccessConfig(SQLAccessConfig):
|
|
54
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SnowflakeConnectionConfig(SQLConnectionConfig):
|
|
58
|
+
access_config: Secret[SnowflakeAccessConfig] = Field(
|
|
59
|
+
default=SnowflakeAccessConfig(), validate_default=True
|
|
60
|
+
)
|
|
61
|
+
account: str = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Your account identifier. The account identifier "
|
|
64
|
+
"does not include the snowflakecomputing.com suffix.",
|
|
65
|
+
)
|
|
66
|
+
user: Optional[str] = Field(default=None, description="DB username")
|
|
67
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
68
|
+
port: Optional[int] = Field(default=443, description="DB host connection port")
|
|
69
|
+
database: str = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="Database name.",
|
|
72
|
+
)
|
|
73
|
+
db_schema: str = Field(default=None, description="Database schema.", alias="schema")
|
|
74
|
+
role: str = Field(
|
|
75
|
+
default=None,
|
|
76
|
+
description="Database role.",
|
|
77
|
+
)
|
|
78
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
79
|
+
|
|
80
|
+
@contextmanager
|
|
81
|
+
# The actual snowflake module package name is: snowflake-connector-python
|
|
82
|
+
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
83
|
+
def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
|
|
84
|
+
# https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
|
|
85
|
+
from snowflake.connector import connect
|
|
86
|
+
|
|
87
|
+
connect_kwargs = self.model_dump()
|
|
88
|
+
connect_kwargs["schema"] = connect_kwargs.pop("db_schema")
|
|
89
|
+
connect_kwargs.pop("access_configs", None)
|
|
90
|
+
connect_kwargs["password"] = self.access_config.get_secret_value().password
|
|
91
|
+
# https://peps.python.org/pep-0249/#paramstyle
|
|
92
|
+
connect_kwargs["paramstyle"] = "qmark"
|
|
93
|
+
# remove anything that is none
|
|
94
|
+
active_kwargs = {k: v for k, v in connect_kwargs.items() if v is not None}
|
|
95
|
+
connection = connect(**active_kwargs)
|
|
96
|
+
try:
|
|
97
|
+
yield connection
|
|
98
|
+
finally:
|
|
99
|
+
connection.commit()
|
|
100
|
+
connection.close()
|
|
101
|
+
|
|
102
|
+
@contextmanager
|
|
103
|
+
def get_cursor(self) -> Generator["SnowflakeCursor", None, None]:
|
|
104
|
+
with self.get_connection() as connection:
|
|
105
|
+
cursor = connection.cursor()
|
|
106
|
+
try:
|
|
107
|
+
yield cursor
|
|
108
|
+
finally:
|
|
109
|
+
cursor.close()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SnowflakeIndexerConfig(SQLIndexerConfig):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class SnowflakeIndexer(SQLIndexer):
|
|
118
|
+
connection_config: SnowflakeConnectionConfig
|
|
119
|
+
index_config: SnowflakeIndexerConfig
|
|
120
|
+
connector_type: str = CONNECTOR_TYPE
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class SnowflakeDownloaderConfig(SQLDownloaderConfig):
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class SnowflakeDownloader(SQLDownloader):
|
|
129
|
+
connection_config: SnowflakeConnectionConfig
|
|
130
|
+
download_config: SnowflakeDownloaderConfig
|
|
131
|
+
connector_type: str = CONNECTOR_TYPE
|
|
132
|
+
values_delimiter: str = "?"
|
|
133
|
+
|
|
134
|
+
# The actual snowflake module package name is: snowflake-connector-python
|
|
135
|
+
@requires_dependencies(["snowflake"], extras="snowflake")
|
|
136
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
137
|
+
table_name = file_data.additional_metadata.table_name
|
|
138
|
+
id_column = file_data.additional_metadata.id_column
|
|
139
|
+
ids = [item.identifier for item in file_data.batch_items]
|
|
140
|
+
|
|
141
|
+
with self.connection_config.get_cursor() as cursor:
|
|
142
|
+
query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
|
|
143
|
+
table_name=table_name,
|
|
144
|
+
id_column=id_column,
|
|
145
|
+
fields=(
|
|
146
|
+
",".join(self.download_config.fields) if self.download_config.fields else "*"
|
|
147
|
+
),
|
|
148
|
+
values=",".join([self.values_delimiter for _ in ids]),
|
|
149
|
+
)
|
|
150
|
+
logger.debug(f"running query: {query}\nwith values: {ids}")
|
|
151
|
+
cursor.execute(query, ids)
|
|
152
|
+
rows = [
|
|
153
|
+
tuple(row.values()) if isinstance(row, dict) else row for row in cursor.fetchall()
|
|
154
|
+
]
|
|
155
|
+
columns = [col[0] for col in cursor.description]
|
|
156
|
+
return rows, columns
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SnowflakeUploadStagerConfig(SQLUploadStagerConfig):
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class SnowflakeUploadStager(SQLUploadStager):
|
|
164
|
+
upload_stager_config: SnowflakeUploadStagerConfig
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class SnowflakeUploaderConfig(SQLUploaderConfig):
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass
|
|
172
|
+
class SnowflakeUploader(SQLUploader):
|
|
173
|
+
upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
|
|
174
|
+
connection_config: SnowflakeConnectionConfig
|
|
175
|
+
connector_type: str = CONNECTOR_TYPE
|
|
176
|
+
values_delimiter: str = "?"
|
|
177
|
+
|
|
178
|
+
_embeddings_dimension: Optional[int] = None
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def embeddings_dimension(self) -> Optional[int]:
|
|
182
|
+
"""
|
|
183
|
+
Get the dimension of the embeddings column in the Snowflake table.
|
|
184
|
+
If the column is not present or is not of type VECTOR, returns None.
|
|
185
|
+
"""
|
|
186
|
+
if self._embeddings_dimension is None:
|
|
187
|
+
with self.connection_config.get_cursor() as cursor:
|
|
188
|
+
embeddings_column = cursor.execute(
|
|
189
|
+
f"SHOW COLUMNS LIKE '{EMBEDDINGS_COLUMN}' IN {self.upload_config.table_name}"
|
|
190
|
+
).fetchone()
|
|
191
|
+
if embeddings_column:
|
|
192
|
+
data_type = {}
|
|
193
|
+
if isinstance(embeddings_column, dict):
|
|
194
|
+
data_type = json.loads(embeddings_column.get("data_type", "{}"))
|
|
195
|
+
elif isinstance(embeddings_column, tuple):
|
|
196
|
+
data_type = json.loads(embeddings_column[3] or "{}")
|
|
197
|
+
if isinstance(data_type, dict) and data_type.get("type") == "VECTOR":
|
|
198
|
+
self._embeddings_dimension = data_type.get("dimension")
|
|
199
|
+
# If the _embeddings_dimension is still None, it means the column
|
|
200
|
+
# is not present or not a VECTOR type
|
|
201
|
+
if self._embeddings_dimension is None:
|
|
202
|
+
self._embeddings_dimension = 0
|
|
203
|
+
return self._embeddings_dimension
|
|
204
|
+
|
|
205
|
+
@requires_dependencies(["pandas"], extras="snowflake")
|
|
206
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
207
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
208
|
+
|
|
209
|
+
@requires_dependencies(["pandas"], extras="snowflake")
|
|
210
|
+
def prepare_data(
|
|
211
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
212
|
+
) -> list[tuple[Any, ...]]:
|
|
213
|
+
import pandas as pd
|
|
214
|
+
|
|
215
|
+
output = []
|
|
216
|
+
for row in data:
|
|
217
|
+
parsed = []
|
|
218
|
+
for column_name, value in zip(columns, row):
|
|
219
|
+
if column_name in _DATE_COLUMNS:
|
|
220
|
+
if value is None or pd.isna(value): # pandas is nan
|
|
221
|
+
parsed.append(None)
|
|
222
|
+
else:
|
|
223
|
+
parsed.append(parse_date_string(value))
|
|
224
|
+
elif column_name in _ARRAY_COLUMNS or column_name in _VECTOR_COLUMNS:
|
|
225
|
+
if not isinstance(value, list) and (
|
|
226
|
+
value is None or pd.isna(value)
|
|
227
|
+
): # pandas is nan
|
|
228
|
+
parsed.append(None)
|
|
229
|
+
else:
|
|
230
|
+
parsed.append(json.dumps(value))
|
|
231
|
+
else:
|
|
232
|
+
parsed.append(value)
|
|
233
|
+
output.append(tuple(parsed))
|
|
234
|
+
return output
|
|
235
|
+
|
|
236
|
+
def _parse_select(self, columns: list[str]) -> str:
|
|
237
|
+
embeddings_dimension = self.embeddings_dimension
|
|
238
|
+
parsed_values = []
|
|
239
|
+
for i, col in enumerate(columns):
|
|
240
|
+
argument_selector = f"${i + 1}"
|
|
241
|
+
if col in _VECTOR_COLUMNS and embeddings_dimension:
|
|
242
|
+
parsed_values.append(
|
|
243
|
+
f"PARSE_JSON({argument_selector})::VECTOR(FLOAT,{embeddings_dimension})"
|
|
244
|
+
)
|
|
245
|
+
elif col in _ARRAY_COLUMNS or col in _VECTOR_COLUMNS:
|
|
246
|
+
parsed_values.append(f"PARSE_JSON({argument_selector})")
|
|
247
|
+
else:
|
|
248
|
+
parsed_values.append(argument_selector)
|
|
249
|
+
return ",".join(parsed_values)
|
|
250
|
+
|
|
251
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
252
|
+
import numpy as np
|
|
253
|
+
|
|
254
|
+
if self.can_delete():
|
|
255
|
+
self.delete_by_record_id(file_data=file_data)
|
|
256
|
+
else:
|
|
257
|
+
logger.warning(
|
|
258
|
+
f"table doesn't contain expected "
|
|
259
|
+
f"record id column "
|
|
260
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
261
|
+
)
|
|
262
|
+
df = self._fit_to_schema(df=df, add_missing_columns=True, case_sensitive=False)
|
|
263
|
+
df.replace({np.nan: None}, inplace=True)
|
|
264
|
+
|
|
265
|
+
columns = list(df.columns)
|
|
266
|
+
stmt = "INSERT INTO {table_name} ({columns}) SELECT {select} FROM VALUES ({values})".format(
|
|
267
|
+
table_name=self.upload_config.table_name,
|
|
268
|
+
columns=",".join(columns),
|
|
269
|
+
select=self._parse_select(columns),
|
|
270
|
+
values=",".join([self.values_delimiter for _ in columns]),
|
|
271
|
+
)
|
|
272
|
+
logger.info(
|
|
273
|
+
f"writing a total of {len(df)} elements via"
|
|
274
|
+
f" document batches to destination"
|
|
275
|
+
f" table named {self.upload_config.table_name}"
|
|
276
|
+
f" with batch size {self.upload_config.batch_size}"
|
|
277
|
+
)
|
|
278
|
+
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
279
|
+
with self.connection_config.get_cursor() as cursor:
|
|
280
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
281
|
+
cursor.executemany(stmt, values)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
snowflake_source_entry = SourceRegistryEntry(
|
|
285
|
+
connection_config=SnowflakeConnectionConfig,
|
|
286
|
+
indexer_config=SnowflakeIndexerConfig,
|
|
287
|
+
indexer=SnowflakeIndexer,
|
|
288
|
+
downloader_config=SnowflakeDownloaderConfig,
|
|
289
|
+
downloader=SnowflakeDownloader,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
snowflake_destination_entry = DestinationRegistryEntry(
|
|
293
|
+
connection_config=SnowflakeConnectionConfig,
|
|
294
|
+
uploader=SnowflakeUploader,
|
|
295
|
+
uploader_config=SnowflakeUploaderConfig,
|
|
296
|
+
upload_stager=SnowflakeUploadStager,
|
|
297
|
+
upload_stager_config=SnowflakeUploadStagerConfig,
|
|
298
|
+
)
|