unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import time
|
|
3
|
+
import xml.etree.ElementTree as ET
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.data_types.file_data import (
|
|
12
|
+
FileData,
|
|
13
|
+
FileDataSourceMetadata,
|
|
14
|
+
SourceIdentifiers,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.error import SourceConnectionError, ValueError
|
|
17
|
+
from unstructured_ingest.interfaces import (
|
|
18
|
+
AccessConfig,
|
|
19
|
+
ConnectionConfig,
|
|
20
|
+
Downloader,
|
|
21
|
+
DownloaderConfig,
|
|
22
|
+
DownloadResponse,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
)
|
|
26
|
+
from unstructured_ingest.logger import logger
|
|
27
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
28
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from slack_sdk import WebClient
|
|
32
|
+
from slack_sdk.web.async_client import AsyncWebClient
|
|
33
|
+
|
|
34
|
+
# NOTE: Pagination limit set to the upper end of the recommended range
|
|
35
|
+
# https://api.slack.com/apis/pagination#facts
|
|
36
|
+
PAGINATION_LIMIT = 200
|
|
37
|
+
|
|
38
|
+
CONNECTOR_TYPE = "slack"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SlackAccessConfig(AccessConfig):
|
|
42
|
+
token: str = Field(
|
|
43
|
+
description="Bot token used to access Slack API, must have channels:history scope for the"
|
|
44
|
+
" bot user."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SlackConnectionConfig(ConnectionConfig):
|
|
49
|
+
access_config: Secret[SlackAccessConfig]
|
|
50
|
+
|
|
51
|
+
@requires_dependencies(["slack_sdk"], extras="slack")
|
|
52
|
+
@SourceConnectionError.wrap
|
|
53
|
+
def get_client(self) -> "WebClient":
|
|
54
|
+
from slack_sdk import WebClient
|
|
55
|
+
|
|
56
|
+
return WebClient(token=self.access_config.get_secret_value().token)
|
|
57
|
+
|
|
58
|
+
@requires_dependencies(["slack_sdk"], extras="slack")
|
|
59
|
+
@SourceConnectionError.wrap
|
|
60
|
+
def get_async_client(self) -> "AsyncWebClient":
|
|
61
|
+
from slack_sdk.web.async_client import AsyncWebClient
|
|
62
|
+
|
|
63
|
+
return AsyncWebClient(token=self.access_config.get_secret_value().token)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class SlackIndexerConfig(IndexerConfig):
|
|
67
|
+
channels: list[str] = Field(
|
|
68
|
+
description="Comma-delimited list of Slack channel IDs to pull messages from, can be"
|
|
69
|
+
" both public or private channels."
|
|
70
|
+
)
|
|
71
|
+
start_date: Optional[datetime] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
description="Start date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
|
|
74
|
+
" or YYYY-MM-DD",
|
|
75
|
+
)
|
|
76
|
+
end_date: Optional[datetime] = Field(
|
|
77
|
+
default=None,
|
|
78
|
+
description="End date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
|
|
79
|
+
" or YYYY-MM-DD",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class SlackIndexer(Indexer):
|
|
85
|
+
index_config: SlackIndexerConfig
|
|
86
|
+
connection_config: SlackConnectionConfig
|
|
87
|
+
connector_type: str = CONNECTOR_TYPE
|
|
88
|
+
|
|
89
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
90
|
+
client = self.connection_config.get_client()
|
|
91
|
+
for channel in self.index_config.channels:
|
|
92
|
+
messages = []
|
|
93
|
+
oldest = (
|
|
94
|
+
str(self.index_config.start_date.timestamp())
|
|
95
|
+
if self.index_config.start_date is not None
|
|
96
|
+
else None
|
|
97
|
+
)
|
|
98
|
+
latest = (
|
|
99
|
+
str(self.index_config.end_date.timestamp())
|
|
100
|
+
if self.index_config.end_date is not None
|
|
101
|
+
else None
|
|
102
|
+
)
|
|
103
|
+
for conversation_history in client.conversations_history(
|
|
104
|
+
channel=channel,
|
|
105
|
+
oldest=oldest,
|
|
106
|
+
latest=latest,
|
|
107
|
+
limit=PAGINATION_LIMIT,
|
|
108
|
+
):
|
|
109
|
+
messages = conversation_history.get("messages", [])
|
|
110
|
+
if messages:
|
|
111
|
+
yield self._messages_to_file_data(messages, channel)
|
|
112
|
+
|
|
113
|
+
def _messages_to_file_data(
|
|
114
|
+
self,
|
|
115
|
+
messages: list[dict],
|
|
116
|
+
channel: str,
|
|
117
|
+
) -> FileData:
|
|
118
|
+
ts_oldest = min((message["ts"] for message in messages), key=lambda m: float(m))
|
|
119
|
+
ts_newest = max((message["ts"] for message in messages), key=lambda m: float(m))
|
|
120
|
+
|
|
121
|
+
identifier_base = f"{channel}-{ts_oldest}-{ts_newest}"
|
|
122
|
+
identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
|
|
123
|
+
filename = identifier[:16]
|
|
124
|
+
|
|
125
|
+
source_identifiers = SourceIdentifiers(
|
|
126
|
+
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
127
|
+
)
|
|
128
|
+
return FileData(
|
|
129
|
+
identifier=identifier,
|
|
130
|
+
connector_type=CONNECTOR_TYPE,
|
|
131
|
+
source_identifiers=source_identifiers,
|
|
132
|
+
metadata=FileDataSourceMetadata(
|
|
133
|
+
date_created=ts_oldest,
|
|
134
|
+
date_modified=ts_newest,
|
|
135
|
+
date_processed=str(time.time()),
|
|
136
|
+
record_locator={
|
|
137
|
+
"channel": channel,
|
|
138
|
+
"oldest": ts_oldest,
|
|
139
|
+
"latest": ts_newest,
|
|
140
|
+
},
|
|
141
|
+
),
|
|
142
|
+
display_name=source_identifiers.fullpath,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@SourceConnectionError.wrap
|
|
146
|
+
def precheck(self) -> None:
|
|
147
|
+
client = self.connection_config.get_client()
|
|
148
|
+
for channel in self.index_config.channels:
|
|
149
|
+
# NOTE: Querying conversations history guarantees that the bot is in the channel
|
|
150
|
+
client.conversations_history(channel=channel, limit=1)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class SlackDownloaderConfig(DownloaderConfig):
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class SlackDownloader(Downloader):
|
|
159
|
+
connector_type: str = CONNECTOR_TYPE
|
|
160
|
+
connection_config: SlackConnectionConfig
|
|
161
|
+
download_config: SlackDownloaderConfig = field(default_factory=SlackDownloaderConfig)
|
|
162
|
+
|
|
163
|
+
def run(self, file_data, **kwargs):
|
|
164
|
+
raise NotImplementedError
|
|
165
|
+
|
|
166
|
+
async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
|
|
167
|
+
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
168
|
+
download_path = self.get_download_path(file_data)
|
|
169
|
+
if download_path is None:
|
|
170
|
+
logger.error(
|
|
171
|
+
"Generated download path is None, source_identifiers might be missingfrom FileData."
|
|
172
|
+
)
|
|
173
|
+
raise ValueError("Generated invalid download path.")
|
|
174
|
+
|
|
175
|
+
await self._download_conversation(file_data, download_path)
|
|
176
|
+
return self.generate_download_response(file_data, download_path)
|
|
177
|
+
|
|
178
|
+
def is_async(self):
|
|
179
|
+
return True
|
|
180
|
+
|
|
181
|
+
async def _download_conversation(self, file_data: FileData, download_path: Path) -> None:
|
|
182
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
183
|
+
if (
|
|
184
|
+
file_data.metadata.record_locator is None
|
|
185
|
+
or "channel" not in file_data.metadata.record_locator
|
|
186
|
+
or "oldest" not in file_data.metadata.record_locator
|
|
187
|
+
or "latest" not in file_data.metadata.record_locator
|
|
188
|
+
):
|
|
189
|
+
logger.error(
|
|
190
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
191
|
+
"Keys 'channel', 'oldest' and 'latest' must be present."
|
|
192
|
+
)
|
|
193
|
+
raise ValueError("Invalid record locator.")
|
|
194
|
+
|
|
195
|
+
client = self.connection_config.get_async_client()
|
|
196
|
+
messages = []
|
|
197
|
+
async for conversation_history in await client.conversations_history(
|
|
198
|
+
channel=file_data.metadata.record_locator["channel"],
|
|
199
|
+
oldest=file_data.metadata.record_locator["oldest"],
|
|
200
|
+
latest=file_data.metadata.record_locator["latest"],
|
|
201
|
+
limit=PAGINATION_LIMIT,
|
|
202
|
+
# NOTE: In order to get the exact same range of messages as indexer, it provides
|
|
203
|
+
# timestamps of oldest and newest messages, inclusive=True is necessary to include them
|
|
204
|
+
inclusive=True,
|
|
205
|
+
):
|
|
206
|
+
messages += conversation_history.get("messages", [])
|
|
207
|
+
|
|
208
|
+
conversation = []
|
|
209
|
+
for message in messages:
|
|
210
|
+
thread_messages = []
|
|
211
|
+
async for conversations_replies in await client.conversations_replies(
|
|
212
|
+
channel=file_data.metadata.record_locator["channel"],
|
|
213
|
+
ts=message["ts"],
|
|
214
|
+
limit=PAGINATION_LIMIT,
|
|
215
|
+
):
|
|
216
|
+
thread_messages += conversations_replies.get("messages", [])
|
|
217
|
+
|
|
218
|
+
# NOTE: Replies contains the whole thread, including the message references by the `ts`
|
|
219
|
+
# parameter even if it's the only message (there were no replies).
|
|
220
|
+
# Reference: https://api.slack.com/methods/conversations.replies#markdown
|
|
221
|
+
conversation.append(thread_messages)
|
|
222
|
+
|
|
223
|
+
conversation_xml = self._conversation_to_xml(conversation)
|
|
224
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
225
|
+
conversation_xml.write(download_path, encoding="utf-8", xml_declaration=True)
|
|
226
|
+
|
|
227
|
+
def _conversation_to_xml(self, conversation: list[list[dict]]) -> ET.ElementTree:
|
|
228
|
+
root = ET.Element("messages")
|
|
229
|
+
|
|
230
|
+
for thread in conversation:
|
|
231
|
+
message, *replies = thread
|
|
232
|
+
message_elem = ET.SubElement(root, "message")
|
|
233
|
+
text_elem = ET.SubElement(message_elem, "text")
|
|
234
|
+
text_elem.text = message.get("text")
|
|
235
|
+
|
|
236
|
+
for reply in replies:
|
|
237
|
+
reply_msg = reply.get("text", "")
|
|
238
|
+
text_elem.text = "".join([str(text_elem.text), " <reply> ", reply_msg])
|
|
239
|
+
|
|
240
|
+
return ET.ElementTree(root)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
slack_source_entry = SourceRegistryEntry(
|
|
244
|
+
indexer=SlackIndexer,
|
|
245
|
+
indexer_config=SlackIndexerConfig,
|
|
246
|
+
downloader=SlackDownloader,
|
|
247
|
+
downloader_config=DownloaderConfig,
|
|
248
|
+
connection_config=SlackConnectionConfig,
|
|
249
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
add_source_entry,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from .databricks_delta_tables import CONNECTOR_TYPE as DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE
|
|
9
|
+
from .databricks_delta_tables import databricks_delta_tables_destination_entry
|
|
10
|
+
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
11
|
+
from .postgres import postgres_destination_entry, postgres_source_entry
|
|
12
|
+
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
13
|
+
from .singlestore import singlestore_destination_entry, singlestore_source_entry
|
|
14
|
+
from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
|
|
15
|
+
from .snowflake import snowflake_destination_entry, snowflake_source_entry
|
|
16
|
+
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
17
|
+
from .sqlite import sqlite_destination_entry, sqlite_source_entry
|
|
18
|
+
from .teradata import CONNECTOR_TYPE as TERADATA_CONNECTOR_TYPE
|
|
19
|
+
from .teradata import teradata_destination_entry, teradata_source_entry
|
|
20
|
+
from .vastdb import CONNECTOR_TYPE as VASTDB_CONNECTOR_TYPE
|
|
21
|
+
from .vastdb import vastdb_destination_entry, vastdb_source_entry
|
|
22
|
+
|
|
23
|
+
add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
|
|
24
|
+
add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
|
|
25
|
+
add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
|
|
26
|
+
add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
|
|
27
|
+
add_source_entry(source_type=TERADATA_CONNECTOR_TYPE, entry=teradata_source_entry)
|
|
28
|
+
add_source_entry(source_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_source_entry)
|
|
29
|
+
|
|
30
|
+
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
31
|
+
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
32
|
+
add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_destination_entry)
|
|
33
|
+
add_destination_entry(
|
|
34
|
+
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
35
|
+
)
|
|
36
|
+
add_destination_entry(
|
|
37
|
+
destination_type=DATABRICKS_DELTA_TABLES_CONNECTOR_TYPE,
|
|
38
|
+
entry=databricks_delta_tables_destination_entry,
|
|
39
|
+
)
|
|
40
|
+
add_destination_entry(destination_type=TERADATA_CONNECTOR_TYPE, entry=teradata_destination_entry)
|
|
41
|
+
add_destination_entry(destination_type=VASTDB_CONNECTOR_TYPE, entry=vastdb_destination_entry)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
11
|
+
from unstructured_ingest.error import ValueError
|
|
12
|
+
from unstructured_ingest.logger import logger
|
|
13
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
14
|
+
DestinationRegistryEntry,
|
|
15
|
+
)
|
|
16
|
+
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
17
|
+
SQLAccessConfig,
|
|
18
|
+
SQLConnectionConfig,
|
|
19
|
+
SQLUploader,
|
|
20
|
+
SQLUploaderConfig,
|
|
21
|
+
SQLUploadStager,
|
|
22
|
+
SQLUploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
25
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from databricks.sdk.core import oauth_service_principal
|
|
29
|
+
from databricks.sql.client import Connection as DeltaTableConnection
|
|
30
|
+
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
31
|
+
from pandas import DataFrame
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatabricksDeltaTablesAccessConfig(SQLAccessConfig):
|
|
37
|
+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
|
|
38
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
39
|
+
client_secret: Optional[str] = Field(
|
|
40
|
+
default=None, description="Client Secret of the OAuth app."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DatabricksDeltaTablesConnectionConfig(SQLConnectionConfig):
|
|
45
|
+
access_config: Secret[DatabricksDeltaTablesAccessConfig]
|
|
46
|
+
server_hostname: str = Field(description="server hostname connection config value")
|
|
47
|
+
http_path: str = Field(description="http path connection config value")
|
|
48
|
+
|
|
49
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
50
|
+
def get_credentials_provider(self) -> "oauth_service_principal":
|
|
51
|
+
from databricks.sdk.core import Config, oauth_service_principal
|
|
52
|
+
|
|
53
|
+
host = f"https://{self.server_hostname}"
|
|
54
|
+
access_configs = self.access_config.get_secret_value()
|
|
55
|
+
client_id = access_configs.client_id
|
|
56
|
+
client_secret = access_configs.client_secret
|
|
57
|
+
|
|
58
|
+
def _get_credentials_provider():
|
|
59
|
+
return oauth_service_principal(
|
|
60
|
+
Config(
|
|
61
|
+
host=host,
|
|
62
|
+
client_id=client_id,
|
|
63
|
+
client_secret=client_secret,
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if client_id and client_secret:
|
|
68
|
+
return _get_credentials_provider
|
|
69
|
+
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
def model_post_init(self, __context: Any) -> None:
|
|
73
|
+
access_config = self.access_config.get_secret_value()
|
|
74
|
+
if access_config.token and access_config.client_secret and access_config.client_id:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"One one for of auth can be provided, either token or client id and secret"
|
|
77
|
+
)
|
|
78
|
+
if not access_config.token and not (
|
|
79
|
+
access_config.client_secret and access_config.client_id
|
|
80
|
+
):
|
|
81
|
+
raise ValueError(
|
|
82
|
+
"One form of auth must be provided, either token or client id and secret"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@contextmanager
|
|
86
|
+
@requires_dependencies(["databricks"], extras="databricks-delta-tables")
|
|
87
|
+
def get_connection(self, **connect_kwargs) -> Generator["DeltaTableConnection", None, None]:
|
|
88
|
+
from databricks.sql import connect
|
|
89
|
+
|
|
90
|
+
connect_kwargs = connect_kwargs or {}
|
|
91
|
+
connect_kwargs["_user_agent_entry"] = os.getenv(
|
|
92
|
+
"UNSTRUCTURED_USER_AGENT", "unstructuredio_oss"
|
|
93
|
+
)
|
|
94
|
+
connect_kwargs["server_hostname"] = connect_kwargs.get(
|
|
95
|
+
"server_hostname", self.server_hostname
|
|
96
|
+
)
|
|
97
|
+
connect_kwargs["http_path"] = connect_kwargs.get("http_path", self.http_path)
|
|
98
|
+
|
|
99
|
+
if credential_provider := self.get_credentials_provider():
|
|
100
|
+
connect_kwargs["credentials_provider"] = credential_provider
|
|
101
|
+
else:
|
|
102
|
+
connect_kwargs["access_token"] = self.access_config.get_secret_value().token
|
|
103
|
+
with connect(**connect_kwargs) as connection:
|
|
104
|
+
yield connection
|
|
105
|
+
|
|
106
|
+
@contextmanager
|
|
107
|
+
def get_cursor(self, **connect_kwargs) -> Generator["DeltaTableCursor", None, None]:
|
|
108
|
+
with self.get_connection(**connect_kwargs) as connection:
|
|
109
|
+
cursor = connection.cursor()
|
|
110
|
+
yield cursor
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class DatabricksDeltaTablesUploadStagerConfig(SQLUploadStagerConfig):
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class DatabricksDeltaTablesUploadStager(SQLUploadStager):
|
|
118
|
+
upload_stager_config: DatabricksDeltaTablesUploadStagerConfig
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class DatabricksDeltaTablesUploaderConfig(SQLUploaderConfig):
|
|
122
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
123
|
+
database: str = Field(description="Database name", default="default")
|
|
124
|
+
table_name: str = Field(description="Table name")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class DatabricksDeltaTablesUploader(SQLUploader):
|
|
129
|
+
upload_config: DatabricksDeltaTablesUploaderConfig
|
|
130
|
+
connection_config: DatabricksDeltaTablesConnectionConfig
|
|
131
|
+
connector_type: str = CONNECTOR_TYPE
|
|
132
|
+
|
|
133
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
134
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
135
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
136
|
+
|
|
137
|
+
@contextmanager
|
|
138
|
+
def get_cursor(self) -> Generator[Any, None, None]:
|
|
139
|
+
with self.connection_config.get_cursor() as cursor:
|
|
140
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
141
|
+
yield cursor
|
|
142
|
+
|
|
143
|
+
def precheck(self) -> None:
|
|
144
|
+
with self.connection_config.get_cursor() as cursor:
|
|
145
|
+
cursor.execute("SHOW CATALOGS")
|
|
146
|
+
catalogs = [r[0] for r in cursor.fetchall()]
|
|
147
|
+
if self.upload_config.catalog not in catalogs:
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"Catalog {} not found in {}".format(
|
|
150
|
+
self.upload_config.catalog, ", ".join(catalogs)
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
|
|
154
|
+
cursor.execute("SHOW DATABASES")
|
|
155
|
+
databases = [r[0] for r in cursor.fetchall()]
|
|
156
|
+
if self.upload_config.database not in databases:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
"Database {} not found in {}".format(
|
|
159
|
+
self.upload_config.database, ", ".join(databases)
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
cursor.execute("SHOW TABLES")
|
|
163
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
164
|
+
if self.upload_config.table_name not in table_names:
|
|
165
|
+
raise ValueError(
|
|
166
|
+
"Table {} not found in {}".format(
|
|
167
|
+
self.upload_config.table_name, ", ".join(table_names)
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def create_statement(self, columns: list[str], values: tuple[Any, ...]) -> str:
|
|
172
|
+
values_list = []
|
|
173
|
+
for v in values:
|
|
174
|
+
if isinstance(v, dict):
|
|
175
|
+
values_list.append(json.dumps(v))
|
|
176
|
+
elif isinstance(v, list):
|
|
177
|
+
if v and isinstance(v[0], (int, float)):
|
|
178
|
+
values_list.append("ARRAY({})".format(", ".join([str(val) for val in v])))
|
|
179
|
+
else:
|
|
180
|
+
values_list.append("ARRAY({})".format(", ".join([f"'{val}'" for val in v])))
|
|
181
|
+
else:
|
|
182
|
+
values_list.append(f"'{v}'")
|
|
183
|
+
statement = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
|
|
184
|
+
table_name=self.upload_config.table_name,
|
|
185
|
+
columns=", ".join(columns),
|
|
186
|
+
values=", ".join(values_list),
|
|
187
|
+
)
|
|
188
|
+
return statement
|
|
189
|
+
|
|
190
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
191
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
192
|
+
import numpy as np
|
|
193
|
+
|
|
194
|
+
if self.can_delete():
|
|
195
|
+
self.delete_by_record_id(file_data=file_data)
|
|
196
|
+
else:
|
|
197
|
+
logger.warning(
|
|
198
|
+
f"table doesn't contain expected "
|
|
199
|
+
f"record id column "
|
|
200
|
+
f"{self.upload_config.record_id_key}, skipping delete"
|
|
201
|
+
)
|
|
202
|
+
df.replace({np.nan: None}, inplace=True)
|
|
203
|
+
self._fit_to_schema(df=df)
|
|
204
|
+
|
|
205
|
+
columns = list(df.columns)
|
|
206
|
+
logger.info(
|
|
207
|
+
f"writing a total of {len(df)} elements via"
|
|
208
|
+
f" document batches to destination"
|
|
209
|
+
f" table named {self.upload_config.table_name}"
|
|
210
|
+
# f" with batch size {self.upload_config.batch_size}"
|
|
211
|
+
)
|
|
212
|
+
# TODO: currently variable binding not supporting for list data_types,
|
|
213
|
+
# update once that gets resolved in SDK
|
|
214
|
+
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
215
|
+
with self.get_cursor() as cursor:
|
|
216
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
217
|
+
for v in values:
|
|
218
|
+
stmt = self.create_statement(columns=columns, values=v)
|
|
219
|
+
cursor.execute(stmt)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
databricks_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
223
|
+
connection_config=DatabricksDeltaTablesConnectionConfig,
|
|
224
|
+
uploader=DatabricksDeltaTablesUploader,
|
|
225
|
+
uploader_config=DatabricksDeltaTablesUploaderConfig,
|
|
226
|
+
upload_stager=DatabricksDeltaTablesUploadStager,
|
|
227
|
+
upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
|
|
228
|
+
)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.logger import logger
|
|
10
|
+
from unstructured_ingest.processes.connector_registry import (
|
|
11
|
+
DestinationRegistryEntry,
|
|
12
|
+
SourceRegistryEntry,
|
|
13
|
+
)
|
|
14
|
+
from unstructured_ingest.processes.connectors.sql.sql import (
|
|
15
|
+
SQLAccessConfig,
|
|
16
|
+
SqlBatchFileData,
|
|
17
|
+
SQLConnectionConfig,
|
|
18
|
+
SQLDownloader,
|
|
19
|
+
SQLDownloaderConfig,
|
|
20
|
+
SQLIndexer,
|
|
21
|
+
SQLIndexerConfig,
|
|
22
|
+
SQLUploader,
|
|
23
|
+
SQLUploaderConfig,
|
|
24
|
+
SQLUploadStager,
|
|
25
|
+
SQLUploadStagerConfig,
|
|
26
|
+
)
|
|
27
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from psycopg2.extensions import connection as PostgresConnection
|
|
31
|
+
from psycopg2.extensions import cursor as PostgresCursor
|
|
32
|
+
|
|
33
|
+
CONNECTOR_TYPE = "postgres"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PostgresAccessConfig(SQLAccessConfig):
|
|
37
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PostgresConnectionConfig(SQLConnectionConfig):
|
|
41
|
+
access_config: Secret[PostgresAccessConfig] = Field(
|
|
42
|
+
default=PostgresAccessConfig(), validate_default=True
|
|
43
|
+
)
|
|
44
|
+
database: Optional[str] = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="Database name.",
|
|
47
|
+
)
|
|
48
|
+
username: Optional[str] = Field(default=None, description="DB username")
|
|
49
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
50
|
+
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
51
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
52
|
+
|
|
53
|
+
@contextmanager
|
|
54
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
55
|
+
def get_connection(self) -> Generator["PostgresConnection", None, None]:
|
|
56
|
+
from psycopg2 import connect
|
|
57
|
+
|
|
58
|
+
access_config = self.access_config.get_secret_value()
|
|
59
|
+
connection = connect(
|
|
60
|
+
user=self.username,
|
|
61
|
+
password=access_config.password,
|
|
62
|
+
dbname=self.database,
|
|
63
|
+
host=self.host,
|
|
64
|
+
port=self.port,
|
|
65
|
+
)
|
|
66
|
+
try:
|
|
67
|
+
yield connection
|
|
68
|
+
finally:
|
|
69
|
+
connection.commit()
|
|
70
|
+
connection.close()
|
|
71
|
+
|
|
72
|
+
@contextmanager
|
|
73
|
+
def get_cursor(self) -> Generator["PostgresCursor", None, None]:
|
|
74
|
+
with self.get_connection() as connection:
|
|
75
|
+
cursor = connection.cursor()
|
|
76
|
+
try:
|
|
77
|
+
yield cursor
|
|
78
|
+
finally:
|
|
79
|
+
cursor.close()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class PostgresIndexerConfig(SQLIndexerConfig):
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class PostgresIndexer(SQLIndexer):
|
|
88
|
+
connection_config: PostgresConnectionConfig
|
|
89
|
+
index_config: PostgresIndexerConfig
|
|
90
|
+
connector_type: str = CONNECTOR_TYPE
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class PostgresDownloaderConfig(SQLDownloaderConfig):
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class PostgresDownloader(SQLDownloader):
|
|
99
|
+
connection_config: PostgresConnectionConfig
|
|
100
|
+
download_config: PostgresDownloaderConfig
|
|
101
|
+
connector_type: str = CONNECTOR_TYPE
|
|
102
|
+
|
|
103
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
104
|
+
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
105
|
+
from psycopg2 import sql
|
|
106
|
+
|
|
107
|
+
table_name = file_data.additional_metadata.table_name
|
|
108
|
+
id_column = file_data.additional_metadata.id_column
|
|
109
|
+
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
110
|
+
|
|
111
|
+
with self.connection_config.get_cursor() as cursor:
|
|
112
|
+
fields = (
|
|
113
|
+
sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
|
|
114
|
+
if self.download_config.fields
|
|
115
|
+
else sql.SQL("*")
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
|
|
119
|
+
fields=fields,
|
|
120
|
+
table_name=sql.Identifier(table_name),
|
|
121
|
+
id_column=sql.Identifier(id_column),
|
|
122
|
+
)
|
|
123
|
+
logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
|
|
124
|
+
cursor.execute(query, (ids,))
|
|
125
|
+
rows = cursor.fetchall()
|
|
126
|
+
columns = [col[0] for col in cursor.description]
|
|
127
|
+
return rows, columns
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class PostgresUploadStagerConfig(SQLUploadStagerConfig):
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class PostgresUploadStager(SQLUploadStager):
|
|
135
|
+
upload_stager_config: PostgresUploadStagerConfig
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class PostgresUploaderConfig(SQLUploaderConfig):
|
|
139
|
+
pass
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class PostgresUploader(SQLUploader):
|
|
144
|
+
upload_config: PostgresUploaderConfig = field(default_factory=PostgresUploaderConfig)
|
|
145
|
+
connection_config: PostgresConnectionConfig
|
|
146
|
+
connector_type: str = CONNECTOR_TYPE
|
|
147
|
+
values_delimiter: str = "%s"
|
|
148
|
+
|
|
149
|
+
@requires_dependencies(["pandas"], extras="postgres")
|
|
150
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
151
|
+
super().run(path=path, file_data=file_data, **kwargs)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
postgres_source_entry = SourceRegistryEntry(
|
|
155
|
+
connection_config=PostgresConnectionConfig,
|
|
156
|
+
indexer_config=PostgresIndexerConfig,
|
|
157
|
+
indexer=PostgresIndexer,
|
|
158
|
+
downloader_config=PostgresDownloaderConfig,
|
|
159
|
+
downloader=PostgresDownloader,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
postgres_destination_entry = DestinationRegistryEntry(
|
|
163
|
+
connection_config=PostgresConnectionConfig,
|
|
164
|
+
uploader=PostgresUploader,
|
|
165
|
+
uploader_config=PostgresUploaderConfig,
|
|
166
|
+
upload_stager=PostgresUploadStager,
|
|
167
|
+
upload_stager_config=PostgresUploadStagerConfig,
|
|
168
|
+
)
|