unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Coroutine, Generator
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError
|
|
11
|
+
from unstructured_ingest.logger import logger
|
|
12
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
+
from unstructured_ingest.v2.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Downloader,
|
|
17
|
+
DownloaderConfig,
|
|
18
|
+
FileData,
|
|
19
|
+
Indexer,
|
|
20
|
+
IndexerConfig,
|
|
21
|
+
download_responses,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
|
|
24
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
25
|
+
|
|
26
|
+
MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from office365.graph_client import GraphClient
|
|
30
|
+
from office365.outlook.mail.folders.folder import MailFolder
|
|
31
|
+
from office365.outlook.mail.messages.message import Message
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
CONNECTOR_TYPE = "outlook"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class OutlookAccessConfig(AccessConfig):
|
|
38
|
+
client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class OutlookConnectionConfig(ConnectionConfig):
|
|
42
|
+
access_config: Secret[OutlookAccessConfig]
|
|
43
|
+
client_id: str = Field(description="Azure AD App client ID")
|
|
44
|
+
tenant: str = Field(
|
|
45
|
+
default="common", description="ID or domain name associated with your Azure AD instance"
|
|
46
|
+
)
|
|
47
|
+
authority_url: str = Field(
|
|
48
|
+
default="https://login.microsoftonline.com",
|
|
49
|
+
description="Authentication token provider for Microsoft apps",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@requires_dependencies(["msal"], extras="outlook")
|
|
53
|
+
def _acquire_token(self):
|
|
54
|
+
"""Acquire token via MSAL"""
|
|
55
|
+
from msal import ConfidentialClientApplication
|
|
56
|
+
|
|
57
|
+
# NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
|
|
58
|
+
# constant as default in the future but they do not fit well with `authority_url` right now
|
|
59
|
+
authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
|
|
60
|
+
app = ConfidentialClientApplication(
|
|
61
|
+
authority=authority_url,
|
|
62
|
+
client_id=self.client_id,
|
|
63
|
+
client_credential=self.access_config.get_secret_value().client_credential,
|
|
64
|
+
)
|
|
65
|
+
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
66
|
+
return token
|
|
67
|
+
|
|
68
|
+
@requires_dependencies(["office365"], extras="outlook")
|
|
69
|
+
@SourceConnectionError.wrap
|
|
70
|
+
def get_client(self) -> "GraphClient":
|
|
71
|
+
from office365.graph_client import GraphClient
|
|
72
|
+
|
|
73
|
+
return GraphClient(self._acquire_token)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class OutlookIndexerConfig(IndexerConfig):
|
|
77
|
+
outlook_folders: list[str] = Field(
|
|
78
|
+
description="Folders to download email messages from. Do not specify subfolders. "
|
|
79
|
+
"Use quotes if there are spaces in folder names."
|
|
80
|
+
)
|
|
81
|
+
recursive: bool = Field(
|
|
82
|
+
default=False,
|
|
83
|
+
description="Recursively download files in their respective folders otherwise stop at the"
|
|
84
|
+
" files in provided folder level.",
|
|
85
|
+
)
|
|
86
|
+
user_email: str = Field(description="Outlook email to download messages from.")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class OutlookIndexer(Indexer):
|
|
91
|
+
index_config: OutlookIndexerConfig
|
|
92
|
+
connection_config: OutlookConnectionConfig
|
|
93
|
+
connector_type: str = CONNECTOR_TYPE
|
|
94
|
+
|
|
95
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
96
|
+
messages = self._list_messages(recursive=self.index_config.recursive)
|
|
97
|
+
|
|
98
|
+
for message in messages:
|
|
99
|
+
yield self._message_to_file_data(message)
|
|
100
|
+
|
|
101
|
+
def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
|
|
102
|
+
raise NotImplementedError
|
|
103
|
+
|
|
104
|
+
@SourceConnectionError.wrap
|
|
105
|
+
def precheck(self) -> None:
|
|
106
|
+
client = self.connection_config.get_client()
|
|
107
|
+
client.users[self.index_config.user_email].get().execute_query()
|
|
108
|
+
|
|
109
|
+
def is_async(self) -> bool:
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
def _list_messages(self, recursive: bool) -> list["Message"]:
|
|
113
|
+
mail_folders = self._get_selected_root_folders()
|
|
114
|
+
messages = []
|
|
115
|
+
|
|
116
|
+
while mail_folders:
|
|
117
|
+
mail_folder = mail_folders.pop()
|
|
118
|
+
messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
|
|
119
|
+
|
|
120
|
+
if recursive:
|
|
121
|
+
mail_folders += list(mail_folder.child_folders.get().execute_query())
|
|
122
|
+
|
|
123
|
+
return messages
|
|
124
|
+
|
|
125
|
+
def _get_selected_root_folders(self) -> list["MailFolder"]:
|
|
126
|
+
client_user = self.connection_config.get_client().users[self.index_config.user_email]
|
|
127
|
+
root_mail_folders = client_user.mail_folders.get().execute_query()
|
|
128
|
+
|
|
129
|
+
selected_names_normalized = [
|
|
130
|
+
folder_name.lower() for folder_name in self.index_config.outlook_folders
|
|
131
|
+
]
|
|
132
|
+
selected_root_mail_folders = [
|
|
133
|
+
folder
|
|
134
|
+
for folder in root_mail_folders
|
|
135
|
+
if folder.display_name.lower() in selected_names_normalized
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
if not selected_root_mail_folders:
|
|
139
|
+
logger.error(
|
|
140
|
+
f"Root folders selected in configuration: {self.index_config.outlook_folders}"
|
|
141
|
+
f"not found for user email {self.index_config.user_email}. Aborting."
|
|
142
|
+
)
|
|
143
|
+
raise ValueError("Root folders selected in configuration not found.")
|
|
144
|
+
|
|
145
|
+
return selected_root_mail_folders
|
|
146
|
+
|
|
147
|
+
def _message_to_file_data(self, message: "Message") -> FileData:
|
|
148
|
+
fullpath = self._generate_fullpath(message)
|
|
149
|
+
|
|
150
|
+
return FileData(
|
|
151
|
+
identifier=message.id,
|
|
152
|
+
connector_type=CONNECTOR_TYPE,
|
|
153
|
+
source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
|
|
154
|
+
metadata=FileDataSourceMetadata(
|
|
155
|
+
url=message.resource_url,
|
|
156
|
+
version=message.change_key,
|
|
157
|
+
date_modified=str(
|
|
158
|
+
message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
|
|
159
|
+
),
|
|
160
|
+
date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
|
|
161
|
+
date_processed=str(time.time()),
|
|
162
|
+
record_locator={
|
|
163
|
+
"message_id": message.id,
|
|
164
|
+
"user_email": self.index_config.user_email,
|
|
165
|
+
},
|
|
166
|
+
),
|
|
167
|
+
additional_metadata={
|
|
168
|
+
"sent_from": str(message.sent_from),
|
|
169
|
+
"to_recipients": [str(recipient) for recipient in message.to_recipients],
|
|
170
|
+
"bcc_recipients": [str(recipient) for recipient in message.to_recipients],
|
|
171
|
+
"subject": message.subject,
|
|
172
|
+
"conversation_id": message.conversation_id,
|
|
173
|
+
"is_draft": message.is_draft,
|
|
174
|
+
"is_read": message.is_read,
|
|
175
|
+
"has_attachments": message.has_attachments,
|
|
176
|
+
"importance": message.importance,
|
|
177
|
+
},
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def _generate_fullpath(self, message: "Message") -> Path:
|
|
181
|
+
return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class OutlookDownloaderConfig(DownloaderConfig):
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@dataclass
|
|
189
|
+
class OutlookDownloader(Downloader):
|
|
190
|
+
connector_type: str = CONNECTOR_TYPE
|
|
191
|
+
connection_config: OutlookConnectionConfig
|
|
192
|
+
download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
|
|
193
|
+
|
|
194
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
195
|
+
# NOTE: Indexer should provide source identifiers required to generate the download path
|
|
196
|
+
download_path = self.get_download_path(file_data)
|
|
197
|
+
if download_path is None:
|
|
198
|
+
logger.error(
|
|
199
|
+
"Generated download path is None, source_identifiers might be missing"
|
|
200
|
+
"from FileData."
|
|
201
|
+
)
|
|
202
|
+
raise ValueError("Generated invalid download path.")
|
|
203
|
+
|
|
204
|
+
self._download_message(file_data, download_path)
|
|
205
|
+
return self.generate_download_response(file_data, download_path)
|
|
206
|
+
|
|
207
|
+
def is_async(self) -> bool:
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
def _download_message(self, file_data: FileData, download_path: Path) -> None:
|
|
211
|
+
# NOTE: Indexer should supply the record locator in metadata
|
|
212
|
+
if (
|
|
213
|
+
file_data.metadata.record_locator is None
|
|
214
|
+
or "user_email" not in file_data.metadata.record_locator
|
|
215
|
+
or "message_id" not in file_data.metadata.record_locator
|
|
216
|
+
):
|
|
217
|
+
logger.error(
|
|
218
|
+
f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
|
|
219
|
+
"Keys 'user_email' and 'message_id' must be present."
|
|
220
|
+
)
|
|
221
|
+
raise ValueError("Invalid record locator.")
|
|
222
|
+
|
|
223
|
+
user_email = file_data.metadata.record_locator["user_email"]
|
|
224
|
+
message_id = file_data.metadata.record_locator["message_id"]
|
|
225
|
+
|
|
226
|
+
message = self.connection_config.get_client().users[user_email].messages[message_id]
|
|
227
|
+
download_path.parent.mkdir(exist_ok=True, parents=True)
|
|
228
|
+
|
|
229
|
+
with open(download_path, "wb") as file:
|
|
230
|
+
message.download(file).execute_query()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
outlook_source_entry = SourceRegistryEntry(
|
|
234
|
+
indexer=OutlookIndexer,
|
|
235
|
+
indexer_config=OutlookIndexerConfig,
|
|
236
|
+
downloader=OutlookDownloader,
|
|
237
|
+
downloader_config=OutlookDownloaderConfig,
|
|
238
|
+
connection_config=OutlookConnectionConfig,
|
|
239
|
+
)
|
|
@@ -94,6 +94,10 @@ class PineconeUploaderConfig(UploaderConfig):
|
|
|
94
94
|
pool_threads: Optional[int] = Field(
|
|
95
95
|
default=1, description="Optional limit on number of threads to use for upload"
|
|
96
96
|
)
|
|
97
|
+
namespace: Optional[str] = Field(
|
|
98
|
+
default=None,
|
|
99
|
+
description="The namespace to write to. If not specified, the default namespace is used",
|
|
100
|
+
)
|
|
97
101
|
|
|
98
102
|
|
|
99
103
|
@dataclass
|
|
@@ -183,7 +187,11 @@ class PineconeUploader(Uploader):
|
|
|
183
187
|
pool_threads = max_pool_threads
|
|
184
188
|
index = self.connection_config.get_index(pool_threads=pool_threads)
|
|
185
189
|
with index:
|
|
186
|
-
|
|
190
|
+
upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
|
|
191
|
+
if namespace := self.upload_config.namespace:
|
|
192
|
+
for kwargs in upsert_kwargs:
|
|
193
|
+
kwargs["namespace"] = namespace
|
|
194
|
+
async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
|
|
187
195
|
# Wait for and retrieve responses (this raises in case of error)
|
|
188
196
|
try:
|
|
189
197
|
results = [async_result.get() for async_result in async_results]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
|
|
8
|
+
from .postgres import postgres_destination_entry
|
|
9
|
+
from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
|
|
10
|
+
from .sqlite import sqlite_destination_entry
|
|
11
|
+
|
|
12
|
+
add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
|
|
13
|
+
add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pydantic import Field, Secret
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
12
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
13
|
+
_DATE_COLUMNS,
|
|
14
|
+
SQLAccessConfig,
|
|
15
|
+
SQLConnectionConfig,
|
|
16
|
+
SQLUploader,
|
|
17
|
+
SQLUploaderConfig,
|
|
18
|
+
SQLUploadStager,
|
|
19
|
+
SQLUploadStagerConfig,
|
|
20
|
+
parse_date_string,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from psycopg2.extensions import connection as PostgresConnection
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "postgres"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PostgresAccessConfig(SQLAccessConfig):
|
|
30
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PostgresConnectionConfig(SQLConnectionConfig):
|
|
34
|
+
access_config: Secret[PostgresAccessConfig] = Field(
|
|
35
|
+
default=PostgresAccessConfig(), validate_default=True
|
|
36
|
+
)
|
|
37
|
+
database: Optional[str] = Field(
|
|
38
|
+
default=None,
|
|
39
|
+
description="Database name.",
|
|
40
|
+
)
|
|
41
|
+
username: Optional[str] = Field(default=None, description="DB username")
|
|
42
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
43
|
+
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
44
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
45
|
+
|
|
46
|
+
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
47
|
+
def get_connection(self) -> "PostgresConnection":
|
|
48
|
+
from psycopg2 import connect
|
|
49
|
+
|
|
50
|
+
access_config = self.access_config.get_secret_value()
|
|
51
|
+
return connect(
|
|
52
|
+
user=self.username,
|
|
53
|
+
password=access_config.password,
|
|
54
|
+
dbname=self.database,
|
|
55
|
+
host=self.host,
|
|
56
|
+
port=self.port,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PostgresUploadStagerConfig(SQLUploadStagerConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class PostgresUploadStager(SQLUploadStager):
|
|
65
|
+
upload_stager_config: PostgresUploadStagerConfig
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PostgresUploaderConfig(SQLUploaderConfig):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class PostgresUploader(SQLUploader):
|
|
74
|
+
upload_config: PostgresUploaderConfig = field(default_factory=PostgresUploaderConfig)
|
|
75
|
+
connection_config: PostgresConnectionConfig
|
|
76
|
+
connector_type: str = CONNECTOR_TYPE
|
|
77
|
+
|
|
78
|
+
def prepare_data(
|
|
79
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
80
|
+
) -> list[tuple[Any, ...]]:
|
|
81
|
+
output = []
|
|
82
|
+
for row in data:
|
|
83
|
+
parsed = []
|
|
84
|
+
for column_name, value in zip(columns, row):
|
|
85
|
+
if column_name in _DATE_COLUMNS:
|
|
86
|
+
if value is None:
|
|
87
|
+
parsed.append(None)
|
|
88
|
+
else:
|
|
89
|
+
parsed.append(parse_date_string(value))
|
|
90
|
+
else:
|
|
91
|
+
parsed.append(value)
|
|
92
|
+
output.append(tuple(parsed))
|
|
93
|
+
return output
|
|
94
|
+
|
|
95
|
+
def upload_contents(self, path: Path) -> None:
|
|
96
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
97
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
98
|
+
df.replace({np.nan: None}, inplace=True)
|
|
99
|
+
|
|
100
|
+
columns = tuple(df.columns)
|
|
101
|
+
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
|
|
102
|
+
VALUES({','.join(['%s' for x in columns])})" # noqa E501
|
|
103
|
+
|
|
104
|
+
for rows in pd.read_json(
|
|
105
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
106
|
+
):
|
|
107
|
+
with self.connection_config.get_connection() as conn:
|
|
108
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
109
|
+
with conn.cursor() as cur:
|
|
110
|
+
cur.executemany(stmt, values)
|
|
111
|
+
|
|
112
|
+
conn.commit()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
postgres_destination_entry = DestinationRegistryEntry(
|
|
116
|
+
connection_config=PostgresConnectionConfig,
|
|
117
|
+
uploader=PostgresUploader,
|
|
118
|
+
uploader_config=PostgresUploaderConfig,
|
|
119
|
+
upload_stager=PostgresUploadStager,
|
|
120
|
+
upload_stager_config=PostgresUploadStagerConfig,
|
|
121
|
+
)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import date, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from dateutil import parser
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
|
+
|
|
13
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
Uploader,
|
|
19
|
+
UploaderConfig,
|
|
20
|
+
UploadStager,
|
|
21
|
+
UploadStagerConfig,
|
|
22
|
+
)
|
|
23
|
+
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
|
|
25
|
+
_COLUMNS = (
|
|
26
|
+
"id",
|
|
27
|
+
"element_id",
|
|
28
|
+
"text",
|
|
29
|
+
"embeddings",
|
|
30
|
+
"type",
|
|
31
|
+
"system",
|
|
32
|
+
"layout_width",
|
|
33
|
+
"layout_height",
|
|
34
|
+
"points",
|
|
35
|
+
"url",
|
|
36
|
+
"version",
|
|
37
|
+
"date_created",
|
|
38
|
+
"date_modified",
|
|
39
|
+
"date_processed",
|
|
40
|
+
"permissions_data",
|
|
41
|
+
"record_locator",
|
|
42
|
+
"category_depth",
|
|
43
|
+
"parent_id",
|
|
44
|
+
"attached_filename",
|
|
45
|
+
"filetype",
|
|
46
|
+
"last_modified",
|
|
47
|
+
"file_directory",
|
|
48
|
+
"filename",
|
|
49
|
+
"languages",
|
|
50
|
+
"page_number",
|
|
51
|
+
"links",
|
|
52
|
+
"page_name",
|
|
53
|
+
"link_urls",
|
|
54
|
+
"link_texts",
|
|
55
|
+
"sent_from",
|
|
56
|
+
"sent_to",
|
|
57
|
+
"subject",
|
|
58
|
+
"section",
|
|
59
|
+
"header_footer_type",
|
|
60
|
+
"emphasized_text_contents",
|
|
61
|
+
"emphasized_text_tags",
|
|
62
|
+
"text_as_html",
|
|
63
|
+
"regex_metadata",
|
|
64
|
+
"detection_class_prob",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_date_string(date_value: Union[str, int]) -> date:
|
|
71
|
+
try:
|
|
72
|
+
timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
|
|
73
|
+
return datetime.fromtimestamp(timestamp)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
logger.debug(f"date {date_value} string not a timestamp: {e}")
|
|
76
|
+
return parser.parse(date_value)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class SQLAccessConfig(AccessConfig):
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class SQLConnectionConfig(ConnectionConfig, ABC):
|
|
84
|
+
access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def get_connection(self) -> Any:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class SQLUploadStager(UploadStager):
|
|
97
|
+
upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
|
|
98
|
+
|
|
99
|
+
def run(
|
|
100
|
+
self,
|
|
101
|
+
elements_filepath: Path,
|
|
102
|
+
file_data: FileData,
|
|
103
|
+
output_dir: Path,
|
|
104
|
+
output_filename: str,
|
|
105
|
+
**kwargs: Any,
|
|
106
|
+
) -> Path:
|
|
107
|
+
with open(elements_filepath) as elements_file:
|
|
108
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
109
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
110
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
|
|
112
|
+
output = []
|
|
113
|
+
for data in elements_contents:
|
|
114
|
+
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
115
|
+
data_source = metadata.pop("data_source", {})
|
|
116
|
+
coordinates = metadata.pop("coordinates", {})
|
|
117
|
+
|
|
118
|
+
data.update(metadata)
|
|
119
|
+
data.update(data_source)
|
|
120
|
+
data.update(coordinates)
|
|
121
|
+
|
|
122
|
+
data["id"] = str(uuid.uuid4())
|
|
123
|
+
|
|
124
|
+
# remove extraneous, not supported columns
|
|
125
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
126
|
+
|
|
127
|
+
output.append(data)
|
|
128
|
+
|
|
129
|
+
df = pd.DataFrame.from_dict(output)
|
|
130
|
+
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
131
|
+
df[column] = df[column].apply(parse_date_string)
|
|
132
|
+
for column in filter(
|
|
133
|
+
lambda x: x in df.columns,
|
|
134
|
+
("permissions_data", "record_locator", "points", "links"),
|
|
135
|
+
):
|
|
136
|
+
df[column] = df[column].apply(
|
|
137
|
+
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
|
138
|
+
)
|
|
139
|
+
for column in filter(
|
|
140
|
+
lambda x: x in df.columns,
|
|
141
|
+
("version", "page_number", "regex_metadata"),
|
|
142
|
+
):
|
|
143
|
+
df[column] = df[column].apply(str)
|
|
144
|
+
|
|
145
|
+
with output_path.open("w") as output_file:
|
|
146
|
+
df.to_json(output_file, orient="records", lines=True)
|
|
147
|
+
return output_path
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class SQLUploaderConfig(UploaderConfig):
|
|
151
|
+
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
152
|
+
table_name: str = Field(default="elements", description="which table to upload contents to")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class SQLUploader(Uploader):
|
|
157
|
+
upload_config: SQLUploaderConfig
|
|
158
|
+
connection_config: SQLConnectionConfig
|
|
159
|
+
|
|
160
|
+
def precheck(self) -> None:
|
|
161
|
+
try:
|
|
162
|
+
connection = self.connection_config.get_connection()
|
|
163
|
+
cursor = connection.cursor()
|
|
164
|
+
cursor.execute("SELECT 1;")
|
|
165
|
+
cursor.close()
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
169
|
+
|
|
170
|
+
@abstractmethod
|
|
171
|
+
def prepare_data(
|
|
172
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
173
|
+
) -> list[tuple[Any, ...]]:
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
@abstractmethod
|
|
177
|
+
def upload_contents(self, path: Path) -> None:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
+
self.upload_contents(path=path)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.v2.logger import logger
|
|
11
|
+
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
12
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
13
|
+
_DATE_COLUMNS,
|
|
14
|
+
SQLAccessConfig,
|
|
15
|
+
SQLConnectionConfig,
|
|
16
|
+
SQLUploader,
|
|
17
|
+
SQLUploaderConfig,
|
|
18
|
+
SQLUploadStager,
|
|
19
|
+
SQLUploadStagerConfig,
|
|
20
|
+
parse_date_string,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from sqlite3 import Connection as SqliteConnection
|
|
25
|
+
|
|
26
|
+
CONNECTOR_TYPE = "sqlite"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SQLiteAccessConfig(SQLAccessConfig):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SQLiteConnectionConfig(SQLConnectionConfig):
|
|
34
|
+
access_config: Secret[SQLiteAccessConfig] = Field(
|
|
35
|
+
default=SQLiteAccessConfig(), validate_default=True
|
|
36
|
+
)
|
|
37
|
+
database_path: Path = Field(
|
|
38
|
+
description="Path to the .db file.",
|
|
39
|
+
)
|
|
40
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
41
|
+
|
|
42
|
+
def get_connection(self) -> "SqliteConnection":
|
|
43
|
+
from sqlite3 import connect
|
|
44
|
+
|
|
45
|
+
return connect(database=self.database_path)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SQLiteUploadStager(SQLUploadStager):
|
|
53
|
+
upload_stager_config: SQLiteUploadStagerConfig
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SQLiteUploaderConfig(SQLUploaderConfig):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class SQLiteUploader(SQLUploader):
|
|
62
|
+
upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
|
|
63
|
+
connection_config: SQLiteConnectionConfig
|
|
64
|
+
connector_type: str = CONNECTOR_TYPE
|
|
65
|
+
|
|
66
|
+
def prepare_data(
|
|
67
|
+
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
68
|
+
) -> list[tuple[Any, ...]]:
|
|
69
|
+
output = []
|
|
70
|
+
for row in data:
|
|
71
|
+
parsed = []
|
|
72
|
+
for column_name, value in zip(columns, row):
|
|
73
|
+
if isinstance(value, (list, dict)):
|
|
74
|
+
value = json.dumps(value)
|
|
75
|
+
if column_name in _DATE_COLUMNS:
|
|
76
|
+
if value is None:
|
|
77
|
+
parsed.append(None)
|
|
78
|
+
else:
|
|
79
|
+
parsed.append(parse_date_string(value))
|
|
80
|
+
else:
|
|
81
|
+
parsed.append(value)
|
|
82
|
+
output.append(tuple(parsed))
|
|
83
|
+
return output
|
|
84
|
+
|
|
85
|
+
def upload_contents(self, path: Path) -> None:
|
|
86
|
+
df = pd.read_json(path, orient="records", lines=True)
|
|
87
|
+
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database_path} ")
|
|
88
|
+
df.replace({np.nan: None}, inplace=True)
|
|
89
|
+
|
|
90
|
+
columns = tuple(df.columns)
|
|
91
|
+
stmt = f"INSERT INTO {self.upload_config.table_name} ({','.join(columns)}) \
|
|
92
|
+
VALUES({','.join(['?' for x in columns])})" # noqa E501
|
|
93
|
+
|
|
94
|
+
for rows in pd.read_json(
|
|
95
|
+
path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
96
|
+
):
|
|
97
|
+
with self.connection_config.get_connection() as conn:
|
|
98
|
+
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
99
|
+
conn.executemany(stmt, values)
|
|
100
|
+
conn.commit()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
sqlite_destination_entry = DestinationRegistryEntry(
|
|
104
|
+
connection_config=SQLiteConnectionConfig,
|
|
105
|
+
uploader=SQLiteUploader,
|
|
106
|
+
uploader_config=SQLiteUploaderConfig,
|
|
107
|
+
upload_stager=SQLiteUploadStager,
|
|
108
|
+
upload_stager_config=SQLiteUploadStagerConfig,
|
|
109
|
+
)
|