unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -6,10 +6,9 @@ from time import time
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
from urllib.parse import quote
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
10
10
|
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
11
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
13
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
13
|
from unstructured_ingest.v2.interfaces import (
|
|
15
14
|
AccessConfig,
|
|
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
17
|
DownloaderConfig,
|
|
19
18
|
DownloadResponse,
|
|
20
19
|
FileData,
|
|
20
|
+
FileDataSourceMetadata,
|
|
21
21
|
Indexer,
|
|
22
22
|
IndexerConfig,
|
|
23
23
|
SourceIdentifiers,
|
|
@@ -55,24 +55,38 @@ class SharepointContentType(Enum):
|
|
|
55
55
|
LIST = "list"
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
@dataclass
|
|
59
58
|
class SharepointAccessConfig(AccessConfig):
|
|
60
|
-
client_cred: str
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
59
|
+
client_cred: str = Field(description="Sharepoint app secret")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SharepointPermissionsConfig(BaseModel):
|
|
63
|
+
permissions_application_id: str = Field(description="Microsoft Graph API application id")
|
|
64
|
+
permissions_tenant: str = Field(
|
|
65
|
+
description="url to get permissions data within tenant.",
|
|
66
|
+
examples=["https://contoso.onmicrosoft.com"],
|
|
67
|
+
)
|
|
68
|
+
permissions_client_cred: SecretStr = Field(
|
|
69
|
+
description="Microsoft Graph API application credentials"
|
|
70
|
+
)
|
|
71
|
+
authority_url: Optional[SecretStr] = Field(
|
|
72
|
+
repr=False,
|
|
73
|
+
default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
|
|
74
|
+
description="Permissions authority url",
|
|
75
|
+
examples=["https://login.microsoftonline.com"],
|
|
76
|
+
)
|
|
69
77
|
|
|
70
78
|
|
|
71
|
-
@dataclass
|
|
72
79
|
class SharepointConnectionConfig(ConnectionConfig):
|
|
73
|
-
client_id: str
|
|
74
|
-
site: str
|
|
75
|
-
|
|
80
|
+
client_id: str = Field(description="Sharepoint app client ID")
|
|
81
|
+
site: str = Field(
|
|
82
|
+
description="Sharepoint site url. Process either base url e.g \
|
|
83
|
+
https://[tenant].sharepoint.com or relative sites \
|
|
84
|
+
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
85
|
+
To process all sites within the tenant pass a site url as \
|
|
86
|
+
https://[tenant]-admin.sharepoint.com.\
|
|
87
|
+
This requires the app to be registered at a tenant level"
|
|
88
|
+
)
|
|
89
|
+
access_config: Secret[SharepointAccessConfig]
|
|
76
90
|
permissions_config: Optional[SharepointPermissionsConfig] = None
|
|
77
91
|
|
|
78
92
|
@requires_dependencies(["office365"], extras="sharepoint")
|
|
@@ -81,7 +95,9 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
81
95
|
from office365.sharepoint.client_context import ClientContext
|
|
82
96
|
|
|
83
97
|
try:
|
|
84
|
-
credentials = ClientCredential(
|
|
98
|
+
credentials = ClientCredential(
|
|
99
|
+
self.client_id, self.access_config.get_secret_value().client_cred
|
|
100
|
+
)
|
|
85
101
|
site_client = ClientContext(self.site).with_credentials(credentials)
|
|
86
102
|
except Exception as e:
|
|
87
103
|
logger.error(f"Couldn't set Sharepoint client: {e}")
|
|
@@ -93,11 +109,12 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
93
109
|
from msal import ConfidentialClientApplication
|
|
94
110
|
|
|
95
111
|
try:
|
|
112
|
+
client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
|
|
96
113
|
app = ConfidentialClientApplication(
|
|
97
|
-
authority=f"{self.permissions_config.authority_url}/"
|
|
114
|
+
authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
|
|
98
115
|
f"{self.permissions_config.permissions_tenant}",
|
|
99
116
|
client_id=self.permissions_config.permissions_application_id,
|
|
100
|
-
client_credential=
|
|
117
|
+
client_credential=client_credential,
|
|
101
118
|
)
|
|
102
119
|
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
103
120
|
except ValueError as exc:
|
|
@@ -120,13 +137,21 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
120
137
|
return client
|
|
121
138
|
|
|
122
139
|
|
|
123
|
-
@dataclass
|
|
124
140
|
class SharepointIndexerConfig(IndexerConfig):
|
|
125
|
-
path: Optional[str] =
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
141
|
+
path: Optional[str] = Field(
|
|
142
|
+
defaul=None,
|
|
143
|
+
description="Path from which to start parsing files. If the connector is to \
|
|
144
|
+
process all sites within the tenant this filter will be applied to \
|
|
145
|
+
all sites document libraries.",
|
|
146
|
+
)
|
|
147
|
+
recursive: bool = Field(
|
|
148
|
+
default=False,
|
|
149
|
+
description="Recursively download files in their respective folders "
|
|
150
|
+
"otherwise stop at the files in provided folder level.",
|
|
151
|
+
)
|
|
152
|
+
omit_files: bool = Field(default=False, description="Don't process files.")
|
|
153
|
+
omit_pages: bool = Field(default=False, description="Don't process site pages.")
|
|
154
|
+
omit_lists: bool = Field(default=False, description="Don't process lists.")
|
|
130
155
|
|
|
131
156
|
|
|
132
157
|
@dataclass
|
|
@@ -134,6 +159,14 @@ class SharepointIndexer(Indexer):
|
|
|
134
159
|
connection_config: SharepointConnectionConfig
|
|
135
160
|
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
136
161
|
|
|
162
|
+
def precheck(self) -> None:
|
|
163
|
+
try:
|
|
164
|
+
site_client = self.connection_config.get_client()
|
|
165
|
+
site_client.site_pages.pages.get().execute_query()
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
168
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
169
|
+
|
|
137
170
|
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
138
171
|
if not recursive:
|
|
139
172
|
folder.expand(["Files"]).get().execute_query()
|
|
@@ -187,7 +220,7 @@ class SharepointIndexer(Indexer):
|
|
|
187
220
|
fullpath=file_path,
|
|
188
221
|
rel_path=file_path.replace(self.index_config.path, ""),
|
|
189
222
|
),
|
|
190
|
-
metadata=
|
|
223
|
+
metadata=FileDataSourceMetadata(
|
|
191
224
|
url=url,
|
|
192
225
|
version=version,
|
|
193
226
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -222,7 +255,7 @@ class SharepointIndexer(Indexer):
|
|
|
222
255
|
fullpath=fullpath,
|
|
223
256
|
rel_path=rel_path,
|
|
224
257
|
),
|
|
225
|
-
metadata=
|
|
258
|
+
metadata=FileDataSourceMetadata(
|
|
226
259
|
url=absolute_url,
|
|
227
260
|
version=f"{file.major_version}.{file.minor_version}",
|
|
228
261
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -303,7 +336,7 @@ class SharepointIndexer(Indexer):
|
|
|
303
336
|
def process_permissions(self) -> bool:
|
|
304
337
|
return (
|
|
305
338
|
self.connection_config.permissions_config.permissions_tenant
|
|
306
|
-
and self.connection_config.permissions_config.permissions_client_cred
|
|
339
|
+
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
307
340
|
and self.connection_config.permissions_config.permissions_application_id
|
|
308
341
|
)
|
|
309
342
|
|
|
@@ -328,7 +361,6 @@ class SharepointIndexer(Indexer):
|
|
|
328
361
|
yield file_data
|
|
329
362
|
|
|
330
363
|
|
|
331
|
-
@dataclass
|
|
332
364
|
class SharepointDownloaderConfig(DownloaderConfig):
|
|
333
365
|
pass
|
|
334
366
|
|
|
@@ -340,10 +372,9 @@ class SharepointDownloader(Downloader):
|
|
|
340
372
|
connector_type: str = CONNECTOR_TYPE
|
|
341
373
|
|
|
342
374
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
375
|
+
download_path = super().get_download_path(file_data=file_data)
|
|
376
|
+
|
|
343
377
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
344
|
-
rel_path = file_data.source_identifiers.fullpath
|
|
345
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
346
|
-
download_path = self.download_dir / Path(rel_path)
|
|
347
378
|
if content_type == SharepointContentType.SITEPAGE.value:
|
|
348
379
|
# Update output extension to html if site page
|
|
349
380
|
download_path = download_path.with_suffix(".html")
|
|
@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from dateutil import parser
|
|
10
|
+
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
12
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
14
|
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
@@ -33,18 +33,16 @@ if TYPE_CHECKING:
|
|
|
33
33
|
CONNECTOR_TYPE = "singlestore"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
@dataclass
|
|
37
36
|
class SingleStoreAccessConfig(AccessConfig):
|
|
38
|
-
password: Optional[str] = None
|
|
37
|
+
password: Optional[str] = Field(default=None, description="SingleStore password")
|
|
39
38
|
|
|
40
39
|
|
|
41
|
-
@dataclass
|
|
42
40
|
class SingleStoreConnectionConfig(ConnectionConfig):
|
|
43
|
-
host: Optional[str] = None
|
|
44
|
-
port: Optional[int] = None
|
|
45
|
-
user: Optional[str] = None
|
|
46
|
-
database: Optional[str] = None
|
|
47
|
-
access_config: SingleStoreAccessConfig
|
|
41
|
+
host: Optional[str] = Field(default=None, description="SingleStore host")
|
|
42
|
+
port: Optional[int] = Field(default=None, description="SingleStore port")
|
|
43
|
+
user: Optional[str] = Field(default=None, description="SingleStore user")
|
|
44
|
+
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
45
|
+
access_config: Secret[SingleStoreAccessConfig]
|
|
48
46
|
|
|
49
47
|
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
50
48
|
def get_connection(self) -> "Connection":
|
|
@@ -55,14 +53,13 @@ class SingleStoreConnectionConfig(ConnectionConfig):
|
|
|
55
53
|
port=self.port,
|
|
56
54
|
database=self.database,
|
|
57
55
|
user=self.user,
|
|
58
|
-
password=self.access_config.password,
|
|
56
|
+
password=self.access_config.get_secret_value().password,
|
|
59
57
|
)
|
|
60
58
|
return conn
|
|
61
59
|
|
|
62
60
|
|
|
63
|
-
@dataclass
|
|
64
61
|
class SingleStoreUploadStagerConfig(UploadStagerConfig):
|
|
65
|
-
drop_empty_cols: bool = False
|
|
62
|
+
drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
|
|
66
63
|
|
|
67
64
|
|
|
68
65
|
@dataclass
|
|
@@ -112,10 +109,9 @@ class SingleStoreUploadStager(UploadStager):
|
|
|
112
109
|
return output_path
|
|
113
110
|
|
|
114
111
|
|
|
115
|
-
@dataclass
|
|
116
112
|
class SingleStoreUploaderConfig(UploaderConfig):
|
|
117
|
-
table_name: str
|
|
118
|
-
batch_size: int = 100
|
|
113
|
+
table_name: str = Field(description="SingleStore table to write contents to")
|
|
114
|
+
batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
|
|
119
115
|
|
|
120
116
|
|
|
121
117
|
@dataclass
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import enum
|
|
2
1
|
import json
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import date, datetime
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Optional, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import pandas as pd
|
|
11
10
|
from dateutil import parser
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
|
-
from unstructured_ingest.
|
|
13
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
14
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
15
|
from unstructured_ingest.v2.interfaces import (
|
|
16
16
|
AccessConfig,
|
|
@@ -25,42 +25,48 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
25
25
|
from unstructured_ingest.v2.logger import logger
|
|
26
26
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
27
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from sqlite3 import Connection as SqliteConnection
|
|
30
|
+
|
|
31
|
+
from psycopg2.extensions import connection as PostgresConnection
|
|
32
|
+
|
|
28
33
|
CONNECTOR_TYPE = "sql"
|
|
29
34
|
ELEMENTS_TABLE_NAME = "elements"
|
|
35
|
+
SQLITE_DB = "sqlite"
|
|
36
|
+
POSTGRESQL_DB = "postgresql"
|
|
30
37
|
|
|
31
38
|
|
|
32
|
-
@dataclass
|
|
33
39
|
class SQLAccessConfig(AccessConfig):
|
|
34
|
-
username: Optional[str] = None
|
|
35
|
-
password: Optional[str] = None
|
|
40
|
+
username: Optional[str] = Field(default=None, description="DB username")
|
|
41
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
36
42
|
|
|
37
43
|
|
|
38
|
-
|
|
39
|
-
SQLITE = "sqlite"
|
|
40
|
-
POSTGRESQL = "postgresql"
|
|
44
|
+
SecreteSQLAccessConfig = Secret[SQLAccessConfig]
|
|
41
45
|
|
|
42
46
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
# required default value here because of parent class
|
|
47
|
-
DatabaseType.SQLITE
|
|
47
|
+
class SQLConnectionConfig(ConnectionConfig):
|
|
48
|
+
db_type: Literal["sqlite", "postgresql"] = Field(
|
|
49
|
+
default=SQLITE_DB, description="Type of the database backend"
|
|
48
50
|
)
|
|
49
|
-
database: Optional[str] =
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
database: Optional[str] = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="Database name. For sqlite databases, this is the path to the .db file.",
|
|
54
|
+
)
|
|
55
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
56
|
+
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
57
|
+
access_config: SecreteSQLAccessConfig = Field(
|
|
58
|
+
default_factory=lambda: SecreteSQLAccessConfig(secret_value=SQLAccessConfig())
|
|
59
|
+
)
|
|
60
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
54
61
|
|
|
55
62
|
def __post_init__(self):
|
|
56
|
-
if (self.db_type ==
|
|
63
|
+
if (self.db_type == SQLITE_DB) and (self.database is None):
|
|
57
64
|
raise ValueError(
|
|
58
65
|
"A sqlite connection requires a path to a *.db file "
|
|
59
66
|
"through the `database` argument"
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
|
|
63
|
-
@dataclass
|
|
64
70
|
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
65
71
|
pass
|
|
66
72
|
|
|
@@ -134,7 +140,7 @@ class SQLUploadStager(UploadStager):
|
|
|
134
140
|
**kwargs: Any,
|
|
135
141
|
) -> Path:
|
|
136
142
|
with open(elements_filepath) as elements_file:
|
|
137
|
-
elements_contents = json.load(elements_file)
|
|
143
|
+
elements_contents: list[dict] = json.load(elements_file)
|
|
138
144
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
139
145
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
146
|
|
|
@@ -151,7 +157,7 @@ class SQLUploadStager(UploadStager):
|
|
|
151
157
|
data["id"] = str(uuid.uuid4())
|
|
152
158
|
|
|
153
159
|
# remove extraneous, not supported columns
|
|
154
|
-
|
|
160
|
+
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
155
161
|
|
|
156
162
|
output.append(data)
|
|
157
163
|
|
|
@@ -176,37 +182,46 @@ class SQLUploadStager(UploadStager):
|
|
|
176
182
|
return output_path
|
|
177
183
|
|
|
178
184
|
|
|
179
|
-
@dataclass
|
|
180
185
|
class SQLUploaderConfig(UploaderConfig):
|
|
181
|
-
batch_size: int = 50
|
|
186
|
+
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
182
187
|
|
|
183
188
|
|
|
184
189
|
@dataclass
|
|
185
190
|
class SQLUploader(Uploader):
|
|
186
191
|
connector_type: str = CONNECTOR_TYPE
|
|
187
192
|
upload_config: SQLUploaderConfig
|
|
188
|
-
connection_config:
|
|
193
|
+
connection_config: SQLConnectionConfig
|
|
194
|
+
|
|
195
|
+
def precheck(self) -> None:
|
|
196
|
+
try:
|
|
197
|
+
cursor = self.connection().cursor()
|
|
198
|
+
cursor.execute("SELECT 1;")
|
|
199
|
+
cursor.close()
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
202
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
189
203
|
|
|
190
204
|
@property
|
|
191
|
-
def connection(self):
|
|
192
|
-
if self.connection_config.db_type ==
|
|
205
|
+
def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
|
|
206
|
+
if self.connection_config.db_type == POSTGRESQL_DB:
|
|
193
207
|
return self._make_psycopg_connection
|
|
194
|
-
elif self.connection_config.db_type ==
|
|
208
|
+
elif self.connection_config.db_type == SQLITE_DB:
|
|
195
209
|
return self._make_sqlite_connection
|
|
196
210
|
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
197
211
|
|
|
198
|
-
def _make_sqlite_connection(self):
|
|
212
|
+
def _make_sqlite_connection(self) -> "SqliteConnection":
|
|
199
213
|
from sqlite3 import connect
|
|
200
214
|
|
|
201
215
|
return connect(database=self.connection_config.database)
|
|
202
216
|
|
|
203
217
|
@requires_dependencies(["psycopg2"], extras="postgres")
|
|
204
|
-
def _make_psycopg_connection(self):
|
|
218
|
+
def _make_psycopg_connection(self) -> "PostgresConnection":
|
|
205
219
|
from psycopg2 import connect
|
|
206
220
|
|
|
221
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
207
222
|
return connect(
|
|
208
|
-
user=
|
|
209
|
-
password=
|
|
223
|
+
user=access_config.username,
|
|
224
|
+
password=access_config.password,
|
|
210
225
|
dbname=self.connection_config.database,
|
|
211
226
|
host=self.connection_config.host,
|
|
212
227
|
port=self.connection_config.port,
|
|
@@ -219,9 +234,7 @@ class SQLUploader(Uploader):
|
|
|
219
234
|
for row in data:
|
|
220
235
|
parsed = []
|
|
221
236
|
for column_name, value in zip(columns, row):
|
|
222
|
-
if self.connection_config.db_type ==
|
|
223
|
-
value, (list, dict)
|
|
224
|
-
):
|
|
237
|
+
if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
|
|
225
238
|
value = json.dumps(value)
|
|
226
239
|
if column_name in _DATE_COLUMNS:
|
|
227
240
|
if value is None:
|
|
@@ -240,14 +253,14 @@ class SQLUploader(Uploader):
|
|
|
240
253
|
|
|
241
254
|
columns = tuple(df.columns)
|
|
242
255
|
stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
|
|
243
|
-
VALUES({','.join(['?' if self.connection_config.db_type==
|
|
256
|
+
VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
|
|
244
257
|
|
|
245
258
|
for rows in pd.read_json(
|
|
246
259
|
content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
247
260
|
):
|
|
248
261
|
with self.connection() as conn:
|
|
249
262
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
250
|
-
if self.connection_config.db_type ==
|
|
263
|
+
if self.connection_config.db_type == SQLITE_DB:
|
|
251
264
|
conn.executemany(stmt, values)
|
|
252
265
|
else:
|
|
253
266
|
with conn.cursor() as cur:
|
|
@@ -261,7 +274,7 @@ class SQLUploader(Uploader):
|
|
|
261
274
|
|
|
262
275
|
|
|
263
276
|
sql_destination_entry = DestinationRegistryEntry(
|
|
264
|
-
connection_config=
|
|
277
|
+
connection_config=SQLConnectionConfig,
|
|
265
278
|
uploader=SQLUploader,
|
|
266
279
|
uploader_config=SQLUploaderConfig,
|
|
267
280
|
upload_stager=SQLUploadStager,
|
|
@@ -5,8 +5,9 @@ from pathlib import Path
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
12
13
|
AccessConfig,
|
|
@@ -29,27 +30,37 @@ if TYPE_CHECKING:
|
|
|
29
30
|
CONNECTOR_TYPE = "weaviate"
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
@dataclass
|
|
33
33
|
class WeaviateAccessConfig(AccessConfig):
|
|
34
|
-
access_token: Optional[str] =
|
|
34
|
+
access_token: Optional[str] = Field(
|
|
35
|
+
default=None, description="Used to create the bearer token."
|
|
36
|
+
)
|
|
35
37
|
api_key: Optional[str] = None
|
|
36
38
|
client_secret: Optional[str] = None
|
|
37
39
|
password: Optional[str] = None
|
|
38
40
|
|
|
39
41
|
|
|
40
|
-
|
|
42
|
+
SecretWeaviateAccessConfig = Secret[WeaviateAccessConfig]
|
|
43
|
+
|
|
44
|
+
|
|
41
45
|
class WeaviateConnectionConfig(ConnectionConfig):
|
|
42
|
-
host_url: str
|
|
43
|
-
class_name: str
|
|
44
|
-
|
|
46
|
+
host_url: str = Field(description="Weaviate instance url")
|
|
47
|
+
class_name: str = Field(
|
|
48
|
+
description="Name of the class to push the records into, e.g: Pdf-elements"
|
|
49
|
+
)
|
|
50
|
+
access_config: SecretWeaviateAccessConfig = Field(
|
|
51
|
+
default_factory=lambda: SecretWeaviateAccessConfig(secret_value=WeaviateAccessConfig())
|
|
52
|
+
)
|
|
45
53
|
username: Optional[str] = None
|
|
46
|
-
anonymous: bool = False
|
|
54
|
+
anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
|
|
47
55
|
scope: Optional[list[str]] = None
|
|
48
|
-
refresh_token: Optional[str] =
|
|
49
|
-
|
|
56
|
+
refresh_token: Optional[str] = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="Will tie this value to the bearer token. If not provided, "
|
|
59
|
+
"the authentication will expire once the lifetime of the access token is up.",
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
50
62
|
|
|
51
63
|
|
|
52
|
-
@dataclass
|
|
53
64
|
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
54
65
|
pass
|
|
55
66
|
|
|
@@ -147,24 +158,29 @@ class WeaviateUploadStager(UploadStager):
|
|
|
147
158
|
return output_path
|
|
148
159
|
|
|
149
160
|
|
|
150
|
-
@dataclass
|
|
151
161
|
class WeaviateUploaderConfig(UploaderConfig):
|
|
152
|
-
batch_size: int = 100
|
|
162
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
153
163
|
|
|
154
164
|
|
|
155
165
|
@dataclass
|
|
156
166
|
class WeaviateUploader(Uploader):
|
|
157
167
|
upload_config: WeaviateUploaderConfig
|
|
158
168
|
connection_config: WeaviateConnectionConfig
|
|
159
|
-
client: Optional["Client"] = field(init=False)
|
|
160
169
|
connector_type: str = CONNECTOR_TYPE
|
|
161
170
|
|
|
162
171
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
163
|
-
def
|
|
172
|
+
def get_client(self) -> "Client":
|
|
164
173
|
from weaviate import Client
|
|
165
174
|
|
|
166
175
|
auth = self._resolve_auth_method()
|
|
167
|
-
|
|
176
|
+
return Client(url=self.connection_config.host_url, auth_client_secret=auth)
|
|
177
|
+
|
|
178
|
+
def precheck(self) -> None:
|
|
179
|
+
try:
|
|
180
|
+
self.get_client()
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
183
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
168
184
|
|
|
169
185
|
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
170
186
|
def _resolve_auth_method(self):
|
|
@@ -215,8 +231,9 @@ class WeaviateUploader(Uploader):
|
|
|
215
231
|
f"at {self.connection_config.host_url}",
|
|
216
232
|
)
|
|
217
233
|
|
|
218
|
-
|
|
219
|
-
|
|
234
|
+
client = self.get_client()
|
|
235
|
+
client.batch.configure(batch_size=self.upload_config.batch_size)
|
|
236
|
+
with client.batch as b:
|
|
220
237
|
for e in elements_dict:
|
|
221
238
|
vector = e.pop("embeddings", None)
|
|
222
239
|
b.add_data_object(
|