unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -6,10 +6,9 @@ from time import time
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
  from urllib.parse import quote
8
8
 
9
- from unstructured.documents.elements import DataSourceMetadata
9
+ from pydantic import BaseModel, Field, Secret, SecretStr
10
10
 
11
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
11
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
13
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
13
  from unstructured_ingest.v2.interfaces import (
15
14
  AccessConfig,
@@ -18,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
18
17
  DownloaderConfig,
19
18
  DownloadResponse,
20
19
  FileData,
20
+ FileDataSourceMetadata,
21
21
  Indexer,
22
22
  IndexerConfig,
23
23
  SourceIdentifiers,
@@ -55,24 +55,38 @@ class SharepointContentType(Enum):
55
55
  LIST = "list"
56
56
 
57
57
 
58
- @dataclass
59
58
  class SharepointAccessConfig(AccessConfig):
60
- client_cred: str
61
-
62
-
63
- @dataclass
64
- class SharepointPermissionsConfig(EnhancedDataClassJsonMixin):
65
- permissions_application_id: str
66
- permissions_tenant: str
67
- permissions_client_cred: str = enhanced_field(sensitive=True)
68
- authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
59
+ client_cred: str = Field(description="Sharepoint app secret")
60
+
61
+
62
+ class SharepointPermissionsConfig(BaseModel):
63
+ permissions_application_id: str = Field(description="Microsoft Graph API application id")
64
+ permissions_tenant: str = Field(
65
+ description="url to get permissions data within tenant.",
66
+ examples=["https://contoso.onmicrosoft.com"],
67
+ )
68
+ permissions_client_cred: SecretStr = Field(
69
+ description="Microsoft Graph API application credentials"
70
+ )
71
+ authority_url: Optional[SecretStr] = Field(
72
+ repr=False,
73
+ default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
74
+ description="Permissions authority url",
75
+ examples=["https://login.microsoftonline.com"],
76
+ )
69
77
 
70
78
 
71
- @dataclass
72
79
  class SharepointConnectionConfig(ConnectionConfig):
73
- client_id: str
74
- site: str
75
- access_config: SharepointAccessConfig = enhanced_field(sensitive=True)
80
+ client_id: str = Field(description="Sharepoint app client ID")
81
+ site: str = Field(
82
+ description="Sharepoint site url. Process either base url e.g \
83
+ https://[tenant].sharepoint.com or relative sites \
84
+ https://[tenant].sharepoint.com/sites/<site_name>. \
85
+ To process all sites within the tenant pass a site url as \
86
+ https://[tenant]-admin.sharepoint.com.\
87
+ This requires the app to be registered at a tenant level"
88
+ )
89
+ access_config: Secret[SharepointAccessConfig]
76
90
  permissions_config: Optional[SharepointPermissionsConfig] = None
77
91
 
78
92
  @requires_dependencies(["office365"], extras="sharepoint")
@@ -81,7 +95,9 @@ class SharepointConnectionConfig(ConnectionConfig):
81
95
  from office365.sharepoint.client_context import ClientContext
82
96
 
83
97
  try:
84
- credentials = ClientCredential(self.client_id, self.access_config.client_cred)
98
+ credentials = ClientCredential(
99
+ self.client_id, self.access_config.get_secret_value().client_cred
100
+ )
85
101
  site_client = ClientContext(self.site).with_credentials(credentials)
86
102
  except Exception as e:
87
103
  logger.error(f"Couldn't set Sharepoint client: {e}")
@@ -93,11 +109,12 @@ class SharepointConnectionConfig(ConnectionConfig):
93
109
  from msal import ConfidentialClientApplication
94
110
 
95
111
  try:
112
+ client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
96
113
  app = ConfidentialClientApplication(
97
- authority=f"{self.permissions_config.authority_url}/"
114
+ authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
98
115
  f"{self.permissions_config.permissions_tenant}",
99
116
  client_id=self.permissions_config.permissions_application_id,
100
- client_credential=self.permissions_config.permissions_client_cred,
117
+ client_credential=client_credential,
101
118
  )
102
119
  token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
103
120
  except ValueError as exc:
@@ -120,13 +137,21 @@ class SharepointConnectionConfig(ConnectionConfig):
120
137
  return client
121
138
 
122
139
 
123
- @dataclass
124
140
  class SharepointIndexerConfig(IndexerConfig):
125
- path: Optional[str] = None
126
- recursive: bool = False
127
- omit_files: bool = False
128
- omit_pages: bool = False
129
- omit_lists: bool = False
141
+ path: Optional[str] = Field(
142
+ defaul=None,
143
+ description="Path from which to start parsing files. If the connector is to \
144
+ process all sites within the tenant this filter will be applied to \
145
+ all sites document libraries.",
146
+ )
147
+ recursive: bool = Field(
148
+ default=False,
149
+ description="Recursively download files in their respective folders "
150
+ "otherwise stop at the files in provided folder level.",
151
+ )
152
+ omit_files: bool = Field(default=False, description="Don't process files.")
153
+ omit_pages: bool = Field(default=False, description="Don't process site pages.")
154
+ omit_lists: bool = Field(default=False, description="Don't process lists.")
130
155
 
131
156
 
132
157
  @dataclass
@@ -134,6 +159,14 @@ class SharepointIndexer(Indexer):
134
159
  connection_config: SharepointConnectionConfig
135
160
  index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
136
161
 
162
+ def precheck(self) -> None:
163
+ try:
164
+ site_client = self.connection_config.get_client()
165
+ site_client.site_pages.pages.get().execute_query()
166
+ except Exception as e:
167
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
168
+ raise SourceConnectionError(f"failed to validate connection: {e}")
169
+
137
170
  def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
138
171
  if not recursive:
139
172
  folder.expand(["Files"]).get().execute_query()
@@ -187,7 +220,7 @@ class SharepointIndexer(Indexer):
187
220
  fullpath=file_path,
188
221
  rel_path=file_path.replace(self.index_config.path, ""),
189
222
  ),
190
- metadata=DataSourceMetadata(
223
+ metadata=FileDataSourceMetadata(
191
224
  url=url,
192
225
  version=version,
193
226
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -222,7 +255,7 @@ class SharepointIndexer(Indexer):
222
255
  fullpath=fullpath,
223
256
  rel_path=rel_path,
224
257
  ),
225
- metadata=DataSourceMetadata(
258
+ metadata=FileDataSourceMetadata(
226
259
  url=absolute_url,
227
260
  version=f"{file.major_version}.{file.minor_version}",
228
261
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -303,7 +336,7 @@ class SharepointIndexer(Indexer):
303
336
  def process_permissions(self) -> bool:
304
337
  return (
305
338
  self.connection_config.permissions_config.permissions_tenant
306
- and self.connection_config.permissions_config.permissions_client_cred
339
+ and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
307
340
  and self.connection_config.permissions_config.permissions_application_id
308
341
  )
309
342
 
@@ -328,7 +361,6 @@ class SharepointIndexer(Indexer):
328
361
  yield file_data
329
362
 
330
363
 
331
- @dataclass
332
364
  class SharepointDownloaderConfig(DownloaderConfig):
333
365
  pass
334
366
 
@@ -340,10 +372,9 @@ class SharepointDownloader(Downloader):
340
372
  connector_type: str = CONNECTOR_TYPE
341
373
 
342
374
  def get_download_path(self, file_data: FileData) -> Path:
375
+ download_path = super().get_download_path(file_data=file_data)
376
+
343
377
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
344
- rel_path = file_data.source_identifiers.fullpath
345
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
346
- download_path = self.download_dir / Path(rel_path)
347
378
  if content_type == SharepointContentType.SITEPAGE.value:
348
379
  # Update output extension to html if site page
349
380
  download_path = download_path.with_suffix(".html")
@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
  from dateutil import parser
10
+ from pydantic import Field, Secret
10
11
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
12
12
  from unstructured_ingest.utils.data_prep import batch_generator
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
  from unstructured_ingest.utils.table import convert_to_pandas_dataframe
@@ -33,18 +33,16 @@ if TYPE_CHECKING:
33
33
  CONNECTOR_TYPE = "singlestore"
34
34
 
35
35
 
36
- @dataclass
37
36
  class SingleStoreAccessConfig(AccessConfig):
38
- password: Optional[str] = None
37
+ password: Optional[str] = Field(default=None, description="SingleStore password")
39
38
 
40
39
 
41
- @dataclass
42
40
  class SingleStoreConnectionConfig(ConnectionConfig):
43
- host: Optional[str] = None
44
- port: Optional[int] = None
45
- user: Optional[str] = None
46
- database: Optional[str] = None
47
- access_config: SingleStoreAccessConfig = enhanced_field(sensitive=True)
41
+ host: Optional[str] = Field(default=None, description="SingleStore host")
42
+ port: Optional[int] = Field(default=None, description="SingleStore port")
43
+ user: Optional[str] = Field(default=None, description="SingleStore user")
44
+ database: Optional[str] = Field(default=None, description="SingleStore database")
45
+ access_config: Secret[SingleStoreAccessConfig]
48
46
 
49
47
  @requires_dependencies(["singlestoredb"], extras="singlestore")
50
48
  def get_connection(self) -> "Connection":
@@ -55,14 +53,13 @@ class SingleStoreConnectionConfig(ConnectionConfig):
55
53
  port=self.port,
56
54
  database=self.database,
57
55
  user=self.user,
58
- password=self.access_config.password,
56
+ password=self.access_config.get_secret_value().password,
59
57
  )
60
58
  return conn
61
59
 
62
60
 
63
- @dataclass
64
61
  class SingleStoreUploadStagerConfig(UploadStagerConfig):
65
- drop_empty_cols: bool = False
62
+ drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
66
63
 
67
64
 
68
65
  @dataclass
@@ -112,10 +109,9 @@ class SingleStoreUploadStager(UploadStager):
112
109
  return output_path
113
110
 
114
111
 
115
- @dataclass
116
112
  class SingleStoreUploaderConfig(UploaderConfig):
117
- table_name: str
118
- batch_size: int = 100
113
+ table_name: str = Field(description="SingleStore table to write contents to")
114
+ batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
119
115
 
120
116
 
121
117
  @dataclass
@@ -1,16 +1,16 @@
1
- import enum
2
1
  import json
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from datetime import date, datetime
6
5
  from pathlib import Path
7
- from typing import Any, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
8
7
 
9
8
  import numpy as np
10
9
  import pandas as pd
11
10
  from dateutil import parser
11
+ from pydantic import Field, Secret
12
12
 
13
- from unstructured_ingest.enhanced_dataclass import enhanced_field
13
+ from unstructured_ingest.error import DestinationConnectionError
14
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
15
  from unstructured_ingest.v2.interfaces import (
16
16
  AccessConfig,
@@ -25,42 +25,48 @@ from unstructured_ingest.v2.interfaces import (
25
25
  from unstructured_ingest.v2.logger import logger
26
26
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
27
27
 
28
+ if TYPE_CHECKING:
29
+ from sqlite3 import Connection as SqliteConnection
30
+
31
+ from psycopg2.extensions import connection as PostgresConnection
32
+
28
33
  CONNECTOR_TYPE = "sql"
29
34
  ELEMENTS_TABLE_NAME = "elements"
35
+ SQLITE_DB = "sqlite"
36
+ POSTGRESQL_DB = "postgresql"
30
37
 
31
38
 
32
- @dataclass
33
39
  class SQLAccessConfig(AccessConfig):
34
- username: Optional[str] = None
35
- password: Optional[str] = None
40
+ username: Optional[str] = Field(default=None, description="DB username")
41
+ password: Optional[str] = Field(default=None, description="DB password")
36
42
 
37
43
 
38
- class DatabaseType(str, enum.Enum):
39
- SQLITE = "sqlite"
40
- POSTGRESQL = "postgresql"
44
+ SecreteSQLAccessConfig = Secret[SQLAccessConfig]
41
45
 
42
46
 
43
- @dataclass
44
- class SimpleSqlConfig(ConnectionConfig):
45
- db_type: DatabaseType = (
46
- # required default value here because of parent class
47
- DatabaseType.SQLITE
47
+ class SQLConnectionConfig(ConnectionConfig):
48
+ db_type: Literal["sqlite", "postgresql"] = Field(
49
+ default=SQLITE_DB, description="Type of the database backend"
48
50
  )
49
- database: Optional[str] = None
50
- host: Optional[str] = None
51
- port: Optional[int] = 5432
52
- access_config: Optional[SQLAccessConfig] = enhanced_field(default=None, sensitive=True)
53
- connector_type: str = CONNECTOR_TYPE
51
+ database: Optional[str] = Field(
52
+ default=None,
53
+ description="Database name. For sqlite databases, this is the path to the .db file.",
54
+ )
55
+ host: Optional[str] = Field(default=None, description="DB host")
56
+ port: Optional[int] = Field(default=5432, description="DB host connection port")
57
+ access_config: SecreteSQLAccessConfig = Field(
58
+ default_factory=lambda: SecreteSQLAccessConfig(secret_value=SQLAccessConfig())
59
+ )
60
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
54
61
 
55
62
  def __post_init__(self):
56
- if (self.db_type == DatabaseType.SQLITE) and (self.database is None):
63
+ if (self.db_type == SQLITE_DB) and (self.database is None):
57
64
  raise ValueError(
58
65
  "A sqlite connection requires a path to a *.db file "
59
66
  "through the `database` argument"
60
67
  )
61
68
 
62
69
 
63
- @dataclass
64
70
  class SQLUploadStagerConfig(UploadStagerConfig):
65
71
  pass
66
72
 
@@ -134,7 +140,7 @@ class SQLUploadStager(UploadStager):
134
140
  **kwargs: Any,
135
141
  ) -> Path:
136
142
  with open(elements_filepath) as elements_file:
137
- elements_contents = json.load(elements_file)
143
+ elements_contents: list[dict] = json.load(elements_file)
138
144
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
145
  output_path.parent.mkdir(parents=True, exist_ok=True)
140
146
 
@@ -151,7 +157,7 @@ class SQLUploadStager(UploadStager):
151
157
  data["id"] = str(uuid.uuid4())
152
158
 
153
159
  # remove extraneous, not supported columns
154
- [data.pop(column) for column in data if column not in _COLUMNS]
160
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
155
161
 
156
162
  output.append(data)
157
163
 
@@ -176,37 +182,46 @@ class SQLUploadStager(UploadStager):
176
182
  return output_path
177
183
 
178
184
 
179
- @dataclass
180
185
  class SQLUploaderConfig(UploaderConfig):
181
- batch_size: int = 50
186
+ batch_size: int = Field(default=50, description="Number of records per batch")
182
187
 
183
188
 
184
189
  @dataclass
185
190
  class SQLUploader(Uploader):
186
191
  connector_type: str = CONNECTOR_TYPE
187
192
  upload_config: SQLUploaderConfig
188
- connection_config: SimpleSqlConfig
193
+ connection_config: SQLConnectionConfig
194
+
195
+ def precheck(self) -> None:
196
+ try:
197
+ cursor = self.connection().cursor()
198
+ cursor.execute("SELECT 1;")
199
+ cursor.close()
200
+ except Exception as e:
201
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
202
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
189
203
 
190
204
  @property
191
- def connection(self):
192
- if self.connection_config.db_type == DatabaseType.POSTGRESQL:
205
+ def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
206
+ if self.connection_config.db_type == POSTGRESQL_DB:
193
207
  return self._make_psycopg_connection
194
- elif self.connection_config.db_type == DatabaseType.SQLITE:
208
+ elif self.connection_config.db_type == SQLITE_DB:
195
209
  return self._make_sqlite_connection
196
210
  raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
197
211
 
198
- def _make_sqlite_connection(self):
212
+ def _make_sqlite_connection(self) -> "SqliteConnection":
199
213
  from sqlite3 import connect
200
214
 
201
215
  return connect(database=self.connection_config.database)
202
216
 
203
217
  @requires_dependencies(["psycopg2"], extras="postgres")
204
- def _make_psycopg_connection(self):
218
+ def _make_psycopg_connection(self) -> "PostgresConnection":
205
219
  from psycopg2 import connect
206
220
 
221
+ access_config = self.connection_config.access_config.get_secret_value()
207
222
  return connect(
208
- user=self.connection_config.access_config.username,
209
- password=self.connection_config.access_config.password,
223
+ user=access_config.username,
224
+ password=access_config.password,
210
225
  dbname=self.connection_config.database,
211
226
  host=self.connection_config.host,
212
227
  port=self.connection_config.port,
@@ -219,9 +234,7 @@ class SQLUploader(Uploader):
219
234
  for row in data:
220
235
  parsed = []
221
236
  for column_name, value in zip(columns, row):
222
- if self.connection_config.db_type == DatabaseType.SQLITE and isinstance(
223
- value, (list, dict)
224
- ):
237
+ if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
225
238
  value = json.dumps(value)
226
239
  if column_name in _DATE_COLUMNS:
227
240
  if value is None:
@@ -240,14 +253,14 @@ class SQLUploader(Uploader):
240
253
 
241
254
  columns = tuple(df.columns)
242
255
  stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
243
- VALUES({','.join(['?' if self.connection_config.db_type==DatabaseType.SQLITE else '%s' for x in columns])})" # noqa E501
256
+ VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
244
257
 
245
258
  for rows in pd.read_json(
246
259
  content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
247
260
  ):
248
261
  with self.connection() as conn:
249
262
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
250
- if self.connection_config.db_type == DatabaseType.SQLITE:
263
+ if self.connection_config.db_type == SQLITE_DB:
251
264
  conn.executemany(stmt, values)
252
265
  else:
253
266
  with conn.cursor() as cur:
@@ -261,7 +274,7 @@ class SQLUploader(Uploader):
261
274
 
262
275
 
263
276
  sql_destination_entry = DestinationRegistryEntry(
264
- connection_config=SimpleSqlConfig,
277
+ connection_config=SQLConnectionConfig,
265
278
  uploader=SQLUploader,
266
279
  uploader_config=SQLUploaderConfig,
267
280
  upload_stager=SQLUploadStager,
@@ -5,8 +5,9 @@ from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Optional
6
6
 
7
7
  from dateutil import parser
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import DestinationConnectionError
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
12
13
  AccessConfig,
@@ -29,27 +30,37 @@ if TYPE_CHECKING:
29
30
  CONNECTOR_TYPE = "weaviate"
30
31
 
31
32
 
32
- @dataclass
33
33
  class WeaviateAccessConfig(AccessConfig):
34
- access_token: Optional[str] = None
34
+ access_token: Optional[str] = Field(
35
+ default=None, description="Used to create the bearer token."
36
+ )
35
37
  api_key: Optional[str] = None
36
38
  client_secret: Optional[str] = None
37
39
  password: Optional[str] = None
38
40
 
39
41
 
40
- @dataclass
42
+ SecretWeaviateAccessConfig = Secret[WeaviateAccessConfig]
43
+
44
+
41
45
  class WeaviateConnectionConfig(ConnectionConfig):
42
- host_url: str
43
- class_name: str
44
- access_config: WeaviateAccessConfig = enhanced_field(sensitive=True)
46
+ host_url: str = Field(description="Weaviate instance url")
47
+ class_name: str = Field(
48
+ description="Name of the class to push the records into, e.g: Pdf-elements"
49
+ )
50
+ access_config: SecretWeaviateAccessConfig = Field(
51
+ default_factory=lambda: SecretWeaviateAccessConfig(secret_value=WeaviateAccessConfig())
52
+ )
45
53
  username: Optional[str] = None
46
- anonymous: bool = False
54
+ anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
47
55
  scope: Optional[list[str]] = None
48
- refresh_token: Optional[str] = None
49
- connector_type: str = CONNECTOR_TYPE
56
+ refresh_token: Optional[str] = Field(
57
+ default=None,
58
+ description="Will tie this value to the bearer token. If not provided, "
59
+ "the authentication will expire once the lifetime of the access token is up.",
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
62
 
51
63
 
52
- @dataclass
53
64
  class WeaviateUploadStagerConfig(UploadStagerConfig):
54
65
  pass
55
66
 
@@ -147,24 +158,29 @@ class WeaviateUploadStager(UploadStager):
147
158
  return output_path
148
159
 
149
160
 
150
- @dataclass
151
161
  class WeaviateUploaderConfig(UploaderConfig):
152
- batch_size: int = 100
162
+ batch_size: int = Field(default=100, description="Number of records per batch")
153
163
 
154
164
 
155
165
  @dataclass
156
166
  class WeaviateUploader(Uploader):
157
167
  upload_config: WeaviateUploaderConfig
158
168
  connection_config: WeaviateConnectionConfig
159
- client: Optional["Client"] = field(init=False)
160
169
  connector_type: str = CONNECTOR_TYPE
161
170
 
162
171
  @requires_dependencies(["weaviate"], extras="weaviate")
163
- def __post_init__(self):
172
+ def get_client(self) -> "Client":
164
173
  from weaviate import Client
165
174
 
166
175
  auth = self._resolve_auth_method()
167
- self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
176
+ return Client(url=self.connection_config.host_url, auth_client_secret=auth)
177
+
178
+ def precheck(self) -> None:
179
+ try:
180
+ self.get_client()
181
+ except Exception as e:
182
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
183
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
168
184
 
169
185
  @requires_dependencies(["weaviate"], extras="weaviate")
170
186
  def _resolve_auth_method(self):
@@ -215,8 +231,9 @@ class WeaviateUploader(Uploader):
215
231
  f"at {self.connection_config.host_url}",
216
232
  )
217
233
 
218
- self.client.batch.configure(batch_size=self.upload_config.batch_size)
219
- with self.client.batch as b:
234
+ client = self.get_client()
235
+ client.batch.configure(batch_size=self.upload_config.batch_size)
236
+ with client.batch as b:
220
237
  for e in elements_dict:
221
238
  vector = e.pop("embeddings", None)
222
239
  b.add_data_object(