unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  from dataclasses import dataclass, field
2
+ from pathlib import Path
2
3
  from typing import TYPE_CHECKING, Optional
3
4
 
4
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
5
+ from pydantic import BaseModel, Field, Secret
6
+
5
7
  from unstructured_ingest.error import (
6
8
  DestinationConnectionError,
7
9
  )
@@ -35,20 +37,28 @@ CONNECTOR_TYPE = "opensearch"
35
37
  heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
36
38
 
37
39
 
38
- @dataclass
39
40
  class OpenSearchAccessConfig(AccessConfig):
40
- password: Optional[str] = enhanced_field(default=None, sensitive=True)
41
- use_ssl: bool = False
42
- verify_certs: bool = False
43
- ssl_show_warn: bool = False
44
- ca_certs: Optional[str] = None
45
- client_cert: Optional[str] = None
46
- client_key: Optional[str] = None
47
-
48
-
49
- @dataclass
50
- class OpenSearchClientInput(EnhancedDataClassJsonMixin):
51
- http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
41
+ password: Optional[str] = Field(default=None, description="password when using basic auth")
42
+ use_ssl: bool = Field(default=False, description="use ssl for the connection")
43
+ verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
44
+ ssl_show_warn: bool = Field(
45
+ default=False, description="show warning when verify certs is disabled"
46
+ )
47
+ ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
48
+ client_cert: Optional[Path] = Field(
49
+ default=None,
50
+ description="path to the file containing the private key and the certificate,"
51
+ " or cert only if using client_key",
52
+ )
53
+ client_key: Optional[Path] = Field(
54
+ default=None,
55
+ description="path to the file containing the private key"
56
+ " if using separate cert and key files",
57
+ )
58
+
59
+
60
+ class OpenSearchClientInput(BaseModel):
61
+ http_auth: Secret[Optional[tuple[str, str]]] = None
52
62
  hosts: Optional[list[str]] = None
53
63
  use_ssl: bool = False
54
64
  verify_certs: bool = False
@@ -58,37 +68,41 @@ class OpenSearchClientInput(EnhancedDataClassJsonMixin):
58
68
  client_key: Optional[str] = None
59
69
 
60
70
 
61
- @dataclass
62
71
  class OpenSearchConnectionConfig(ConnectionConfig):
63
- hosts: Optional[list[str]] = None
64
- username: Optional[str] = None
65
- access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True)
72
+ hosts: Optional[list[str]] = Field(
73
+ default=None,
74
+ description="List of the OpenSearch hosts to connect",
75
+ examples=["http://localhost:9200"],
76
+ )
77
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
78
+ access_config: Secret[OpenSearchAccessConfig]
66
79
 
67
80
  def get_client_kwargs(self) -> dict:
68
81
  # Update auth related fields to conform to what the SDK expects based on the
69
82
  # supported methods:
70
83
  # https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
71
- client_input = OpenSearchClientInput()
84
+ access_config = self.access_config.get_secret_value()
85
+ client_input_kwargs = {}
72
86
  if self.hosts:
73
- client_input.hosts = self.hosts
74
- if self.access_config.use_ssl:
75
- client_input.use_ssl = self.access_config.use_ssl
76
- if self.access_config.verify_certs:
77
- client_input.verify_certs = self.access_config.verify_certs
78
- if self.access_config.ssl_show_warn:
79
- client_input.ssl_show_warn = self.access_config.ssl_show_warn
80
- if self.access_config.ca_certs:
81
- client_input.ca_certs = self.access_config.ca_certs
82
- if self.access_config.client_cert:
83
- client_input.client_cert = self.access_config.client_cert
84
- if self.access_config.client_key:
85
- client_input.client_key = self.access_config.client_key
86
- if self.username and self.access_config.password:
87
- client_input.http_auth = (self.username, self.access_config.password)
88
- logger.debug(
89
- f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
90
- )
91
- client_kwargs = client_input.to_dict(redact_sensitive=False)
87
+ client_input_kwargs["hosts"] = self.hosts
88
+ if access_config.use_ssl:
89
+ client_input_kwargs["use_ssl"] = access_config.use_ssl
90
+ if access_config.verify_certs:
91
+ client_input_kwargs["verify_certs"] = access_config.verify_certs
92
+ if access_config.ssl_show_warn:
93
+ client_input_kwargs["ssl_show_warn"] = access_config.ssl_show_warn
94
+ if access_config.ca_certs:
95
+ client_input_kwargs["ca_certs"] = str(access_config.ca_certs)
96
+ if access_config.client_cert:
97
+ client_input_kwargs["client_cert"] = str(access_config.client_cert)
98
+ if access_config.client_key:
99
+ client_input_kwargs["client_key"] = str(access_config.client_key)
100
+ if self.username and access_config.password:
101
+ client_input_kwargs["http_auth"] = (self.username, access_config.password)
102
+ client_input = OpenSearchClientInput(**client_input_kwargs)
103
+ logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
104
+ client_kwargs = client_input.dict()
105
+ client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
92
106
  client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
93
107
  return client_kwargs
94
108
 
@@ -100,15 +114,14 @@ class OpenSearchConnectionConfig(ConnectionConfig):
100
114
  return OpenSearch(**self.get_client_kwargs())
101
115
 
102
116
 
103
- @dataclass
104
- class OpensearchIndexerConfig(ElasticsearchIndexerConfig):
117
+ class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
105
118
  pass
106
119
 
107
120
 
108
121
  @dataclass
109
122
  class OpenSearchIndexer(ElasticsearchIndexer):
110
123
  connection_config: OpenSearchConnectionConfig
111
- index_config: OpensearchIndexerConfig
124
+ index_config: OpenSearchIndexerConfig
112
125
  client: "OpenSearch" = field(init=False)
113
126
 
114
127
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -118,15 +131,14 @@ class OpenSearchIndexer(ElasticsearchIndexer):
118
131
  return scan
119
132
 
120
133
 
121
- @dataclass
122
- class OpensearchDownloaderConfig(ElasticsearchDownloaderConfig):
134
+ class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
123
135
  pass
124
136
 
125
137
 
126
138
  @dataclass
127
139
  class OpenSearchDownloader(ElasticsearchDownloader):
128
140
  connection_config: OpenSearchConnectionConfig
129
- download_config: OpensearchDownloaderConfig
141
+ download_config: OpenSearchDownloaderConfig
130
142
  connector_type: str = CONNECTOR_TYPE
131
143
 
132
144
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -137,15 +149,14 @@ class OpenSearchDownloader(ElasticsearchDownloader):
137
149
  return AsyncOpenSearch, async_scan
138
150
 
139
151
 
140
- @dataclass
141
- class OpensearchUploaderConfig(ElasticsearchUploaderConfig):
152
+ class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
142
153
  pass
143
154
 
144
155
 
145
156
  @dataclass
146
157
  class OpenSearchUploader(ElasticsearchUploader):
147
158
  connection_config: OpenSearchConnectionConfig
148
- upload_config: OpensearchUploaderConfig
159
+ upload_config: OpenSearchUploaderConfig
149
160
  connector_type: str = CONNECTOR_TYPE
150
161
 
151
162
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -155,29 +166,28 @@ class OpenSearchUploader(ElasticsearchUploader):
155
166
  return parallel_bulk
156
167
 
157
168
 
158
- @dataclass
159
- class OpensearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
169
+ class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
160
170
  pass
161
171
 
162
172
 
163
173
  @dataclass
164
- class OpensearchUploadStager(ElasticsearchUploadStager):
165
- upload_stager_config: OpensearchUploadStagerConfig
174
+ class OpenSearchUploadStager(ElasticsearchUploadStager):
175
+ upload_stager_config: OpenSearchUploadStagerConfig
166
176
 
167
177
 
168
178
  opensearch_source_entry = SourceRegistryEntry(
169
179
  connection_config=OpenSearchConnectionConfig,
170
180
  indexer=OpenSearchIndexer,
171
- indexer_config=OpensearchIndexerConfig,
181
+ indexer_config=OpenSearchIndexerConfig,
172
182
  downloader=OpenSearchDownloader,
173
- downloader_config=OpensearchDownloaderConfig,
183
+ downloader_config=OpenSearchDownloaderConfig,
174
184
  )
175
185
 
176
186
 
177
187
  opensearch_destination_entry = DestinationRegistryEntry(
178
188
  connection_config=OpenSearchConnectionConfig,
179
- upload_stager_config=OpensearchUploadStagerConfig,
180
- upload_stager=OpensearchUploadStager,
181
- uploader_config=OpensearchUploaderConfig,
189
+ upload_stager_config=OpenSearchUploadStagerConfig,
190
+ upload_stager=OpenSearchUploadStager,
191
+ uploader_config=OpenSearchUploaderConfig,
182
192
  uploader=OpenSearchUploader,
183
193
  )
@@ -5,12 +5,11 @@ from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, Optional
7
7
 
8
- from unstructured.staging.base import flatten_dict
9
- from unstructured.utils import requires_dependencies
8
+ from pydantic import Field, Secret
10
9
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
12
10
  from unstructured_ingest.error import DestinationConnectionError
13
- from unstructured_ingest.utils.data_prep import batch_generator
11
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
13
  from unstructured_ingest.v2.interfaces import (
15
14
  AccessConfig,
16
15
  ConnectionConfig,
@@ -32,25 +31,31 @@ if TYPE_CHECKING:
32
31
  CONNECTOR_TYPE = "pinecone"
33
32
 
34
33
 
35
- @dataclass
36
34
  class PineconeAccessConfig(AccessConfig):
37
- api_key: Optional[str] = enhanced_field(default=None, overload_name="pinecone_api_key")
35
+ pinecone_api_key: Optional[str] = Field(
36
+ default=None, description="API key for Pinecone.", alias="api_key"
37
+ )
38
+
39
+
40
+ SecretPineconeAccessConfig = Secret[PineconeAccessConfig]
38
41
 
39
42
 
40
- @dataclass
41
43
  class PineconeConnectionConfig(ConnectionConfig):
42
- index_name: str
43
- environment: str
44
- access_config: PineconeAccessConfig = enhanced_field(sensitive=True)
44
+ index_name: str = Field(description="Name of the index to connect to.")
45
+ environment: str = Field(description="Environment to connect to.")
46
+ access_config: SecretPineconeAccessConfig = Field(
47
+ default_factory=lambda: SecretPineconeAccessConfig(secret_value=PineconeAccessConfig())
48
+ )
45
49
 
46
50
  @requires_dependencies(["pinecone"], extras="pinecone")
47
51
  def get_index(self) -> "PineconeIndex":
48
52
  from pinecone import Pinecone
49
- from unstructured import __version__ as unstructured_version
53
+
54
+ from unstructured_ingest import __version__ as unstructured_version
50
55
 
51
56
  pc = Pinecone(
52
- api_key=self.access_config.api_key,
53
- source_tag=f"unstructured=={unstructured_version}",
57
+ api_key=self.access_config.get_secret_value().pinecone_api_key,
58
+ source_tag=f"unstructured_ingest=={unstructured_version}",
54
59
  )
55
60
 
56
61
  index = pc.Index(self.index_name)
@@ -58,15 +63,13 @@ class PineconeConnectionConfig(ConnectionConfig):
58
63
  return index
59
64
 
60
65
 
61
- @dataclass
62
66
  class PineconeUploadStagerConfig(UploadStagerConfig):
63
67
  pass
64
68
 
65
69
 
66
- @dataclass
67
70
  class PineconeUploaderConfig(UploaderConfig):
68
- batch_size: int = 100
69
- num_of_processes: int = 4
71
+ batch_size: int = Field(default=100, description="Number of records per batch")
72
+ num_processes: int = Field(default=4, description="Number of processes to use for uploading")
70
73
 
71
74
 
72
75
  @dataclass
@@ -154,18 +157,18 @@ class PineconeUploader(Uploader):
154
157
  f" index named {self.connection_config.index_name}"
155
158
  f" environment named {self.connection_config.environment}"
156
159
  f" with batch size {self.upload_config.batch_size}"
157
- f" with {self.upload_config.num_of_processes} (number of) processes"
160
+ f" with {self.upload_config.num_processes} (number of) processes"
158
161
  )
159
162
 
160
163
  pinecone_batch_size = self.upload_config.batch_size
161
164
 
162
- if self.upload_config.num_of_processes == 1:
165
+ if self.upload_config.num_processes == 1:
163
166
  for batch in batch_generator(elements_dict, pinecone_batch_size):
164
167
  self.upsert_batch(batch) # noqa: E203
165
168
 
166
169
  else:
167
170
  with mp.Pool(
168
- processes=self.upload_config.num_of_processes,
171
+ processes=self.upload_config.num_processes,
169
172
  ) as pool:
170
173
  pool.map(
171
174
  self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
@@ -15,11 +15,11 @@ from email.utils import formatdate
15
15
  from pathlib import Path
16
16
  from string import Template
17
17
  from textwrap import dedent
18
- from typing import TYPE_CHECKING, Any, Generator, Type
18
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Type
19
19
 
20
20
  from dateutil import parser
21
+ from pydantic import Field, Secret
21
22
 
22
- from unstructured_ingest.enhanced_dataclass import enhanced_field
23
23
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
24
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
25
25
  from unstructured_ingest.v2.interfaces import (
@@ -75,49 +75,58 @@ $htmlbody
75
75
  )
76
76
 
77
77
 
78
- @dataclass
79
78
  class SalesforceAccessConfig(AccessConfig):
80
79
  consumer_key: str
81
- private_key: str
80
+ private_key_path: Optional[Path] = Field(
81
+ default=None,
82
+ description="Path to the private key file. " "Key file is usually named server.key.",
83
+ )
84
+ private_key: Optional[str] = Field(default=None, description="Contents of the private key")
85
+
86
+ def model_post_init(self, __context: Any) -> None:
87
+ if self.private_key_path is None and self.private_key is None:
88
+ raise ValueError("either private_key or private_key_path must be set")
89
+ if self.private_key is not None and self.private_key_path is not None:
90
+ raise ValueError("only one of private_key or private_key_path must be set")
82
91
 
83
92
  @requires_dependencies(["cryptography"])
84
93
  def get_private_key_value_and_type(self) -> tuple[str, Type]:
85
94
  from cryptography.hazmat.primitives import serialization
86
95
 
87
- try:
88
- serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
89
- except ValueError:
90
- pass
91
- else:
96
+ if self.private_key_path and self.private_key_path.is_file():
97
+ return str(self.private_key_path), Path
98
+ if self.private_key:
99
+ try:
100
+ serialization.load_pem_private_key(
101
+ data=str(self.private_key).encode("utf-8"), password=None
102
+ )
103
+ except Exception as e:
104
+ raise ValueError(f"failed to validate private key data: {e}") from e
92
105
  return self.private_key, str
93
106
 
94
- if Path(self.private_key).is_file():
95
- return self.private_key, Path
96
-
97
107
  raise ValueError("private_key does not contain PEM private key or path")
98
108
 
99
109
 
100
- @dataclass
101
110
  class SalesforceConnectionConfig(ConnectionConfig):
102
111
  username: str
103
- access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
112
+ access_config: Secret[SalesforceAccessConfig]
104
113
 
105
114
  @requires_dependencies(["simple_salesforce"], extras="salesforce")
106
115
  def get_client(self) -> "Salesforce":
107
116
  from simple_salesforce import Salesforce
108
117
 
109
- pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
118
+ access_config = self.access_config.get_secret_value()
119
+ pkey_value, pkey_type = access_config.get_private_key_value_and_type()
110
120
 
111
121
  return Salesforce(
112
122
  username=self.username,
113
- consumer_key=self.access_config.consumer_key,
123
+ consumer_key=access_config.consumer_key,
114
124
  privatekey_file=pkey_value if pkey_type is Path else None,
115
125
  privatekey=pkey_value if pkey_type is str else None,
116
126
  version=SALESFORCE_API_VERSION,
117
127
  )
118
128
 
119
129
 
120
- @dataclass
121
130
  class SalesforceIndexerConfig(IndexerConfig):
122
131
  categories: list[str]
123
132
 
@@ -201,7 +210,6 @@ class SalesforceIndexer(Indexer):
201
210
  yield f
202
211
 
203
212
 
204
- @dataclass
205
213
  class SalesforceDownloaderConfig(DownloaderConfig):
206
214
  pass
207
215
 
@@ -6,7 +6,8 @@ from time import time
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
  from urllib.parse import quote
8
8
 
9
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
9
+ from pydantic import BaseModel, Field, Secret, SecretStr
10
+
10
11
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.v2.interfaces import (
@@ -54,24 +55,38 @@ class SharepointContentType(Enum):
54
55
  LIST = "list"
55
56
 
56
57
 
57
- @dataclass
58
58
  class SharepointAccessConfig(AccessConfig):
59
- client_cred: str
60
-
61
-
62
- @dataclass
63
- class SharepointPermissionsConfig(EnhancedDataClassJsonMixin):
64
- permissions_application_id: str
65
- permissions_tenant: str
66
- permissions_client_cred: str = enhanced_field(sensitive=True)
67
- authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
59
+ client_cred: str = Field(description="Sharepoint app secret")
60
+
61
+
62
+ class SharepointPermissionsConfig(BaseModel):
63
+ permissions_application_id: str = Field(description="Microsoft Graph API application id")
64
+ permissions_tenant: str = Field(
65
+ description="url to get permissions data within tenant.",
66
+ examples=["https://contoso.onmicrosoft.com"],
67
+ )
68
+ permissions_client_cred: SecretStr = Field(
69
+ description="Microsoft Graph API application credentials"
70
+ )
71
+ authority_url: Optional[SecretStr] = Field(
72
+ repr=False,
73
+ default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
74
+ description="Permissions authority url",
75
+ examples=["https://login.microsoftonline.com"],
76
+ )
68
77
 
69
78
 
70
- @dataclass
71
79
  class SharepointConnectionConfig(ConnectionConfig):
72
- client_id: str
73
- site: str
74
- access_config: SharepointAccessConfig = enhanced_field(sensitive=True)
80
+ client_id: str = Field(description="Sharepoint app client ID")
81
+ site: str = Field(
82
+ description="Sharepoint site url. Process either base url e.g \
83
+ https://[tenant].sharepoint.com or relative sites \
84
+ https://[tenant].sharepoint.com/sites/<site_name>. \
85
+ To process all sites within the tenant pass a site url as \
86
+ https://[tenant]-admin.sharepoint.com.\
87
+ This requires the app to be registered at a tenant level"
88
+ )
89
+ access_config: Secret[SharepointAccessConfig]
75
90
  permissions_config: Optional[SharepointPermissionsConfig] = None
76
91
 
77
92
  @requires_dependencies(["office365"], extras="sharepoint")
@@ -80,7 +95,9 @@ class SharepointConnectionConfig(ConnectionConfig):
80
95
  from office365.sharepoint.client_context import ClientContext
81
96
 
82
97
  try:
83
- credentials = ClientCredential(self.client_id, self.access_config.client_cred)
98
+ credentials = ClientCredential(
99
+ self.client_id, self.access_config.get_secret_value().client_cred
100
+ )
84
101
  site_client = ClientContext(self.site).with_credentials(credentials)
85
102
  except Exception as e:
86
103
  logger.error(f"Couldn't set Sharepoint client: {e}")
@@ -92,11 +109,12 @@ class SharepointConnectionConfig(ConnectionConfig):
92
109
  from msal import ConfidentialClientApplication
93
110
 
94
111
  try:
112
+ client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
95
113
  app = ConfidentialClientApplication(
96
- authority=f"{self.permissions_config.authority_url}/"
114
+ authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
97
115
  f"{self.permissions_config.permissions_tenant}",
98
116
  client_id=self.permissions_config.permissions_application_id,
99
- client_credential=self.permissions_config.permissions_client_cred,
117
+ client_credential=client_credential,
100
118
  )
101
119
  token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
102
120
  except ValueError as exc:
@@ -119,13 +137,21 @@ class SharepointConnectionConfig(ConnectionConfig):
119
137
  return client
120
138
 
121
139
 
122
- @dataclass
123
140
  class SharepointIndexerConfig(IndexerConfig):
124
- path: Optional[str] = None
125
- recursive: bool = False
126
- omit_files: bool = False
127
- omit_pages: bool = False
128
- omit_lists: bool = False
141
+ path: Optional[str] = Field(
142
+ defaul=None,
143
+ description="Path from which to start parsing files. If the connector is to \
144
+ process all sites within the tenant this filter will be applied to \
145
+ all sites document libraries.",
146
+ )
147
+ recursive: bool = Field(
148
+ default=False,
149
+ description="Recursively download files in their respective folders "
150
+ "otherwise stop at the files in provided folder level.",
151
+ )
152
+ omit_files: bool = Field(default=False, description="Don't process files.")
153
+ omit_pages: bool = Field(default=False, description="Don't process site pages.")
154
+ omit_lists: bool = Field(default=False, description="Don't process lists.")
129
155
 
130
156
 
131
157
  @dataclass
@@ -310,7 +336,7 @@ class SharepointIndexer(Indexer):
310
336
  def process_permissions(self) -> bool:
311
337
  return (
312
338
  self.connection_config.permissions_config.permissions_tenant
313
- and self.connection_config.permissions_config.permissions_client_cred
339
+ and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
314
340
  and self.connection_config.permissions_config.permissions_application_id
315
341
  )
316
342
 
@@ -335,7 +361,6 @@ class SharepointIndexer(Indexer):
335
361
  yield file_data
336
362
 
337
363
 
338
- @dataclass
339
364
  class SharepointDownloaderConfig(DownloaderConfig):
340
365
  pass
341
366
 
@@ -7,8 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
  from dateutil import parser
10
+ from pydantic import Field, Secret
10
11
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
12
12
  from unstructured_ingest.utils.data_prep import batch_generator
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
  from unstructured_ingest.utils.table import convert_to_pandas_dataframe
@@ -33,18 +33,16 @@ if TYPE_CHECKING:
33
33
  CONNECTOR_TYPE = "singlestore"
34
34
 
35
35
 
36
- @dataclass
37
36
  class SingleStoreAccessConfig(AccessConfig):
38
- password: Optional[str] = None
37
+ password: Optional[str] = Field(default=None, description="SingleStore password")
39
38
 
40
39
 
41
- @dataclass
42
40
  class SingleStoreConnectionConfig(ConnectionConfig):
43
- host: Optional[str] = None
44
- port: Optional[int] = None
45
- user: Optional[str] = None
46
- database: Optional[str] = None
47
- access_config: SingleStoreAccessConfig = enhanced_field(sensitive=True)
41
+ host: Optional[str] = Field(default=None, description="SingleStore host")
42
+ port: Optional[int] = Field(default=None, description="SingleStore port")
43
+ user: Optional[str] = Field(default=None, description="SingleStore user")
44
+ database: Optional[str] = Field(default=None, description="SingleStore database")
45
+ access_config: Secret[SingleStoreAccessConfig]
48
46
 
49
47
  @requires_dependencies(["singlestoredb"], extras="singlestore")
50
48
  def get_connection(self) -> "Connection":
@@ -55,14 +53,13 @@ class SingleStoreConnectionConfig(ConnectionConfig):
55
53
  port=self.port,
56
54
  database=self.database,
57
55
  user=self.user,
58
- password=self.access_config.password,
56
+ password=self.access_config.get_secret_value().password,
59
57
  )
60
58
  return conn
61
59
 
62
60
 
63
- @dataclass
64
61
  class SingleStoreUploadStagerConfig(UploadStagerConfig):
65
- drop_empty_cols: bool = False
62
+ drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
66
63
 
67
64
 
68
65
  @dataclass
@@ -112,10 +109,9 @@ class SingleStoreUploadStager(UploadStager):
112
109
  return output_path
113
110
 
114
111
 
115
- @dataclass
116
112
  class SingleStoreUploaderConfig(UploaderConfig):
117
- table_name: str
118
- batch_size: int = 100
113
+ table_name: str = Field(description="SingleStore table to write contents to")
114
+ batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
119
115
 
120
116
 
121
117
  @dataclass