unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -5,9 +5,9 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.utils import requires_dependencies
8
+ from pydantic import Field, Secret
9
9
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
11
  from unstructured_ingest.v2.interfaces import (
12
12
  DownloadResponse,
13
13
  FileData,
@@ -32,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
32
32
  CONNECTOR_TYPE = "s3"
33
33
 
34
34
 
35
- @dataclass
36
35
  class S3IndexerConfig(FsspecIndexerConfig):
37
36
  pass
38
37
 
39
38
 
40
- @dataclass
41
39
  class S3AccessConfig(FsspecAccessConfig):
42
- key: Optional[str] = None
43
- secret: Optional[str] = None
44
- token: Optional[str] = None
40
+ key: Optional[str] = Field(
41
+ default=None,
42
+ description="If not anonymous, use this access key ID, if specified. Takes precedence "
43
+ "over `aws_access_key_id` in client_kwargs.",
44
+ )
45
+ secret: Optional[str] = Field(
46
+ default=None, description="If not anonymous, use this secret access key, if specified."
47
+ )
48
+ token: Optional[str] = Field(
49
+ default=None, description="If not anonymous, use this security token, if specified."
50
+ )
51
+
52
+
53
+ SecretS3AccessConfig = Secret[S3AccessConfig]
45
54
 
46
55
 
47
- @dataclass
48
56
  class S3ConnectionConfig(FsspecConnectionConfig):
49
- supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
50
- access_config: S3AccessConfig = enhanced_field(
51
- sensitive=True, default_factory=lambda: S3AccessConfig()
57
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
58
+ access_config: SecretS3AccessConfig = Field(
59
+ default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
52
60
  )
53
- endpoint_url: Optional[str] = None
54
- anonymous: bool = False
55
- connector_type: str = CONNECTOR_TYPE
61
+ endpoint_url: Optional[str] = Field(
62
+ default=None,
63
+ description="Use this endpoint_url, if specified. Needed for "
64
+ "connecting to non-AWS S3 buckets.",
65
+ )
66
+ anonymous: bool = Field(
67
+ default=False, description="Connect to s3 without local AWS credentials."
68
+ )
69
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
56
70
 
57
71
  def get_access_config(self) -> dict[str, Any]:
58
72
  access_configs: dict[str, Any] = {"anon": self.anonymous}
@@ -60,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
60
74
  access_configs["endpoint_url"] = self.endpoint_url
61
75
 
62
76
  # Avoid injecting None by filtering out k,v pairs where the value is None
63
- access_configs.update({k: v for k, v in self.access_config.to_dict().items() if v})
77
+ access_configs.update(
78
+ {k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
79
+ )
64
80
  return access_configs
65
81
 
66
82
 
@@ -116,7 +132,6 @@ class S3Indexer(FsspecIndexer):
116
132
  super().precheck()
117
133
 
118
134
 
119
- @dataclass
120
135
  class S3DownloaderConfig(FsspecDownloaderConfig):
121
136
  pass
122
137
 
@@ -137,7 +152,6 @@ class S3Downloader(FsspecDownloader):
137
152
  return await super().run_async(file_data=file_data, **kwargs)
138
153
 
139
154
 
140
- @dataclass
141
155
  class S3UploaderConfig(FsspecUploaderConfig):
142
156
  pass
143
157
 
@@ -6,7 +6,8 @@ from pathlib import Path
6
6
  from typing import Any, Generator, Optional
7
7
  from urllib.parse import urlparse
8
8
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from pydantic import Field, Secret
10
+
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
12
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
27
28
  CONNECTOR_TYPE = "sftp"
28
29
 
29
30
 
30
- @dataclass
31
31
  class SftpIndexerConfig(FsspecIndexerConfig):
32
- def __post_init__(self):
33
- super().__post_init__()
32
+
33
+ def model_post_init(self, __context: Any) -> None:
34
+ super().model_post_init(__context)
34
35
  _, ext = os.path.splitext(self.remote_url)
35
36
  parsed_url = urlparse(self.remote_url)
36
37
  if ext:
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
39
40
  self.path_without_protocol = parsed_url.path.lstrip("/")
40
41
 
41
42
 
42
- @dataclass
43
43
  class SftpAccessConfig(FsspecAccessConfig):
44
- password: str
44
+ password: str = Field(description="Password for sftp connection")
45
45
 
46
46
 
47
- @dataclass
48
47
  class SftpConnectionConfig(FsspecConnectionConfig):
49
- supported_protocols: list[str] = field(default_factory=lambda: ["sftp"])
50
- access_config: SftpAccessConfig = enhanced_field(sensitive=True)
51
- connector_type: str = CONNECTOR_TYPE
52
- username: Optional[str] = None
53
- host: Optional[str] = None
54
- port: int = 22
55
- look_for_keys: bool = False
56
- allow_agent: bool = False
48
+ supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
49
+ access_config: Secret[SftpAccessConfig]
50
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
51
+ username: str = Field(description="Username for sftp connection")
52
+ host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
53
+ port: int = Field(default=22, description="Port for sftp connection")
54
+ look_for_keys: bool = Field(
55
+ default=False, description="Whether to search for private key files in ~/.ssh/"
56
+ )
57
+ allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
57
58
 
58
59
  def get_access_config(self) -> dict[str, Any]:
59
60
  access_config = {
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
62
63
  "port": self.port,
63
64
  "look_for_keys": self.look_for_keys,
64
65
  "allow_agent": self.allow_agent,
65
- "password": self.access_config.password,
66
+ "password": self.access_config.get_secret_value().password,
66
67
  }
67
68
  return access_config
68
69
 
@@ -96,24 +97,15 @@ class SftpIndexer(FsspecIndexer):
96
97
  super().precheck()
97
98
 
98
99
 
99
- @dataclass
100
100
  class SftpDownloaderConfig(FsspecDownloaderConfig):
101
- remote_url: Optional[str] = None
102
-
103
- def __post_init__(self):
104
- # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
105
- if not self.remote_url:
106
- raise TypeError(
107
- f"{self.__class__.__name__}.__init__() "
108
- f"missing 1 required positional argument: 'remote_url'"
109
- )
101
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
110
102
 
111
103
 
112
104
  @dataclass
113
105
  class SftpDownloader(FsspecDownloader):
114
106
  protocol: str = "sftp"
115
107
  connection_config: SftpConnectionConfig
116
- connector_type: str = CONNECTOR_TYPE
108
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
117
109
  download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
118
110
 
119
111
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
@@ -131,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
131
123
  return await super().run_async(file_data=file_data, **kwargs)
132
124
 
133
125
 
134
- @dataclass
135
126
  class SftpUploaderConfig(FsspecUploaderConfig):
136
127
  pass
137
128
 
@@ -1,18 +1,18 @@
1
1
  import io
2
- import os
2
+ import json
3
3
  from dataclasses import dataclass, field
4
- from typing import TYPE_CHECKING, Any, Generator, Optional, Union
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from dateutil import parser
7
- from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
10
  from unstructured_ingest.error import (
11
11
  SourceConnectionError,
12
12
  SourceConnectionNetworkError,
13
13
  )
14
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
+ from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
16
16
  from unstructured_ingest.v2.interfaces import (
17
17
  AccessConfig,
18
18
  ConnectionConfig,
@@ -37,46 +37,54 @@ if TYPE_CHECKING:
37
37
  from googleapiclient.http import MediaIoBaseDownload
38
38
 
39
39
 
40
- @dataclass
41
40
  class GoogleDriveAccessConfig(AccessConfig):
42
- service_account_key: Union[str, dict]
41
+ service_account_key: Optional[dict] = Field(
42
+ default=None, description="Credentials values to use for authentication"
43
+ )
44
+ service_account_key_path: Optional[Path] = Field(
45
+ default=None, description="File path to credentials values to use for authentication"
46
+ )
47
+
48
+ def model_post_init(self, __context: Any) -> None:
49
+ if self.service_account_key is None and self.service_account_key_path is None:
50
+ raise ValueError(
51
+ "either service_account_key or service_account_key_path must be provided"
52
+ )
53
+
54
+ def get_service_account_key(self) -> dict:
55
+ key_data = None
56
+ if self.service_account_key_path:
57
+ with self.service_account_key_path.open() as f:
58
+ key_data = json.load(f)
59
+ if key_data and self.service_account_key:
60
+ if key_data == self.service_account_key:
61
+ return key_data
62
+ else:
63
+ raise ValueError(
64
+ "service_account_key and service_account_key_path "
65
+ "both provided and have different values"
66
+ )
67
+ if key_data:
68
+ return key_data
69
+ return self.service_account_key
43
70
 
44
71
 
45
- @dataclass
46
72
  class GoogleDriveConnectionConfig(ConnectionConfig):
47
- drive_id: str
48
- access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True)
73
+ drive_id: str = Field(description="Google Drive File or Folder ID.")
74
+ access_config: Secret[GoogleDriveAccessConfig]
49
75
 
50
76
  @requires_dependencies(["googleapiclient"], extras="google-drive")
51
77
  def get_files_service(self) -> "GoogleAPIResource":
52
- from google.auth import default, exceptions
78
+ from google.auth import exceptions
53
79
  from google.oauth2 import service_account
54
80
  from googleapiclient.discovery import build
55
81
  from googleapiclient.errors import HttpError
56
82
 
57
- # Service account key can be a dict or a file path(str)
58
- # But the dict may come in as a string
59
- if isinstance(self.access_config.service_account_key, str):
60
- key_path = json_to_dict(self.access_config.service_account_key)
61
- elif isinstance(self.access_config.service_account_key, dict):
62
- key_path = self.access_config.service_account_key
63
- else:
64
- raise TypeError(
65
- f"access_config.service_account_key must be "
66
- f"str or dict, got: {type(self.access_config.service_account_key)}"
67
- )
83
+ access_config = self.access_config.get_secret_value()
84
+ key_data = access_config.get_service_account_key()
68
85
 
69
86
  try:
70
- if isinstance(key_path, dict):
71
- creds = service_account.Credentials.from_service_account_info(key_path)
72
- elif isinstance(key_path, str):
73
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
74
- creds, _ = default()
75
- else:
76
- raise ValueError(
77
- f"key path not recognized as a dictionary or a file path: "
78
- f"[{type(key_path)}] {key_path}",
79
- )
87
+ creds = service_account.Credentials.from_service_account_info(key_data)
80
88
  service = build("drive", "v3", credentials=creds)
81
89
  return service.files()
82
90
 
@@ -86,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
86
94
  raise ValueError("The provided API key is invalid.")
87
95
 
88
96
 
89
- @dataclass
90
97
  class GoogleDriveIndexerConfig(IndexerConfig):
91
98
  extensions: Optional[list[str]] = None
92
99
  recursive: bool = False
@@ -268,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
268
275
  yield f
269
276
 
270
277
 
271
- @dataclass
272
278
  class GoogleDriveDownloaderConfig(DownloaderConfig):
273
279
  pass
274
280
 
@@ -5,6 +5,8 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator
7
7
 
8
+ from pydantic import Field, Secret
9
+
8
10
  from unstructured_ingest.v2.interfaces import (
9
11
  AccessConfig,
10
12
  ConnectionConfig,
@@ -29,20 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
29
31
  CONNECTOR_TYPE = "local"
30
32
 
31
33
 
32
- @dataclass
33
34
  class LocalAccessConfig(AccessConfig):
34
35
  pass
35
36
 
36
37
 
37
- @dataclass
38
+ SecretLocalAccessConfig = Secret[LocalAccessConfig]
39
+
40
+
38
41
  class LocalConnectionConfig(ConnectionConfig):
39
- access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig())
42
+ access_config: SecretLocalAccessConfig = Field(
43
+ default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
44
+ )
40
45
 
41
46
 
42
- @dataclass
43
47
  class LocalIndexerConfig(IndexerConfig):
44
- input_path: str
45
- recursive: bool = False
48
+ input_path: Path = Field(
49
+ description="Path to the location in the local file system that will be processed."
50
+ )
51
+ recursive: bool = Field(
52
+ default=False,
53
+ description="Recursively download files in their respective folders "
54
+ "otherwise stop at the files in provided folder level.",
55
+ )
46
56
 
47
57
  @property
48
58
  def path(self) -> Path:
@@ -122,7 +132,6 @@ class LocalIndexer(Indexer):
122
132
  yield file_data
123
133
 
124
134
 
125
- @dataclass
126
135
  class LocalDownloaderConfig(DownloaderConfig):
127
136
  pass
128
137
 
@@ -130,10 +139,8 @@ class LocalDownloaderConfig(DownloaderConfig):
130
139
  @dataclass
131
140
  class LocalDownloader(Downloader):
132
141
  connector_type: str = CONNECTOR_TYPE
133
- connection_config: LocalConnectionConfig = field(
134
- default_factory=lambda: LocalConnectionConfig()
135
- )
136
- download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
142
+ connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
143
+ download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
137
144
 
138
145
  def get_download_path(self, file_data: FileData) -> Path:
139
146
  return Path(file_data.source_identifiers.fullpath)
@@ -144,9 +151,10 @@ class LocalDownloader(Downloader):
144
151
  )
145
152
 
146
153
 
147
- @dataclass
148
154
  class LocalUploaderConfig(UploaderConfig):
149
- output_dir: str = field(default="structured-output")
155
+ output_dir: str = Field(
156
+ default="structured-output", description="Local path to write partitioned output to"
157
+ )
150
158
 
151
159
  @property
152
160
  def output_path(self) -> Path:
@@ -160,7 +168,7 @@ class LocalUploaderConfig(UploaderConfig):
160
168
  @dataclass
161
169
  class LocalUploader(Uploader):
162
170
  connector_type: str = CONNECTOR_TYPE
163
- upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig())
171
+ upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
164
172
  connection_config: LocalConnectionConfig = field(
165
173
  default_factory=lambda: LocalConnectionConfig()
166
174
  )
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
6
6
 
7
7
  import pandas as pd
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import WriteError
12
12
  from unstructured_ingest.utils.data_prep import flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "milvus"
33
33
 
34
34
 
35
- @dataclass
36
35
  class MilvusAccessConfig(AccessConfig):
37
- password: Optional[str] = None
38
- token: Optional[str] = None
36
+ password: Optional[str] = Field(default=None, description="Milvus password")
37
+ token: Optional[str] = Field(default=None, description="Milvus access token")
38
+
39
+
40
+ SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
39
41
 
40
42
 
41
- @dataclass
42
43
  class MilvusConnectionConfig(ConnectionConfig):
43
- access_config: MilvusAccessConfig = enhanced_field(
44
- sensitive=True, default_factory=lambda: MilvusAccessConfig()
44
+ access_config: SecretMilvusAccessConfig = Field(
45
+ default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
45
46
  )
46
- uri: Optional[str] = None
47
- user: Optional[str] = None
48
- db_name: Optional[str] = None
47
+ uri: Optional[str] = Field(
48
+ default=None, description="Milvus uri", examples=["http://localhost:19530"]
49
+ )
50
+ user: Optional[str] = Field(default=None, description="Milvus user")
51
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
49
52
 
50
53
  def get_connection_kwargs(self) -> dict[str, Any]:
51
- access_config_dict = self.access_config.to_dict()
52
- connection_config_dict = self.to_dict()
54
+ access_config = self.access_config.get_secret_value()
55
+ access_config_dict = access_config.dict()
56
+ connection_config_dict = self.dict()
53
57
  connection_config_dict.pop("access_config", None)
54
58
  connection_config_dict.update(access_config_dict)
55
59
  # Drop any that were not set explicitly
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
63
67
  return MilvusClient(**self.get_connection_kwargs())
64
68
 
65
69
 
66
- @dataclass
67
70
  class MilvusUploadStagerConfig(UploadStagerConfig):
68
71
  pass
69
72
 
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
130
133
  return output_path
131
134
 
132
135
 
133
- @dataclass
134
136
  class MilvusUploaderConfig(UploaderConfig):
135
- collection_name: str
136
- num_of_processes: int = 4
137
+ collection_name: str = Field(description="Milvus collections to write to")
138
+ num_processes: int = Field(
139
+ default=4, description="number of processes to use when writing to support parallel writes"
140
+ )
137
141
 
138
142
 
139
143
  @dataclass
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
180
184
  self.insert_results(data=data)
181
185
 
182
186
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
183
- if self.upload_config.num_of_processes == 1:
187
+ if self.upload_config.num_processes == 1:
184
188
  for content in contents:
185
189
  self.upload(content=content)
186
190
 
187
191
  else:
188
192
  with mp.Pool(
189
- processes=self.upload_config.num_of_processes,
193
+ processes=self.upload_config.num_processes,
190
194
  ) as pool:
191
195
  pool.map(self.upload, contents)
192
196
 
@@ -3,9 +3,9 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured.__version__ import __version__ as unstructured_version
6
+ from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
9
9
  from unstructured_ingest.error import DestinationConnectionError
10
10
  from unstructured_ingest.utils.data_prep import batch_generator
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -31,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
31
31
  SERVER_API_VERSION = "1"
32
32
 
33
33
 
34
- @dataclass
35
34
  class MongoDBAccessConfig(AccessConfig):
36
- uri: Optional[str] = None
35
+ uri: Optional[str] = Field(default=None, description="URI to user when connecting")
36
+
37
+
38
+ SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
37
39
 
38
40
 
39
- @dataclass
40
41
  class MongoDBConnectionConfig(ConnectionConfig):
41
- access_config: MongoDBAccessConfig = enhanced_field(
42
- sensitive=True, default_factory=MongoDBAccessConfig
42
+ access_config: SecretMongoDBAccessConfig = Field(
43
+ default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
43
44
  )
44
- host: Optional[str] = None
45
- database: Optional[str] = None
46
- collection: Optional[str] = None
47
- port: int = 27017
48
- batch_size: int = 100
49
- connector_type: str = CONNECTOR_TYPE
45
+ host: Optional[str] = Field(
46
+ default=None,
47
+ description="hostname or IP address or Unix domain socket path of a single mongod or "
48
+ "mongos instance to connect to, or a list of hostnames",
49
+ )
50
+ database: Optional[str] = Field(default=None, description="database name to connect to")
51
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
52
+ port: int = Field(default=27017)
53
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
54
 
51
55
 
52
- @dataclass
53
56
  class MongoDBUploadStagerConfig(UploadStagerConfig):
54
57
  pass
55
58
 
@@ -77,9 +80,8 @@ class MongoDBUploadStager(UploadStager):
77
80
  return output_path
78
81
 
79
82
 
80
- @dataclass
81
83
  class MongoDBUploaderConfig(UploaderConfig):
82
- batch_size: int = 100
84
+ batch_size: int = Field(default=100, description="Number of records per batch")
83
85
 
84
86
 
85
87
  @dataclass
@@ -102,9 +104,11 @@ class MongoDBUploader(Uploader):
102
104
  from pymongo.driver_info import DriverInfo
103
105
  from pymongo.server_api import ServerApi
104
106
 
105
- if self.connection_config.access_config.uri:
107
+ access_config = self.connection_config.access_config.get_secret_value()
108
+
109
+ if access_config.uri:
106
110
  return MongoClient(
107
- self.connection_config.access_config.uri,
111
+ access_config.uri,
108
112
  server_api=ServerApi(version=SERVER_API_VERSION),
109
113
  driver=DriverInfo(name="unstructured", version=unstructured_version),
110
114
  )
@@ -1,12 +1,12 @@
1
1
  import json
2
- from dataclasses import dataclass, field
2
+ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
10
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
35
35
  MAX_MB_SIZE = 512_000_000
36
36
 
37
37
 
38
- @dataclass
39
38
  class OnedriveAccessConfig(AccessConfig):
40
- client_cred: str
39
+ client_cred: str = Field(description="Microsoft App client secret")
41
40
 
42
41
 
43
- @dataclass
44
42
  class OnedriveConnectionConfig(ConnectionConfig):
45
- client_id: str
46
- user_pname: str
47
- tenant: str = field(repr=False)
48
- authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
49
- access_config: OnedriveAccessConfig = enhanced_field(sensitive=True)
43
+ client_id: str = Field(description="Microsoft app client ID")
44
+ user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
45
+ tenant: str = Field(
46
+ repr=False, description="ID or domain name associated with your Azure AD instance"
47
+ )
48
+ authority_url: Optional[str] = Field(
49
+ repr=False,
50
+ default="https://login.microsoftonline.com",
51
+ examples=["https://login.microsoftonline.com"],
52
+ description="Authentication token provider for Microsoft apps",
53
+ )
54
+ access_config: Secret[OnedriveAccessConfig]
50
55
 
51
56
  @requires_dependencies(["msal"], extras="onedrive")
52
57
  def get_token(self):
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
56
61
  app = ConfidentialClientApplication(
57
62
  authority=f"{self.authority_url}/{self.tenant}",
58
63
  client_id=self.client_id,
59
- client_credential=self.access_config.client_cred,
64
+ client_credential=self.access_config.get_secret_value().client_cred,
60
65
  )
61
66
  token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
62
67
  except ValueError as exc:
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
76
81
  return client
77
82
 
78
83
 
79
- @dataclass
80
84
  class OnedriveIndexerConfig(IndexerConfig):
81
- path: Optional[str] = field(default="")
85
+ path: Optional[str] = Field(default="")
82
86
  recursive: bool = False
83
87
 
84
88
 
@@ -171,7 +175,6 @@ class OnedriveIndexer(Indexer):
171
175
  yield file_data
172
176
 
173
177
 
174
- @dataclass
175
178
  class OnedriveDownloaderConfig(DownloaderConfig):
176
179
  pass
177
180