unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,18 @@
1
1
  import io
2
- import os
2
+ import json
3
3
  from dataclasses import dataclass, field
4
- from typing import TYPE_CHECKING, Any, Generator, Optional, Union
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
6
 
6
7
  from dateutil import parser
7
- from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
10
  from unstructured_ingest.error import (
11
11
  SourceConnectionError,
12
12
  SourceConnectionNetworkError,
13
13
  )
14
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
+ from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
16
16
  from unstructured_ingest.v2.interfaces import (
17
17
  AccessConfig,
18
18
  ConnectionConfig,
@@ -37,46 +37,54 @@ if TYPE_CHECKING:
37
37
  from googleapiclient.http import MediaIoBaseDownload
38
38
 
39
39
 
40
- @dataclass
41
40
  class GoogleDriveAccessConfig(AccessConfig):
42
- service_account_key: Union[str, dict]
41
+ service_account_key: Optional[dict] = Field(
42
+ default=None, description="Credentials values to use for authentication"
43
+ )
44
+ service_account_key_path: Optional[Path] = Field(
45
+ default=None, description="File path to credentials values to use for authentication"
46
+ )
47
+
48
+ def model_post_init(self, __context: Any) -> None:
49
+ if self.service_account_key is None and self.service_account_key_path is None:
50
+ raise ValueError(
51
+ "either service_account_key or service_account_key_path must be provided"
52
+ )
53
+
54
+ def get_service_account_key(self) -> dict:
55
+ key_data = None
56
+ if self.service_account_key_path:
57
+ with self.service_account_key_path.open() as f:
58
+ key_data = json.load(f)
59
+ if key_data and self.service_account_key:
60
+ if key_data == self.service_account_key:
61
+ return key_data
62
+ else:
63
+ raise ValueError(
64
+ "service_account_key and service_account_key_path "
65
+ "both provided and have different values"
66
+ )
67
+ if key_data:
68
+ return key_data
69
+ return self.service_account_key
43
70
 
44
71
 
45
- @dataclass
46
72
  class GoogleDriveConnectionConfig(ConnectionConfig):
47
- drive_id: str
48
- access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True)
73
+ drive_id: str = Field(description="Google Drive File or Folder ID.")
74
+ access_config: Secret[GoogleDriveAccessConfig]
49
75
 
50
76
  @requires_dependencies(["googleapiclient"], extras="google-drive")
51
77
  def get_files_service(self) -> "GoogleAPIResource":
52
- from google.auth import default, exceptions
78
+ from google.auth import exceptions
53
79
  from google.oauth2 import service_account
54
80
  from googleapiclient.discovery import build
55
81
  from googleapiclient.errors import HttpError
56
82
 
57
- # Service account key can be a dict or a file path(str)
58
- # But the dict may come in as a string
59
- if isinstance(self.access_config.service_account_key, str):
60
- key_path = json_to_dict(self.access_config.service_account_key)
61
- elif isinstance(self.access_config.service_account_key, dict):
62
- key_path = self.access_config.service_account_key
63
- else:
64
- raise TypeError(
65
- f"access_config.service_account_key must be "
66
- f"str or dict, got: {type(self.access_config.service_account_key)}"
67
- )
83
+ access_config = self.access_config.get_secret_value()
84
+ key_data = access_config.get_service_account_key()
68
85
 
69
86
  try:
70
- if isinstance(key_path, dict):
71
- creds = service_account.Credentials.from_service_account_info(key_path)
72
- elif isinstance(key_path, str):
73
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
74
- creds, _ = default()
75
- else:
76
- raise ValueError(
77
- f"key path not recognized as a dictionary or a file path: "
78
- f"[{type(key_path)}] {key_path}",
79
- )
87
+ creds = service_account.Credentials.from_service_account_info(key_data)
80
88
  service = build("drive", "v3", credentials=creds)
81
89
  return service.files()
82
90
 
@@ -86,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
86
94
  raise ValueError("The provided API key is invalid.")
87
95
 
88
96
 
89
- @dataclass
90
97
  class GoogleDriveIndexerConfig(IndexerConfig):
91
98
  extensions: Optional[list[str]] = None
92
99
  recursive: bool = False
@@ -268,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
268
275
  yield f
269
276
 
270
277
 
271
- @dataclass
272
278
  class GoogleDriveDownloaderConfig(DownloaderConfig):
273
279
  pass
274
280
 
@@ -0,0 +1,170 @@
1
+ import json
2
+ import uuid
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.utils.data_prep import flatten_dict
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ UploadContent,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import (
26
+ DestinationRegistryEntry,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from kdbai_client import Session, Table
31
+
32
+ CONNECTOR_TYPE = "kdbai"
33
+
34
+
35
+ class KdbaiAccessConfig(AccessConfig):
36
+ api_key: Optional[str] = Field(
37
+ default=None,
38
+ description="A string for the api-key, can be left empty "
39
+ "when connecting to local KDBAI instance.",
40
+ )
41
+
42
+
43
+ SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
44
+
45
+
46
+ class KdbaiConnectionConfig(ConnectionConfig):
47
+ access_config: SecretKdbaiAccessConfig = Field(
48
+ default=SecretKdbaiAccessConfig(secret_value=KdbaiAccessConfig())
49
+ )
50
+ endpoint: str = Field(
51
+ default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
52
+ )
53
+
54
+ @requires_dependencies(["kdbai_client"], extras="kdbai")
55
+ def get_session(self) -> "Session":
56
+ from kdbai_client import Session
57
+
58
+ return Session(
59
+ api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
60
+ )
61
+
62
+
63
+ class KdbaiUploadStagerConfig(UploadStagerConfig):
64
+ pass
65
+
66
+
67
+ @dataclass
68
+ class KdbaiUploadStager(UploadStager):
69
+ upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
70
+
71
+ def run(
72
+ self,
73
+ elements_filepath: Path,
74
+ file_data: FileData,
75
+ output_dir: Path,
76
+ output_filename: str,
77
+ **kwargs: Any,
78
+ ) -> Path:
79
+ with open(elements_filepath) as elements_file:
80
+ elements_contents = json.load(elements_file)
81
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
82
+ output_path.parent.mkdir(parents=True, exist_ok=True)
83
+
84
+ data = []
85
+ for element in elements_contents:
86
+ data.append(
87
+ {
88
+ "id": str(uuid.uuid4()),
89
+ "element_id": element.get("element_id"),
90
+ "document": element.pop("text", None),
91
+ "embeddings": element.get("embeddings"),
92
+ "metadata": flatten_dict(
93
+ dictionary=element.get("metadata"),
94
+ flatten_lists=True,
95
+ remove_none=True,
96
+ ),
97
+ }
98
+ )
99
+ logger.debug(f"writing {len(data)} elements to {output_path}")
100
+ with output_path.open("w") as output_file:
101
+ json.dump(data, output_file, indent=2)
102
+ return output_path
103
+
104
+
105
+ class KdbaiUploaderConfig(UploaderConfig):
106
+ table_name: str = Field(description="The name of the KDBAI table to write into.")
107
+ batch_size: int = Field(default=100, description="Number of records per batch")
108
+
109
+
110
+ @dataclass
111
+ class KdbaiUploader(Uploader):
112
+ connection_config: KdbaiConnectionConfig
113
+ upload_config: KdbaiUploaderConfig
114
+ connector_type: str = field(default=CONNECTOR_TYPE, init=False)
115
+
116
+ def precheck(self) -> None:
117
+ try:
118
+ self.get_table()
119
+ except Exception as e:
120
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
121
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
122
+
123
+ def get_table(self) -> "Table":
124
+ session: Session = self.connection_config.get_session()
125
+ table = session.table(self.upload_config.table_name)
126
+ return table
127
+
128
+ def upsert_batch(self, batch: pd.DataFrame):
129
+ table = self.get_table()
130
+ table.insert(data=batch)
131
+
132
+ def process_dataframe(self, df: pd.DataFrame):
133
+ logger.debug(
134
+ f"uploading {len(df)} entries to {self.connection_config.endpoint} "
135
+ f"db in table {self.upload_config.table_name}"
136
+ )
137
+ for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
138
+ self.upsert_batch(batch=batch_df)
139
+
140
+ def process_csv(self, csv_paths: list[Path]):
141
+ logger.debug(f"uploading content from {len(csv_paths)} csv files")
142
+ df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
143
+ self.process_dataframe(df=df)
144
+
145
+ def process_json(self, json_paths: list[Path]):
146
+ logger.debug(f"uploading content from {len(json_paths)} json files")
147
+ all_records = []
148
+ for p in json_paths:
149
+ with open(p) as json_file:
150
+ all_records.extend(json.load(json_file))
151
+
152
+ df = pd.DataFrame(data=all_records)
153
+ self.process_dataframe(df=df)
154
+
155
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
156
+ csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
157
+ if csv_paths:
158
+ self.process_csv(csv_paths=csv_paths)
159
+ json_paths = [c.path for c in contents if c.path.suffix == ".json"]
160
+ if json_paths:
161
+ self.process_json(json_paths=json_paths)
162
+
163
+
164
+ kdbai_destination_entry = DestinationRegistryEntry(
165
+ connection_config=KdbaiConnectionConfig,
166
+ uploader=KdbaiUploader,
167
+ uploader_config=KdbaiUploaderConfig,
168
+ upload_stager=KdbaiUploadStager,
169
+ upload_stager_config=KdbaiUploadStagerConfig,
170
+ )
@@ -5,6 +5,8 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator
7
7
 
8
+ from pydantic import Field, Secret
9
+
8
10
  from unstructured_ingest.v2.interfaces import (
9
11
  AccessConfig,
10
12
  ConnectionConfig,
@@ -29,20 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
29
31
  CONNECTOR_TYPE = "local"
30
32
 
31
33
 
32
- @dataclass
33
34
  class LocalAccessConfig(AccessConfig):
34
35
  pass
35
36
 
36
37
 
37
- @dataclass
38
+ SecretLocalAccessConfig = Secret[LocalAccessConfig]
39
+
40
+
38
41
  class LocalConnectionConfig(ConnectionConfig):
39
- access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig())
42
+ access_config: SecretLocalAccessConfig = Field(
43
+ default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
44
+ )
40
45
 
41
46
 
42
- @dataclass
43
47
  class LocalIndexerConfig(IndexerConfig):
44
- input_path: str
45
- recursive: bool = False
48
+ input_path: Path = Field(
49
+ description="Path to the location in the local file system that will be processed."
50
+ )
51
+ recursive: bool = Field(
52
+ default=False,
53
+ description="Recursively download files in their respective folders "
54
+ "otherwise stop at the files in provided folder level.",
55
+ )
46
56
 
47
57
  @property
48
58
  def path(self) -> Path:
@@ -61,9 +71,12 @@ class LocalIndexer(Indexer):
61
71
  input_path = self.index_config.path
62
72
  if input_path.is_file():
63
73
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
74
+ files = []
64
75
  if self.index_config.recursive:
65
- return list(input_path.rglob("*"))
66
- return list(input_path.glob("*"))
76
+ files.extend(list(input_path.rglob("*")))
77
+ else:
78
+ files.extend(list(input_path.glob("*")))
79
+ return [f for f in files if f.is_file()]
67
80
 
68
81
  def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
69
82
  stats = path.stat()
@@ -122,7 +135,6 @@ class LocalIndexer(Indexer):
122
135
  yield file_data
123
136
 
124
137
 
125
- @dataclass
126
138
  class LocalDownloaderConfig(DownloaderConfig):
127
139
  pass
128
140
 
@@ -130,10 +142,8 @@ class LocalDownloaderConfig(DownloaderConfig):
130
142
  @dataclass
131
143
  class LocalDownloader(Downloader):
132
144
  connector_type: str = CONNECTOR_TYPE
133
- connection_config: LocalConnectionConfig = field(
134
- default_factory=lambda: LocalConnectionConfig()
135
- )
136
- download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
145
+ connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
146
+ download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
137
147
 
138
148
  def get_download_path(self, file_data: FileData) -> Path:
139
149
  return Path(file_data.source_identifiers.fullpath)
@@ -144,9 +154,10 @@ class LocalDownloader(Downloader):
144
154
  )
145
155
 
146
156
 
147
- @dataclass
148
157
  class LocalUploaderConfig(UploaderConfig):
149
- output_dir: str = field(default="structured-output")
158
+ output_dir: str = Field(
159
+ default="structured-output", description="Local path to write partitioned output to"
160
+ )
150
161
 
151
162
  @property
152
163
  def output_path(self) -> Path:
@@ -160,7 +171,7 @@ class LocalUploaderConfig(UploaderConfig):
160
171
  @dataclass
161
172
  class LocalUploader(Uploader):
162
173
  connector_type: str = CONNECTOR_TYPE
163
- upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig())
174
+ upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
164
175
  connection_config: LocalConnectionConfig = field(
165
176
  default_factory=lambda: LocalConnectionConfig()
166
177
  )
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
6
6
 
7
7
  import pandas as pd
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import WriteError
12
12
  from unstructured_ingest.utils.data_prep import flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "milvus"
33
33
 
34
34
 
35
- @dataclass
36
35
  class MilvusAccessConfig(AccessConfig):
37
- password: Optional[str] = None
38
- token: Optional[str] = None
36
+ password: Optional[str] = Field(default=None, description="Milvus password")
37
+ token: Optional[str] = Field(default=None, description="Milvus access token")
38
+
39
+
40
+ SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
39
41
 
40
42
 
41
- @dataclass
42
43
  class MilvusConnectionConfig(ConnectionConfig):
43
- access_config: MilvusAccessConfig = enhanced_field(
44
- sensitive=True, default_factory=lambda: MilvusAccessConfig()
44
+ access_config: SecretMilvusAccessConfig = Field(
45
+ default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
45
46
  )
46
- uri: Optional[str] = None
47
- user: Optional[str] = None
48
- db_name: Optional[str] = None
47
+ uri: Optional[str] = Field(
48
+ default=None, description="Milvus uri", examples=["http://localhost:19530"]
49
+ )
50
+ user: Optional[str] = Field(default=None, description="Milvus user")
51
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
49
52
 
50
53
  def get_connection_kwargs(self) -> dict[str, Any]:
51
- access_config_dict = self.access_config.to_dict()
52
- connection_config_dict = self.to_dict()
54
+ access_config = self.access_config.get_secret_value()
55
+ access_config_dict = access_config.dict()
56
+ connection_config_dict = self.dict()
53
57
  connection_config_dict.pop("access_config", None)
54
58
  connection_config_dict.update(access_config_dict)
55
59
  # Drop any that were not set explicitly
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
63
67
  return MilvusClient(**self.get_connection_kwargs())
64
68
 
65
69
 
66
- @dataclass
67
70
  class MilvusUploadStagerConfig(UploadStagerConfig):
68
71
  pass
69
72
 
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
130
133
  return output_path
131
134
 
132
135
 
133
- @dataclass
134
136
  class MilvusUploaderConfig(UploaderConfig):
135
- collection_name: str
136
- num_of_processes: int = 4
137
+ collection_name: str = Field(description="Milvus collections to write to")
138
+ num_processes: int = Field(
139
+ default=4, description="number of processes to use when writing to support parallel writes"
140
+ )
137
141
 
138
142
 
139
143
  @dataclass
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
180
184
  self.insert_results(data=data)
181
185
 
182
186
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
183
- if self.upload_config.num_of_processes == 1:
187
+ if self.upload_config.num_processes == 1:
184
188
  for content in contents:
185
189
  self.upload(content=content)
186
190
 
187
191
  else:
188
192
  with mp.Pool(
189
- processes=self.upload_config.num_of_processes,
193
+ processes=self.upload_config.num_processes,
190
194
  ) as pool:
191
195
  pool.map(self.upload, contents)
192
196
 
@@ -3,9 +3,9 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
- from unstructured.__version__ import __version__ as unstructured_version
6
+ from pydantic import Field, Secret
7
7
 
8
- from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
9
9
  from unstructured_ingest.error import DestinationConnectionError
10
10
  from unstructured_ingest.utils.data_prep import batch_generator
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -31,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
31
31
  SERVER_API_VERSION = "1"
32
32
 
33
33
 
34
- @dataclass
35
34
  class MongoDBAccessConfig(AccessConfig):
36
- uri: Optional[str] = None
35
+ uri: Optional[str] = Field(default=None, description="URI to user when connecting")
36
+
37
+
38
+ SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
37
39
 
38
40
 
39
- @dataclass
40
41
  class MongoDBConnectionConfig(ConnectionConfig):
41
- access_config: MongoDBAccessConfig = enhanced_field(
42
- sensitive=True, default_factory=MongoDBAccessConfig
42
+ access_config: SecretMongoDBAccessConfig = Field(
43
+ default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
43
44
  )
44
- host: Optional[str] = None
45
- database: Optional[str] = None
46
- collection: Optional[str] = None
47
- port: int = 27017
48
- batch_size: int = 100
49
- connector_type: str = CONNECTOR_TYPE
45
+ host: Optional[str] = Field(
46
+ default=None,
47
+ description="hostname or IP address or Unix domain socket path of a single mongod or "
48
+ "mongos instance to connect to, or a list of hostnames",
49
+ )
50
+ database: Optional[str] = Field(default=None, description="database name to connect to")
51
+ collection: Optional[str] = Field(default=None, description="collection name to connect to")
52
+ port: int = Field(default=27017)
53
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
54
 
51
55
 
52
- @dataclass
53
56
  class MongoDBUploadStagerConfig(UploadStagerConfig):
54
57
  pass
55
58
 
@@ -77,9 +80,8 @@ class MongoDBUploadStager(UploadStager):
77
80
  return output_path
78
81
 
79
82
 
80
- @dataclass
81
83
  class MongoDBUploaderConfig(UploaderConfig):
82
- batch_size: int = 100
84
+ batch_size: int = Field(default=100, description="Number of records per batch")
83
85
 
84
86
 
85
87
  @dataclass
@@ -102,9 +104,11 @@ class MongoDBUploader(Uploader):
102
104
  from pymongo.driver_info import DriverInfo
103
105
  from pymongo.server_api import ServerApi
104
106
 
105
- if self.connection_config.access_config.uri:
107
+ access_config = self.connection_config.access_config.get_secret_value()
108
+
109
+ if access_config.uri:
106
110
  return MongoClient(
107
- self.connection_config.access_config.uri,
111
+ access_config.uri,
108
112
  server_api=ServerApi(version=SERVER_API_VERSION),
109
113
  driver=DriverInfo(name="unstructured", version=unstructured_version),
110
114
  )
@@ -1,12 +1,12 @@
1
1
  import json
2
- from dataclasses import dataclass, field
2
+ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
10
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
35
35
  MAX_MB_SIZE = 512_000_000
36
36
 
37
37
 
38
- @dataclass
39
38
  class OnedriveAccessConfig(AccessConfig):
40
- client_cred: str
39
+ client_cred: str = Field(description="Microsoft App client secret")
41
40
 
42
41
 
43
- @dataclass
44
42
  class OnedriveConnectionConfig(ConnectionConfig):
45
- client_id: str
46
- user_pname: str
47
- tenant: str = field(repr=False)
48
- authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
49
- access_config: OnedriveAccessConfig = enhanced_field(sensitive=True)
43
+ client_id: str = Field(description="Microsoft app client ID")
44
+ user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
45
+ tenant: str = Field(
46
+ repr=False, description="ID or domain name associated with your Azure AD instance"
47
+ )
48
+ authority_url: Optional[str] = Field(
49
+ repr=False,
50
+ default="https://login.microsoftonline.com",
51
+ examples=["https://login.microsoftonline.com"],
52
+ description="Authentication token provider for Microsoft apps",
53
+ )
54
+ access_config: Secret[OnedriveAccessConfig]
50
55
 
51
56
  @requires_dependencies(["msal"], extras="onedrive")
52
57
  def get_token(self):
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
56
61
  app = ConfidentialClientApplication(
57
62
  authority=f"{self.authority_url}/{self.tenant}",
58
63
  client_id=self.client_id,
59
- client_credential=self.access_config.client_cred,
64
+ client_credential=self.access_config.get_secret_value().client_cred,
60
65
  )
61
66
  token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
62
67
  except ValueError as exc:
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
76
81
  return client
77
82
 
78
83
 
79
- @dataclass
80
84
  class OnedriveIndexerConfig(IndexerConfig):
81
- path: Optional[str] = field(default="")
85
+ path: Optional[str] = Field(default="")
82
86
  recursive: bool = False
83
87
 
84
88
 
@@ -171,7 +175,6 @@ class OnedriveIndexer(Indexer):
171
175
  yield file_data
172
176
 
173
177
 
174
- @dataclass
175
178
  class OnedriveDownloaderConfig(DownloaderConfig):
176
179
  pass
177
180