unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,15 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.documents.elements import DataSourceMetadata
8
+ from pydantic import Field, Secret
9
9
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.interfaces import (
12
+ DownloadResponse,
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ UploadContent,
16
+ )
13
17
  from unstructured_ingest.v2.processes.connector_registry import (
14
18
  DestinationRegistryEntry,
15
19
  SourceRegistryEntry,
@@ -28,27 +32,41 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
32
  CONNECTOR_TYPE = "s3"
29
33
 
30
34
 
31
- @dataclass
32
35
  class S3IndexerConfig(FsspecIndexerConfig):
33
36
  pass
34
37
 
35
38
 
36
- @dataclass
37
39
  class S3AccessConfig(FsspecAccessConfig):
38
- key: Optional[str] = None
39
- secret: Optional[str] = None
40
- token: Optional[str] = None
40
+ key: Optional[str] = Field(
41
+ default=None,
42
+ description="If not anonymous, use this access key ID, if specified. Takes precedence "
43
+ "over `aws_access_key_id` in client_kwargs.",
44
+ )
45
+ secret: Optional[str] = Field(
46
+ default=None, description="If not anonymous, use this secret access key, if specified."
47
+ )
48
+ token: Optional[str] = Field(
49
+ default=None, description="If not anonymous, use this security token, if specified."
50
+ )
51
+
52
+
53
+ SecretS3AccessConfig = Secret[S3AccessConfig]
41
54
 
42
55
 
43
- @dataclass
44
56
  class S3ConnectionConfig(FsspecConnectionConfig):
45
- supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
46
- access_config: S3AccessConfig = enhanced_field(
47
- sensitive=True, default_factory=lambda: S3AccessConfig()
57
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
58
+ access_config: SecretS3AccessConfig = Field(
59
+ default_factory=lambda: SecretS3AccessConfig(secret_value=S3AccessConfig())
48
60
  )
49
- endpoint_url: Optional[str] = None
50
- anonymous: bool = False
51
- connector_type: str = CONNECTOR_TYPE
61
+ endpoint_url: Optional[str] = Field(
62
+ default=None,
63
+ description="Use this endpoint_url, if specified. Needed for "
64
+ "connecting to non-AWS S3 buckets.",
65
+ )
66
+ anonymous: bool = Field(
67
+ default=False, description="Connect to s3 without local AWS credentials."
68
+ )
69
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
52
70
 
53
71
  def get_access_config(self) -> dict[str, Any]:
54
72
  access_configs: dict[str, Any] = {"anon": self.anonymous}
@@ -56,7 +74,9 @@ class S3ConnectionConfig(FsspecConnectionConfig):
56
74
  access_configs["endpoint_url"] = self.endpoint_url
57
75
 
58
76
  # Avoid injecting None by filtering out k,v pairs where the value is None
59
- access_configs.update({k: v for k, v in self.access_config.to_dict().items() if v})
77
+ access_configs.update(
78
+ {k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
79
+ )
60
80
  return access_configs
61
81
 
62
82
 
@@ -66,9 +86,10 @@ class S3Indexer(FsspecIndexer):
66
86
  index_config: S3IndexerConfig
67
87
  connector_type: str = CONNECTOR_TYPE
68
88
 
69
- def get_metadata(self, path: str) -> DataSourceMetadata:
89
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
70
90
  date_created = None
71
91
  date_modified = None
92
+ file_size = None
72
93
  try:
73
94
  modified: Optional[datetime] = self.fs.modified(path)
74
95
  if modified:
@@ -76,6 +97,8 @@ class S3Indexer(FsspecIndexer):
76
97
  date_modified = str(modified.timestamp())
77
98
  except NotImplementedError:
78
99
  pass
100
+ with contextlib.suppress(AttributeError):
101
+ file_size = self.fs.size(path)
79
102
 
80
103
  version = None
81
104
  info: dict[str, Any] = self.fs.info(path)
@@ -90,21 +113,25 @@ class S3Indexer(FsspecIndexer):
90
113
  }
91
114
  if metadata:
92
115
  record_locator["metadata"] = metadata
93
- return DataSourceMetadata(
116
+ return FileDataSourceMetadata(
94
117
  date_created=date_created,
95
118
  date_modified=date_modified,
96
119
  date_processed=str(time()),
97
120
  version=version,
98
121
  url=f"{self.index_config.protocol}://{path}",
99
122
  record_locator=record_locator,
123
+ filesize_bytes=file_size,
100
124
  )
101
125
 
102
126
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
103
127
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
104
128
  return super().run(**kwargs)
105
129
 
130
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
131
+ def precheck(self) -> None:
132
+ super().precheck()
133
+
106
134
 
107
- @dataclass
108
135
  class S3DownloaderConfig(FsspecDownloaderConfig):
109
136
  pass
110
137
 
@@ -125,7 +152,6 @@ class S3Downloader(FsspecDownloader):
125
152
  return await super().run_async(file_data=file_data, **kwargs)
126
153
 
127
154
 
128
- @dataclass
129
155
  class S3UploaderConfig(FsspecUploaderConfig):
130
156
  pass
131
157
 
@@ -136,6 +162,10 @@ class S3Uploader(FsspecUploader):
136
162
  connection_config: S3ConnectionConfig
137
163
  upload_config: S3UploaderConfig = field(default=None)
138
164
 
165
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
166
+ def precheck(self) -> None:
167
+ super().precheck()
168
+
139
169
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
140
170
  def __post_init__(self):
141
171
  super().__post_init__()
@@ -6,7 +6,8 @@ from pathlib import Path
6
6
  from typing import Any, Generator, Optional
7
7
  from urllib.parse import urlparse
8
8
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from pydantic import Field, Secret
10
+
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
12
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -27,10 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
27
28
  CONNECTOR_TYPE = "sftp"
28
29
 
29
30
 
30
- @dataclass
31
31
  class SftpIndexerConfig(FsspecIndexerConfig):
32
- def __post_init__(self):
33
- super().__post_init__()
32
+
33
+ def model_post_init(self, __context: Any) -> None:
34
+ super().model_post_init(__context)
34
35
  _, ext = os.path.splitext(self.remote_url)
35
36
  parsed_url = urlparse(self.remote_url)
36
37
  if ext:
@@ -39,21 +40,21 @@ class SftpIndexerConfig(FsspecIndexerConfig):
39
40
  self.path_without_protocol = parsed_url.path.lstrip("/")
40
41
 
41
42
 
42
- @dataclass
43
43
  class SftpAccessConfig(FsspecAccessConfig):
44
- password: str
44
+ password: str = Field(description="Password for sftp connection")
45
45
 
46
46
 
47
- @dataclass
48
47
  class SftpConnectionConfig(FsspecConnectionConfig):
49
- supported_protocols: list[str] = field(default_factory=lambda: ["sftp"])
50
- access_config: SftpAccessConfig = enhanced_field(sensitive=True)
51
- connector_type: str = CONNECTOR_TYPE
52
- username: Optional[str] = None
53
- host: Optional[str] = None
54
- port: int = 22
55
- look_for_keys: bool = False
56
- allow_agent: bool = False
48
+ supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
49
+ access_config: Secret[SftpAccessConfig]
50
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
51
+ username: str = Field(description="Username for sftp connection")
52
+ host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
53
+ port: int = Field(default=22, description="Port for sftp connection")
54
+ look_for_keys: bool = Field(
55
+ default=False, description="Whether to search for private key files in ~/.ssh/"
56
+ )
57
+ allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
57
58
 
58
59
  def get_access_config(self) -> dict[str, Any]:
59
60
  access_config = {
@@ -62,7 +63,7 @@ class SftpConnectionConfig(FsspecConnectionConfig):
62
63
  "port": self.port,
63
64
  "look_for_keys": self.look_for_keys,
64
65
  "allow_agent": self.allow_agent,
65
- "password": self.access_config.password,
66
+ "password": self.access_config.get_secret_value().password,
66
67
  }
67
68
  return access_config
68
69
 
@@ -91,25 +92,20 @@ class SftpIndexer(FsspecIndexer):
91
92
  file.identifier = new_identifier
92
93
  yield file
93
94
 
95
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
96
+ def precheck(self) -> None:
97
+ super().precheck()
98
+
94
99
 
95
- @dataclass
96
100
  class SftpDownloaderConfig(FsspecDownloaderConfig):
97
- remote_url: Optional[str] = None
98
-
99
- def __post_init__(self):
100
- # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
101
- if not self.remote_url:
102
- raise TypeError(
103
- f"{self.__class__.__name__}.__init__() "
104
- f"missing 1 required positional argument: 'remote_url'"
105
- )
101
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
106
102
 
107
103
 
108
104
  @dataclass
109
105
  class SftpDownloader(FsspecDownloader):
110
106
  protocol: str = "sftp"
111
107
  connection_config: SftpConnectionConfig
112
- connector_type: str = CONNECTOR_TYPE
108
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
113
109
  download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
114
110
 
115
111
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
@@ -127,7 +123,6 @@ class SftpDownloader(FsspecDownloader):
127
123
  return await super().run_async(file_data=file_data, **kwargs)
128
124
 
129
125
 
130
- @dataclass
131
126
  class SftpUploaderConfig(FsspecUploaderConfig):
132
127
  pass
133
128
 
@@ -142,6 +137,10 @@ class SftpUploader(FsspecUploader):
142
137
  def __post_init__(self):
143
138
  super().__post_init__()
144
139
 
140
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
141
+ def precheck(self) -> None:
142
+ super().precheck()
143
+
145
144
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
146
145
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147
146
  return super().run(contents=contents, **kwargs)
@@ -1,23 +1,25 @@
1
1
  import io
2
- import os
2
+ import json
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Generator, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
- from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
8
+ from pydantic import Field, Secret
10
9
 
11
- from unstructured_ingest.enhanced_dataclass import enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import (
11
+ SourceConnectionError,
12
+ SourceConnectionNetworkError,
13
+ )
13
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
+ from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
15
16
  from unstructured_ingest.v2.interfaces import (
16
17
  AccessConfig,
17
18
  ConnectionConfig,
18
19
  Downloader,
19
20
  DownloaderConfig,
20
21
  FileData,
22
+ FileDataSourceMetadata,
21
23
  Indexer,
22
24
  IndexerConfig,
23
25
  SourceIdentifiers,
@@ -35,46 +37,54 @@ if TYPE_CHECKING:
35
37
  from googleapiclient.http import MediaIoBaseDownload
36
38
 
37
39
 
38
- @dataclass
39
40
  class GoogleDriveAccessConfig(AccessConfig):
40
- service_account_key: Union[str, dict]
41
+ service_account_key: Optional[dict] = Field(
42
+ default=None, description="Credentials values to use for authentication"
43
+ )
44
+ service_account_key_path: Optional[Path] = Field(
45
+ default=None, description="File path to credentials values to use for authentication"
46
+ )
47
+
48
+ def model_post_init(self, __context: Any) -> None:
49
+ if self.service_account_key is None and self.service_account_key_path is None:
50
+ raise ValueError(
51
+ "either service_account_key or service_account_key_path must be provided"
52
+ )
53
+
54
+ def get_service_account_key(self) -> dict:
55
+ key_data = None
56
+ if self.service_account_key_path:
57
+ with self.service_account_key_path.open() as f:
58
+ key_data = json.load(f)
59
+ if key_data and self.service_account_key:
60
+ if key_data == self.service_account_key:
61
+ return key_data
62
+ else:
63
+ raise ValueError(
64
+ "service_account_key and service_account_key_path "
65
+ "both provided and have different values"
66
+ )
67
+ if key_data:
68
+ return key_data
69
+ return self.service_account_key
41
70
 
42
71
 
43
- @dataclass
44
72
  class GoogleDriveConnectionConfig(ConnectionConfig):
45
- drive_id: str
46
- access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True)
73
+ drive_id: str = Field(description="Google Drive File or Folder ID.")
74
+ access_config: Secret[GoogleDriveAccessConfig]
47
75
 
48
76
  @requires_dependencies(["googleapiclient"], extras="google-drive")
49
77
  def get_files_service(self) -> "GoogleAPIResource":
50
- from google.auth import default, exceptions
78
+ from google.auth import exceptions
51
79
  from google.oauth2 import service_account
52
80
  from googleapiclient.discovery import build
53
81
  from googleapiclient.errors import HttpError
54
82
 
55
- # Service account key can be a dict or a file path(str)
56
- # But the dict may come in as a string
57
- if isinstance(self.access_config.service_account_key, str):
58
- key_path = json_to_dict(self.access_config.service_account_key)
59
- elif isinstance(self.access_config.service_account_key, dict):
60
- key_path = self.access_config.service_account_key
61
- else:
62
- raise TypeError(
63
- f"access_config.service_account_key must be "
64
- f"str or dict, got: {type(self.access_config.service_account_key)}"
65
- )
83
+ access_config = self.access_config.get_secret_value()
84
+ key_data = access_config.get_service_account_key()
66
85
 
67
86
  try:
68
- if isinstance(key_path, dict):
69
- creds = service_account.Credentials.from_service_account_info(key_path)
70
- elif isinstance(key_path, str):
71
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
72
- creds, _ = default()
73
- else:
74
- raise ValueError(
75
- f"key path not recognized as a dictionary or a file path: "
76
- f"[{type(key_path)}] {key_path}",
77
- )
87
+ creds = service_account.Credentials.from_service_account_info(key_data)
78
88
  service = build("drive", "v3", credentials=creds)
79
89
  return service.files()
80
90
 
@@ -84,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
84
94
  raise ValueError("The provided API key is invalid.")
85
95
 
86
96
 
87
- @dataclass
88
97
  class GoogleDriveIndexerConfig(IndexerConfig):
89
98
  extensions: Optional[list[str]] = None
90
99
  recursive: bool = False
@@ -121,6 +130,13 @@ class GoogleDriveIndexer(Indexer):
121
130
  ]
122
131
  )
123
132
 
133
+ def precheck(self) -> None:
134
+ try:
135
+ self.connection_config.get_files_service()
136
+ except Exception as e:
137
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
138
+ raise SourceConnectionError(f"failed to validate connection: {e}")
139
+
124
140
  @staticmethod
125
141
  def is_dir(record: dict) -> bool:
126
142
  return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +171,7 @@ class GoogleDriveIndexer(Indexer):
155
171
  connector_type=CONNECTOR_TYPE,
156
172
  identifier=file_id,
157
173
  source_identifiers=source_identifiers,
158
- metadata=DataSourceMetadata(
174
+ metadata=FileDataSourceMetadata(
159
175
  url=url,
160
176
  version=version,
161
177
  date_created=str(date_created_dt.timestamp()),
@@ -259,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
259
275
  yield f
260
276
 
261
277
 
262
- @dataclass
263
278
  class GoogleDriveDownloaderConfig(DownloaderConfig):
264
279
  pass
265
280
 
@@ -272,11 +287,6 @@ class GoogleDriveDownloader(Downloader):
272
287
  )
273
288
  connector_type: str = CONNECTOR_TYPE
274
289
 
275
- def get_download_path(self, file_data: FileData) -> Path:
276
- rel_path = file_data.source_identifiers.relative_path
277
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
- return self.download_dir / Path(rel_path)
279
-
280
290
  @SourceConnectionNetworkError.wrap
281
291
  def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
282
292
  downloaded = False
@@ -1,12 +1,11 @@
1
1
  import glob
2
- import itertools
3
2
  import shutil
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
5
  from time import time
7
- from typing import Any, Generator, Optional
6
+ from typing import Any, Generator
8
7
 
9
- from unstructured.documents.elements import DataSourceMetadata
8
+ from pydantic import Field, Secret
10
9
 
11
10
  from unstructured_ingest.v2.interfaces import (
12
11
  AccessConfig,
@@ -15,6 +14,7 @@ from unstructured_ingest.v2.interfaces import (
15
14
  DownloaderConfig,
16
15
  DownloadResponse,
17
16
  FileData,
17
+ FileDataSourceMetadata,
18
18
  Indexer,
19
19
  IndexerConfig,
20
20
  SourceIdentifiers,
@@ -31,21 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
31
31
  CONNECTOR_TYPE = "local"
32
32
 
33
33
 
34
- @dataclass
35
34
  class LocalAccessConfig(AccessConfig):
36
35
  pass
37
36
 
38
37
 
39
- @dataclass
38
+ SecretLocalAccessConfig = Secret[LocalAccessConfig]
39
+
40
+
40
41
  class LocalConnectionConfig(ConnectionConfig):
41
- access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig())
42
+ access_config: SecretLocalAccessConfig = Field(
43
+ default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
44
+ )
42
45
 
43
46
 
44
- @dataclass
45
47
  class LocalIndexerConfig(IndexerConfig):
46
- input_path: str
47
- recursive: bool = False
48
- file_glob: Optional[list[str]] = None
48
+ input_path: Path = Field(
49
+ description="Path to the location in the local file system that will be processed."
50
+ )
51
+ recursive: bool = Field(
52
+ default=False,
53
+ description="Recursively download files in their respective folders "
54
+ "otherwise stop at the files in provided folder level.",
55
+ )
49
56
 
50
57
  @property
51
58
  def path(self) -> Path:
@@ -64,16 +71,11 @@ class LocalIndexer(Indexer):
64
71
  input_path = self.index_config.path
65
72
  if input_path.is_file():
66
73
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
67
- glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
- if not self.index_config.file_glob:
69
- return list(glob_fn("*"))
70
- return list(
71
- itertools.chain.from_iterable(
72
- glob_fn(pattern) for pattern in self.index_config.file_glob
73
- )
74
- )
74
+ if self.index_config.recursive:
75
+ return list(input_path.rglob("*"))
76
+ return list(input_path.glob("*"))
75
77
 
76
- def get_file_metadata(self, path: Path) -> DataSourceMetadata:
78
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
77
79
  stats = path.stat()
78
80
  try:
79
81
  date_modified = str(stats.st_mtime)
@@ -93,12 +95,20 @@ class LocalIndexer(Indexer):
93
95
  except Exception as e:
94
96
  logger.warning(f"Couldn't detect file mode: {e}")
95
97
  permissions_data = None
96
- return DataSourceMetadata(
98
+
99
+ try:
100
+ filesize_bytes = stats.st_size
101
+ except Exception as e:
102
+ logger.warning(f"Couldn't detect file size: {e}")
103
+ filesize_bytes = None
104
+
105
+ return FileDataSourceMetadata(
97
106
  date_modified=date_modified,
98
107
  date_created=date_created,
99
108
  date_processed=str(time()),
100
109
  permissions_data=permissions_data,
101
110
  record_locator={"path": str(path.resolve())},
111
+ filesize_bytes=filesize_bytes,
102
112
  )
103
113
 
104
114
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
@@ -122,7 +132,6 @@ class LocalIndexer(Indexer):
122
132
  yield file_data
123
133
 
124
134
 
125
- @dataclass
126
135
  class LocalDownloaderConfig(DownloaderConfig):
127
136
  pass
128
137
 
@@ -130,10 +139,8 @@ class LocalDownloaderConfig(DownloaderConfig):
130
139
  @dataclass
131
140
  class LocalDownloader(Downloader):
132
141
  connector_type: str = CONNECTOR_TYPE
133
- connection_config: LocalConnectionConfig = field(
134
- default_factory=lambda: LocalConnectionConfig()
135
- )
136
- download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
142
+ connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
143
+ download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
137
144
 
138
145
  def get_download_path(self, file_data: FileData) -> Path:
139
146
  return Path(file_data.source_identifiers.fullpath)
@@ -144,9 +151,10 @@ class LocalDownloader(Downloader):
144
151
  )
145
152
 
146
153
 
147
- @dataclass
148
154
  class LocalUploaderConfig(UploaderConfig):
149
- output_dir: str = field(default="structured-output")
155
+ output_dir: str = Field(
156
+ default="structured-output", description="Local path to write partitioned output to"
157
+ )
150
158
 
151
159
  @property
152
160
  def output_path(self) -> Path:
@@ -160,7 +168,7 @@ class LocalUploaderConfig(UploaderConfig):
160
168
  @dataclass
161
169
  class LocalUploader(Uploader):
162
170
  connector_type: str = CONNECTOR_TYPE
163
- upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig())
171
+ upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
164
172
  connection_config: LocalConnectionConfig = field(
165
173
  default_factory=lambda: LocalConnectionConfig()
166
174
  )
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
6
6
 
7
7
  import pandas as pd
8
8
  from dateutil import parser
9
+ from pydantic import Field, Secret
9
10
 
10
- from unstructured_ingest.enhanced_dataclass import enhanced_field
11
11
  from unstructured_ingest.error import WriteError
12
12
  from unstructured_ingest.utils.data_prep import flatten_dict
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
32
32
  CONNECTOR_TYPE = "milvus"
33
33
 
34
34
 
35
- @dataclass
36
35
  class MilvusAccessConfig(AccessConfig):
37
- password: Optional[str] = None
38
- token: Optional[str] = None
36
+ password: Optional[str] = Field(default=None, description="Milvus password")
37
+ token: Optional[str] = Field(default=None, description="Milvus access token")
38
+
39
+
40
+ SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
39
41
 
40
42
 
41
- @dataclass
42
43
  class MilvusConnectionConfig(ConnectionConfig):
43
- access_config: MilvusAccessConfig = enhanced_field(
44
- sensitive=True, default_factory=lambda: MilvusAccessConfig()
44
+ access_config: SecretMilvusAccessConfig = Field(
45
+ default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
45
46
  )
46
- uri: Optional[str] = None
47
- user: Optional[str] = None
48
- db_name: Optional[str] = None
47
+ uri: Optional[str] = Field(
48
+ default=None, description="Milvus uri", examples=["http://localhost:19530"]
49
+ )
50
+ user: Optional[str] = Field(default=None, description="Milvus user")
51
+ db_name: Optional[str] = Field(default=None, description="Milvus database name")
49
52
 
50
53
  def get_connection_kwargs(self) -> dict[str, Any]:
51
- access_config_dict = self.access_config.to_dict()
52
- connection_config_dict = self.to_dict()
54
+ access_config = self.access_config.get_secret_value()
55
+ access_config_dict = access_config.dict()
56
+ connection_config_dict = self.dict()
53
57
  connection_config_dict.pop("access_config", None)
54
58
  connection_config_dict.update(access_config_dict)
55
59
  # Drop any that were not set explicitly
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
63
67
  return MilvusClient(**self.get_connection_kwargs())
64
68
 
65
69
 
66
- @dataclass
67
70
  class MilvusUploadStagerConfig(UploadStagerConfig):
68
71
  pass
69
72
 
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
130
133
  return output_path
131
134
 
132
135
 
133
- @dataclass
134
136
  class MilvusUploaderConfig(UploaderConfig):
135
- collection_name: str
136
- num_of_processes: int = 4
137
+ collection_name: str = Field(description="Milvus collections to write to")
138
+ num_processes: int = Field(
139
+ default=4, description="number of processes to use when writing to support parallel writes"
140
+ )
137
141
 
138
142
 
139
143
  @dataclass
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
180
184
  self.insert_results(data=data)
181
185
 
182
186
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
183
- if self.upload_config.num_of_processes == 1:
187
+ if self.upload_config.num_processes == 1:
184
188
  for content in contents:
185
189
  self.upload(content=content)
186
190
 
187
191
  else:
188
192
  with mp.Pool(
189
- processes=self.upload_config.num_of_processes,
193
+ processes=self.upload_config.num_processes,
190
194
  ) as pool:
191
195
  pool.map(self.upload, contents)
192
196