unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,475 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ import shutil
6
+ import tempfile
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
11
+ from uuid import NAMESPACE_DNS, uuid5
12
+
13
+ from pydantic import BaseModel, Field, Secret
14
+
15
+ from unstructured_ingest.data_types.file_data import (
16
+ FileData,
17
+ FileDataSourceMetadata,
18
+ SourceIdentifiers,
19
+ )
20
+ from unstructured_ingest.error import TypeError, ValueError
21
+ from unstructured_ingest.interfaces import (
22
+ AccessConfig,
23
+ ConnectionConfig,
24
+ Downloader,
25
+ DownloaderConfig,
26
+ DownloadResponse,
27
+ Indexer,
28
+ IndexerConfig,
29
+ Uploader,
30
+ UploaderConfig,
31
+ )
32
+ from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
33
+ from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
34
+
35
+ if TYPE_CHECKING:
36
+ from fsspec import AbstractFileSystem
37
+
38
+ CONNECTOR_TYPE = "fsspec"
39
+
40
+
41
+ class FileConfig(BaseModel):
42
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
43
+ protocol: str = Field(init=False)
44
+ path_without_protocol: str = Field(init=False)
45
+ supported_protocols: list[str] = Field(
46
+ init=False,
47
+ default_factory=lambda: [
48
+ "s3",
49
+ "s3a",
50
+ "abfs",
51
+ "az",
52
+ "gs",
53
+ "gcs",
54
+ "box",
55
+ "dropbox",
56
+ "sftp",
57
+ ],
58
+ )
59
+
60
+ def __init__(self, **data):
61
+ protocol, path_without_protocol = data["remote_url"].split("://")
62
+ data["protocol"] = protocol
63
+ data["path_without_protocol"] = path_without_protocol
64
+ super().__init__(**data)
65
+
66
+
67
+ class FsspecIndexerConfig(FileConfig, IndexerConfig):
68
+ recursive: bool = False
69
+ sample_n_files: Optional[int] = None
70
+
71
+
72
+ class FsspecAccessConfig(AccessConfig):
73
+ pass
74
+
75
+
76
+ class FsspecConnectionConfig(ConnectionConfig):
77
+ access_config: Secret[FsspecAccessConfig]
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
79
+
80
+ @contextmanager
81
+ def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
82
+ from fsspec import get_filesystem_class
83
+
84
+ client = get_filesystem_class(protocol)(
85
+ **self.get_access_config(),
86
+ )
87
+ yield client
88
+
89
+ def wrap_error(self, e: Exception) -> Exception:
90
+ return e
91
+
92
+
93
+ FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
94
+ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
95
+
96
+
97
+ @dataclass
98
+ class FsspecIndexer(Indexer):
99
+ connection_config: FsspecConnectionConfigT
100
+ index_config: FsspecIndexerConfigT
101
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
102
+
103
+ def wrap_error(self, e: Exception) -> Exception:
104
+ return self.connection_config.wrap_error(e=e)
105
+
106
+ def precheck(self) -> None:
107
+ from fsspec import get_filesystem_class
108
+
109
+ self.log_operation_start(
110
+ "Connection validation",
111
+ protocol=self.index_config.protocol,
112
+ path=self.index_config.path_without_protocol,
113
+ )
114
+
115
+ try:
116
+ fs = get_filesystem_class(self.index_config.protocol)(
117
+ **self.connection_config.get_access_config(),
118
+ )
119
+ files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
120
+ valid_files = [x.get("name") for x in files if x.get("type") == "file"]
121
+ if not valid_files:
122
+ self.log_operation_complete("Connection validation", count=0)
123
+ return
124
+ file_to_sample = valid_files[0]
125
+ self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
126
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
127
+ client.head(path=file_to_sample)
128
+
129
+ self.log_connection_validated(
130
+ connector_type=self.connector_type,
131
+ endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
132
+ )
133
+
134
+ except Exception as e:
135
+ self.log_connection_failed(
136
+ connector_type=self.connector_type,
137
+ error=e,
138
+ endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
139
+ )
140
+ raise self.wrap_error(e=e)
141
+
142
+ def get_file_info(self) -> list[dict[str, Any]]:
143
+ if not self.index_config.recursive:
144
+ # fs.ls does not walk directories
145
+ # directories that are listed in cloud storage can cause problems
146
+ # because they are seen as 0 byte files
147
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
148
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
149
+
150
+ else:
151
+ # fs.find will recursively walk directories
152
+ # "size" is a common key for all the cloud protocols with fs
153
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
154
+ found = client.find(
155
+ self.index_config.path_without_protocol,
156
+ detail=True,
157
+ )
158
+ files = found.values()
159
+ filtered_files = [
160
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
161
+ ]
162
+
163
+ if self.index_config.sample_n_files:
164
+ filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
165
+
166
+ return filtered_files
167
+
168
+ def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
169
+ if len(files) <= n:
170
+ self.log_warning(
171
+ f"number of files to be sampled={n} is not smaller than the number"
172
+ f" of files found ({len(files)}). Returning all of the files as the"
173
+ " sample."
174
+ )
175
+ return files
176
+
177
+ return random.sample(files, n)
178
+
179
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
180
+ raise NotImplementedError()
181
+
182
+ def get_path(self, file_info: dict) -> str:
183
+ return file_info["name"]
184
+
185
+ def sterilize_info(self, file_data: dict) -> dict:
186
+ return sterilize_dict(data=file_data)
187
+
188
+ def create_init_file_data(self, remote_filepath: Optional[str] = None) -> FileData:
189
+ # Create initial file data that requires no network calls and is constructed purely
190
+ # with information that exists in the config
191
+ remote_filepath = remote_filepath or self.index_config.remote_url
192
+ path_without_protocol = remote_filepath.split("://")[1]
193
+ rel_path = remote_filepath.replace(path_without_protocol, "").lstrip("/")
194
+ return FileData(
195
+ identifier=str(uuid5(NAMESPACE_DNS, remote_filepath)),
196
+ connector_type=self.connector_type,
197
+ display_name=remote_filepath,
198
+ source_identifiers=SourceIdentifiers(
199
+ filename=Path(remote_filepath).name,
200
+ rel_path=rel_path or None,
201
+ fullpath=remote_filepath,
202
+ ),
203
+ metadata=FileDataSourceMetadata(url=remote_filepath),
204
+ )
205
+
206
+ def hydrate_file_data(self, init_file_data: FileData):
207
+ # Get file info
208
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
209
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
210
+ filtered_files = [
211
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
212
+ ]
213
+ if not filtered_files:
214
+ raise ValueError(f"{init_file_data} did not reference any valid file")
215
+ if len(filtered_files) > 1:
216
+ raise ValueError(f"{init_file_data} referenced more than one file")
217
+ file_info = filtered_files[0]
218
+ init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
219
+
220
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
221
+ self.log_indexing_start(f"{self.connector_type} files")
222
+
223
+ files = self.get_file_info()
224
+ total_files = len(files)
225
+
226
+ self.log_operation_start("File indexing", total_files=total_files)
227
+
228
+ for i, file_info in enumerate(files):
229
+ file_path = self.get_path(file_info=file_info)
230
+
231
+ # Only log progress for larger operations
232
+ if total_files > 5:
233
+ self.log_progress(
234
+ current=i + 1, total=total_files, item_type="files", operation="Indexing"
235
+ )
236
+
237
+ # Note: we remove any remaining leading slashes (Box introduces these)
238
+ # to get a valid relative path
239
+ rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
240
+
241
+ additional_metadata = self.sterilize_info(file_data=file_info)
242
+ additional_metadata["original_file_path"] = file_path
243
+ yield FileData(
244
+ identifier=str(uuid5(NAMESPACE_DNS, file_path)),
245
+ connector_type=self.connector_type,
246
+ source_identifiers=SourceIdentifiers(
247
+ filename=Path(file_path).name,
248
+ rel_path=rel_path or None,
249
+ fullpath=file_path,
250
+ ),
251
+ metadata=self.get_metadata(file_info=file_info),
252
+ additional_metadata=additional_metadata,
253
+ display_name=file_path,
254
+ )
255
+
256
+ self.log_indexing_complete(f"{self.connector_type} files", total_files)
257
+
258
+
259
+ class FsspecDownloaderConfig(DownloaderConfig):
260
+ pass
261
+
262
+
263
+ FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig)
264
+
265
+
266
+ @dataclass
267
+ class FsspecDownloader(Downloader):
268
+ TEMP_DIR_PREFIX = "unstructured_"
269
+
270
+ protocol: str
271
+ connection_config: FsspecConnectionConfigT
272
+ connector_type: str = CONNECTOR_TYPE
273
+ download_config: Optional[FsspecDownloaderConfigT] = field(
274
+ default_factory=lambda: FsspecDownloaderConfig()
275
+ )
276
+
277
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
278
+ has_source_identifiers = file_data.source_identifiers is not None
279
+ has_filename = has_source_identifiers and file_data.source_identifiers.filename
280
+
281
+ if not (has_source_identifiers and has_filename):
282
+ return None
283
+
284
+ filename = file_data.source_identifiers.filename
285
+
286
+ mkdir_concurrent_safe(self.download_dir)
287
+
288
+ temp_dir = tempfile.mkdtemp(prefix=self.TEMP_DIR_PREFIX, dir=self.download_dir)
289
+ return Path(temp_dir) / filename
290
+
291
+ def is_async(self) -> bool:
292
+ with self.connection_config.get_client(protocol=self.protocol) as client:
293
+ return client.async_impl
294
+
295
+ def handle_directory_download(self, lpath: Path) -> None:
296
+ # If the object's name contains certain characters (i.e. '?'), it
297
+ # gets downloaded into a new directory of the same name. This
298
+ # reconciles that with what is expected, which is to download it
299
+ # as a file that is not within a directory.
300
+ if not lpath.is_dir():
301
+ return
302
+ desired_name = lpath.name
303
+ files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
304
+ if not files_in_dir:
305
+ raise ValueError(f"no files in {lpath}")
306
+ if len(files_in_dir) > 1:
307
+ raise ValueError(
308
+ "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
309
+ )
310
+ file = files_in_dir[0]
311
+ with tempfile.TemporaryDirectory() as temp_dir:
312
+ temp_location = os.path.join(temp_dir, desired_name)
313
+ shutil.copyfile(src=file, dst=temp_location)
314
+ shutil.rmtree(lpath)
315
+ shutil.move(src=temp_location, dst=lpath)
316
+
317
+ def wrap_error(self, e: Exception) -> Exception:
318
+ return self.connection_config.wrap_error(e=e)
319
+
320
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
321
+ download_path = self.get_download_path(file_data=file_data)
322
+ mkdir_concurrent_safe(download_path.parent)
323
+
324
+ rpath = file_data.additional_metadata["original_file_path"]
325
+ file_size = file_data.metadata.filesize_bytes
326
+ self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
327
+
328
+ try:
329
+ with self.connection_config.get_client(protocol=self.protocol) as client:
330
+ client.get_file(rpath=rpath, lpath=download_path.as_posix())
331
+ self.handle_directory_download(lpath=download_path)
332
+
333
+ except Exception as e:
334
+ self.log_error(
335
+ "File download failed",
336
+ error=e,
337
+ context={"file_path": rpath, "file_id": file_data.identifier},
338
+ )
339
+ raise self.wrap_error(e=e)
340
+
341
+ self.log_download_complete(
342
+ file_path=rpath,
343
+ file_id=file_data.identifier,
344
+ download_path=str(download_path),
345
+ )
346
+
347
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
348
+
349
+ async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
350
+ download_path = self.get_download_path(file_data=file_data)
351
+ mkdir_concurrent_safe(download_path.parent)
352
+ rpath = file_data.additional_metadata["original_file_path"]
353
+ file_size = file_data.metadata.filesize_bytes
354
+ self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
355
+
356
+ try:
357
+ with self.connection_config.get_client(protocol=self.protocol) as client:
358
+ await client.get_file(rpath=rpath, lpath=download_path.as_posix())
359
+ self.handle_directory_download(lpath=download_path)
360
+ except Exception as e:
361
+ self.log_error(
362
+ "File download failed",
363
+ error=e,
364
+ context={"file_path": rpath, "file_id": file_data.identifier},
365
+ )
366
+ raise self.wrap_error(e=e)
367
+
368
+ self.log_download_complete(
369
+ file_path=rpath,
370
+ file_id=file_data.identifier,
371
+ download_path=str(download_path),
372
+ )
373
+
374
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
375
+
376
+
377
+ class FsspecUploaderConfig(FileConfig, UploaderConfig):
378
+ pass
379
+
380
+
381
+ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
382
+
383
+
384
+ @dataclass
385
+ class FsspecUploader(Uploader):
386
+ connector_type: str = CONNECTOR_TYPE
387
+ upload_config: FsspecUploaderConfigT = field(default=None)
388
+ connection_config: FsspecConnectionConfigT
389
+
390
+ def is_async(self) -> bool:
391
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
392
+ return client.async_impl
393
+
394
+ @property
395
+ def fs(self) -> "AbstractFileSystem":
396
+ from fsspec import get_filesystem_class
397
+
398
+ fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {}
399
+ return get_filesystem_class(self.upload_config.protocol)(
400
+ **fs_kwargs,
401
+ )
402
+
403
+ def __post_init__(self):
404
+ super().__post_init__()
405
+ # TODO: Consider using `kw_only` instead
406
+ if not self.upload_config:
407
+ raise TypeError(
408
+ f"{self.__class__.__name__}.__init__() "
409
+ f"missing 1 required positional argument: 'upload_config'"
410
+ )
411
+
412
+ def wrap_error(self, e: Exception) -> Exception:
413
+ return self.connection_config.wrap_error(e=e)
414
+
415
+ def precheck(self) -> None:
416
+ from fsspec import get_filesystem_class
417
+
418
+ self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
419
+
420
+ try:
421
+ fs = get_filesystem_class(self.upload_config.protocol)(
422
+ **self.connection_config.get_access_config(),
423
+ )
424
+ upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
425
+ fs.write_bytes(path=upload_path.as_posix(), value=b"")
426
+ except Exception as e:
427
+ self.log_connection_failed(
428
+ connector_type=self.connector_type,
429
+ error=e,
430
+ endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
431
+ )
432
+ raise self.wrap_error(e=e)
433
+ self.log_connection_validated(
434
+ connector_type=self.connector_type,
435
+ endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
436
+ )
437
+
438
+ def get_upload_path(self, file_data: FileData) -> Path:
439
+ upload_path = Path(
440
+ self.upload_config.path_without_protocol
441
+ ) / file_data.source_identifiers.relative_path.lstrip("/")
442
+ updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
443
+ return updated_upload_path
444
+
445
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
446
+ path_str = str(path.resolve())
447
+ upload_path = self.get_upload_path(file_data=file_data)
448
+ self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
449
+ try:
450
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
451
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
452
+ except Exception as e:
453
+ self.log_error(
454
+ "File upload failed",
455
+ error=e,
456
+ context={"file_path": path_str, "destination": upload_path.as_posix()},
457
+ )
458
+ raise self.wrap_error(e=e)
459
+ self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
460
+
461
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
462
+ path_str = str(path.resolve())
463
+ upload_path = self.get_upload_path(file_data=file_data)
464
+ self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
465
+ try:
466
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
467
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
468
+ except Exception as e:
469
+ self.log_error(
470
+ "File upload failed",
471
+ error=e,
472
+ context={"file_path": path_str, "destination": upload_path.as_posix()},
473
+ )
474
+ raise self.wrap_error(e=e)
475
+ self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
@@ -0,0 +1,203 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
+
9
+ from dateutil import parser
10
+ from pydantic import Field, Secret
11
+
12
+ from unstructured_ingest.data_types.file_data import FileDataSourceMetadata
13
+ from unstructured_ingest.error import ProviderError, UserError, ValueError
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
16
+ DestinationRegistryEntry,
17
+ SourceRegistryEntry,
18
+ )
19
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
20
+ FsspecAccessConfig,
21
+ FsspecConnectionConfig,
22
+ FsspecDownloader,
23
+ FsspecDownloaderConfig,
24
+ FsspecIndexer,
25
+ FsspecIndexerConfig,
26
+ FsspecUploader,
27
+ FsspecUploaderConfig,
28
+ )
29
+ from unstructured_ingest.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
33
+ from unstructured_ingest.utils.dep_check import requires_dependencies
34
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
35
+
36
+ if TYPE_CHECKING:
37
+ from gcsfs import GCSFileSystem
38
+
39
+ CONNECTOR_TYPE = "gcs"
40
+
41
+
42
+ class GcsIndexerConfig(FsspecIndexerConfig):
43
+ pass
44
+
45
+
46
+ service_account_key_description = """
47
+ Options:
48
+ - ``None``, GCSFS will attempt to guess your credentials in the
49
+ following order: gcloud CLI default, gcsfs cached token, google compute
50
+ metadata service, anonymous.
51
+ - ``'google_default'``, your default gcloud credentials will be used,
52
+ which are typically established by doing ``gcloud login`` in a terminal.
53
+ - ``'cache'``, credentials from previously successful gcsfs
54
+ authentication will be used (use this after "browser" auth succeeded)
55
+ - ``'anon'``, no authentication is performed, and you can only
56
+ access data which is accessible to allUsers (in this case, the project and
57
+ access level parameters are meaningless)
58
+ - ``'browser'``, you get an access code with which you can
59
+ authenticate via a specially provided URL
60
+ - if ``'cloud'``, we assume we are running within google compute
61
+ or google container engine, and query the internal metadata directly for
62
+ a token.
63
+ - you may supply a token generated by the
64
+ [gcloud](https://cloud.google.com/sdk/docs/)
65
+ utility; this is either a python dictionary or the name of a file
66
+ containing the JSON returned by logging in with the gcloud CLI tool.
67
+ """
68
+
69
+
70
+ class GcsAccessConfig(FsspecAccessConfig):
71
+ service_account_key: Optional[str] = Field(
72
+ default=None, description=service_account_key_description
73
+ )
74
+ token: Union[str, dict, None] = Field(init=False, default=None)
75
+
76
+ def model_post_init(self, __context: Any) -> None:
77
+ ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
78
+
79
+ # Case: null value
80
+ if not self.service_account_key:
81
+ return
82
+
83
+ # Case: one of auth constants
84
+ if self.service_account_key in ALLOWED_AUTH_VALUES:
85
+ self.token = self.service_account_key
86
+ return
87
+
88
+ # Case: token as json
89
+ if isinstance(json_to_dict(self.service_account_key), dict):
90
+ self.token = json_to_dict(self.service_account_key)
91
+ return
92
+
93
+ # Case: path to token
94
+ if Path(self.service_account_key).is_file():
95
+ self.token = self.service_account_key
96
+ return
97
+
98
+ raise ValueError("Invalid auth token value")
99
+
100
+
101
+ class GcsConnectionConfig(FsspecConnectionConfig):
102
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"], init=False)
103
+ access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
104
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
105
+
106
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
107
+ @contextmanager
108
+ def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
109
+ with super().get_client(protocol=protocol) as client:
110
+ yield client
111
+
112
+ def wrap_error(self, e: Exception) -> Exception:
113
+ # https://github.com/fsspec/gcsfs/blob/main/gcsfs/retry.py#L79
114
+ from gcsfs.retry import HttpError
115
+
116
+ if isinstance(e, FileNotFoundError):
117
+ raise UserError(f"File not found: {e}")
118
+ if isinstance(e, OSError) and "Forbidden" in str(e):
119
+ raise UserError(e)
120
+ if isinstance(e, ValueError) and "Bad Request" in str(e):
121
+ raise UserError(e)
122
+ if isinstance(e, HttpError) and (http_error_code := e.code):
123
+ message = e.message or e
124
+ if 400 <= http_error_code < 500:
125
+ raise UserError(message)
126
+ if http_error_code >= 500:
127
+ raise ProviderError(message)
128
+ logger.error(f"({type(e)} from gcs): {e}", exc_info=True)
129
+ return e
130
+
131
+
132
+ @dataclass
133
+ class GcsIndexer(FsspecIndexer):
134
+ connection_config: GcsConnectionConfig
135
+ index_config: GcsIndexerConfig
136
+ connector_type: str = CONNECTOR_TYPE
137
+
138
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
139
+ path = file_info["name"]
140
+ date_created = None
141
+ date_modified = None
142
+ if modified_at_str := file_info.get("updated"):
143
+ date_modified = str(parser.parse(modified_at_str).timestamp())
144
+ if created_at_str := file_info.get("timeCreated"):
145
+ date_created = str(parser.parse(created_at_str).timestamp())
146
+
147
+ file_size = file_info.get("size") if "size" in file_info else None
148
+
149
+ version = file_info.get("etag")
150
+ record_locator = {
151
+ "protocol": self.index_config.protocol,
152
+ "remote_file_path": self.index_config.remote_url,
153
+ "file_id": file_info.get("id"),
154
+ }
155
+ return FileDataSourceMetadata(
156
+ date_created=date_created,
157
+ date_modified=date_modified,
158
+ date_processed=str(time()),
159
+ version=version,
160
+ url=f"{self.index_config.protocol}://{path}",
161
+ record_locator=record_locator,
162
+ filesize_bytes=file_size,
163
+ )
164
+
165
+
166
+ class GcsDownloaderConfig(FsspecDownloaderConfig):
167
+ pass
168
+
169
+
170
+ @dataclass
171
+ class GcsDownloader(FsspecDownloader):
172
+ protocol: str = "gcs"
173
+ connection_config: GcsConnectionConfig
174
+ connector_type: str = CONNECTOR_TYPE
175
+ download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
176
+
177
+
178
+ class GcsUploaderConfig(FsspecUploaderConfig):
179
+ pass
180
+
181
+
182
+ @dataclass
183
+ class GcsUploader(FsspecUploader):
184
+ connector_type: str = CONNECTOR_TYPE
185
+ connection_config: GcsConnectionConfig
186
+ upload_config: GcsUploaderConfig = field(default=None)
187
+
188
+
189
+ gcs_source_entry = SourceRegistryEntry(
190
+ indexer=GcsIndexer,
191
+ indexer_config=GcsIndexerConfig,
192
+ downloader=GcsDownloader,
193
+ downloader_config=GcsDownloaderConfig,
194
+ connection_config=GcsConnectionConfig,
195
+ )
196
+
197
+ gcs_destination_entry = DestinationRegistryEntry(
198
+ uploader=GcsUploader,
199
+ uploader_config=GcsUploaderConfig,
200
+ connection_config=GcsConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
203
+ )