unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,253 @@
1
+ import contextlib
2
+ import os
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import (
11
+ FileDataSourceMetadata,
12
+ )
13
+ from unstructured_ingest.error import ProviderError, UserAuthError, UserError
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.processes.connector_registry import (
16
+ DestinationRegistryEntry,
17
+ SourceRegistryEntry,
18
+ )
19
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
20
+ FsspecAccessConfig,
21
+ FsspecConnectionConfig,
22
+ FsspecDownloader,
23
+ FsspecDownloaderConfig,
24
+ FsspecIndexer,
25
+ FsspecIndexerConfig,
26
+ FsspecUploader,
27
+ FsspecUploaderConfig,
28
+ )
29
+ from unstructured_ingest.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
33
+ from unstructured_ingest.utils.dep_check import requires_dependencies
34
+
35
+ CONNECTOR_TYPE = "s3"
36
+
37
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html#object-key-guidelines-avoid-characters # noqa
38
+ CHARACTERS_TO_AVOID = ["\\", "{", "^", "}", "%", "`", "]", '"', ">", "[", "~", "<", "#", "|"]
39
+
40
+ if TYPE_CHECKING:
41
+ from s3fs import S3FileSystem
42
+
43
+
44
+ class S3IndexerConfig(FsspecIndexerConfig):
45
+ pass
46
+
47
+
48
+ class S3AccessConfig(FsspecAccessConfig):
49
+ key: Optional[str] = Field(
50
+ default=None,
51
+ description="If not anonymous, use this access key ID, if specified. Takes precedence "
52
+ "over `aws_access_key_id` in client_kwargs.",
53
+ )
54
+ secret: Optional[str] = Field(
55
+ default=None, description="If not anonymous, use this secret access key, if specified."
56
+ )
57
+ token: Optional[str] = Field(
58
+ default=None, description="If not anonymous, use this security token, if specified."
59
+ )
60
+
61
+
62
+ class S3ConnectionConfig(FsspecConnectionConfig):
63
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"], init=False)
64
+ access_config: Secret[S3AccessConfig] = Field(default=S3AccessConfig(), validate_default=True)
65
+ endpoint_url: Optional[str] = Field(
66
+ default=None,
67
+ description="Use this endpoint_url, if specified. Needed for "
68
+ "connecting to non-AWS S3 buckets.",
69
+ )
70
+ anonymous: bool = Field(
71
+ default=False, description="Connect to s3 without local AWS credentials."
72
+ )
73
+ ambient_credentials: bool = Field(
74
+ default=False,
75
+ description="Explicitly allow using ambient AWS credentials from .aws folder, "
76
+ "environment variables, or IAM roles. Requires ALLOW_AMBIENT_CREDENTIALS_S3 environment "
77
+ "variable to also be set to 'true' (case insensitive) for security. When False (default), "
78
+ "only explicit credentials or anonymous access are allowed.",
79
+ )
80
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
81
+
82
+ def get_access_config(self) -> dict[str, Any]:
83
+ access_config = self.access_config.get_secret_value()
84
+ has_explicit_credentials = bool(
85
+ access_config.key or access_config.secret or access_config.token
86
+ )
87
+
88
+ access_configs: dict[str, Any]
89
+
90
+ if has_explicit_credentials:
91
+ access_configs = {"anon": False}
92
+ # Avoid injecting None by filtering out k,v pairs where the value is None
93
+ access_configs.update(
94
+ {k: v for k, v in access_config.model_dump().items() if v is not None}
95
+ )
96
+ elif self.ambient_credentials:
97
+ if os.getenv("ALLOW_AMBIENT_CREDENTIALS_S3", "").lower() == "true":
98
+ logger.info(
99
+ "Using ambient AWS credentials (environment variables, .aws folder, IAM roles)"
100
+ )
101
+ access_configs = {"anon": False}
102
+ # Don't pass explicit credentials, let s3fs/boto3 auto-detect
103
+ else:
104
+ # Field allows but environment doesn't - raise error for security
105
+ raise UserAuthError(
106
+ "Ambient credentials requested (ambient_credentials=True) but "
107
+ "ALLOW_AMBIENT_CREDENTIALS_S3 environment variable is not set to 'true'. "
108
+ )
109
+ elif self.anonymous:
110
+ access_configs = {"anon": True}
111
+ else:
112
+ # User set anonymous=False but provided no credentials and no ambient permission
113
+ raise UserAuthError(
114
+ "No authentication method specified. anonymous=False but no explicit credentials "
115
+ "provided and ambient_credentials=False."
116
+ )
117
+
118
+ if self.endpoint_url:
119
+ access_configs["endpoint_url"] = self.endpoint_url
120
+
121
+ # This allows s3fs to properly follow AWS region redirects
122
+ access_configs["cache_regions"] = True
123
+
124
+ return access_configs
125
+
126
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
127
+ @contextmanager
128
+ def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
129
+ with super().get_client(protocol=protocol) as client:
130
+ yield client
131
+
132
+ def wrap_error(self, e: Exception) -> Exception:
133
+ # s3fs maps botocore errors into python ones using mapping here:
134
+ # https://github.com/fsspec/s3fs/blob/main/s3fs/errors.py
135
+ if isinstance(e, PermissionError):
136
+ return UserAuthError(e)
137
+ if isinstance(e, FileNotFoundError):
138
+ return UserError(f"File not found: {e}")
139
+ if cause := getattr(e, "__cause__", None):
140
+ error_response = cause.response
141
+ error_meta = error_response["ResponseMetadata"]
142
+ http_code = error_meta["HTTPStatusCode"]
143
+ message = error_response["Error"].get("Message", str(e))
144
+ if 400 <= http_code < 500:
145
+ return UserError(message)
146
+ if http_code >= 500:
147
+ return ProviderError(message)
148
+ logger.error(
149
+ "Unhandled exception from S3 (type: %s, endpoint: %s): %s",
150
+ type(e).__name__,
151
+ self.endpoint_url or "default",
152
+ e,
153
+ exc_info=True,
154
+ )
155
+ return e
156
+
157
+
158
+ @dataclass
159
+ class S3Indexer(FsspecIndexer):
160
+ connection_config: S3ConnectionConfig
161
+ index_config: S3IndexerConfig
162
+ connector_type: str = CONNECTOR_TYPE
163
+
164
+ def wrap_error(self, e: Exception) -> Exception:
165
+ return self.connection_config.wrap_error(e=e)
166
+
167
+ def get_path(self, file_info: dict) -> str:
168
+ return file_info["Key"]
169
+
170
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
171
+ path = file_info["Key"]
172
+
173
+ self.log_debug("Getting metadata for S3 object", context={"file_path": path})
174
+ self.log_file_operation("Getting metadata", file_path=path)
175
+
176
+ date_created = None
177
+ date_modified = None
178
+ modified = file_info.get("LastModified")
179
+ if modified:
180
+ date_created = str(modified.timestamp())
181
+ date_modified = str(modified.timestamp())
182
+
183
+ file_size = file_info.get("size") if "size" in file_info else None
184
+ file_size = file_size or file_info.get("Size")
185
+
186
+ version = file_info.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_info else None
187
+ metadata: dict[str, str] = {}
188
+ with (
189
+ contextlib.suppress(AttributeError),
190
+ self.connection_config.get_client(protocol=self.index_config.protocol) as client,
191
+ ):
192
+ metadata = client.metadata(path=path)
193
+ record_locator = {
194
+ "protocol": self.index_config.protocol,
195
+ "remote_file_path": self.index_config.remote_url,
196
+ }
197
+ if metadata:
198
+ record_locator["metadata"] = metadata
199
+ issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
200
+ if issue_characters:
201
+ self.log_warning(
202
+ f"File path contains characters that can cause issues with S3: {issue_characters}",
203
+ context={"path": path, "problematic_characters": issue_characters},
204
+ )
205
+ return FileDataSourceMetadata(
206
+ date_created=date_created,
207
+ date_modified=date_modified,
208
+ date_processed=str(time()),
209
+ version=version,
210
+ url=f"{self.index_config.protocol}://{path}",
211
+ record_locator=record_locator,
212
+ filesize_bytes=file_size,
213
+ )
214
+
215
+
216
+ class S3DownloaderConfig(FsspecDownloaderConfig):
217
+ pass
218
+
219
+
220
+ @dataclass
221
+ class S3Downloader(FsspecDownloader):
222
+ protocol: str = "s3"
223
+ connection_config: S3ConnectionConfig
224
+ connector_type: str = CONNECTOR_TYPE
225
+ download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
226
+
227
+
228
+ class S3UploaderConfig(FsspecUploaderConfig):
229
+ pass
230
+
231
+
232
+ @dataclass
233
+ class S3Uploader(FsspecUploader):
234
+ connector_type: str = CONNECTOR_TYPE
235
+ connection_config: S3ConnectionConfig
236
+ upload_config: S3UploaderConfig = field(default=None)
237
+
238
+
239
+ s3_source_entry = SourceRegistryEntry(
240
+ indexer=S3Indexer,
241
+ indexer_config=S3IndexerConfig,
242
+ downloader=S3Downloader,
243
+ downloader_config=S3DownloaderConfig,
244
+ connection_config=S3ConnectionConfig,
245
+ )
246
+
247
+ s3_destination_entry = DestinationRegistryEntry(
248
+ uploader=S3Uploader,
249
+ uploader_config=S3UploaderConfig,
250
+ connection_config=S3ConnectionConfig,
251
+ upload_stager_config=BlobStoreUploadStagerConfig,
252
+ upload_stager=BlobStoreUploadStager,
253
+ )
@@ -0,0 +1,177 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.data_types.file_data import FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.processes.connector_registry import (
15
+ DestinationRegistryEntry,
16
+ SourceRegistryEntry,
17
+ )
18
+ from unstructured_ingest.processes.connectors.fsspec.fsspec import (
19
+ FsspecAccessConfig,
20
+ FsspecConnectionConfig,
21
+ FsspecDownloader,
22
+ FsspecDownloaderConfig,
23
+ FsspecIndexer,
24
+ FsspecIndexerConfig,
25
+ FsspecUploader,
26
+ FsspecUploaderConfig,
27
+ )
28
+ from unstructured_ingest.processes.utils.blob_storage import (
29
+ BlobStoreUploadStager,
30
+ BlobStoreUploadStagerConfig,
31
+ )
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
+
34
+ if TYPE_CHECKING:
35
+ from fsspec.implementations.sftp import SFTPFileSystem
36
+
37
+ CONNECTOR_TYPE = "sftp"
38
+
39
+
40
+ class SftpIndexerConfig(FsspecIndexerConfig):
41
+ def model_post_init(self, __context: Any) -> None:
42
+ super().model_post_init(__context)
43
+ _, ext = os.path.splitext(self.remote_url)
44
+ parsed_url = urlparse(self.remote_url)
45
+ if ext:
46
+ self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/")
47
+ else:
48
+ self.path_without_protocol = parsed_url.path.lstrip("/")
49
+
50
+
51
+ class SftpAccessConfig(FsspecAccessConfig):
52
+ password: str = Field(description="Password for sftp connection")
53
+
54
+
55
+ class SftpConnectionConfig(FsspecConnectionConfig):
56
+ supported_protocols: list[str] = Field(default_factory=lambda: ["sftp"], init=False)
57
+ access_config: Secret[SftpAccessConfig]
58
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
59
+ username: str = Field(description="Username for sftp connection")
60
+ host: Optional[str] = Field(default=None, description="Hostname for sftp connection")
61
+ port: int = Field(default=22, description="Port for sftp connection")
62
+ look_for_keys: bool = Field(
63
+ default=False, description="Whether to search for private key files in ~/.ssh/"
64
+ )
65
+ allow_agent: bool = Field(default=False, description="Whether to connect to the SSH agent.")
66
+
67
+ def get_access_config(self) -> dict[str, Any]:
68
+ access_config = {
69
+ "username": self.username,
70
+ "host": self.host,
71
+ "port": self.port,
72
+ "look_for_keys": self.look_for_keys,
73
+ "allow_agent": self.allow_agent,
74
+ "password": self.access_config.get_secret_value().password,
75
+ }
76
+ return access_config
77
+
78
+ @contextmanager
79
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
80
+ def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
81
+ # The paramiko.SSHClient() client that's opened by the SFTPFileSystem
82
+ # never gets closed so explicitly adding that as part of this context manager
83
+ from fsspec import get_filesystem_class
84
+
85
+ client: SFTPFileSystem = get_filesystem_class(protocol)(
86
+ **self.get_access_config(),
87
+ )
88
+ yield client
89
+ client.client.close()
90
+
91
+
92
+ @dataclass
93
+ class SftpIndexer(FsspecIndexer):
94
+ connection_config: SftpConnectionConfig
95
+ index_config: SftpIndexerConfig
96
+ connector_type: str = CONNECTOR_TYPE
97
+
98
+ def __post_init__(self):
99
+ parsed_url = urlparse(self.index_config.remote_url)
100
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
101
+ self.connection_config.port = parsed_url.port or self.connection_config.port
102
+
103
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
104
+ for file in super().run(**kwargs):
105
+ new_identifier = (
106
+ f"sftp://"
107
+ f"{self.connection_config.host}:"
108
+ f"{self.connection_config.port}/"
109
+ f"{file.identifier}"
110
+ )
111
+ file.identifier = new_identifier
112
+ yield file
113
+
114
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
115
+ path = file_info["name"]
116
+ date_created = str(file_info.get("time").timestamp()) if "time" in file_info else None
117
+ date_modified = str(file_info.get("mtime").timestamp()) if "mtime" in file_info else None
118
+
119
+ file_size = file_info.get("size") if "size" in file_info else None
120
+
121
+ record_locator = {
122
+ "protocol": self.index_config.protocol,
123
+ "remote_file_path": self.index_config.remote_url,
124
+ }
125
+ return FileDataSourceMetadata(
126
+ date_created=date_created,
127
+ date_modified=date_modified,
128
+ date_processed=str(time()),
129
+ url=f"{self.index_config.protocol}://{path}",
130
+ record_locator=record_locator,
131
+ filesize_bytes=file_size,
132
+ )
133
+
134
+
135
+ class SftpDownloaderConfig(FsspecDownloaderConfig):
136
+ remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")
137
+
138
+
139
+ @dataclass
140
+ class SftpDownloader(FsspecDownloader):
141
+ protocol: str = "sftp"
142
+ connection_config: SftpConnectionConfig
143
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
144
+ download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
145
+
146
+ def __post_init__(self):
147
+ parsed_url = urlparse(self.download_config.remote_url)
148
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
149
+ self.connection_config.port = parsed_url.port or self.connection_config.port
150
+
151
+
152
+ class SftpUploaderConfig(FsspecUploaderConfig):
153
+ pass
154
+
155
+
156
+ @dataclass
157
+ class SftpUploader(FsspecUploader):
158
+ connector_type: str = CONNECTOR_TYPE
159
+ connection_config: SftpConnectionConfig
160
+ upload_config: SftpUploaderConfig = field(default=None)
161
+
162
+
163
+ sftp_source_entry = SourceRegistryEntry(
164
+ indexer=SftpIndexer,
165
+ indexer_config=SftpIndexerConfig,
166
+ downloader=SftpDownloader,
167
+ downloader_config=SftpDownloaderConfig,
168
+ connection_config=SftpConnectionConfig,
169
+ )
170
+
171
+ sftp_destination_entry = DestinationRegistryEntry(
172
+ uploader=SftpUploader,
173
+ uploader_config=SftpUploaderConfig,
174
+ connection_config=SftpConnectionConfig,
175
+ upload_stager_config=BlobStoreUploadStagerConfig,
176
+ upload_stager=BlobStoreUploadStager,
177
+ )
@@ -0,0 +1,17 @@
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Callable
5
+
6
+
7
+ def json_serial(obj):
8
+ if isinstance(obj, Path):
9
+ return obj.as_posix()
10
+ if isinstance(obj, datetime):
11
+ return obj.isoformat()
12
+ raise TypeError("Type %s not serializable" % type(obj))
13
+
14
+
15
+ def sterilize_dict(data: dict, default: Callable = json_serial) -> dict:
16
+ data_s = json.dumps(data, default=default)
17
+ return json.loads(data_s)
@@ -0,0 +1,226 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from time import time
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+ from urllib.parse import urlparse
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import Field, Secret, field_validator
9
+
10
+ from unstructured_ingest.data_types.file_data import (
11
+ FileData,
12
+ FileDataSourceMetadata,
13
+ SourceIdentifiers,
14
+ )
15
+ from unstructured_ingest.error import (
16
+ ProviderError,
17
+ UnstructuredIngestError,
18
+ UserAuthError,
19
+ UserError,
20
+ )
21
+ from unstructured_ingest.interfaces import (
22
+ AccessConfig,
23
+ ConnectionConfig,
24
+ Downloader,
25
+ DownloaderConfig,
26
+ Indexer,
27
+ IndexerConfig,
28
+ download_responses,
29
+ )
30
+ from unstructured_ingest.logger import logger
31
+ from unstructured_ingest.processes.connector_registry import (
32
+ SourceRegistryEntry,
33
+ )
34
+ from unstructured_ingest.utils.dep_check import requires_dependencies
35
+
36
+ if TYPE_CHECKING:
37
+ from github import ContentFile, GitTreeElement, Repository
38
+ from github import Github as GithubClient
39
+ from github.GithubException import GithubException
40
+ from requests import HTTPError
41
+
42
+ CONNECTOR_TYPE = "github"
43
+
44
+
45
+ class GithubAccessConfig(AccessConfig):
46
+ access_token: str = Field(description="Github acess token")
47
+
48
+
49
+ class GithubConnectionConfig(ConnectionConfig):
50
+ access_config: Secret[GithubAccessConfig]
51
+ url: str = Field(description="Github url or repository owner/name pair")
52
+
53
+ @field_validator("url", mode="after")
54
+ def conform_url(cls, value: str):
55
+ parsed_url = urlparse(value)
56
+ return parsed_url.path
57
+
58
+ def get_full_url(self):
59
+ return f"https://github.com/{self.url}"
60
+
61
+ @requires_dependencies(["github"], extras="github")
62
+ def get_client(self) -> "GithubClient":
63
+ from github import Github as GithubClient
64
+
65
+ return GithubClient(login_or_token=self.access_config.get_secret_value().access_token)
66
+
67
+ def get_repo(self) -> "Repository":
68
+ client = self.get_client()
69
+ return client.get_repo(self.url)
70
+
71
+ def wrap_github_exception(self, e: "GithubException") -> Exception:
72
+ data = e.data
73
+ status_code = e.status
74
+ message = data.get("message")
75
+ if status_code == 401:
76
+ return UserAuthError(f"Unauthorized access to Github: {message}")
77
+ if 400 <= status_code < 500:
78
+ return UserError(message)
79
+ if status_code > 500:
80
+ return ProviderError(message)
81
+ logger.debug(f"unhandled github error: {e}")
82
+ return e
83
+
84
+ def wrap_http_error(self, e: "HTTPError") -> Exception:
85
+ status_code = e.response.status_code
86
+ if status_code == 401:
87
+ return UserAuthError(f"Unauthorized access to Github: {e.response.text}")
88
+ if 400 <= status_code < 500:
89
+ return UserError(e.response.text)
90
+ if status_code > 500:
91
+ return ProviderError(e.response.text)
92
+ logger.debug(f"unhandled http error: {e}")
93
+ return UnstructuredIngestError(str(e))
94
+
95
+ @requires_dependencies(["requests"], extras="github")
96
+ def wrap_error(self, e: Exception) -> Exception:
97
+ from github.GithubException import GithubException
98
+ from requests import HTTPError
99
+
100
+ if isinstance(e, GithubException):
101
+ return self.wrap_github_exception(e=e)
102
+ if isinstance(e, HTTPError):
103
+ return self.wrap_http_error(e=e)
104
+ logger.debug(f"unhandled error: {e}")
105
+ return UnstructuredIngestError(str(e))
106
+
107
+
108
+ class GithubIndexerConfig(IndexerConfig):
109
+ branch: Optional[str] = Field(
110
+ description="Branch to index, use the default if one isn't provided", default=None
111
+ )
112
+ recursive: bool = Field(
113
+ description="Recursively index all files in the repository", default=True
114
+ )
115
+
116
+
117
+ @dataclass
118
+ class GithubIndexer(Indexer):
119
+ connection_config: GithubConnectionConfig
120
+ index_config: GithubIndexerConfig = field(default_factory=GithubIndexerConfig)
121
+ connector_type: str = CONNECTOR_TYPE
122
+
123
+ def precheck(self) -> None:
124
+ try:
125
+ self.connection_config.get_repo()
126
+ except Exception as e:
127
+ raise self.connection_config.wrap_error(e=e)
128
+
129
+ def get_branch(self) -> str:
130
+ repo = self.connection_config.get_repo()
131
+ sha = self.index_config.branch or repo.default_branch
132
+ return sha
133
+
134
+ def list_files(self) -> list["GitTreeElement"]:
135
+ repo = self.connection_config.get_repo()
136
+ sha = self.index_config.branch or repo.default_branch
137
+ git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive)
138
+ file_elements = [
139
+ element for element in git_tree.tree if element.size is not None and element.size > 0
140
+ ]
141
+ return file_elements
142
+
143
+ def convert_element(self, element: "GitTreeElement") -> FileData:
144
+ full_path = (
145
+ f"{self.connection_config.get_full_url()}/blob/{self.get_branch()}/{element.path}"
146
+ )
147
+
148
+ return FileData(
149
+ identifier=str(uuid5(NAMESPACE_DNS, full_path)),
150
+ connector_type=self.connector_type,
151
+ display_name=full_path,
152
+ source_identifiers=SourceIdentifiers(
153
+ filename=Path(element.path).name,
154
+ fullpath=(Path(self.get_branch()) / element.path).as_posix(),
155
+ rel_path=element.path,
156
+ ),
157
+ metadata=FileDataSourceMetadata(
158
+ url=element.url,
159
+ version=element.etag,
160
+ record_locator={},
161
+ date_modified=str(element.last_modified_datetime.timestamp()),
162
+ date_processed=str(time()),
163
+ filesize_bytes=element.size,
164
+ permissions_data=[{"mode": element.mode}],
165
+ ),
166
+ )
167
+
168
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
169
+ for element in self.list_files():
170
+ yield self.convert_element(element=element)
171
+
172
+
173
+ class GithubDownloaderConfig(DownloaderConfig):
174
+ pass
175
+
176
+
177
+ @dataclass
178
+ class GithubDownloader(Downloader):
179
+ download_config: GithubDownloaderConfig
180
+ connection_config: GithubConnectionConfig
181
+ connector_type: str = CONNECTOR_TYPE
182
+
183
+ @requires_dependencies(["github"], extras="github")
184
+ def get_file(self, file_data: FileData) -> "ContentFile":
185
+ from github.GithubException import UnknownObjectException
186
+
187
+ path = file_data.source_identifiers.relative_path
188
+ repo = self.connection_config.get_repo()
189
+
190
+ try:
191
+ content_file = repo.get_contents(path)
192
+ except UnknownObjectException as e:
193
+ logger.error(f"File doesn't exists {self.connection_config.url}/{path}: {e}")
194
+ raise UserError(f"File not found: {path}")
195
+ return content_file
196
+
197
+ @requires_dependencies(["requests"], extras="github")
198
+ def get_contents(self, content_file: "ContentFile") -> bytes:
199
+ import requests
200
+
201
+ if content_file.decoded_content:
202
+ return content_file.decoded_content
203
+ download_url = content_file.download_url
204
+ resp = requests.get(download_url)
205
+ try:
206
+ resp.raise_for_status()
207
+ except requests.HTTPError as e:
208
+ raise self.connection_config.wrap_error(e=e)
209
+ return resp.content
210
+
211
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
212
+ content_file = self.get_file(file_data)
213
+ contents = self.get_contents(content_file)
214
+ download_path = self.get_download_path(file_data)
215
+ with download_path.open("wb") as f:
216
+ f.write(contents)
217
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
218
+
219
+
220
+ github_source_entry = SourceRegistryEntry(
221
+ indexer=GithubIndexer,
222
+ indexer_config=GithubIndexerConfig,
223
+ downloader=GithubDownloader,
224
+ downloader_config=GithubDownloaderConfig,
225
+ connection_config=GithubConnectionConfig,
226
+ )