unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,242 @@
1
+ import hashlib
2
+ import time
3
+ from dataclasses import dataclass, field
4
+ from datetime import timezone
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Coroutine, Generator
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.data_types.file_data import (
11
+ FileData,
12
+ FileDataSourceMetadata,
13
+ SourceIdentifiers,
14
+ )
15
+ from unstructured_ingest.error import SourceConnectionError, ValueError
16
+ from unstructured_ingest.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ Downloader,
20
+ DownloaderConfig,
21
+ DownloadResponse,
22
+ Indexer,
23
+ IndexerConfig,
24
+ )
25
+ from unstructured_ingest.logger import logger
26
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
27
+ from unstructured_ingest.utils.dep_check import requires_dependencies
28
+
29
+ MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
30
+
31
+ if TYPE_CHECKING:
32
+ from office365.graph_client import GraphClient
33
+ from office365.outlook.mail.folders.folder import MailFolder
34
+ from office365.outlook.mail.messages.message import Message
35
+
36
+
37
+ CONNECTOR_TYPE = "outlook"
38
+
39
+
40
+ class OutlookAccessConfig(AccessConfig):
41
+ client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
42
+
43
+
44
+ class OutlookConnectionConfig(ConnectionConfig):
45
+ access_config: Secret[OutlookAccessConfig]
46
+ client_id: str = Field(description="Azure AD App client ID")
47
+ tenant: str = Field(
48
+ default="common", description="ID or domain name associated with your Azure AD instance"
49
+ )
50
+ authority_url: str = Field(
51
+ default="https://login.microsoftonline.com",
52
+ description="Authentication token provider for Microsoft apps",
53
+ )
54
+
55
+ @requires_dependencies(["msal"], extras="outlook")
56
+ def _acquire_token(self):
57
+ """Acquire token via MSAL"""
58
+ from msal import ConfidentialClientApplication
59
+
60
+ # NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
61
+ # constant as default in the future but they do not fit well with `authority_url` right now
62
+ authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
63
+ app = ConfidentialClientApplication(
64
+ authority=authority_url,
65
+ client_id=self.client_id,
66
+ client_credential=self.access_config.get_secret_value().client_credential,
67
+ )
68
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
69
+ return token
70
+
71
+ @requires_dependencies(["office365"], extras="outlook")
72
+ @SourceConnectionError.wrap
73
+ def get_client(self) -> "GraphClient":
74
+ from office365.graph_client import GraphClient
75
+
76
+ return GraphClient(self._acquire_token)
77
+
78
+
79
+ class OutlookIndexerConfig(IndexerConfig):
80
+ outlook_folders: list[str] = Field(
81
+ description="Folders to download email messages from. Do not specify subfolders. "
82
+ "Use quotes if there are spaces in folder names."
83
+ )
84
+ recursive: bool = Field(
85
+ default=False,
86
+ description="Recursively download files in their respective folders otherwise stop at the"
87
+ " files in provided folder level.",
88
+ )
89
+ user_email: str = Field(description="Outlook email to download messages from.")
90
+
91
+
92
+ @dataclass
93
+ class OutlookIndexer(Indexer):
94
+ index_config: OutlookIndexerConfig
95
+ connection_config: OutlookConnectionConfig
96
+ connector_type: str = CONNECTOR_TYPE
97
+
98
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
99
+ messages = self._list_messages(recursive=self.index_config.recursive)
100
+
101
+ for message in messages:
102
+ yield self._message_to_file_data(message)
103
+
104
+ def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
105
+ raise NotImplementedError
106
+
107
+ @SourceConnectionError.wrap
108
+ def precheck(self) -> None:
109
+ client = self.connection_config.get_client()
110
+ client.users[self.index_config.user_email].get().execute_query()
111
+
112
+ def is_async(self) -> bool:
113
+ return False
114
+
115
+ def _list_messages(self, recursive: bool) -> list["Message"]:
116
+ mail_folders = self._get_selected_root_folders()
117
+ messages = []
118
+
119
+ while mail_folders:
120
+ mail_folder = mail_folders.pop()
121
+ messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
122
+
123
+ if recursive:
124
+ mail_folders += list(mail_folder.child_folders.get().execute_query())
125
+
126
+ return messages
127
+
128
+ def _get_selected_root_folders(self) -> list["MailFolder"]:
129
+ client_user = self.connection_config.get_client().users[self.index_config.user_email]
130
+ root_mail_folders = client_user.mail_folders.get().execute_query()
131
+
132
+ selected_names_normalized = [
133
+ folder_name.lower() for folder_name in self.index_config.outlook_folders
134
+ ]
135
+ selected_root_mail_folders = [
136
+ folder
137
+ for folder in root_mail_folders
138
+ if folder.display_name.lower() in selected_names_normalized
139
+ ]
140
+
141
+ if not selected_root_mail_folders:
142
+ logger.error(
143
+ f"Root folders selected in configuration: {self.index_config.outlook_folders}"
144
+ f"not found for user email {self.index_config.user_email}. Aborting."
145
+ )
146
+ raise ValueError("Root folders selected in configuration not found.")
147
+
148
+ return selected_root_mail_folders
149
+
150
+ def _message_to_file_data(self, message: "Message") -> FileData:
151
+ fullpath = self._generate_fullpath(message)
152
+ source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
153
+ return FileData(
154
+ identifier=message.id,
155
+ connector_type=CONNECTOR_TYPE,
156
+ source_identifiers=source_identifiers,
157
+ metadata=FileDataSourceMetadata(
158
+ url=message.resource_url,
159
+ version=message.change_key,
160
+ date_modified=str(
161
+ message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
162
+ ),
163
+ date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
164
+ date_processed=str(time.time()),
165
+ record_locator={
166
+ "message_id": message.id,
167
+ "user_email": self.index_config.user_email,
168
+ },
169
+ ),
170
+ additional_metadata={
171
+ "sent_from": str(message.sent_from),
172
+ "to_recipients": [str(recipient) for recipient in message.to_recipients],
173
+ "bcc_recipients": [str(recipient) for recipient in message.to_recipients],
174
+ "subject": message.subject,
175
+ "conversation_id": message.conversation_id,
176
+ "is_draft": message.is_draft,
177
+ "is_read": message.is_read,
178
+ "has_attachments": message.has_attachments,
179
+ "importance": message.importance,
180
+ },
181
+ display_name=source_identifiers.fullpath,
182
+ )
183
+
184
+ def _generate_fullpath(self, message: "Message") -> Path:
185
+ return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
186
+
187
+
188
+ class OutlookDownloaderConfig(DownloaderConfig):
189
+ pass
190
+
191
+
192
+ @dataclass
193
+ class OutlookDownloader(Downloader):
194
+ connector_type: str = CONNECTOR_TYPE
195
+ connection_config: OutlookConnectionConfig
196
+ download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
197
+
198
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
199
+ # NOTE: Indexer should provide source identifiers required to generate the download path
200
+ download_path = self.get_download_path(file_data)
201
+ if download_path is None:
202
+ logger.error(
203
+ "Generated download path is None, source_identifiers might be missingfrom FileData."
204
+ )
205
+ raise ValueError("Generated invalid download path.")
206
+
207
+ self._download_message(file_data, download_path)
208
+ return self.generate_download_response(file_data, download_path)
209
+
210
+ def is_async(self) -> bool:
211
+ return False
212
+
213
+ def _download_message(self, file_data: FileData, download_path: Path) -> None:
214
+ # NOTE: Indexer should supply the record locator in metadata
215
+ if (
216
+ file_data.metadata.record_locator is None
217
+ or "user_email" not in file_data.metadata.record_locator
218
+ or "message_id" not in file_data.metadata.record_locator
219
+ ):
220
+ logger.error(
221
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
222
+ "Keys 'user_email' and 'message_id' must be present."
223
+ )
224
+ raise ValueError("Invalid record locator.")
225
+
226
+ user_email = file_data.metadata.record_locator["user_email"]
227
+ message_id = file_data.metadata.record_locator["message_id"]
228
+
229
+ message = self.connection_config.get_client().users[user_email].messages[message_id]
230
+ download_path.parent.mkdir(exist_ok=True, parents=True)
231
+
232
+ with open(download_path, "wb") as file:
233
+ message.download(file).execute_query()
234
+
235
+
236
+ outlook_source_entry = SourceRegistryEntry(
237
+ indexer=OutlookIndexer,
238
+ indexer_config=OutlookIndexerConfig,
239
+ downloader=OutlookDownloader,
240
+ downloader_config=OutlookDownloaderConfig,
241
+ connection_config=OutlookConnectionConfig,
242
+ )
@@ -0,0 +1,400 @@
1
+ import json
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Literal, Optional
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.error import (
11
+ DestinationConnectionError,
12
+ NotFoundError,
13
+ UnstructuredIngestError,
14
+ UserError,
15
+ )
16
+ from unstructured_ingest.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ UploaderConfig,
20
+ UploadStager,
21
+ UploadStagerConfig,
22
+ VectorDBUploader,
23
+ )
24
+ from unstructured_ingest.logger import logger
25
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
26
+ from unstructured_ingest.utils import ndjson
27
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
28
+ from unstructured_ingest.utils.data_prep import (
29
+ flatten_dict,
30
+ generator_batching_wbytes,
31
+ get_enhanced_element_id,
32
+ get_json_data,
33
+ write_data,
34
+ )
35
+ from unstructured_ingest.utils.dep_check import requires_dependencies
36
+
37
+ if TYPE_CHECKING:
38
+ from pinecone import Index as PineconeIndex
39
+ from pinecone import Pinecone
40
+
41
+
42
+ CONNECTOR_TYPE = "pinecone"
43
+ MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
44
+ MAX_POOL_THREADS = 100
45
+ MAX_METADATA_BYTES = 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
46
+ MAX_QUERY_RESULTS = 10000
47
+
48
+
49
+ class PineconeAccessConfig(AccessConfig):
50
+ pinecone_api_key: Optional[str] = Field(
51
+ default=None, description="API key for Pinecone.", alias="api_key"
52
+ )
53
+
54
+
55
+ class PineconeConnectionConfig(ConnectionConfig):
56
+ index_name: Optional[str] = Field(description="Name of the index to connect to.", default=None)
57
+ access_config: Secret[PineconeAccessConfig] = Field(
58
+ default=PineconeAccessConfig(), validate_default=True
59
+ )
60
+
61
+ @requires_dependencies(["pinecone"], extras="pinecone")
62
+ def get_client(self, **index_kwargs) -> "Pinecone":
63
+ from pinecone import Pinecone
64
+
65
+ from unstructured_ingest import __version__ as unstructured_version
66
+
67
+ return Pinecone(
68
+ api_key=self.access_config.get_secret_value().pinecone_api_key,
69
+ source_tag=f"unstructured_ingest=={unstructured_version}",
70
+ )
71
+
72
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
73
+ pc = self.get_client()
74
+
75
+ index = pc.Index(name=self.index_name, **index_kwargs)
76
+
77
+ logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
78
+ return index
79
+
80
+
81
+ ALLOWED_FIELDS = (
82
+ "element_id",
83
+ "text",
84
+ "parent_id",
85
+ "category_depth",
86
+ "emphasized_text_tags",
87
+ "emphasized_text_contents",
88
+ "coordinates",
89
+ "last_modified",
90
+ "page_number",
91
+ "filename",
92
+ "is_continuation",
93
+ "link_urls",
94
+ "link_texts",
95
+ "text_as_html",
96
+ "entities",
97
+ )
98
+
99
+
100
+ class PineconeUploadStagerConfig(UploadStagerConfig):
101
+ metadata_fields: list[str] = Field(
102
+ default=list(ALLOWED_FIELDS),
103
+ description=(
104
+ "which metadata from the source element to map to the payload metadata being sent to "
105
+ "Pinecone."
106
+ ),
107
+ )
108
+
109
+
110
+ class PineconeUploaderConfig(UploaderConfig):
111
+ batch_size: Optional[int] = Field(
112
+ default=None,
113
+ description="Optional number of records per batch. Will otherwise limit by size.",
114
+ )
115
+ pool_threads: Optional[int] = Field(
116
+ default=1, description="Optional limit on number of threads to use for upload"
117
+ )
118
+ namespace: Optional[str] = Field(
119
+ default=None,
120
+ description="The namespace to write to. If not specified, the default namespace is used",
121
+ )
122
+ record_id_key: str = Field(
123
+ default=RECORD_ID_LABEL,
124
+ description="searchable key to find entries for the same record on previous runs",
125
+ )
126
+
127
+
128
+ @dataclass
129
+ class PineconeUploadStager(UploadStager):
130
+ upload_stager_config: PineconeUploadStagerConfig = field(
131
+ default_factory=lambda: PineconeUploadStagerConfig()
132
+ )
133
+
134
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
135
+ embeddings = element_dict.pop("embeddings", None)
136
+ metadata: dict[str, Any] = element_dict.pop("metadata", {})
137
+ data_source = metadata.pop("data_source", {})
138
+ coordinates = metadata.pop("coordinates", {})
139
+ pinecone_metadata = {}
140
+ for possible_meta in [element_dict, metadata, data_source, coordinates]:
141
+ pinecone_metadata.update(
142
+ {
143
+ k: v
144
+ for k, v in possible_meta.items()
145
+ if k in self.upload_stager_config.metadata_fields
146
+ }
147
+ )
148
+
149
+ metadata = flatten_dict(
150
+ pinecone_metadata,
151
+ separator="-",
152
+ flatten_lists=True,
153
+ remove_none=True,
154
+ )
155
+ metadata_size_bytes = len(json.dumps(metadata).encode())
156
+ if metadata_size_bytes > MAX_METADATA_BYTES:
157
+ logger.info(
158
+ f"Metadata size is {metadata_size_bytes} bytes, which exceeds the limit of"
159
+ f" {MAX_METADATA_BYTES} bytes per vector. Dropping the metadata."
160
+ )
161
+ metadata = {}
162
+
163
+ metadata[RECORD_ID_LABEL] = file_data.identifier
164
+
165
+ # To support more optimal deletes, a prefix is suggested for each record:
166
+ # https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
167
+ return {
168
+ "id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
169
+ "values": embeddings,
170
+ "metadata": metadata,
171
+ }
172
+
173
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
174
+ with input_file.open() as in_f:
175
+ reader = ndjson.reader(in_f)
176
+ with output_file.open("w") as out_f:
177
+ writer = ndjson.writer(out_f)
178
+ for element in reader:
179
+ if "embeddings" not in element:
180
+ continue
181
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
182
+ writer.write(row=conformed_element)
183
+ writer.f.flush()
184
+
185
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
186
+ elements_contents = get_json_data(path=input_file)
187
+
188
+ conformed_elements = [
189
+ self.conform_dict(element_dict=element, file_data=file_data)
190
+ for element in elements_contents
191
+ if "embeddings" in element
192
+ ]
193
+ write_data(path=output_file, data=conformed_elements)
194
+
195
+
196
+ @dataclass
197
+ class PineconeUploader(VectorDBUploader):
198
+ upload_config: PineconeUploaderConfig
199
+ connection_config: PineconeConnectionConfig
200
+ connector_type: str = CONNECTOR_TYPE
201
+
202
+ def init(self, **kwargs: Any) -> None:
203
+ self.create_destination(**kwargs)
204
+
205
+ def index_exists(self, index_name: Optional[str]) -> bool:
206
+ from pinecone.exceptions import NotFoundException
207
+
208
+ index_name = index_name or self.connection_config.index_name
209
+ pc = self.connection_config.get_client()
210
+ try:
211
+ pc.describe_index(index_name)
212
+ return True
213
+ except NotFoundException:
214
+ return False
215
+ except Exception as e:
216
+ logger.error(f"failed to check if pinecone index exists : {e}")
217
+ raise DestinationConnectionError(f"failed to check if pinecone index exists : {e}")
218
+
219
+ def precheck(self):
220
+ try:
221
+ # just a connection check here. not an actual index_exists check
222
+ self.index_exists("just-checking-our-connection")
223
+
224
+ if self.connection_config.index_name and not self.index_exists(
225
+ self.connection_config.index_name
226
+ ):
227
+ raise NotFoundError(f"index {self.connection_config.index_name} does not exist")
228
+ except Exception as e:
229
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
230
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
231
+
232
+ def format_destination_name(self, destination_name: str) -> str:
233
+ # Pinecone naming requirements:
234
+ # can only contain lowercase letters, numbers, and hyphens
235
+ # must be 45 characters or less
236
+ formatted = re.sub(r"[^a-z0-9]", "-", destination_name.lower())
237
+ return formatted
238
+
239
+ def create_destination(
240
+ self,
241
+ vector_length: int,
242
+ destination_name: str = "unstructuredautocreated",
243
+ destination_type: Literal["pod", "serverless"] = "serverless",
244
+ serverless_cloud: str = "aws",
245
+ serverless_region: str = "us-east-1",
246
+ pod_environment: str = "us-east1-gcp",
247
+ pod_type: str = "p1.x1",
248
+ pod_count: int = 1,
249
+ **kwargs: Any,
250
+ ) -> bool:
251
+ from pinecone import PodSpec, ServerlessSpec
252
+
253
+ index_name = self.connection_config.index_name or destination_name
254
+ index_name = self.format_destination_name(index_name)
255
+ self.connection_config.index_name = index_name
256
+
257
+ if not self.index_exists(index_name):
258
+ logger.info(f"creating pinecone index {index_name}")
259
+
260
+ pc = self.connection_config.get_client()
261
+ if destination_type == "serverless":
262
+ pc.create_index(
263
+ name=index_name,
264
+ dimension=vector_length,
265
+ spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
266
+ )
267
+
268
+ return True
269
+
270
+ elif destination_type == "pod":
271
+ pc.create_index(
272
+ name=destination_name,
273
+ dimension=vector_length,
274
+ spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
275
+ )
276
+
277
+ return True
278
+
279
+ else:
280
+ raise ValueError(f"unexpected destination type: {destination_type}")
281
+
282
+ else:
283
+ logger.debug(f"index {index_name} already exists, skipping creation")
284
+ return False
285
+
286
+ def pod_delete_by_record_id(self, file_data: FileData) -> None:
287
+ logger.debug(
288
+ f"deleting any content with metadata "
289
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
290
+ f"from pinecone pod index"
291
+ )
292
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
293
+ delete_kwargs = {
294
+ "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
295
+ }
296
+
297
+ if namespace := self.upload_config.namespace:
298
+ delete_kwargs["namespace"] = namespace
299
+ try:
300
+ index.delete(**delete_kwargs)
301
+ except UserError as e:
302
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
303
+
304
+ logger.debug(
305
+ f"deleted any content with metadata "
306
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
307
+ f"from pinecone index: {delete_kwargs}"
308
+ )
309
+
310
+ def serverless_delete_by_record_id(self, file_data: FileData) -> None:
311
+ logger.debug(
312
+ f"deleting any content with metadata "
313
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
314
+ f"from pinecone serverless index"
315
+ )
316
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
317
+ list_kwargs = {"prefix": f"{file_data.identifier}#"}
318
+ deleted_ids = 0
319
+ if namespace := self.upload_config.namespace:
320
+ list_kwargs["namespace"] = namespace
321
+
322
+ for ids in index.list(**list_kwargs):
323
+ deleted_ids += len(ids)
324
+ delete_kwargs = {"ids": ids}
325
+
326
+ if namespace := self.upload_config.namespace:
327
+ delete_kwargs["namespace"] = namespace
328
+
329
+ try:
330
+ index.delete(**delete_kwargs)
331
+ except UserError as e:
332
+ logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
333
+
334
+ logger.info(
335
+ f"deleted {deleted_ids} records with metadata "
336
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
337
+ f"from pinecone index"
338
+ )
339
+
340
+ @requires_dependencies(["pinecone"], extras="pinecone")
341
+ def upsert_batches_async(self, elements_dict: list[dict]):
342
+ from pinecone.exceptions import PineconeApiException
343
+
344
+ chunks = list(
345
+ generator_batching_wbytes(
346
+ iterable=elements_dict,
347
+ batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
348
+ max_batch_size=self.upload_config.batch_size,
349
+ )
350
+ )
351
+ logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
352
+
353
+ max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
354
+ if self.upload_config.pool_threads:
355
+ pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
356
+ else:
357
+ pool_threads = max_pool_threads
358
+ index = self.connection_config.get_index(pool_threads=pool_threads)
359
+ with index:
360
+ upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
361
+ if namespace := self.upload_config.namespace:
362
+ for kwargs in upsert_kwargs:
363
+ kwargs["namespace"] = namespace
364
+ async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
365
+ # Wait for and retrieve responses (this raises in case of error)
366
+ try:
367
+ results = [async_result.get() for async_result in async_results]
368
+ except PineconeApiException as api_error:
369
+ raise UnstructuredIngestError(f"http error: {api_error}") from api_error
370
+ logger.debug(f"results: {results}")
371
+
372
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
373
+ logger.info(
374
+ f"writing a total of {len(data)} elements via"
375
+ f" document batches to destination"
376
+ f" index named {self.connection_config.index_name}"
377
+ )
378
+ # Determine if serverless or pod based index
379
+ pinecone_client = self.connection_config.get_client()
380
+
381
+ if not self.connection_config.index_name:
382
+ raise ValueError("No index name specified")
383
+
384
+ index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
385
+ if "serverless" in index_description.get("spec"):
386
+ self.serverless_delete_by_record_id(file_data=file_data)
387
+ elif "pod" in index_description.get("spec"):
388
+ self.pod_delete_by_record_id(file_data=file_data)
389
+ else:
390
+ raise ValueError(f"unexpected spec type in index description: {index_description}")
391
+ self.upsert_batches_async(elements_dict=data)
392
+
393
+
394
+ pinecone_destination_entry = DestinationRegistryEntry(
395
+ connection_config=PineconeConnectionConfig,
396
+ uploader=PineconeUploader,
397
+ uploader_config=PineconeUploaderConfig,
398
+ upload_stager=PineconeUploadStager,
399
+ upload_stager_config=PineconeUploadStagerConfig,
400
+ )
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
8
+ from .cloud import qdrant_cloud_destination_entry
9
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
10
+ from .local import qdrant_local_destination_entry
11
+ from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
12
+ from .server import qdrant_server_destination_entry
13
+
14
+ add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
15
+ add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
16
+ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)