unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,348 @@
1
+ import asyncio
2
+ import json
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Mapping, Optional
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import FileData
12
+ from unstructured_ingest.error import DestinationConnectionError, ValueError
13
+ from unstructured_ingest.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.logger import logger
22
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
23
+ from unstructured_ingest.utils.data_prep import flatten_dict
24
+ from unstructured_ingest.utils.dep_check import requires_dependencies
25
+
26
+ BASE_URL = "https://api.vectara.io/v2"
27
+
28
+ CONNECTOR_TYPE = "vectara"
29
+
30
+
31
+ class VectaraAccessConfig(AccessConfig):
32
+ oauth_client_id: str = Field(description="Client ID")
33
+ oauth_secret: str = Field(description="Client Secret")
34
+
35
+
36
+ class VectaraConnectionConfig(ConnectionConfig):
37
+ access_config: Secret[VectaraAccessConfig]
38
+ customer_id: str
39
+ corpus_name: Optional[str] = None
40
+ corpus_key: Optional[str] = None
41
+ token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
42
+
43
+
44
+ class VectaraUploadStagerConfig(UploadStagerConfig):
45
+ pass
46
+
47
+
48
+ @dataclass
49
+ class VectaraUploadStager(UploadStager):
50
+ upload_stager_config: VectaraUploadStagerConfig = field(
51
+ default_factory=lambda: VectaraUploadStagerConfig()
52
+ )
53
+
54
+ @staticmethod
55
+ def conform_dict(data: dict) -> dict:
56
+ """
57
+ Prepares dictionary in the format that Vectara requires.
58
+ See more detail in https://docs.vectara.com/docs/rest-api/create-corpus-document
59
+
60
+ Select which meta-data fields to include and optionally map them to a new format.
61
+ remove the "metadata-" prefix from the keys
62
+ """
63
+ metadata_map = {
64
+ "page_number": "page_number",
65
+ "data_source-url": "url",
66
+ "filename": "filename",
67
+ "filetype": "filetype",
68
+ "last_modified": "last_modified",
69
+ "element_id": "element_id",
70
+ }
71
+ md = flatten_dict(data, separator="-", flatten_lists=True)
72
+ md = {k.replace("metadata-", ""): v for k, v in md.items()}
73
+ md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
74
+ return md
75
+
76
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
77
+ with input_file.open() as in_f:
78
+ elements_contents = json.load(in_f)
79
+
80
+ logger.info(
81
+ f"Extending {len(elements_contents)} json elements from content in {input_file}"
82
+ )
83
+
84
+ conformed_elements = [
85
+ {
86
+ "id": str(uuid.uuid4()),
87
+ "type": "core",
88
+ "metadata": {
89
+ "title": file_data.identifier,
90
+ },
91
+ "document_parts": [
92
+ {
93
+ "text": element.pop("text", None),
94
+ "metadata": self.conform_dict(data=element),
95
+ }
96
+ for element in elements_contents
97
+ ],
98
+ }
99
+ ]
100
+
101
+ with open(output_file, "w") as out_f:
102
+ json.dump(conformed_elements, out_f, indent=2)
103
+
104
+
105
+ class VectaraUploaderConfig(UploaderConfig):
106
+ pass
107
+
108
+
109
+ @dataclass
110
+ class VectaraUploader(Uploader):
111
+ connector_type: str = CONNECTOR_TYPE
112
+ upload_config: VectaraUploaderConfig
113
+ connection_config: VectaraConnectionConfig
114
+ _jwt_token: Optional[str] = field(init=False, default=None)
115
+ _jwt_token_expires_ts: Optional[float] = field(init=False, default=None)
116
+
117
+ def is_async(self) -> bool:
118
+ return True
119
+
120
+ def precheck(self) -> None:
121
+ try:
122
+ self._check_connection_and_corpora()
123
+ except Exception as e:
124
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
125
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
126
+
127
+ @property
128
+ async def jwt_token_async(self) -> str:
129
+ if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
130
+ self._jwt_token = await self._get_jwt_token_async()
131
+ return self._jwt_token
132
+
133
+ @property
134
+ def jwt_token(self) -> str:
135
+ if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
136
+ self._jwt_token = self._get_jwt_token()
137
+ return self._jwt_token
138
+
139
+ # Get Oauth2 JWT token
140
+ @requires_dependencies(["httpx"], extras="vectara")
141
+ async def _get_jwt_token_async(self) -> str:
142
+ import httpx
143
+
144
+ """Connect to the server and get a JWT token."""
145
+ token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
146
+ headers = {
147
+ "Content-Type": "application/x-www-form-urlencoded",
148
+ }
149
+ data = {
150
+ "grant_type": "client_credentials",
151
+ "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
152
+ "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
153
+ }
154
+
155
+ async with httpx.AsyncClient() as client:
156
+ response = await client.post(token_endpoint, headers=headers, data=data)
157
+ response.raise_for_status()
158
+ response_json = response.json()
159
+
160
+ request_time = datetime.now().timestamp()
161
+ self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
162
+
163
+ return response_json.get("access_token")
164
+
165
+ # Get Oauth2 JWT token
166
+ @requires_dependencies(["httpx"], extras="vectara")
167
+ def _get_jwt_token(self) -> str:
168
+ import httpx
169
+
170
+ """Connect to the server and get a JWT token."""
171
+ token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
172
+ headers = {
173
+ "Content-Type": "application/x-www-form-urlencoded",
174
+ }
175
+ data = {
176
+ "grant_type": "client_credentials",
177
+ "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
178
+ "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
179
+ }
180
+
181
+ with httpx.Client() as client:
182
+ response = client.post(token_endpoint, headers=headers, data=data)
183
+ response.raise_for_status()
184
+ response_json = response.json()
185
+
186
+ request_time = datetime.now().timestamp()
187
+ self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
188
+
189
+ return response_json.get("access_token")
190
+
191
+ @DestinationConnectionError.wrap
192
+ def _check_connection_and_corpora(self) -> None:
193
+ """
194
+ Check the connection for Vectara and validate corpus exists.
195
+ - If more than one corpus with the same name exists - raise error
196
+ - If exactly one corpus exists with this name - use it.
197
+ - If does not exist - raise error.
198
+ """
199
+ # Get token if not already set
200
+ self.jwt_token
201
+
202
+ _, list_corpora_response = self._request(
203
+ http_method="GET",
204
+ endpoint="corpora",
205
+ )
206
+
207
+ if self.connection_config.corpus_name:
208
+ possible_corpora_keys_names_map = {
209
+ corpus.get("key"): corpus.get("name")
210
+ for corpus in list_corpora_response.get("corpora")
211
+ if corpus.get("name") == self.connection_config.corpus_name
212
+ }
213
+
214
+ if len(possible_corpora_keys_names_map) > 1:
215
+ raise ValueError(
216
+ f"Multiple Corpus exist with name {self.connection_config.corpus_name} in dest."
217
+ )
218
+ if len(possible_corpora_keys_names_map) == 1:
219
+ if not self.connection_config.corpus_key:
220
+ self.connection_config.corpus_key = list(
221
+ possible_corpora_keys_names_map.keys()
222
+ )[0]
223
+ elif (
224
+ self.connection_config.corpus_key
225
+ != list(possible_corpora_keys_names_map.keys())[0]
226
+ ):
227
+ raise ValueError("Corpus key does not match provided corpus name.")
228
+ else:
229
+ raise ValueError(
230
+ f"No Corpora exist with name {self.connection_config.corpus_name} in dest."
231
+ )
232
+
233
+ @requires_dependencies(["httpx"], extras="vectara")
234
+ async def _async_request(
235
+ self,
236
+ endpoint: str,
237
+ http_method: str = "POST",
238
+ params: Mapping[str, Any] = None,
239
+ data: Mapping[str, Any] = None,
240
+ ) -> tuple[bool, dict]:
241
+ import httpx
242
+
243
+ url = f"{BASE_URL}/{endpoint}"
244
+
245
+ headers = {
246
+ "Content-Type": "application/json",
247
+ "Accept": "application/json",
248
+ "Authorization": f"Bearer {await self.jwt_token_async}",
249
+ "X-source": "unstructured",
250
+ }
251
+
252
+ async with httpx.AsyncClient() as client:
253
+ response = await client.request(
254
+ method=http_method, url=url, headers=headers, params=params, json=data
255
+ )
256
+ response.raise_for_status()
257
+ return response.json()
258
+
259
+ @requires_dependencies(["httpx"], extras="vectara")
260
+ def _request(
261
+ self,
262
+ endpoint: str,
263
+ http_method: str = "POST",
264
+ params: Mapping[str, Any] = None,
265
+ data: Mapping[str, Any] = None,
266
+ ) -> tuple[bool, dict]:
267
+ import httpx
268
+
269
+ url = f"{BASE_URL}/{endpoint}"
270
+
271
+ headers = {
272
+ "Content-Type": "application/json",
273
+ "Accept": "application/json",
274
+ "Authorization": f"Bearer {self.jwt_token}",
275
+ "X-source": "unstructured",
276
+ }
277
+
278
+ with httpx.Client() as client:
279
+ response = client.request(
280
+ method=http_method, url=url, headers=headers, params=params, json=data
281
+ )
282
+ response.raise_for_status()
283
+ return response.json()
284
+
285
+ async def _delete_doc(self, doc_id: str) -> tuple[bool, dict]:
286
+ """
287
+ Delete a document from the Vectara corpus.
288
+ """
289
+
290
+ return await self._async_request(
291
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents/{doc_id}",
292
+ http_method="DELETE",
293
+ )
294
+
295
+ async def _index_document(self, document: Dict[str, Any]) -> None:
296
+ """
297
+ Index a document (by uploading it to the Vectara corpus) from the document dictionary
298
+ """
299
+
300
+ logger.debug(
301
+ f"Indexing document {document['id']} to corpus key {self.connection_config.corpus_key}"
302
+ )
303
+
304
+ try:
305
+ result = await self._async_request(
306
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
307
+ )
308
+ except Exception as e:
309
+ logger.error(f"exception {e} while indexing document {document['id']}")
310
+ return
311
+
312
+ if (
313
+ "messages" in result
314
+ and result["messages"]
315
+ and (
316
+ "ALREADY_EXISTS" in result["messages"]
317
+ or (
318
+ "CONFLICT: Indexing doesn't support updating documents."
319
+ in result["messages"][0]
320
+ )
321
+ )
322
+ ):
323
+ logger.info(f"document {document['id']} already exists, re-indexing")
324
+ await self._delete_doc(document["id"])
325
+ await self._async_request(
326
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
327
+ )
328
+ return
329
+
330
+ logger.info(f"indexing document {document['id']} succeeded")
331
+
332
+ async def run_data_async(
333
+ self,
334
+ data: list[dict],
335
+ file_data: FileData,
336
+ **kwargs: Any,
337
+ ) -> None:
338
+ logger.info(f"inserting / updating {len(data)} documents to Vectara ")
339
+ await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
340
+
341
+
342
+ vectara_destination_entry = DestinationRegistryEntry(
343
+ connection_config=VectaraConnectionConfig,
344
+ uploader=VectaraUploader,
345
+ uploader_config=VectaraUploaderConfig,
346
+ upload_stager=VectaraUploadStager,
347
+ upload_stager_config=VectaraUploadStagerConfig,
348
+ )
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_WEAVIATE_CONNECTOR_TYPE
8
+ from .cloud import weaviate_cloud_destination_entry
9
+ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
+ from .embedded import weaviate_embedded_destination_entry
11
+ from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
+ from .local import weaviate_local_destination_entry
13
+
14
+ add_destination_entry(
15
+ destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
16
+ )
17
+ add_destination_entry(
18
+ destination_type=CLOUD_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_cloud_destination_entry
19
+ )
20
+ add_destination_entry(
21
+ destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
22
+ )
@@ -0,0 +1,166 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import ValueError
8
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+
19
+ if TYPE_CHECKING:
20
+ from weaviate.auth import AuthCredentials
21
+ from weaviate.client import WeaviateClient
22
+
23
+ CONNECTOR_TYPE = "weaviate-cloud"
24
+
25
+
26
+ class CloudWeaviateAccessConfig(WeaviateAccessConfig):
27
+ access_token: Optional[str] = Field(
28
+ default=None, description="Used to create the bearer token."
29
+ )
30
+ api_key: Optional[str] = None
31
+ client_secret: Optional[str] = None
32
+ password: Optional[str] = None
33
+
34
+
35
+ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
36
+ cluster_url: str = Field(
37
+ description="The WCD cluster URL or hostname to connect to. "
38
+ "Usually in the form: rAnD0mD1g1t5.something.weaviate.cloud"
39
+ )
40
+ username: Optional[str] = None
41
+ anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
42
+ refresh_token: Optional[str] = Field(
43
+ default=None,
44
+ description="Will tie this value to the bearer token. If not provided, "
45
+ "the authentication will expire once the lifetime of the access token is up.",
46
+ )
47
+ access_config: Secret[CloudWeaviateAccessConfig]
48
+
49
+ def model_post_init(self, __context: Any) -> None:
50
+ if self.anonymous:
51
+ return
52
+ access_config = self.access_config.get_secret_value()
53
+ auths = {
54
+ "api_key": access_config.api_key is not None,
55
+ "bearer_token": access_config.access_token is not None,
56
+ "client_secret": access_config.client_secret is not None,
57
+ "client_password": access_config.password is not None and self.username is not None,
58
+ }
59
+ existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
60
+
61
+ if len(existing_auths) == 0:
62
+ raise ValueError("No auth values provided and anonymous is False")
63
+ if len(existing_auths) > 1:
64
+ raise ValueError(
65
+ "Multiple auth values provided, only one approach can be used: {}".format(
66
+ ", ".join(existing_auths)
67
+ )
68
+ )
69
+
70
+ @requires_dependencies(["weaviate"], extras="weaviate")
71
+ def get_api_key_auth(self) -> Optional["AuthCredentials"]:
72
+ from weaviate.classes.init import Auth
73
+
74
+ if api_key := self.access_config.get_secret_value().api_key:
75
+ return Auth.api_key(api_key=api_key)
76
+ return None
77
+
78
+ @requires_dependencies(["weaviate"], extras="weaviate")
79
+ def get_bearer_token_auth(self) -> Optional["AuthCredentials"]:
80
+ from weaviate.classes.init import Auth
81
+
82
+ if access_token := self.access_config.get_secret_value().access_token:
83
+ return Auth.bearer_token(access_token=access_token, refresh_token=self.refresh_token)
84
+ return None
85
+
86
+ @requires_dependencies(["weaviate"], extras="weaviate")
87
+ def get_client_secret_auth(self) -> Optional["AuthCredentials"]:
88
+ from weaviate.classes.init import Auth
89
+
90
+ if client_secret := self.access_config.get_secret_value().client_secret:
91
+ return Auth.client_credentials(client_secret=client_secret)
92
+ return None
93
+
94
+ @requires_dependencies(["weaviate"], extras="weaviate")
95
+ def get_client_password_auth(self) -> Optional["AuthCredentials"]:
96
+ from weaviate.classes.init import Auth
97
+
98
+ if (username := self.username) and (
99
+ password := self.access_config.get_secret_value().password
100
+ ):
101
+ return Auth.client_password(username=username, password=password)
102
+ return None
103
+
104
+ @requires_dependencies(["weaviate"], extras="weaviate")
105
+ def get_auth(self) -> "AuthCredentials":
106
+ auths = [
107
+ self.get_api_key_auth(),
108
+ self.get_client_secret_auth(),
109
+ self.get_bearer_token_auth(),
110
+ self.get_client_password_auth(),
111
+ ]
112
+ auths = [auth for auth in auths if auth]
113
+ if len(auths) == 0:
114
+ raise ValueError("No auth values provided and anonymous is False")
115
+ if len(auths) > 1:
116
+ raise ValueError("Multiple auth values provided, only one approach can be used")
117
+ return auths[0]
118
+
119
+ @contextmanager
120
+ @requires_dependencies(["weaviate"], extras="weaviate")
121
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
122
+ from weaviate import connect_to_weaviate_cloud
123
+ from weaviate.classes.init import AdditionalConfig
124
+
125
+ auth_credentials = None if self.anonymous else self.get_auth()
126
+ with connect_to_weaviate_cloud(
127
+ cluster_url=self.cluster_url,
128
+ auth_credentials=auth_credentials,
129
+ additional_config=AdditionalConfig(timeout=self.get_timeout()),
130
+ ) as weaviate_client:
131
+ yield weaviate_client
132
+
133
+
134
+ class CloudWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
135
+ pass
136
+
137
+
138
+ @dataclass
139
+ class CloudWeaviateUploadStager(WeaviateUploadStager):
140
+ upload_stager_config: CloudWeaviateUploadStagerConfig = field(
141
+ default_factory=lambda: WeaviateUploadStagerConfig()
142
+ )
143
+
144
+
145
+ class CloudWeaviateUploaderConfig(WeaviateUploaderConfig):
146
+ pass
147
+
148
+
149
+ @dataclass
150
+ class CloudWeaviateUploader(WeaviateUploader):
151
+ connection_config: CloudWeaviateConnectionConfig = field(
152
+ default_factory=lambda: CloudWeaviateConnectionConfig()
153
+ )
154
+ upload_config: CloudWeaviateUploaderConfig = field(
155
+ default_factory=lambda: CloudWeaviateUploaderConfig()
156
+ )
157
+ connector_type: str = CONNECTOR_TYPE
158
+
159
+
160
+ weaviate_cloud_destination_entry = DestinationRegistryEntry(
161
+ connection_config=CloudWeaviateConnectionConfig,
162
+ uploader=CloudWeaviateUploader,
163
+ uploader_config=CloudWeaviateUploaderConfig,
164
+ upload_stager=CloudWeaviateUploadStager,
165
+ upload_stager_config=CloudWeaviateUploadStagerConfig,
166
+ )
@@ -0,0 +1,90 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
8
+ from unstructured_ingest.processes.connectors.weaviate.weaviate import (
9
+ WeaviateAccessConfig,
10
+ WeaviateConnectionConfig,
11
+ WeaviateUploader,
12
+ WeaviateUploaderConfig,
13
+ WeaviateUploadStager,
14
+ WeaviateUploadStagerConfig,
15
+ )
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-embedded"
22
+
23
+
24
+ class EmbeddedWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class EmbeddedWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ hostname: str = Field(default="127.0.0.1", description="hostname")
30
+ port: int = Field(default=8079, description="http port")
31
+ grpc_port: int = Field(default=50050, description="grpc port")
32
+ data_path: Optional[str] = Field(
33
+ default=None,
34
+ description="directory where the files making up the "
35
+ "database are stored. If not provided, will "
36
+ "default to underlying SDK implementation",
37
+ )
38
+ access_config: Secret[WeaviateAccessConfig] = Field(
39
+ default=WeaviateAccessConfig(), validate_default=True
40
+ )
41
+
42
+ @contextmanager
43
+ @requires_dependencies(["weaviate"], extras="weaviate")
44
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
45
+ from weaviate import connect_to_embedded
46
+ from weaviate.classes.init import AdditionalConfig
47
+
48
+ with connect_to_embedded(
49
+ hostname=self.hostname,
50
+ port=self.port,
51
+ grpc_port=self.grpc_port,
52
+ persistence_data_path=self.data_path,
53
+ additional_config=AdditionalConfig(timeout=self.get_timeout()),
54
+ ) as weaviate_client:
55
+ yield weaviate_client
56
+
57
+
58
+ class EmbeddedWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class EmbeddedWeaviateUploadStager(WeaviateUploadStager):
64
+ upload_stager_config: EmbeddedWeaviateUploadStagerConfig = field(
65
+ default_factory=lambda: WeaviateUploadStagerConfig()
66
+ )
67
+
68
+
69
+ class EmbeddedWeaviateUploaderConfig(WeaviateUploaderConfig):
70
+ pass
71
+
72
+
73
+ @dataclass
74
+ class EmbeddedWeaviateUploader(WeaviateUploader):
75
+ connection_config: EmbeddedWeaviateConnectionConfig = field(
76
+ default_factory=lambda: EmbeddedWeaviateConnectionConfig()
77
+ )
78
+ upload_config: EmbeddedWeaviateUploaderConfig = field(
79
+ default_factory=lambda: EmbeddedWeaviateUploaderConfig()
80
+ )
81
+ connector_type: str = CONNECTOR_TYPE
82
+
83
+
84
+ weaviate_embedded_destination_entry = DestinationRegistryEntry(
85
+ connection_config=EmbeddedWeaviateConnectionConfig,
86
+ uploader=EmbeddedWeaviateUploader,
87
+ uploader_config=EmbeddedWeaviateUploaderConfig,
88
+ upload_stager=EmbeddedWeaviateUploadStager,
89
+ upload_stager_config=EmbeddedWeaviateUploadStagerConfig,
90
+ )
@@ -0,0 +1,73 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
8
+ from unstructured_ingest.processes.connectors.weaviate.weaviate import (
9
+ WeaviateAccessConfig,
10
+ WeaviateConnectionConfig,
11
+ WeaviateUploader,
12
+ WeaviateUploaderConfig,
13
+ WeaviateUploadStager,
14
+ WeaviateUploadStagerConfig,
15
+ )
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-local"
22
+
23
+
24
+ class LocalWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class LocalWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ access_config: Secret[WeaviateAccessConfig] = Field(
30
+ default=WeaviateAccessConfig(), validate_default=True
31
+ )
32
+
33
+ @contextmanager
34
+ @requires_dependencies(["weaviate"], extras="weaviate")
35
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
36
+ from weaviate import connect_to_local
37
+ from weaviate.classes.init import AdditionalConfig
38
+
39
+ with connect_to_local(
40
+ additional_config=AdditionalConfig(timeout=self.get_timeout())
41
+ ) as weaviate_client:
42
+ yield weaviate_client
43
+
44
+
45
+ class LocalWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
46
+ pass
47
+
48
+
49
+ @dataclass
50
+ class LocalWeaviateUploadStager(WeaviateUploadStager):
51
+ upload_stager_config: LocalWeaviateUploadStagerConfig = field(
52
+ default_factory=lambda: WeaviateUploadStagerConfig()
53
+ )
54
+
55
+
56
+ class LocalWeaviateUploaderConfig(WeaviateUploaderConfig):
57
+ pass
58
+
59
+
60
+ @dataclass
61
+ class LocalWeaviateUploader(WeaviateUploader):
62
+ upload_config: LocalWeaviateUploaderConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+ connection_config: LocalWeaviateConnectionConfig
65
+
66
+
67
+ weaviate_local_destination_entry = DestinationRegistryEntry(
68
+ connection_config=LocalWeaviateConnectionConfig,
69
+ uploader=LocalWeaviateUploader,
70
+ uploader_config=LocalWeaviateUploaderConfig,
71
+ upload_stager=LocalWeaviateUploadStager,
72
+ upload_stager_config=LocalWeaviateUploadStagerConfig,
73
+ )