unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,337 @@
1
+ import json
2
+ import re
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from datetime import date, datetime
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
9
+
10
+ from dateutil import parser
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.data_types.file_data import FileData
14
+ from unstructured_ingest.error import DestinationConnectionError, ValueError, WriteError
15
+ from unstructured_ingest.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ UploaderConfig,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ VectorDBUploader,
22
+ )
23
+ from unstructured_ingest.logger import logger
24
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
25
+ from unstructured_ingest.utils.dep_check import requires_dependencies
26
+
27
+ if TYPE_CHECKING:
28
+ from weaviate.classes.init import Timeout
29
+ from weaviate.client import WeaviateClient
30
+ from weaviate.collections.batch.client import BatchClient
31
+
32
+ CONNECTOR_TYPE = "weaviate"
33
+
34
+
35
+ class WeaviateAccessConfig(AccessConfig, ABC):
36
+ pass
37
+
38
+
39
+ class WeaviateConnectionConfig(ConnectionConfig, ABC):
40
+ init_timeout: int = Field(default=2, ge=0, description="Timeout for initialization checks")
41
+ insert_timeout: int = Field(default=90, ge=0, description="Timeout for insert operations")
42
+ query_timeout: int = Field(default=30, ge=0, description="Timeout for query operations")
43
+ access_config: Secret[WeaviateAccessConfig] = Field(
44
+ default=WeaviateAccessConfig(), validate_default=True
45
+ )
46
+
47
+ @requires_dependencies(["weaviate"], extras="weaviate")
48
+ def get_timeout(self) -> "Timeout":
49
+ from weaviate.classes.init import Timeout
50
+
51
+ return Timeout(init=self.init_timeout, query=self.query_timeout, insert=self.insert_timeout)
52
+
53
+ @abstractmethod
54
+ @contextmanager
55
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
56
+ pass
57
+
58
+
59
+ class WeaviateUploadStagerConfig(UploadStagerConfig):
60
+ pass
61
+
62
+
63
+ @dataclass
64
+ class WeaviateUploadStager(UploadStager):
65
+ upload_stager_config: WeaviateUploadStagerConfig = field(
66
+ default_factory=lambda: WeaviateUploadStagerConfig()
67
+ )
68
+
69
+ @staticmethod
70
+ def parse_date_string(date_string: str) -> date:
71
+ try:
72
+ timestamp = float(date_string)
73
+ return datetime.fromtimestamp(timestamp)
74
+ except Exception as e:
75
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
76
+ return parser.parse(date_string)
77
+
78
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
79
+ """
80
+ Updates the element dictionary to conform to the Weaviate schema
81
+ """
82
+ data = element_dict.copy()
83
+ working_data = data.copy()
84
+ # Dict as string formatting
85
+ if (
86
+ record_locator := working_data.get("metadata", {})
87
+ .get("data_source", {})
88
+ .get("record_locator")
89
+ ):
90
+ # Explicit casting otherwise fails schema type checking
91
+ working_data["metadata"]["data_source"]["record_locator"] = str(
92
+ json.dumps(record_locator)
93
+ )
94
+
95
+ # Array of items as string formatting
96
+ if points := working_data.get("metadata", {}).get("coordinates", {}).get("points"):
97
+ working_data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
98
+
99
+ if links := working_data.get("metadata", {}).get("links", {}):
100
+ working_data["metadata"]["links"] = str(json.dumps(links))
101
+
102
+ if permissions_data := (
103
+ working_data.get("metadata", {}).get("data_source", {}).get("permissions_data")
104
+ ):
105
+ working_data["metadata"]["data_source"]["permissions_data"] = json.dumps(
106
+ permissions_data
107
+ )
108
+
109
+ # Datetime formatting
110
+ if (
111
+ date_created := working_data.get("metadata", {})
112
+ .get("data_source", {})
113
+ .get("date_created")
114
+ ):
115
+ working_data["metadata"]["data_source"]["date_created"] = self.parse_date_string(
116
+ date_created
117
+ ).strftime(
118
+ "%Y-%m-%dT%H:%M:%S.%fZ",
119
+ )
120
+
121
+ if (
122
+ date_modified := working_data.get("metadata", {})
123
+ .get("data_source", {})
124
+ .get("date_modified")
125
+ ):
126
+ working_data["metadata"]["data_source"]["date_modified"] = self.parse_date_string(
127
+ date_modified
128
+ ).strftime(
129
+ "%Y-%m-%dT%H:%M:%S.%fZ",
130
+ )
131
+
132
+ if (
133
+ date_processed := working_data.get("metadata", {})
134
+ .get("data_source", {})
135
+ .get("date_processed")
136
+ ):
137
+ working_data["metadata"]["data_source"]["date_processed"] = self.parse_date_string(
138
+ date_processed
139
+ ).strftime(
140
+ "%Y-%m-%dT%H:%M:%S.%fZ",
141
+ )
142
+
143
+ if last_modified := working_data.get("metadata", {}).get("last_modified"):
144
+ working_data["metadata"]["last_modified"] = self.parse_date_string(
145
+ last_modified
146
+ ).strftime(
147
+ "%Y-%m-%dT%H:%M:%S.%fZ",
148
+ )
149
+
150
+ # String casting
151
+ if version := working_data.get("metadata", {}).get("data_source", {}).get("version"):
152
+ working_data["metadata"]["data_source"]["version"] = str(version)
153
+
154
+ if page_number := working_data.get("metadata", {}).get("page_number"):
155
+ working_data["metadata"]["page_number"] = str(page_number)
156
+
157
+ if regex_metadata := working_data.get("metadata", {}).get("regex_metadata"):
158
+ working_data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
159
+
160
+ working_data[RECORD_ID_LABEL] = file_data.identifier
161
+ return working_data
162
+
163
+
164
+ class WeaviateUploaderConfig(UploaderConfig):
165
+ collection: Optional[str] = Field(
166
+ description="The name of the collection this object belongs to", default=None
167
+ )
168
+ batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
169
+ requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
170
+ dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
171
+ record_id_key: str = Field(
172
+ default=RECORD_ID_LABEL,
173
+ description="searchable key to find entries for the same record on previous runs",
174
+ )
175
+
176
+ def model_post_init(self, __context: Any) -> None:
177
+ batch_types = {
178
+ "fixed_size": self.batch_size is not None,
179
+ "rate_limited": self.requests_per_minute is not None,
180
+ "dynamic": self.dynamic_batch,
181
+ }
182
+
183
+ enabled_batch_modes = [batch_key for batch_key, flag in batch_types.items() if flag]
184
+ if not enabled_batch_modes:
185
+ raise ValueError("No batch mode enabled")
186
+ if len(enabled_batch_modes) > 1:
187
+ raise ValueError(
188
+ "Multiple batch modes enabled, only one mode can be used: {}".format(
189
+ ", ".join(enabled_batch_modes)
190
+ )
191
+ )
192
+ logger.info(f"Uploader config instantiated with {enabled_batch_modes[0]} batch mode")
193
+
194
+ @contextmanager
195
+ def get_batch_client(self, client: "WeaviateClient") -> Generator["BatchClient", None, None]:
196
+ if self.dynamic_batch:
197
+ with client.batch.dynamic() as batch_client:
198
+ yield batch_client
199
+ elif self.batch_size:
200
+ with client.batch.fixed_size(batch_size=self.batch_size) as batch_client:
201
+ yield batch_client
202
+ elif self.requests_per_minute:
203
+ with client.batch.rate_limit(
204
+ requests_per_minute=self.requests_per_minute
205
+ ) as batch_client:
206
+ yield batch_client
207
+ else:
208
+ raise ValueError("No batch mode enabled")
209
+
210
+
211
+ @dataclass
212
+ class WeaviateUploader(VectorDBUploader, ABC):
213
+ upload_config: WeaviateUploaderConfig
214
+ connection_config: WeaviateConnectionConfig
215
+
216
+ def _collection_exists(self, collection_name: Optional[str] = None):
217
+ collection_name = collection_name or self.upload_config.collection
218
+ with self.connection_config.get_client() as weaviate_client:
219
+ return weaviate_client.collections.exists(name=collection_name)
220
+
221
+ def precheck(self) -> None:
222
+ try:
223
+ with self.connection_config.get_client():
224
+ # Connection test successful - client is available but not needed
225
+ pass
226
+
227
+ # only if collection name populated should we check that it exists
228
+ if self.upload_config.collection and not self._collection_exists():
229
+ raise DestinationConnectionError(
230
+ f"collection '{self.upload_config.collection}' does not exist"
231
+ )
232
+ except Exception as e:
233
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
234
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
235
+
236
+ def init(self, **kwargs: Any) -> None:
237
+ self.create_destination(**kwargs)
238
+
239
+ def format_destination_name(self, destination_name: str) -> str:
240
+ """
241
+ Weaviate Collection naming conventions:
242
+ 1. must begin with an uppercase letter
243
+ 2. must be alphanumeric and underscores only
244
+ """
245
+
246
+ # Check if the first character is an uppercase letter
247
+ if not re.match(r"^[a-zA-Z]", destination_name):
248
+ raise ValueError("Collection name must start with an uppercase letter")
249
+ # Replace all non-alphanumeric characters with underscores
250
+ formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
251
+ # Make the first character uppercase and leave the rest as is
252
+ if len(formatted) == 1:
253
+ formatted = formatted.capitalize()
254
+ else:
255
+ formatted = formatted[0].capitalize() + formatted[1:]
256
+ if formatted != destination_name:
257
+ logger.warning(
258
+ f"Given Collection name '{destination_name}' doesn't follow naming conventions. "
259
+ f"Renaming to '{formatted}'"
260
+ )
261
+ return formatted
262
+
263
+ def create_destination(
264
+ self,
265
+ destination_name: str = "Unstructuredautocreated",
266
+ vector_length: Optional[int] = None,
267
+ **kwargs: Any,
268
+ ) -> bool:
269
+ collection_name = self.upload_config.collection or destination_name
270
+ collection_name = self.format_destination_name(collection_name)
271
+ self.upload_config.collection = collection_name
272
+
273
+ if not self._collection_exists():
274
+ connectors_dir = Path(__file__).parents[1]
275
+ collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
276
+ with collection_config_file.open() as f:
277
+ collection_config = json.load(f)
278
+ collection_config["class"] = collection_name
279
+
280
+ logger.info(f"Creating weaviate collection '{collection_name}' with default configs")
281
+ with self.connection_config.get_client() as weaviate_client:
282
+ weaviate_client.collections.create_from_dict(config=collection_config)
283
+ return True
284
+ logger.debug(f"Collection with name '{collection_name}' already exists, skipping creation")
285
+ return False
286
+
287
+ def check_for_errors(self, client: "WeaviateClient") -> None:
288
+ failed_uploads = client.batch.failed_objects
289
+ if failed_uploads:
290
+ for failure in failed_uploads:
291
+ logger.error(
292
+ f"Failed to upload object with id {failure.original_uuid}: {failure.message}"
293
+ )
294
+ raise WriteError("Failed to upload to weaviate")
295
+
296
+ @requires_dependencies(["weaviate"], extras="weaviate")
297
+ def delete_by_record_id(self, client: "WeaviateClient", file_data: FileData) -> None:
298
+ from weaviate.classes.query import Filter
299
+
300
+ record_id = file_data.identifier
301
+ collection = client.collections.get(self.upload_config.collection)
302
+ delete_filter = Filter.by_property(name=self.upload_config.record_id_key).equal(
303
+ val=record_id
304
+ )
305
+ # There is a configurable maximum limit (QUERY_MAXIMUM_RESULTS) on the number of
306
+ # objects that can be deleted in a single query (default 10,000). To delete
307
+ # more objects than the limit, re-run the query until nothing is deleted.
308
+ while True:
309
+ resp = collection.data.delete_many(where=delete_filter)
310
+ if resp.failed:
311
+ raise WriteError(
312
+ f"failed to delete records in collection "
313
+ f"{self.upload_config.collection} with record "
314
+ f"id property {record_id}"
315
+ )
316
+ if not resp.failed and not resp.successful:
317
+ break
318
+
319
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
320
+ logger.info(
321
+ f"writing {len(data)} objects to destination "
322
+ f"class {self.connection_config.access_config} "
323
+ )
324
+ if not self.upload_config.collection:
325
+ raise ValueError("No collection specified")
326
+
327
+ with self.connection_config.get_client() as weaviate_client:
328
+ self.delete_by_record_id(client=weaviate_client, file_data=file_data)
329
+ with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
330
+ for e in data:
331
+ vector = e.pop("embeddings", None)
332
+ batch_client.add_object(
333
+ collection=self.upload_config.collection,
334
+ properties=e,
335
+ vector=vector,
336
+ )
337
+ self.check_for_errors(client=weaviate_client)
@@ -0,0 +1,314 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel, Field, HttpUrl
6
+
7
+ from unstructured_ingest.error import (
8
+ ProviderError,
9
+ RateLimitError,
10
+ UnstructuredIngestError,
11
+ UserAuthError,
12
+ UserError,
13
+ )
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.utils.string_and_date_utils import fix_unescaped_unicode
17
+
18
+ if TYPE_CHECKING:
19
+ from httpx import AsyncClient, Client
20
+
21
+
22
+ class Attachment(BaseModel):
23
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/ticket-attachments/#json-format
24
+ content_type: Optional[str] = None
25
+
26
+
27
+ class Via(BaseModel):
28
+ # https://developer.zendesk.com/documentation/ticketing/reference-guides/via-object-reference/
29
+ channel: Union[int, str]
30
+ source: dict = Field(default_factory=dict)
31
+
32
+
33
+ class ZendeskComment(BaseModel):
34
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/ticket_comments/#json-format
35
+ attachments: list[Attachment] = Field(default_factory=list)
36
+ audit_id: Optional[int] = None
37
+ author_id: Optional[int] = None
38
+ body: Optional[str] = None
39
+ created_at: Optional[datetime] = None
40
+ html_body: Optional[str] = None
41
+ id: Optional[int] = None
42
+ metadata: Optional[dict] = None
43
+ plain_body: Optional[str] = None
44
+ public: Optional[bool] = None
45
+ comment_type: Literal["Comment", "VoiceComment"] = Field(alias="type")
46
+ uploads: list[str] = Field(default_factory=list)
47
+ via: Optional[Via] = None
48
+
49
+ def as_text(self) -> str:
50
+ all_data = self.model_dump()
51
+ filtered_data = {
52
+ k: v
53
+ for k, v in all_data.items()
54
+ if k in ["id", "author_id", "body", "created_at"] and v is not None
55
+ }
56
+ return "".join(
57
+ [f"{v}\n" for v in ["comment"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
58
+ )
59
+
60
+
61
+ class ZendeskTicket(BaseModel):
62
+ # https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/#json-format
63
+ allow_attachments: bool = True
64
+ allow_channelback: bool = True
65
+ assignee_email: Optional[str] = None
66
+ assignee_id: Optional[int] = None
67
+ attribute_value_ids: list[int] = Field(default_factory=list)
68
+ brand_id: Optional[int] = None
69
+ collaborator_ids: list[int] = Field(default_factory=list)
70
+ collaborators: list[Union[int, str, dict[str, str]]] = Field(default_factory=list)
71
+ comment: Optional[ZendeskComment] = None
72
+ created_at: Optional[datetime] = None
73
+ custom_fields: list[dict[str, Any]] = Field(default_factory=list)
74
+ custom_status_id: Optional[int] = None
75
+ description: Optional[str] = None
76
+ due_at: Optional[datetime] = None
77
+ email_cc_ids: list[int] = Field(default_factory=list)
78
+ email_ccs: list[dict[str, str]] = Field(default_factory=list)
79
+ external_id: Optional[str] = None
80
+ follower_ids: list[int] = Field(default_factory=list)
81
+ followers: list[dict[str, str]] = Field(default_factory=list)
82
+ followup_ids: list[int] = Field(default_factory=list)
83
+ forum_topic_id: Optional[int] = None
84
+ from_messaging_channel: bool
85
+ generated_timestamp: Optional[datetime] = None
86
+ group_id: Optional[int] = None
87
+ has_incidents: bool = False
88
+ id: Optional[int] = None
89
+ is_public: bool = False
90
+ macro_id: Optional[int] = None
91
+ macro_ids: list[int] = Field(default_factory=list)
92
+ metadata: dict[str, Any] = Field(default_factory=dict)
93
+ organization_id: Optional[int] = None
94
+ priority: Optional[Literal["urgent", "high", "normal", "low"]] = None
95
+ problem_id: Optional[int] = None
96
+ raw_subject: Optional[str] = None
97
+ recipient: Optional[str] = None
98
+ requester: dict[str, str] = Field(default_factory=dict)
99
+ requester_id: int
100
+ safe_update: Optional[bool] = None
101
+ satisfaction_rating: Optional[Union[str, dict[str, Any]]] = None
102
+ sharing_agreement_ids: list[int] = Field(default_factory=list)
103
+ status: Optional[Literal["new", "open", "pending", "hold", "solved", "closed"]] = None
104
+ subject: Optional[str] = None
105
+ submitter_id: Optional[int] = None
106
+ tags: list[str] = Field(default_factory=list)
107
+ ticket_form_id: Optional[int] = None
108
+ ticket_type: Optional[Literal["problem", "incident", "question", "task"]] = Field(
109
+ default=None, alias="type"
110
+ )
111
+ updated_at: Optional[datetime] = None
112
+ updated_stamp: Optional[str] = None
113
+ url: Optional[HttpUrl] = None
114
+ via: Optional[Via] = None
115
+ via_followup_source_id: Optional[int] = None
116
+ via_id: Optional[int] = None
117
+ voice_comment: Optional[dict] = None
118
+
119
+ def as_text(self) -> str:
120
+ all_data = self.model_dump()
121
+ filtered_data = {
122
+ k: v
123
+ for k, v in all_data.items()
124
+ if k in ["id", "subject", "description", "created_at"] and v is not None
125
+ }
126
+ return "".join(
127
+ [f"{v}\n" for v in ["ticket"] + [f"{k}: {v}" for k, v in filtered_data.items()]]
128
+ )
129
+
130
+
131
+ class ZendeskArticle(BaseModel):
132
+ # https://developer.zendesk.com/api-reference/help_center/help-center-api/articles/#json-format
133
+ author_id: Optional[int] = None
134
+ body: Optional[str] = None
135
+ comments_disabled: bool = False
136
+ content_tag_ids: list[str] = Field(default_factory=list)
137
+ created_at: Optional[datetime] = None
138
+ draft: bool = False
139
+ edited_at: Optional[datetime] = None
140
+ html_url: Optional[HttpUrl] = None
141
+ id: int
142
+ label_names: list[str] = Field(default_factory=list)
143
+ locale: str
144
+ outdated: bool = False
145
+ outdated_locales: list[str] = Field(default_factory=list)
146
+ permission_group_id: int
147
+ position: Optional[int] = None
148
+ promoted: bool = False
149
+ section_id: Optional[int] = None
150
+ source_locale: Optional[str] = None
151
+ title: str
152
+ updated_at: Optional[datetime] = None
153
+ url: Optional[HttpUrl] = None
154
+ user_segment_id: Optional[int] = None
155
+ user_segment_ids: list[int] = Field(default_factory=list)
156
+ vote_count: Optional[int] = None
157
+ vote_sum: Optional[int] = None
158
+
159
+ def as_html(self) -> str:
160
+ html = self.body
161
+ if title := self.title:
162
+ html = f"<h1>{title}</h1>{html}"
163
+ return fix_unescaped_unicode(f"<body class='Document' >{html}</body>")
164
+
165
+
166
+ class ZendeskArticleAttachment(BaseModel):
167
+ # https://developer.zendesk.com/api-reference/help_center/help-center-api/article_attachments/#json-format
168
+ article_id: Optional[int] = None
169
+ content_type: Optional[str] = None
170
+ content_url: Optional[HttpUrl] = None
171
+ created_at: Optional[datetime] = None
172
+ guide_media_id: Optional[str] = None
173
+ id: Optional[int] = None
174
+ inline: bool = False
175
+ locale: Optional[str] = None
176
+ size: Optional[int] = None
177
+ updated_at: Optional[datetime] = None
178
+ url: Optional[HttpUrl] = None
179
+
180
+
181
+ @dataclass
182
+ class ZendeskClient:
183
+ token: str
184
+ subdomain: str
185
+ email: str
186
+ max_page_size: int = 100
187
+ _async_client: "AsyncClient" = field(init=False, default=None)
188
+ _client: "Client" = field(init=False, default=None)
189
+ _base_url: str = field(init=False, default=None)
190
+
191
+ async def __aenter__(self) -> "ZendeskClient":
192
+ return self
193
+
194
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
195
+ await self._async_client.aclose()
196
+
197
+ @requires_dependencies(["httpx"], extras="zendesk")
198
+ def __post_init__(self):
199
+ import httpx
200
+
201
+ auth = f"{self.email}/token", self.token
202
+ self._client = httpx.Client(auth=auth)
203
+ self._async_client = httpx.AsyncClient(auth=auth)
204
+ self._base_url = f"https://{self.subdomain}.zendesk.com/api/v2"
205
+
206
+ # Run check
207
+ try:
208
+ url_to_check = f"{self._base_url}/groups.json"
209
+ resp = self._client.head(url_to_check)
210
+ resp.raise_for_status()
211
+ except Exception as e:
212
+ raise self.wrap_error(e=e)
213
+
214
+ @requires_dependencies(["httpx"], extras="zendesk")
215
+ def wrap_error(self, e: Exception) -> Exception:
216
+ import httpx
217
+
218
+ if not isinstance(e, httpx.HTTPStatusError):
219
+ logger.error(f"unhandled exception from Zendesk client: {e}", exc_info=True)
220
+ return UnstructuredIngestError(str(e))
221
+ url = e.request.url
222
+ response_code = e.response.status_code
223
+ if response_code == 401:
224
+ logger.error(
225
+ f"Failed to connect via auth,"
226
+ f"{url} using zendesk response, status code {response_code}"
227
+ )
228
+ return UserAuthError(e)
229
+ if response_code == 429:
230
+ logger.error(
231
+ f"Failed to connect via rate limits"
232
+ f"{url} using zendesk response, status code {response_code}"
233
+ )
234
+ return RateLimitError(e)
235
+ if 400 <= response_code < 500:
236
+ logger.error(
237
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
238
+ )
239
+ return UserError(e)
240
+ if response_code > 500:
241
+ logger.error(
242
+ f"Failed to connect to {url} using zendesk response, status code {response_code}"
243
+ )
244
+ return ProviderError(e)
245
+ logger.error(f"unhandled http status error from Zendesk client: {e}", exc_info=True)
246
+ return e
247
+
248
+ async def fetch_content(self, url: str, content_key: str) -> AsyncGenerator[dict, None]:
249
+ url = f"{url}?page[size]={self.max_page_size}"
250
+ while True:
251
+ try:
252
+ response = await self._async_client.get(url)
253
+ response.raise_for_status()
254
+ except Exception as e:
255
+ raise self.wrap_error(e=e)
256
+
257
+ data = response.json()
258
+ for content in data[content_key]:
259
+ yield content
260
+
261
+ has_more = data.get("meta", {}).get("has_more", False)
262
+ if not has_more:
263
+ break
264
+
265
+ url = data["links"]["next"]
266
+
267
+ async def get_articles(self) -> AsyncGenerator[ZendeskArticle, None]:
268
+ """
269
+ Retrieves article content from Zendesk asynchronously.
270
+ """
271
+ article_url = f"https://{self.subdomain}.zendesk.com/api/v2/help_center/articles.json"
272
+
273
+ try:
274
+ async for article_dict in self.fetch_content(url=article_url, content_key="articles"):
275
+ yield ZendeskArticle.model_validate(article_dict)
276
+ except Exception as e:
277
+ raise self.wrap_error(e=e)
278
+
279
+ async def get_comments(self, ticket_id: int) -> AsyncGenerator[ZendeskComment, None]:
280
+ comments_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets/{ticket_id}/comments"
281
+
282
+ try:
283
+ async for comment_dict in self.fetch_content(url=comments_url, content_key="comments"):
284
+ yield ZendeskComment.model_validate(comment_dict)
285
+ except Exception as e:
286
+ raise self.wrap_error(e=e)
287
+
288
+ async def get_tickets(self) -> AsyncGenerator[ZendeskTicket, None]:
289
+ tickets_url = f"https://{self.subdomain}.zendesk.com/api/v2/tickets"
290
+
291
+ try:
292
+ async for ticket_dict in self.fetch_content(url=tickets_url, content_key="tickets"):
293
+ yield ZendeskTicket.model_validate(ticket_dict)
294
+ except Exception as e:
295
+ raise self.wrap_error(e=e)
296
+
297
+ async def get_article_attachments(
298
+ self, article_id: int
299
+ ) -> AsyncGenerator[ZendeskArticleAttachment, None]:
300
+ """
301
+ Handles article attachments such as images and stores them as UTF-8 encoded bytes.
302
+ """
303
+ article_attachment_url = (
304
+ f"https://{self.subdomain}.zendesk.com/api/v2/help_center/"
305
+ f"articles/{article_id}/attachments"
306
+ )
307
+
308
+ try:
309
+ async for attachment_dict in self.fetch_content(
310
+ url=article_attachment_url, content_key="article_attachments"
311
+ ):
312
+ yield ZendeskArticleAttachment.model_validate(attachment_dict)
313
+ except Exception as e:
314
+ raise self.wrap_error(e=e)