unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,241 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import Any, AsyncGenerator, Literal, Union
8
+
9
+ from pydantic import BaseModel, Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import (
12
+ FileData,
13
+ FileDataSourceMetadata,
14
+ SourceIdentifiers,
15
+ )
16
+ from unstructured_ingest.error import SourceConnectionError, ValueError
17
+ from unstructured_ingest.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ Indexer,
24
+ IndexerConfig,
25
+ )
26
+ from unstructured_ingest.logger import logger
27
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
28
+ from unstructured_ingest.utils.dep_check import requires_dependencies
29
+ from unstructured_ingest.utils.html import HtmlMixin
30
+
31
+ from .client import ZendeskArticle, ZendeskClient, ZendeskTicket
32
+
33
+ CONNECTOR_TYPE = "zendesk"
34
+
35
+
36
+ class ZendeskAdditionalMetadata(BaseModel):
37
+ item_type: Literal["ticket", "article"]
38
+ content: Union[ZendeskTicket, ZendeskArticle]
39
+
40
+
41
+ class ZendeskFileData(FileData):
42
+ additional_metadata: ZendeskAdditionalMetadata
43
+
44
+
45
+ class ZendeskAccessConfig(AccessConfig):
46
+ api_token: str = Field(
47
+ description="API token for zendesk generated under Apps and Integrations"
48
+ )
49
+
50
+
51
+ class ZendeskConnectionConfig(ConnectionConfig):
52
+ subdomain: str = Field(description="Subdomain for zendesk site, <sub-domain>.company.com")
53
+ email: str = Field(description="Email for zendesk site registered at the subdomain")
54
+ access_config: Secret[ZendeskAccessConfig]
55
+
56
+ def get_client(self) -> ZendeskClient:
57
+ access_config = self.access_config.get_secret_value()
58
+
59
+ return ZendeskClient(
60
+ email=self.email, subdomain=self.subdomain, token=access_config.api_token
61
+ )
62
+
63
+
64
+ class ZendeskIndexerConfig(IndexerConfig):
65
+ item_type: Literal["tickets", "articles", "all"] = Field(
66
+ default="tickets",
67
+ description="Type of item from zendesk to parse, can only be `tickets` or `articles`.",
68
+ )
69
+
70
+
71
+ @dataclass
72
+ class ZendeskIndexer(Indexer):
73
+ connection_config: ZendeskConnectionConfig
74
+ index_config: ZendeskIndexerConfig
75
+ connector_type: str = CONNECTOR_TYPE
76
+
77
+ def precheck(self) -> None:
78
+ """Validates connection to Zendesk API."""
79
+ self.connection_config.get_client()
80
+
81
+ def is_async(self) -> bool:
82
+ return True
83
+
84
+ def _generate_fullpath(self, identifier: str) -> Path:
85
+ return Path(hashlib.sha256(identifier.encode("utf-8")).hexdigest()[:16] + ".txt")
86
+
87
+ async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
88
+ async with self.connection_config.get_client() as client:
89
+ async for ticket in client.get_tickets():
90
+ source_identifiers = SourceIdentifiers(
91
+ filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
92
+ )
93
+ yield ZendeskFileData(
94
+ identifier=str(ticket.id),
95
+ connector_type=self.connector_type,
96
+ source_identifiers=source_identifiers,
97
+ additional_metadata=ZendeskAdditionalMetadata(
98
+ item_type="ticket", content=ticket
99
+ ),
100
+ metadata=FileDataSourceMetadata(
101
+ url=str(ticket.url) if ticket.url else None,
102
+ date_created=ticket.created_at.isoformat() if ticket.created_at else None,
103
+ date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
104
+ date_processed=str(time()),
105
+ ),
106
+ display_name=source_identifiers.fullpath,
107
+ )
108
+
109
+ async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
110
+ async with self.connection_config.get_client() as client:
111
+ async for article in client.get_articles():
112
+ source_identifiers = SourceIdentifiers(
113
+ filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
114
+ )
115
+ yield ZendeskFileData(
116
+ identifier=str(article.id),
117
+ connector_type=self.connector_type,
118
+ source_identifiers=source_identifiers,
119
+ additional_metadata=ZendeskAdditionalMetadata(
120
+ item_type="article", content=article
121
+ ),
122
+ metadata=FileDataSourceMetadata(
123
+ url=str(article.url) if article.url else None,
124
+ date_created=article.created_at.isoformat() if article.created_at else None,
125
+ date_modified=(
126
+ article.updated_at.isoformat() if article.updated_at else None
127
+ ),
128
+ date_processed=str(time()),
129
+ ),
130
+ display_name=source_identifiers.fullpath,
131
+ )
132
+
133
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:
134
+ """Determines item type and processes accordingly asynchronously."""
135
+ item_type = self.index_config.item_type
136
+
137
+ if item_type == "articles":
138
+ async for article_file_data in self.get_articles():
139
+ yield article_file_data
140
+
141
+ elif item_type == "tickets":
142
+ async for ticket_file_data in self.get_tickets():
143
+ yield ticket_file_data
144
+
145
+ elif item_type == "all":
146
+ async for article_file_data in self.get_articles():
147
+ yield article_file_data
148
+ async for ticket_file_data in self.get_tickets():
149
+ yield ticket_file_data
150
+
151
+ else:
152
+ raise ValueError(f"Item type {item_type} is not supported by the indexer")
153
+
154
+
155
+ class ZendeskDownloaderConfig(DownloaderConfig, HtmlMixin):
156
+ pass
157
+
158
+
159
+ @dataclass
160
+ class ZendeskDownloader(Downloader):
161
+ download_config: ZendeskDownloaderConfig
162
+ connection_config: ZendeskConnectionConfig
163
+ connector_type: str = CONNECTOR_TYPE
164
+
165
+ def is_async(self) -> bool:
166
+ return True
167
+
168
+ def download_embedded_files(
169
+ self, session, html: str, current_file_data: FileData
170
+ ) -> list[DownloadResponse]:
171
+ if not self.download_config.extract_files:
172
+ return []
173
+ url = current_file_data.metadata.url
174
+ if url is None:
175
+ logger.warning(
176
+ f"""Missing URL for file: {current_file_data.source_identifiers.filename}.
177
+ Skipping file extraction."""
178
+ )
179
+ return []
180
+ filepath = current_file_data.source_identifiers.relative_path
181
+ download_path = Path(self.download_dir) / filepath
182
+ download_dir = download_path.with_suffix("")
183
+ return self.download_config.extract_embedded_files(
184
+ url=url,
185
+ download_dir=download_dir,
186
+ original_filedata=current_file_data,
187
+ html=html,
188
+ session=session,
189
+ )
190
+
191
+ @requires_dependencies(["aiofiles", "bs4"], extras="zendesk")
192
+ async def download_article(self, article: ZendeskArticle, download_path: Path) -> None:
193
+ import aiofiles
194
+ import bs4
195
+
196
+ article_html = article.as_html()
197
+ soup = bs4.BeautifulSoup(article_html, "html.parser")
198
+ async with aiofiles.open(download_path, "w", encoding="utf8") as f:
199
+ await f.write(soup.prettify())
200
+
201
+ @requires_dependencies(["aiofiles"], extras="zendesk")
202
+ async def download_ticket(self, ticket: ZendeskTicket, download_path: Path) -> None:
203
+ import aiofiles
204
+
205
+ async with aiofiles.open(download_path, "w", encoding="utf8") as f:
206
+ await f.write(ticket.as_text())
207
+ async with self.connection_config.get_client() as client:
208
+ comments = [comment async for comment in client.get_comments(ticket_id=ticket.id)]
209
+ for comment in comments:
210
+ await f.write(comment.as_text())
211
+
212
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
213
+ zendesk_filedata = ZendeskFileData.cast(file_data=file_data)
214
+
215
+ item_type = zendesk_filedata.additional_metadata.item_type
216
+ download_path = self.get_download_path(file_data=zendesk_filedata)
217
+ download_path.parent.mkdir(parents=True, exist_ok=True)
218
+
219
+ if item_type == "article":
220
+ article = ZendeskArticle.model_validate(zendesk_filedata.additional_metadata.content)
221
+ await self.download_article(article=article, download_path=download_path)
222
+ elif item_type == "ticket":
223
+ ticket = ZendeskTicket.model_validate(zendesk_filedata.additional_metadata.content)
224
+ await self.download_ticket(ticket=ticket, download_path=download_path)
225
+ else:
226
+ raise SourceConnectionError(
227
+ f"Item type {item_type} cannot be handled by the downloader"
228
+ )
229
+ return super().generate_download_response(
230
+ file_data=zendesk_filedata, download_path=download_path
231
+ )
232
+
233
+
234
+ # create entry
235
+ zendesk_source_entry = SourceRegistryEntry(
236
+ connection_config=ZendeskConnectionConfig,
237
+ indexer_config=ZendeskIndexerConfig,
238
+ indexer=ZendeskIndexer,
239
+ downloader=ZendeskDownloader,
240
+ downloader_config=ZendeskDownloaderConfig,
241
+ )
@@ -0,0 +1,203 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Literal, Optional
5
+
6
+ from pydantic import BaseModel, Field, SecretStr
7
+
8
+ from unstructured_ingest.interfaces.process import BaseProcess
9
+ from unstructured_ingest.utils.data_prep import get_json_data
10
+
11
+ if TYPE_CHECKING:
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
13
+
14
+
15
+ class EmbedderConfig(BaseModel):
16
+ embedding_provider: Optional[
17
+ Literal[
18
+ "openai",
19
+ "azure-openai",
20
+ "huggingface",
21
+ "bedrock",
22
+ "vertexai",
23
+ "voyageai",
24
+ "octoai",
25
+ "mixedbread-ai",
26
+ "togetherai",
27
+ ]
28
+ ] = Field(default=None, description="Type of the embedding class to be used.")
29
+ embedding_api_key: Optional[SecretStr] = Field(
30
+ default=None,
31
+ description="API key for the embedding model, for the case an API key is needed.",
32
+ )
33
+ embedding_model_name: Optional[str] = Field(
34
+ default=None,
35
+ description="Embedding model name, if needed. "
36
+ "Chooses a particular LLM between different options, to embed with it.",
37
+ )
38
+ embedding_aws_access_key_id: Optional[str] = Field(
39
+ default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
40
+ )
41
+ embedding_aws_secret_access_key: Optional[SecretStr] = Field(
42
+ default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
43
+ )
44
+ embedding_aws_region: Optional[str] = Field(
45
+ default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
46
+ )
47
+ embedding_azure_endpoint: Optional[str] = Field(
48
+ default=None,
49
+ description="Your Azure endpoint, including the resource, "
50
+ "e.g. `https://example-resource.azure.openai.com/`",
51
+ )
52
+ embedding_azure_api_version: Optional[str] = Field(
53
+ description="Azure API version", default=None
54
+ )
55
+
56
+ def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
57
+ from unstructured_ingest.embed.huggingface import (
58
+ HuggingFaceEmbeddingConfig,
59
+ HuggingFaceEmbeddingEncoder,
60
+ )
61
+
62
+ return HuggingFaceEmbeddingEncoder(
63
+ config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
64
+ )
65
+
66
+ def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
67
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
68
+
69
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
70
+
71
+ def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
72
+ from unstructured_ingest.embed.azure_openai import (
73
+ AzureOpenAIEmbeddingConfig,
74
+ AzureOpenAIEmbeddingEncoder,
75
+ )
76
+
77
+ config_kwargs = {
78
+ "api_key": self.embedding_api_key,
79
+ "azure_endpoint": self.embedding_azure_endpoint,
80
+ }
81
+ if api_version := self.embedding_azure_api_version:
82
+ config_kwargs["api_version"] = api_version
83
+ if model_name := self.embedding_model_name:
84
+ config_kwargs["model_name"] = model_name
85
+
86
+ return AzureOpenAIEmbeddingEncoder(
87
+ config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
88
+ )
89
+
90
+ def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
91
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
92
+
93
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
94
+
95
+ def get_bedrock_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
96
+ from unstructured_ingest.embed.bedrock import (
97
+ BedrockEmbeddingConfig,
98
+ BedrockEmbeddingEncoder,
99
+ )
100
+
101
+ embedding_kwargs = embedding_kwargs | {
102
+ "aws_access_key_id": self.embedding_aws_access_key_id,
103
+ "aws_secret_access_key": self.embedding_aws_secret_access_key.get_secret_value(),
104
+ "region_name": self.embedding_aws_region,
105
+ }
106
+
107
+ return BedrockEmbeddingEncoder(
108
+ config=BedrockEmbeddingConfig.model_validate(embedding_kwargs)
109
+ )
110
+
111
+ def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
112
+ from unstructured_ingest.embed.vertexai import (
113
+ VertexAIEmbeddingConfig,
114
+ VertexAIEmbeddingEncoder,
115
+ )
116
+
117
+ return VertexAIEmbeddingEncoder(
118
+ config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
119
+ )
120
+
121
+ def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
122
+ from unstructured_ingest.embed.voyageai import (
123
+ VoyageAIEmbeddingConfig,
124
+ VoyageAIEmbeddingEncoder,
125
+ )
126
+
127
+ return VoyageAIEmbeddingEncoder(
128
+ config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
129
+ )
130
+
131
+ def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
132
+ from unstructured_ingest.embed.mixedbreadai import (
133
+ MixedbreadAIEmbeddingConfig,
134
+ MixedbreadAIEmbeddingEncoder,
135
+ )
136
+
137
+ return MixedbreadAIEmbeddingEncoder(
138
+ config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
139
+ )
140
+
141
+ def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
142
+ from unstructured_ingest.embed.togetherai import (
143
+ TogetherAIEmbeddingConfig,
144
+ TogetherAIEmbeddingEncoder,
145
+ )
146
+
147
+ return TogetherAIEmbeddingEncoder(
148
+ config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
149
+ )
150
+
151
+ def get_embedder(self) -> "BaseEmbeddingEncoder":
152
+ kwargs: dict[str, Any] = {}
153
+ if self.embedding_api_key:
154
+ kwargs["api_key"] = self.embedding_api_key.get_secret_value()
155
+ if self.embedding_model_name:
156
+ kwargs["model_name"] = self.embedding_model_name
157
+ # TODO make this more dynamic to map to encoder configs
158
+ if self.embedding_provider == "openai":
159
+ return self.get_openai_embedder(embedding_kwargs=kwargs)
160
+
161
+ if self.embedding_provider == "huggingface":
162
+ return self.get_huggingface_embedder(embedding_kwargs=kwargs)
163
+
164
+ if self.embedding_provider == "octoai":
165
+ return self.get_octoai_embedder(embedding_kwargs=kwargs)
166
+
167
+ if self.embedding_provider == "bedrock":
168
+ return self.get_bedrock_embedder(embedding_kwargs=kwargs)
169
+
170
+ if self.embedding_provider == "vertexai":
171
+ return self.get_vertexai_embedder(embedding_kwargs=kwargs)
172
+
173
+ if self.embedding_provider == "voyageai":
174
+ return self.get_voyageai_embedder(embedding_kwargs=kwargs)
175
+ if self.embedding_provider == "mixedbread-ai":
176
+ return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
177
+ if self.embedding_provider == "togetherai":
178
+ return self.get_togetherai_embedder(embedding_kwargs=kwargs)
179
+ if self.embedding_provider == "azure-openai":
180
+ return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
181
+
182
+ raise ValueError(f"{self.embedding_provider} not a recognized encoder")
183
+
184
+
185
+ @dataclass
186
+ class Embedder(BaseProcess, ABC):
187
+ config: EmbedderConfig
188
+
189
+ def init(self, **kwargs: Any) -> None:
190
+ self.config.get_embedder().initialize()
191
+
192
+ def precheck(self) -> None:
193
+ embedder = self.config.get_embedder()
194
+ embedder.precheck()
195
+
196
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
197
+ # TODO update base embedder classes to support async
198
+ embedder = self.config.get_embedder()
199
+ elements = get_json_data(path=elements_filepath)
200
+ if not elements:
201
+ return []
202
+ embedded_elements = embedder.embed_documents(elements=elements)
203
+ return embedded_elements
@@ -0,0 +1,60 @@
1
+ import fnmatch
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Callable, Optional
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces.process import BaseProcess
10
+ from unstructured_ingest.logger import logger
11
+
12
+
13
+ class FiltererConfig(BaseModel):
14
+ file_glob: Optional[list[str]] = Field(
15
+ default=None,
16
+ description="file globs to limit which data_types of files are accepted",
17
+ examples=["*.pdf", "*.html"],
18
+ )
19
+ max_file_size: Optional[int] = Field(
20
+ default=None, description="Max file size to process in bytes"
21
+ )
22
+
23
+
24
+ @dataclass
25
+ class Filterer(BaseProcess, ABC):
26
+ config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
27
+ filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
28
+
29
+ def __post_init__(self):
30
+ # Populate the filters based on values in config
31
+ if self.config.file_glob is not None:
32
+ self.filters.append(self.glob_filter)
33
+ if self.config.max_file_size:
34
+ self.filters.append(self.file_size_filter)
35
+
36
+ def is_async(self) -> bool:
37
+ return False
38
+
39
+ def file_size_filter(self, file_data: FileData) -> bool:
40
+ if filesize_bytes := file_data.metadata.filesize_bytes:
41
+ return filesize_bytes <= self.config.max_file_size
42
+ return True
43
+
44
+ def glob_filter(self, file_data: FileData) -> bool:
45
+ patterns = self.config.file_glob
46
+ path = file_data.source_identifiers.fullpath
47
+ for pattern in patterns:
48
+ if fnmatch.filter([path], pattern):
49
+ return True
50
+ logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
51
+ return False
52
+
53
+ def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
54
+ for filter in self.filters:
55
+ if not filter(file_data):
56
+ logger.debug(
57
+ f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
58
+ )
59
+ return None
60
+ return file_data