unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,156 @@
1
+ from abc import ABC
2
+ from functools import wraps
3
+ from typing import Optional
4
+
5
+
6
+ class UnstructuredIngestError(Exception, ABC):
7
+ error_string: str
8
+ status_code: Optional[int] = None
9
+
10
+ @classmethod
11
+ def wrap(cls, f):
12
+ """
13
+ Provides a wrapper for a function that catches any exception and
14
+ re-raises it as the customer error. If the exception itself is already an instance
15
+ of the custom error, re-raises original error.
16
+ """
17
+
18
+ @wraps(f)
19
+ def wrapper(*args, **kwargs):
20
+ try:
21
+ return f(*args, **kwargs)
22
+ except BaseException as error:
23
+ if not isinstance(error, cls) and not issubclass(type(error), cls):
24
+ raise cls(cls.error_string.format(str(error))) from error
25
+ raise
26
+
27
+ return wrapper
28
+
29
+
30
+ class ConnectionError(UnstructuredIngestError):
31
+ error_string = "Connection error: {}"
32
+ status_code: Optional[int] = 400
33
+
34
+
35
+ class SourceConnectionError(ConnectionError):
36
+ error_string = "Error in getting data from upstream data source: {}"
37
+ status_code: Optional[int] = 400
38
+
39
+
40
+ class SourceConnectionNetworkError(SourceConnectionError):
41
+ error_string = "Error in connecting to upstream data source: {}"
42
+ status_code: Optional[int] = 400
43
+
44
+
45
+ class DestinationConnectionError(ConnectionError):
46
+ error_string = "Error in connecting to downstream data source: {}"
47
+ status_code: Optional[int] = 400
48
+
49
+
50
+ class EmbeddingEncoderConnectionError(ConnectionError):
51
+ error_string = "Error in connecting to the embedding model provider: {}"
52
+ status_code: Optional[int] = 400
53
+
54
+
55
+ class UserError(UnstructuredIngestError):
56
+ error_string = "User error: {}"
57
+ status_code: Optional[int] = 401
58
+
59
+
60
+ class UserAuthError(UserError):
61
+ error_string = "User authentication error: {}"
62
+ status_code: Optional[int] = 401
63
+
64
+
65
+ class RateLimitError(UserError):
66
+ error_string = "Rate limit error: {}"
67
+ status_code: Optional[int] = 429
68
+
69
+
70
+ class NotFoundError(UnstructuredIngestError):
71
+ error_string = "Not found error: {}"
72
+ status_code: Optional[int] = 404
73
+
74
+
75
+ class TimeoutError(UnstructuredIngestError):
76
+ error_string = "Timeout error: {}"
77
+ status_code: Optional[int] = 408
78
+
79
+
80
+ class ResponseError(UnstructuredIngestError):
81
+ error_string = "Response error: {}"
82
+ status_code: Optional[int] = 400
83
+
84
+
85
+ class WriteError(UnstructuredIngestError):
86
+ error_string = "Error in writing to downstream data source: {}"
87
+ status_code: Optional[int] = 400
88
+
89
+
90
+ class ProviderError(UnstructuredIngestError):
91
+ error_string = "Provider error: {}"
92
+ status_code: Optional[int] = 500
93
+
94
+
95
+ class ValueError(UnstructuredIngestError):
96
+ error_string = "Value error: {}"
97
+
98
+
99
+ class PartitionError(UnstructuredIngestError):
100
+ error_string = "Error in partitioning content: {}"
101
+
102
+
103
+ class QuotaError(UserError):
104
+ error_string = "Quota error: {}"
105
+
106
+
107
+ class MissingCategoryError(UnstructuredIngestError):
108
+ error_string = "Missing category error: {}"
109
+
110
+
111
+ class ValidationError(UnstructuredIngestError):
112
+ error_string = "Validation error: {}"
113
+
114
+
115
+ class KeyError(UnstructuredIngestError):
116
+ error_string = "Key error: {}"
117
+
118
+
119
+ class FileExistsError(UnstructuredIngestError):
120
+ error_string = "File exists error: {}"
121
+
122
+
123
+ class TypeError(UnstructuredIngestError):
124
+ error_string = "Type error: {}"
125
+
126
+
127
+ class IcebergCommitFailedException(UnstructuredIngestError):
128
+ error_string = "Failed to commit changes to the iceberg table"
129
+
130
+
131
+ recognized_errors = [
132
+ UserError,
133
+ UserAuthError,
134
+ RateLimitError,
135
+ QuotaError,
136
+ ProviderError,
137
+ NotFoundError,
138
+ TypeError,
139
+ ValueError,
140
+ FileExistsError,
141
+ TimeoutError,
142
+ KeyError,
143
+ ResponseError,
144
+ ValidationError,
145
+ PartitionError,
146
+ WriteError,
147
+ ConnectionError,
148
+ SourceConnectionError,
149
+ SourceConnectionNetworkError,
150
+ DestinationConnectionError,
151
+ EmbeddingEncoderConnectionError,
152
+ ]
153
+
154
+
155
+ def is_internal_error(e: Exception) -> bool:
156
+ return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
@@ -0,0 +1,156 @@
1
+ from abc import ABC
2
+ from functools import wraps
3
+ from typing import Optional
4
+
5
+
6
+ class UnstructuredIngestError(Exception, ABC):
7
+ error_string: str
8
+ status_code: Optional[int] = None
9
+
10
+ @classmethod
11
+ def wrap(cls, f):
12
+ """
13
+ Provides a wrapper for a function that catches any exception and
14
+ re-raises it as the customer error. If the exception itself is already an instance
15
+ of the custom error, re-raises original error.
16
+ """
17
+
18
+ @wraps(f)
19
+ def wrapper(*args, **kwargs):
20
+ try:
21
+ return f(*args, **kwargs)
22
+ except BaseException as error:
23
+ if not isinstance(error, cls) and not issubclass(type(error), cls):
24
+ raise cls(cls.error_string.format(str(error))) from error
25
+ raise
26
+
27
+ return wrapper
28
+
29
+
30
+ class ConnectionError(UnstructuredIngestError):
31
+ error_string = "Connection error: {}"
32
+ status_code: Optional[int] = 400
33
+
34
+
35
+ class SourceConnectionError(ConnectionError):
36
+ error_string = "Error in getting data from upstream data source: {}"
37
+ status_code: Optional[int] = 400
38
+
39
+
40
+ class SourceConnectionNetworkError(SourceConnectionError):
41
+ error_string = "Error in connecting to upstream data source: {}"
42
+ status_code: Optional[int] = 400
43
+
44
+
45
+ class DestinationConnectionError(ConnectionError):
46
+ error_string = "Error in connecting to downstream data source: {}"
47
+ status_code: Optional[int] = 400
48
+
49
+
50
+ class EmbeddingEncoderConnectionError(ConnectionError):
51
+ error_string = "Error in connecting to the embedding model provider: {}"
52
+ status_code: Optional[int] = 400
53
+
54
+
55
+ class UserError(UnstructuredIngestError):
56
+ error_string = "User error: {}"
57
+ status_code: Optional[int] = 401
58
+
59
+
60
+ class UserAuthError(UserError):
61
+ error_string = "User authentication error: {}"
62
+ status_code: Optional[int] = 401
63
+
64
+
65
+ class RateLimitError(UserError):
66
+ error_string = "Rate limit error: {}"
67
+ status_code: Optional[int] = 429
68
+
69
+
70
+ class NotFoundError(UnstructuredIngestError):
71
+ error_string = "Not found error: {}"
72
+ status_code: Optional[int] = 404
73
+
74
+
75
+ class TimeoutError(UnstructuredIngestError):
76
+ error_string = "Timeout error: {}"
77
+ status_code: Optional[int] = 408
78
+
79
+
80
+ class ResponseError(UnstructuredIngestError):
81
+ error_string = "Response error: {}"
82
+ status_code: Optional[int] = 400
83
+
84
+
85
+ class WriteError(UnstructuredIngestError):
86
+ error_string = "Error in writing to downstream data source: {}"
87
+ status_code: Optional[int] = 400
88
+
89
+
90
+ class ProviderError(UnstructuredIngestError):
91
+ error_string = "Provider error: {}"
92
+ status_code: Optional[int] = 500
93
+
94
+
95
+ class ValueError(UnstructuredIngestError):
96
+ error_string = "Value error: {}"
97
+
98
+
99
+ class PartitionError(UnstructuredIngestError):
100
+ error_string = "Error in partitioning content: {}"
101
+
102
+
103
+ class QuotaError(UserError):
104
+ error_string = "Quota error: {}"
105
+
106
+
107
+ class MissingCategoryError(UnstructuredIngestError):
108
+ error_string = "Missing category error: {}"
109
+
110
+
111
+ class ValidationError(UnstructuredIngestError):
112
+ error_string = "Validation error: {}"
113
+
114
+
115
+ class KeyError(UnstructuredIngestError):
116
+ error_string = "Key error: {}"
117
+
118
+
119
+ class FileExistsError(UnstructuredIngestError):
120
+ error_string = "File exists error: {}"
121
+
122
+
123
+ class TypeError(UnstructuredIngestError):
124
+ error_string = "Type error: {}"
125
+
126
+
127
+ class IcebergCommitFailedException(UnstructuredIngestError):
128
+ error_string = "Failed to commit changes to the iceberg table"
129
+
130
+
131
+ recognized_errors = [
132
+ UserError,
133
+ UserAuthError,
134
+ RateLimitError,
135
+ QuotaError,
136
+ ProviderError,
137
+ NotFoundError,
138
+ TypeError,
139
+ ValueError,
140
+ FileExistsError,
141
+ TimeoutError,
142
+ KeyError,
143
+ ResponseError,
144
+ ValidationError,
145
+ PartitionError,
146
+ WriteError,
147
+ ConnectionError,
148
+ SourceConnectionError,
149
+ SourceConnectionNetworkError,
150
+ DestinationConnectionError,
151
+ EmbeddingEncoderConnectionError,
152
+ ]
153
+
154
+
155
+ def is_internal_error(e: Exception) -> bool:
156
+ return any(isinstance(e, recognized_error) for recognized_error in recognized_errors)
@@ -0,0 +1,27 @@
1
+ from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
+ from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
+ from .indexer import Indexer, IndexerConfig
4
+ from .process import BaseProcess
5
+ from .processor import ProcessorConfig
6
+ from .upload_stager import UploadStager, UploadStagerConfig
7
+ from .uploader import UploadContent, Uploader, UploaderConfig, VectorDBUploader
8
+
9
+ __all__ = [
10
+ "DownloadResponse",
11
+ "download_responses",
12
+ "Downloader",
13
+ "DownloaderConfig",
14
+ "Indexer",
15
+ "IndexerConfig",
16
+ "BaseProcess",
17
+ "ProcessorConfig",
18
+ "UploadStager",
19
+ "UploadStagerConfig",
20
+ "Uploader",
21
+ "UploaderConfig",
22
+ "UploadContent",
23
+ "AccessConfig",
24
+ "ConnectionConfig",
25
+ "BaseConnector",
26
+ "VectorDBUploader",
27
+ ]
@@ -0,0 +1,56 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Any, TypeVar, Union
4
+
5
+ from pydantic import BaseModel, Secret, model_validator
6
+ from pydantic.types import _SecretBase
7
+
8
+ from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
9
+
10
+
11
+ class AccessConfig(BaseModel):
12
+ """Meant to designate holding any sensitive information associated with other configs
13
+ and also for access specific configs."""
14
+
15
+
16
+ AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
17
+
18
+
19
+ class ConnectionConfig(BaseModel):
20
+ access_config: Secret[AccessConfigT]
21
+
22
+ def get_access_config(self) -> dict[str, Any]:
23
+ if not self.access_config:
24
+ return {}
25
+ return self.access_config.get_secret_value().model_dump()
26
+
27
+ @model_validator(mode="after")
28
+ def check_access_config(self):
29
+ access_config = self.access_config
30
+ if self._is_access_config_optional() and access_config is None:
31
+ return self
32
+ if not isinstance(access_config, _SecretBase):
33
+ raise ValueError("access_config must be an instance of SecretBase")
34
+ return self
35
+
36
+ def _is_access_config_optional(self) -> bool:
37
+ access_config_type = self.model_fields["access_config"].annotation
38
+ return (
39
+ hasattr(access_config_type, "__origin__")
40
+ and hasattr(access_config_type, "__args__")
41
+ and access_config_type.__origin__ is Union
42
+ and len(access_config_type.__args__) == 2
43
+ and type(None) in access_config_type.__args__
44
+ )
45
+
46
+
47
+ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
48
+
49
+
50
+ @dataclass
51
+ class BaseConnector(ABC, ConnectorLoggingMixin):
52
+ connection_config: ConnectionConfigT
53
+
54
+ def __post_init__(self):
55
+ """Initialize the logging mixin after dataclass initialization."""
56
+ ConnectorLoggingMixin.__init__(self)
@@ -0,0 +1,90 @@
1
+ import os
2
+ from abc import ABC
3
+ from pathlib import Path
4
+ from typing import Any, Optional, TypedDict, TypeVar, Union
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.interfaces.connector import BaseConnector
10
+ from unstructured_ingest.interfaces.process import BaseProcess
11
+
12
+
13
+ class DownloaderConfig(BaseModel):
14
+ download_dir: Optional[Path] = Field(
15
+ default=None,
16
+ description="Where files are downloaded to, defaults to a location at"
17
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
+ )
19
+
20
+
21
+ DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
22
+
23
+
24
+ class DownloadResponse(TypedDict):
25
+ file_data: FileData
26
+ path: Path
27
+
28
+
29
+ download_responses = Union[list[DownloadResponse], DownloadResponse]
30
+
31
+
32
+ class Downloader(BaseProcess, BaseConnector, ABC):
33
+ connector_type: str
34
+ download_config: DownloaderConfigT
35
+
36
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
37
+ if not file_data.source_identifiers:
38
+ return None
39
+
40
+ rel_path = file_data.source_identifiers.relative_path
41
+ if not rel_path:
42
+ return None
43
+
44
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
45
+ return self.download_dir / Path(rel_path)
46
+
47
+ @staticmethod
48
+ def is_float(value: str):
49
+ try:
50
+ float(value)
51
+ return True
52
+ except ValueError:
53
+ return False
54
+
55
+ def generate_download_response(
56
+ self, file_data: FileData, download_path: Path
57
+ ) -> DownloadResponse:
58
+ if (
59
+ file_data.metadata.date_modified
60
+ and self.is_float(file_data.metadata.date_modified)
61
+ and file_data.metadata.date_created
62
+ and self.is_float(file_data.metadata.date_created)
63
+ ):
64
+ date_modified = float(file_data.metadata.date_modified)
65
+ date_created = float(file_data.metadata.date_created)
66
+ os.utime(download_path, times=(date_created, date_modified))
67
+ file_data.local_download_path = str(download_path.resolve())
68
+ return DownloadResponse(file_data=file_data, path=download_path)
69
+
70
+ @property
71
+ def download_dir(self) -> Path:
72
+ if self.download_config.download_dir is None:
73
+ self.download_config.download_dir = (
74
+ Path.home()
75
+ / ".cache"
76
+ / "unstructured"
77
+ / "ingest"
78
+ / "download"
79
+ / self.connector_type
80
+ ).resolve()
81
+ return self.download_config.download_dir
82
+
83
+ def is_async(self) -> bool:
84
+ return True
85
+
86
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
87
+ raise NotImplementedError()
88
+
89
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
90
+ return self.run(file_data=file_data, **kwargs)
@@ -0,0 +1,29 @@
1
+ from abc import ABC
2
+ from typing import Any, AsyncGenerator, Generator, Optional, TypeVar
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from unstructured_ingest.data_types.file_data import FileData
7
+ from unstructured_ingest.interfaces.connector import BaseConnector
8
+ from unstructured_ingest.interfaces.process import BaseProcess
9
+
10
+
11
+ class IndexerConfig(BaseModel):
12
+ pass
13
+
14
+
15
+ IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
16
+
17
+
18
+ class Indexer(BaseProcess, BaseConnector, ABC):
19
+ connector_type: str
20
+ index_config: Optional[IndexerConfigT] = None
21
+
22
+ def is_async(self) -> bool:
23
+ return False
24
+
25
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
26
+ raise NotImplementedError()
27
+
28
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[FileData, None]:
29
+ raise NotImplementedError()
@@ -0,0 +1,22 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any
4
+
5
+
6
+ @dataclass
7
+ class BaseProcess(ABC):
8
+ def is_async(self) -> bool:
9
+ return False
10
+
11
+ def init(self, **kwargs: Any) -> None:
12
+ pass
13
+
14
+ def precheck(self) -> None:
15
+ pass
16
+
17
+ @abstractmethod
18
+ def run(self, **kwargs: Any) -> Any:
19
+ pass
20
+
21
+ async def run_async(self, **kwargs: Any) -> Any:
22
+ return self.run(**kwargs)
@@ -0,0 +1,88 @@
1
+ import os
2
+ from asyncio import Semaphore
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
9
+
10
+
11
+ class ProcessorConfig(BaseModel):
12
+ model_config = ConfigDict(arbitrary_types_allowed=True)
13
+
14
+ reprocess: bool = Field(
15
+ default=False,
16
+ description="Reprocess a downloaded file even if the relevant structured "
17
+ "output .json file in output directory already exists.",
18
+ )
19
+ verbose: bool = Field(default=False)
20
+ tqdm: bool = Field(default=False, description="Display tqdm progress bar")
21
+ work_dir: str = Field(
22
+ default_factory=lambda: DEFAULT_WORK_DIR,
23
+ description="Where to place working files when processing each step",
24
+ )
25
+ num_processes: int = Field(
26
+ default=2, description="Number of parallel processes with which to process docs"
27
+ )
28
+ max_connections: Optional[int] = Field(
29
+ default=None, description="Limit of concurrent connectionts"
30
+ )
31
+ raise_on_error: bool = Field(
32
+ default=False,
33
+ description="Is set, will raise error if any doc in the pipeline fail. "
34
+ "Otherwise will log error and continue with other docs",
35
+ )
36
+ disable_parallelism: bool = Field(
37
+ default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
38
+ )
39
+ preserve_downloads: bool = Field(
40
+ default=False, description="Don't delete downloaded files after process completes"
41
+ )
42
+ download_only: bool = Field(
43
+ default=False, description="skip the rest of the process after files are downloaded"
44
+ )
45
+ re_download: bool = Field(
46
+ default=False,
47
+ description="If set, will re-download downloaded files "
48
+ "regardless of if they already exist locally",
49
+ )
50
+ uncompress: bool = Field(
51
+ default=False,
52
+ description="Uncompress any archived files. Currently supporting "
53
+ "zip and tar files based on file extension.",
54
+ )
55
+ iter_delete: bool = Field(
56
+ default=False,
57
+ description="If limited on memory, this can be enabled to delete "
58
+ "cached content as it's used and no longer needed in the pipeline.",
59
+ )
60
+ delete_cache: bool = Field(
61
+ default=False,
62
+ description="If set, will delete the cache work directory when process finishes",
63
+ )
64
+
65
+ # OTEL support
66
+ otel_endpoint: Optional[str] = Field(
67
+ default=None, description="OTEL endpoint to publish trace data to"
68
+ )
69
+
70
+ # Used to keep track of state in pipeline
71
+ status: dict = Field(default_factory=dict)
72
+ semaphore: Optional[Semaphore] = Field(init=False, default=None, exclude=True)
73
+
74
+ def model_post_init(self, __context: Any) -> None:
75
+ if self.max_connections is not None:
76
+ self.semaphore = Semaphore(self.max_connections)
77
+
78
+ @property
79
+ def mp_supported(self) -> bool:
80
+ return not self.disable_parallelism and self.num_processes > 1
81
+
82
+ @property
83
+ def async_supported(self) -> bool:
84
+ if self.disable_parallelism:
85
+ return False
86
+ if self.max_connections is not None and isinstance(self.max_connections, int):
87
+ return self.max_connections > 1
88
+ return True