unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,310 @@
1
+ import logging
2
+ import traceback
3
+ from dataclasses import dataclass, field
4
+ from multiprocessing import Queue, current_process
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.data_types.file_data import FileData
12
+ from unstructured_ingest.error import DestinationConnectionError, ValueError
13
+ from unstructured_ingest.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.logger import logger
22
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
23
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
24
+ from unstructured_ingest.utils.data_prep import get_data_df, get_json_data
25
+ from unstructured_ingest.utils.dep_check import requires_dependencies
26
+ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
27
+
28
+ CONNECTOR_TYPE = "delta_table"
29
+
30
+ if TYPE_CHECKING:
31
+ from pandas import DataFrame
32
+
33
+
34
+ @requires_dependencies(["deltalake"], extras="delta-table")
35
+ def write_deltalake_with_error_handling(queue, **kwargs):
36
+ from deltalake.writer import write_deltalake
37
+
38
+ try:
39
+ write_deltalake(**kwargs)
40
+ except Exception:
41
+ queue.put(traceback.format_exc())
42
+
43
+
44
+ class DeltaTableAccessConfig(AccessConfig):
45
+ aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
46
+ aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
47
+
48
+
49
+ class DeltaTableConnectionConfig(ConnectionConfig):
50
+ access_config: Secret[DeltaTableAccessConfig] = Field(
51
+ default_factory=lambda: Secret[DeltaTableAccessConfig](DeltaTableAccessConfig()),
52
+ validate_default=True,
53
+ )
54
+ aws_region: Optional[str] = Field(default=None, description="AWS Region")
55
+ table_uri: str = Field(
56
+ description=(
57
+ "Local path or path to the target folder in the S3 bucket, "
58
+ "formatted as s3://my-bucket/my-folder/"
59
+ ),
60
+ )
61
+
62
+ def update_storage_options(self, storage_options: dict[str, str]) -> None:
63
+ secrets = self.access_config.get_secret_value()
64
+ if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
65
+ storage_options["AWS_REGION"] = self.aws_region
66
+ storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
67
+ storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
68
+ # Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
69
+ # This flag allows single-writer uploads to S3 without using locks, according to:
70
+ # https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
71
+ storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
72
+
73
+
74
+ class DeltaTableUploadStagerConfig(UploadStagerConfig):
75
+ pass
76
+
77
+
78
+ @dataclass
79
+ class DeltaTableUploadStager(UploadStager):
80
+ upload_stager_config: DeltaTableUploadStagerConfig = field(
81
+ default_factory=lambda: DeltaTableUploadStagerConfig()
82
+ )
83
+
84
+ def run( # type: ignore[override]
85
+ self,
86
+ elements_filepath: Path,
87
+ file_data: FileData,
88
+ output_dir: Path,
89
+ output_filename: str,
90
+ **kwargs: Any,
91
+ ) -> Path:
92
+ elements_contents = get_json_data(elements_filepath)
93
+ output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
94
+
95
+ df = convert_to_pandas_dataframe(elements_dict=elements_contents)
96
+ # Ensure per-record overwrite/delete semantics: tag each row with the record identifier
97
+ df[RECORD_ID_LABEL] = file_data.identifier
98
+ df = df.dropna(axis=1, how="all")
99
+ df.to_parquet(output_path)
100
+
101
+ return output_path
102
+
103
+
104
+ class DeltaTableUploaderConfig(UploaderConfig):
105
+ pass
106
+
107
+
108
+ @dataclass
109
+ class DeltaTableUploader(Uploader):
110
+ upload_config: DeltaTableUploaderConfig
111
+ connection_config: DeltaTableConnectionConfig
112
+ connector_type: str = CONNECTOR_TYPE
113
+
114
+ @requires_dependencies(["boto3"], extras="delta-table")
115
+ def precheck(self):
116
+ secrets = self.connection_config.access_config.get_secret_value()
117
+ if (
118
+ self.connection_config.aws_region
119
+ and secrets.aws_access_key_id
120
+ and secrets.aws_secret_access_key
121
+ ):
122
+ from boto3 import client
123
+
124
+ url = urlparse(self.connection_config.table_uri)
125
+ bucket_name = url.netloc
126
+ dir_path = url.path.lstrip("/")
127
+
128
+ try:
129
+ s3_client = client(
130
+ "s3",
131
+ aws_access_key_id=secrets.aws_access_key_id,
132
+ aws_secret_access_key=secrets.aws_secret_access_key,
133
+ )
134
+ s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
135
+
136
+ response = s3_client.get_bucket_location(Bucket=bucket_name)
137
+
138
+ bucket_region = _normalize_location_constraint(response.get("LocationConstraint"))
139
+
140
+ if self.connection_config.aws_region != bucket_region:
141
+ raise ValueError(
142
+ "Wrong AWS region provided: bucket "
143
+ f"'{bucket_name}' resides in '{bucket_region}', "
144
+ "but configuration specifies "
145
+ f"'{self.connection_config.aws_region}'."
146
+ )
147
+
148
+ except Exception as e:
149
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
150
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
151
+
152
+ @requires_dependencies(["tenacity"], extras="delta-table")
153
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
154
+ upload_path = self.connection_config.table_uri
155
+ logger.info(
156
+ f"writing {len(df)} rows to destination table at {upload_path}\ndtypes: {df.dtypes}",
157
+ )
158
+ storage_options: dict[str, str] = {}
159
+ self.connection_config.update_storage_options(storage_options=storage_options)
160
+
161
+ # Decide whether the Delta table already exists. If it does, we first delete all rows
162
+ # belonging to the current record and then append the fresh data. Otherwise we will
163
+ # create a brand-new table via an overwrite.
164
+
165
+ mode = "overwrite"
166
+ try:
167
+ from deltalake import DeltaTable # pylint: disable=import-error
168
+
169
+ dt = DeltaTable(upload_path, storage_options=storage_options)
170
+ logger.debug(f"Table exists: deleting rows for {file_data.identifier}")
171
+ # Table exists – remove any previous rows for this record_id so that appending is
172
+ # effectively an idempotent overwrite for the record.
173
+ dt.delete(predicate=f"{RECORD_ID_LABEL} = '{file_data.identifier}'")
174
+ mode = "append"
175
+ except Exception:
176
+ # Table does not exist yet (or cannot be opened) – we will create it below with
177
+ # mode="overwrite". All other failures will be captured later by the writer.
178
+ logger.debug("Table does not exist: creating new table")
179
+
180
+ writer_kwargs = {
181
+ "table_or_uri": upload_path,
182
+ "data": df,
183
+ "mode": mode,
184
+ "schema_mode": "merge",
185
+ "storage_options": storage_options,
186
+ }
187
+
188
+ from tenacity import (
189
+ before_log,
190
+ retry,
191
+ retry_if_exception,
192
+ stop_after_attempt,
193
+ wait_random,
194
+ )
195
+
196
+ def _is_commit_conflict(exc: BaseException) -> bool: # noqa: ANN401
197
+ """Return True if exception looks like a Delta Lake commit conflict.
198
+
199
+ Besides the canonical *CommitFailed* / *Metadata changed* errors that
200
+ deltalake surfaces when two writers clash, we occasionally hit
201
+ messages such as *Delta transaction failed, version 0 already
202
+ exists* while multiple processes race to create the very first log
203
+ entry. These situations are equally safe to retry, so detect them
204
+ too.
205
+ """
206
+
207
+ return isinstance(exc, RuntimeError) and any(
208
+ marker in str(exc)
209
+ for marker in (
210
+ "CommitFailed",
211
+ "Metadata changed",
212
+ "version 0 already exists",
213
+ "version already exists",
214
+ "Delta transaction failed",
215
+ )
216
+ )
217
+
218
+ @retry(
219
+ stop=stop_after_attempt(10),
220
+ wait=wait_random(min=0.2, max=1.0),
221
+ before=before_log(logger, logging.DEBUG),
222
+ retry=retry_if_exception(_is_commit_conflict),
223
+ reraise=True,
224
+ )
225
+ def _single_attempt() -> None:
226
+ """One optimistic transaction: delete old rows, then append new ones."""
227
+
228
+ # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and
229
+ # cause ingest to fail, even though all tasks are completed normally. Putting the writer
230
+ # into a process mitigates this issue by ensuring python interpreter waits properly for
231
+ # deltalake's rust backend to finish
232
+ # Use a multiprocessing context that relies on 'spawn' to avoid inheriting the
233
+ # parent process' Tokio runtime, which leads to `pyo3_runtime.PanicException`.
234
+ from multiprocessing import get_context
235
+
236
+ ctx = get_context("spawn")
237
+ queue: "Queue[str]" = ctx.Queue()
238
+
239
+ if current_process().daemon:
240
+ # write_deltalake_with_error_handling will push any traceback to our queue
241
+ write_deltalake_with_error_handling(queue=queue, **writer_kwargs)
242
+ else:
243
+ # On non-daemon processes we still guard against SIGABRT by running in a
244
+ # dedicated subprocess created via the 'spawn' method.
245
+ writer = ctx.Process(
246
+ target=write_deltalake_with_error_handling,
247
+ kwargs={"queue": queue, **writer_kwargs},
248
+ )
249
+ writer.start()
250
+ writer.join()
251
+
252
+ # First surface any traceback captured inside the subprocess so users see the real
253
+ # root-cause instead of a generic non-zero exit code.
254
+ if not queue.empty():
255
+ error_message = queue.get()
256
+ logger.error("Exception occurred in write_deltalake: %s", error_message)
257
+ raise DestinationConnectionError(f"Error in write_deltalake: {error_message}")
258
+
259
+ # If the subprocess terminated abnormally but produced no traceback (e.g., SIGABRT),
260
+ # still raise a helpful error for callers.
261
+ if not current_process().daemon and writer.exitcode != 0:
262
+ raise DestinationConnectionError(
263
+ f"write_deltalake subprocess exited with code {writer.exitcode}"
264
+ )
265
+
266
+ _single_attempt()
267
+
268
+ @requires_dependencies(["pandas"], extras="delta-table")
269
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
270
+ import pandas as pd
271
+
272
+ df = pd.DataFrame(data=data)
273
+ self.upload_dataframe(df=df, file_data=file_data)
274
+
275
+ @requires_dependencies(["pandas"], extras="delta-table")
276
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None: # type: ignore[override]
277
+ df = get_data_df(path)
278
+ self.upload_dataframe(df=df, file_data=file_data)
279
+
280
+
281
+ def _normalize_location_constraint(location: Optional[str]) -> str:
282
+ """Return canonical AWS region name for a LocationConstraint value.
283
+
284
+ The S3 GetBucketLocation operation returns `null` (`None`) for buckets in
285
+ the legacy `us-east-1` region and `EU` for very old buckets that were
286
+ created in the historical `EU` region (now `eu-west-1`). For every other
287
+ region the API already returns the correct AWS region string. This helper
288
+ normalises the legacy values so callers can reliably compare regions.
289
+
290
+ Args:
291
+ location: The LocationConstraint value returned by the S3 GetBucketLocation operation.
292
+
293
+ Returns:
294
+ The canonical AWS region name for the given location constraint.
295
+ """
296
+
297
+ if location is None:
298
+ return "us-east-1"
299
+ if location == "EU":
300
+ return "eu-west-1"
301
+ return location
302
+
303
+
304
+ delta_table_destination_entry = DestinationRegistryEntry(
305
+ connection_config=DeltaTableConnectionConfig,
306
+ uploader=DeltaTableUploader,
307
+ uploader_config=DeltaTableUploaderConfig,
308
+ upload_stager=DeltaTableUploadStager,
309
+ upload_stager_config=DeltaTableUploadStagerConfig,
310
+ )
@@ -0,0 +1,161 @@
1
+ import datetime as dt
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.data_types.file_data import (
8
+ FileData,
9
+ FileDataSourceMetadata,
10
+ SourceIdentifiers,
11
+ )
12
+ from unstructured_ingest.error import UserAuthError, ValueError
13
+ from unstructured_ingest.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ Indexer,
20
+ IndexerConfig,
21
+ )
22
+ from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
24
+ from unstructured_ingest.utils.dep_check import requires_dependencies
25
+
26
+ if TYPE_CHECKING:
27
+ from discord import Client as DiscordClient
28
+
29
+ CONNECTOR_TYPE = "discord"
30
+
31
+
32
+ class DiscordAccessConfig(AccessConfig):
33
+ token: str = Field(description="Discord API token")
34
+
35
+
36
+ class DiscordConnectionConfig(ConnectionConfig):
37
+ access_config: Secret[DiscordAccessConfig] = Field(
38
+ default=DiscordAccessConfig, validate_default=True
39
+ )
40
+
41
+ @requires_dependencies(["discord"], extras="discord")
42
+ def get_client(self) -> "DiscordClient":
43
+ import discord
44
+
45
+ intents = discord.Intents.default()
46
+ intents.message_content = True
47
+ return discord.Client(intents=intents)
48
+
49
+
50
+ class DiscordIndexerConfig(IndexerConfig):
51
+ channels: list[str] = Field(
52
+ default=None,
53
+ description="List of Discord channel IDs to process",
54
+ )
55
+
56
+
57
+ @dataclass
58
+ class DiscordIndexer(Indexer):
59
+ connection_config: DiscordConnectionConfig
60
+ index_config: DiscordIndexerConfig
61
+
62
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
63
+ self.connection_config.get_client()
64
+ channels_to_process: set[str] = set(self.index_config.channels or [])
65
+
66
+ for channel_id in list(channels_to_process):
67
+ file_data = self.get_channel_file_data(channel_id=channel_id)
68
+ if file_data:
69
+ yield file_data
70
+
71
+ def precheck(self) -> None:
72
+ if not self.connection_config.access_config.get_secret_value().token:
73
+ raise UserAuthError("Discord token is missing")
74
+ if not self.index_config.channels:
75
+ raise ValueError("No channels provided")
76
+
77
+ def get_channel_file_data(self, channel_id: str) -> Optional[FileData]:
78
+ # Fetch channel metadata
79
+ identifier = channel_id
80
+ channel_id = f"{channel_id}.txt"
81
+ source_identifiers = SourceIdentifiers(
82
+ filename=channel_id,
83
+ fullpath=channel_id,
84
+ )
85
+ metadata = FileDataSourceMetadata(
86
+ record_locator={"channel_id": identifier},
87
+ date_processed=str(dt.datetime.utcnow().isoformat()),
88
+ )
89
+ return FileData(
90
+ identifier=identifier,
91
+ connector_type=CONNECTOR_TYPE,
92
+ source_identifiers=source_identifiers,
93
+ metadata=metadata,
94
+ display_name=source_identifiers.fullpath,
95
+ )
96
+
97
+
98
+ class DiscordDownloaderConfig(DownloaderConfig):
99
+ limit: Optional[int] = Field(
100
+ default=100, description="Limit on how many messages per channel to pull in"
101
+ )
102
+
103
+
104
+ @dataclass
105
+ class DiscordDownloader(Downloader):
106
+ connection_config: DiscordConnectionConfig
107
+ download_config: DiscordDownloaderConfig
108
+ connector_type: str = CONNECTOR_TYPE
109
+
110
+ def is_async(self) -> bool:
111
+ return True
112
+
113
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
114
+ # Synchronous run is not implemented
115
+ raise NotImplementedError()
116
+
117
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
118
+ record_locator = file_data.metadata.record_locator
119
+
120
+ if "channel_id" not in record_locator:
121
+ raise ValueError(f"No channel id in file data record locator: {record_locator}")
122
+
123
+ client = self.connection_config.get_client()
124
+ download_path = self.get_download_path(file_data=file_data)
125
+ download_path.parent.mkdir(parents=True, exist_ok=True)
126
+
127
+ messages = []
128
+ channel_id = record_locator["channel_id"]
129
+
130
+ @client.event
131
+ async def on_ready():
132
+ logger.debug("Discord Bot is ready")
133
+ channel = client.get_channel(int(channel_id))
134
+ if not channel:
135
+ raise ValueError(f"channel not found for id: {channel_id}")
136
+ logger.debug(f"Processing messages for channel: {channel.name}")
137
+ async for msg in channel.history(limit=self.download_config.limit):
138
+ messages.append(msg)
139
+ logger.debug(f"Fetched {len(messages)} messages")
140
+ await client.close()
141
+
142
+ try:
143
+ await client.start(self.connection_config.access_config.get_secret_value().token)
144
+ finally:
145
+ await client.close()
146
+
147
+ content = "\n".join([message.content for message in messages])
148
+
149
+ with open(download_path, "w") as file:
150
+ file.write(content)
151
+
152
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
153
+
154
+
155
+ discord_source_entry = SourceRegistryEntry(
156
+ indexer=DiscordIndexer,
157
+ indexer_config=DiscordIndexerConfig,
158
+ downloader=DiscordDownloader,
159
+ downloader_config=DiscordDownloaderConfig,
160
+ connection_config=DiscordConnectionConfig,
161
+ )
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .duckdb import CONNECTOR_TYPE as DUCKDB_CONNECTOR_TYPE
8
+ from .duckdb import duckdb_destination_entry
9
+ from .motherduck import CONNECTOR_TYPE as MOTHERDUCK_CONNECTOR_TYPE
10
+ from .motherduck import motherduck_destination_entry
11
+
12
+ add_destination_entry(destination_type=DUCKDB_CONNECTOR_TYPE, entry=duckdb_destination_entry)
13
+ add_destination_entry(
14
+ destination_type=MOTHERDUCK_CONNECTOR_TYPE, entry=motherduck_destination_entry
15
+ )
@@ -0,0 +1,103 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from unstructured_ingest.data_types.file_data import FileData
6
+ from unstructured_ingest.interfaces import UploadStager
7
+ from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ _COLUMNS = (
11
+ "id",
12
+ "element_id",
13
+ "text",
14
+ "embeddings",
15
+ "type",
16
+ "system",
17
+ "layout_width",
18
+ "layout_height",
19
+ "points",
20
+ "url",
21
+ "version",
22
+ "date_created",
23
+ "date_modified",
24
+ "date_processed",
25
+ "permissions_data",
26
+ "record_locator",
27
+ "category_depth",
28
+ "parent_id",
29
+ "attached_filename",
30
+ "filetype",
31
+ "last_modified",
32
+ "file_directory",
33
+ "filename",
34
+ "languages",
35
+ "page_number",
36
+ "links",
37
+ "page_name",
38
+ "link_urls",
39
+ "link_texts",
40
+ "sent_from",
41
+ "sent_to",
42
+ "subject",
43
+ "section",
44
+ "header_footer_type",
45
+ "emphasized_text_contents",
46
+ "emphasized_text_tags",
47
+ "text_as_html",
48
+ "regex_metadata",
49
+ "detection_class_prob",
50
+ )
51
+
52
+ # _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
53
+
54
+
55
+ @dataclass
56
+ class BaseDuckDBUploadStager(UploadStager):
57
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
58
+ data = element_dict.copy()
59
+ metadata: dict[str, Any] = data.pop("metadata", {})
60
+ data_source = metadata.pop("data_source", {})
61
+ coordinates = metadata.pop("coordinates", {})
62
+
63
+ data.update(metadata)
64
+ data.update(data_source)
65
+ data.update(coordinates)
66
+
67
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
68
+
69
+ # remove extraneous, not supported columns
70
+ data = {k: v for k, v in data.items() if k in _COLUMNS}
71
+ return data
72
+
73
+ @requires_dependencies(["pandas"], extras="duckdb")
74
+ def run(
75
+ self,
76
+ elements_filepath: Path,
77
+ file_data: FileData,
78
+ output_dir: Path,
79
+ output_filename: str,
80
+ **kwargs: Any,
81
+ ) -> Path:
82
+ import pandas as pd
83
+
84
+ elements_contents = get_json_data(path=elements_filepath)
85
+ output_filename_suffix = Path(elements_filepath).suffix
86
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
87
+ output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
88
+
89
+ output = [
90
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
91
+ for element_dict in elements_contents
92
+ ]
93
+ df = pd.DataFrame(data=output)
94
+
95
+ for column in filter(
96
+ lambda x: x in df.columns,
97
+ ("version", "page_number", "regex_metadata"),
98
+ ):
99
+ df[column] = df[column].apply(str)
100
+
101
+ data = df.to_dict(orient="records")
102
+ write_data(path=output_path, data=data)
103
+ return output_path