unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,130 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.error import DestinationConnectionError, ValueError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Uploader,
14
+ UploaderConfig,
15
+ UploadStagerConfig,
16
+ )
17
+ from unstructured_ingest.logger import logger
18
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
19
+ from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
20
+ from unstructured_ingest.utils.data_prep import get_data_df
21
+ from unstructured_ingest.utils.dep_check import requires_dependencies
22
+
23
+ if TYPE_CHECKING:
24
+ from duckdb import DuckDBPyConnection as DuckDBConnection
25
+ from pandas import DataFrame
26
+
27
+ CONNECTOR_TYPE = "duckdb"
28
+
29
+
30
+ class DuckDBAccessConfig(AccessConfig):
31
+ pass
32
+
33
+
34
+ class DuckDBConnectionConfig(ConnectionConfig):
35
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
36
+ database: Optional[str] = Field(
37
+ default=None,
38
+ description="Database name. Path to the DuckDB .db file. If the file does "
39
+ "not exist, it will be created at the specified path.",
40
+ )
41
+ db_schema: Optional[str] = Field(
42
+ default="main",
43
+ description="Schema name. Schema in the database where the elements table is located.",
44
+ )
45
+ table: Optional[str] = Field(
46
+ default="elements",
47
+ description="Table name. Table name into which the elements data is inserted.",
48
+ )
49
+ access_config: Secret[DuckDBAccessConfig] = Field(
50
+ default=DuckDBAccessConfig(), validate_default=True
51
+ )
52
+
53
+ def __post_init__(self):
54
+ if self.database is None:
55
+ raise ValueError(
56
+ "A DuckDB connection requires a path to a *.db or *.duckdb file "
57
+ "through the `database` argument"
58
+ )
59
+
60
+ @requires_dependencies(["duckdb"], extras="duckdb")
61
+ @contextmanager
62
+ def get_client(self) -> Generator["DuckDBConnection", None, None]:
63
+ import duckdb
64
+
65
+ with duckdb.connect(self.database) as client:
66
+ yield client
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["DuckDBConnection", None, None]:
70
+ with self.get_client() as client, client.cursor() as cursor:
71
+ yield cursor
72
+
73
+
74
+ class DuckDBUploadStagerConfig(UploadStagerConfig):
75
+ pass
76
+
77
+
78
+ @dataclass
79
+ class DuckDBUploadStager(BaseDuckDBUploadStager):
80
+ upload_stager_config: DuckDBUploadStagerConfig = field(
81
+ default_factory=lambda: DuckDBUploadStagerConfig()
82
+ )
83
+
84
+
85
+ class DuckDBUploaderConfig(UploaderConfig):
86
+ batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
87
+
88
+
89
+ @dataclass
90
+ class DuckDBUploader(Uploader):
91
+ connector_type: str = CONNECTOR_TYPE
92
+ upload_config: DuckDBUploaderConfig
93
+ connection_config: DuckDBConnectionConfig
94
+
95
+ def precheck(self) -> None:
96
+ try:
97
+ with self.connection_config.get_cursor() as cursor:
98
+ cursor.execute("SELECT 1;")
99
+ except Exception as e:
100
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
101
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
102
+
103
+ def upload_dataframe(self, df: "DataFrame") -> None:
104
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
105
+
106
+ with self.connection_config.get_client() as conn:
107
+ conn.query(
108
+ f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
109
+ )
110
+
111
+ @requires_dependencies(["pandas"], extras="duckdb")
112
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
113
+ import pandas as pd
114
+
115
+ df = pd.DataFrame(data=data)
116
+ self.upload_dataframe(df=df)
117
+
118
+ @requires_dependencies(["pandas"], extras="duckdb")
119
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
120
+ df = get_data_df(path)
121
+ self.upload_dataframe(df=df)
122
+
123
+
124
+ duckdb_destination_entry = DestinationRegistryEntry(
125
+ connection_config=DuckDBConnectionConfig,
126
+ uploader=DuckDBUploader,
127
+ uploader_config=DuckDBUploaderConfig,
128
+ upload_stager=DuckDBUploadStager,
129
+ upload_stager_config=DuckDBUploadStagerConfig,
130
+ )
@@ -0,0 +1,130 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.error import DestinationConnectionError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ Uploader,
15
+ UploaderConfig,
16
+ UploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.logger import logger
19
+ from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
20
+ from unstructured_ingest.processes.connectors.duckdb.base import BaseDuckDBUploadStager
21
+ from unstructured_ingest.utils.data_prep import get_data_df
22
+ from unstructured_ingest.utils.dep_check import requires_dependencies
23
+
24
+ if TYPE_CHECKING:
25
+ from duckdb import DuckDBPyConnection as MotherDuckConnection
26
+ from pandas import DataFrame
27
+
28
+ CONNECTOR_TYPE = "motherduck"
29
+
30
+
31
+ class MotherDuckAccessConfig(AccessConfig):
32
+ md_token: str = Field(default=None, description="MotherDuck token")
33
+
34
+
35
+ class MotherDuckConnectionConfig(ConnectionConfig):
36
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
37
+ database: str = Field(
38
+ description="Database name. Name of the MotherDuck database.",
39
+ )
40
+ db_schema: Optional[str] = Field(
41
+ default="main",
42
+ description="Schema name. Schema in the database where the elements table is located.",
43
+ )
44
+ table: Optional[str] = Field(
45
+ default="elements",
46
+ description="Table name. Table name into which the elements data is inserted.",
47
+ )
48
+ access_config: Secret[MotherDuckAccessConfig] = Field(
49
+ default=MotherDuckAccessConfig(), validate_default=True
50
+ )
51
+
52
+ @requires_dependencies(["duckdb"], extras="duckdb")
53
+ @contextmanager
54
+ def get_client(self) -> Generator["MotherDuckConnection", None, None]:
55
+ import duckdb
56
+
57
+ access_config = self.access_config.get_secret_value()
58
+ with duckdb.connect(
59
+ f"md:?motherduck_token={access_config.md_token}",
60
+ config={
61
+ "custom_user_agent": f"unstructured-io-ingest/{unstructured_io_ingest_version}"
62
+ },
63
+ ) as conn:
64
+ conn.sql(f'USE "{self.database}"')
65
+ yield conn
66
+
67
+ @contextmanager
68
+ def get_cursor(self) -> Generator["MotherDuckConnection", None, None]:
69
+ with self.get_client() as client, client.cursor() as cursor:
70
+ yield cursor
71
+
72
+
73
+ class MotherDuckUploadStagerConfig(UploadStagerConfig):
74
+ pass
75
+
76
+
77
+ @dataclass
78
+ class MotherDuckUploadStager(BaseDuckDBUploadStager):
79
+ upload_stager_config: MotherDuckUploadStagerConfig = field(
80
+ default_factory=lambda: MotherDuckUploadStagerConfig()
81
+ )
82
+
83
+
84
+ class MotherDuckUploaderConfig(UploaderConfig):
85
+ batch_size: int = Field(default=50, description="[Not-used] Number of records per batch")
86
+
87
+
88
+ @dataclass
89
+ class MotherDuckUploader(Uploader):
90
+ connector_type: str = CONNECTOR_TYPE
91
+ upload_config: MotherDuckUploaderConfig
92
+ connection_config: MotherDuckConnectionConfig
93
+
94
+ def precheck(self) -> None:
95
+ try:
96
+ with self.connection_config.get_cursor() as cursor:
97
+ cursor.execute("SELECT 1;")
98
+ except Exception as e:
99
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
100
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
101
+
102
+ def upload_dataframe(self, df: "DataFrame") -> None:
103
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
104
+ database = self.connection_config.database
105
+ db_schema = self.connection_config.db_schema
106
+ table = self.connection_config.table
107
+
108
+ with self.connection_config.get_client() as conn:
109
+ conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
110
+
111
+ @requires_dependencies(["pandas"], extras="duckdb")
112
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
113
+ import pandas as pd
114
+
115
+ df = pd.DataFrame(data=data)
116
+ self.upload_dataframe(df=df)
117
+
118
+ @requires_dependencies(["pandas"], extras="duckdb")
119
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
120
+ df = get_data_df(path)
121
+ self.upload_dataframe(df=df)
122
+
123
+
124
+ motherduck_destination_entry = DestinationRegistryEntry(
125
+ connection_config=MotherDuckConnectionConfig,
126
+ uploader=MotherDuckUploader,
127
+ uploader_config=MotherDuckUploaderConfig,
128
+ upload_stager=MotherDuckUploadStager,
129
+ upload_stager_config=MotherDuckUploadStagerConfig,
130
+ )
@@ -0,0 +1,19 @@
1
+ from unstructured_ingest.processes.connector_registry import (
2
+ add_destination_entry,
3
+ add_source_entry,
4
+ )
5
+
6
+ from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
7
+ from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
8
+ from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
9
+ from .opensearch import opensearch_destination_entry, opensearch_source_entry
10
+
11
+ add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
12
+ add_destination_entry(
13
+ destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
14
+ )
15
+
16
+ add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
17
+ add_destination_entry(
18
+ destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
19
+ )