unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,176 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.logger import logger
11
+ from unstructured_ingest.processes.connector_registry import (
12
+ DestinationRegistryEntry,
13
+ SourceRegistryEntry,
14
+ )
15
+ from unstructured_ingest.processes.connectors.sql.sql import (
16
+ _DATE_COLUMNS,
17
+ SQLAccessConfig,
18
+ SqlBatchFileData,
19
+ SQLConnectionConfig,
20
+ SQLDownloader,
21
+ SQLDownloaderConfig,
22
+ SQLIndexer,
23
+ SQLIndexerConfig,
24
+ SQLUploader,
25
+ SQLUploaderConfig,
26
+ SQLUploadStager,
27
+ SQLUploadStagerConfig,
28
+ parse_date_string,
29
+ )
30
+ from unstructured_ingest.utils.dep_check import requires_dependencies
31
+
32
+ if TYPE_CHECKING:
33
+ from singlestoredb.connection import Connection as SingleStoreConnection
34
+ from singlestoredb.connection import Cursor as SingleStoreCursor
35
+
36
+ CONNECTOR_TYPE = "singlestore"
37
+
38
+
39
+ class SingleStoreAccessConfig(SQLAccessConfig):
40
+ password: Optional[str] = Field(default=None, description="SingleStore password")
41
+
42
+
43
+ class SingleStoreConnectionConfig(SQLConnectionConfig):
44
+ access_config: Secret[SingleStoreAccessConfig]
45
+ host: Optional[str] = Field(default=None, description="SingleStore host")
46
+ port: Optional[int] = Field(default=None, description="SingleStore port")
47
+ user: Optional[str] = Field(default=None, description="SingleStore user")
48
+ database: Optional[str] = Field(default=None, description="SingleStore database")
49
+
50
+ @contextmanager
51
+ @requires_dependencies(["singlestoredb"], extras="singlestore")
52
+ def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
53
+ import singlestoredb as s2
54
+
55
+ connection = s2.connect(
56
+ host=self.host,
57
+ port=self.port,
58
+ database=self.database,
59
+ user=self.user,
60
+ password=self.access_config.get_secret_value().password,
61
+ )
62
+ try:
63
+ yield connection
64
+ finally:
65
+ connection.commit()
66
+ connection.close()
67
+
68
+ @contextmanager
69
+ def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
70
+ with self.get_connection() as connection, connection.cursor() as cursor:
71
+ try:
72
+ yield cursor
73
+ finally:
74
+ cursor.close()
75
+
76
+
77
+ class SingleStoreIndexerConfig(SQLIndexerConfig):
78
+ pass
79
+
80
+
81
+ @dataclass
82
+ class SingleStoreIndexer(SQLIndexer):
83
+ connection_config: SingleStoreConnectionConfig
84
+ index_config: SingleStoreIndexerConfig
85
+ connector_type: str = CONNECTOR_TYPE
86
+
87
+
88
+ class SingleStoreDownloaderConfig(SQLDownloaderConfig):
89
+ pass
90
+
91
+
92
+ @dataclass
93
+ class SingleStoreDownloader(SQLDownloader):
94
+ connection_config: SingleStoreConnectionConfig
95
+ download_config: SingleStoreDownloaderConfig
96
+ connector_type: str = CONNECTOR_TYPE
97
+ values_delimiter: str = "%s"
98
+
99
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
100
+ table_name = file_data.additional_metadata.table_name
101
+ id_column = file_data.additional_metadata.id_column
102
+ ids = tuple([item.identifier for item in file_data.batch_items])
103
+ with self.connection_config.get_connection() as sqlite_connection:
104
+ cursor = sqlite_connection.cursor()
105
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
106
+ query = (
107
+ f"SELECT {fields} FROM {table_name} WHERE {id_column} IN {self.values_delimiter}"
108
+ )
109
+ logger.debug(f"running query: {query}\nwith values: {(ids,)}")
110
+ cursor.execute(query, (ids,))
111
+ rows = cursor.fetchall()
112
+ columns = [col[0] for col in cursor.description]
113
+ return rows, columns
114
+
115
+
116
+ class SingleStoreUploadStagerConfig(SQLUploadStagerConfig):
117
+ pass
118
+
119
+
120
+ class SingleStoreUploadStager(SQLUploadStager):
121
+ upload_stager_config: SingleStoreUploadStagerConfig
122
+
123
+
124
+ class SingleStoreUploaderConfig(SQLUploaderConfig):
125
+ pass
126
+
127
+
128
+ @dataclass
129
+ class SingleStoreUploader(SQLUploader):
130
+ upload_config: SingleStoreUploaderConfig = field(default_factory=SingleStoreUploaderConfig)
131
+ connection_config: SingleStoreConnectionConfig
132
+ values_delimiter: str = "%s"
133
+ connector_type: str = CONNECTOR_TYPE
134
+
135
+ @requires_dependencies(["pandas"], extras="singlestore")
136
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
137
+ super().run(path=path, file_data=file_data, **kwargs)
138
+
139
+ @requires_dependencies(["pandas"], extras="singlestore")
140
+ def prepare_data(
141
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
142
+ ) -> list[tuple[Any, ...]]:
143
+ import pandas as pd
144
+
145
+ output = []
146
+ for row in data:
147
+ parsed = []
148
+ for column_name, value in zip(columns, row):
149
+ if isinstance(value, (list, dict)):
150
+ value = json.dumps(value)
151
+ if column_name in _DATE_COLUMNS:
152
+ if value is None or pd.isna(value):
153
+ parsed.append(None)
154
+ else:
155
+ parsed.append(parse_date_string(value))
156
+ else:
157
+ parsed.append(value)
158
+ output.append(tuple(parsed))
159
+ return output
160
+
161
+
162
+ singlestore_source_entry = SourceRegistryEntry(
163
+ connection_config=SingleStoreConnectionConfig,
164
+ indexer_config=SingleStoreIndexerConfig,
165
+ indexer=SingleStoreIndexer,
166
+ downloader_config=SingleStoreDownloaderConfig,
167
+ downloader=SingleStoreDownloader,
168
+ )
169
+
170
+ singlestore_destination_entry = DestinationRegistryEntry(
171
+ connection_config=SingleStoreConnectionConfig,
172
+ uploader=SingleStoreUploader,
173
+ uploader_config=SingleStoreUploaderConfig,
174
+ upload_stager=SingleStoreUploadStager,
175
+ upload_stager_config=SingleStoreUploadStagerConfig,
176
+ )
@@ -0,0 +1,298 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.logger import logger
11
+ from unstructured_ingest.processes.connector_registry import (
12
+ DestinationRegistryEntry,
13
+ SourceRegistryEntry,
14
+ )
15
+ from unstructured_ingest.processes.connectors.sql.sql import (
16
+ _DATE_COLUMNS,
17
+ SQLAccessConfig,
18
+ SqlBatchFileData,
19
+ SQLConnectionConfig,
20
+ SQLDownloader,
21
+ SQLDownloaderConfig,
22
+ SQLIndexer,
23
+ SQLIndexerConfig,
24
+ SQLUploader,
25
+ SQLUploaderConfig,
26
+ SQLUploadStager,
27
+ SQLUploadStagerConfig,
28
+ parse_date_string,
29
+ )
30
+ from unstructured_ingest.utils.data_prep import split_dataframe
31
+ from unstructured_ingest.utils.dep_check import requires_dependencies
32
+
33
+ if TYPE_CHECKING:
34
+ from pandas import DataFrame
35
+ from snowflake.connector import SnowflakeConnection
36
+ from snowflake.connector.cursor import SnowflakeCursor
37
+
38
+ CONNECTOR_TYPE = "snowflake"
39
+
40
+ EMBEDDINGS_COLUMN = "embeddings"
41
+ _ARRAY_COLUMNS = (
42
+ "languages",
43
+ "link_urls",
44
+ "link_texts",
45
+ "sent_from",
46
+ "sent_to",
47
+ "emphasized_text_contents",
48
+ "emphasized_text_tags",
49
+ )
50
+ _VECTOR_COLUMNS = (EMBEDDINGS_COLUMN,)
51
+
52
+
53
+ class SnowflakeAccessConfig(SQLAccessConfig):
54
+ password: Optional[str] = Field(default=None, description="DB password")
55
+
56
+
57
+ class SnowflakeConnectionConfig(SQLConnectionConfig):
58
+ access_config: Secret[SnowflakeAccessConfig] = Field(
59
+ default=SnowflakeAccessConfig(), validate_default=True
60
+ )
61
+ account: str = Field(
62
+ default=None,
63
+ description="Your account identifier. The account identifier "
64
+ "does not include the snowflakecomputing.com suffix.",
65
+ )
66
+ user: Optional[str] = Field(default=None, description="DB username")
67
+ host: Optional[str] = Field(default=None, description="DB host")
68
+ port: Optional[int] = Field(default=443, description="DB host connection port")
69
+ database: str = Field(
70
+ default=None,
71
+ description="Database name.",
72
+ )
73
+ db_schema: str = Field(default=None, description="Database schema.", alias="schema")
74
+ role: str = Field(
75
+ default=None,
76
+ description="Database role.",
77
+ )
78
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
79
+
80
+ @contextmanager
81
+ # The actual snowflake module package name is: snowflake-connector-python
82
+ @requires_dependencies(["snowflake"], extras="snowflake")
83
+ def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
84
+ # https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
85
+ from snowflake.connector import connect
86
+
87
+ connect_kwargs = self.model_dump()
88
+ connect_kwargs["schema"] = connect_kwargs.pop("db_schema")
89
+ connect_kwargs.pop("access_configs", None)
90
+ connect_kwargs["password"] = self.access_config.get_secret_value().password
91
+ # https://peps.python.org/pep-0249/#paramstyle
92
+ connect_kwargs["paramstyle"] = "qmark"
93
+ # remove anything that is none
94
+ active_kwargs = {k: v for k, v in connect_kwargs.items() if v is not None}
95
+ connection = connect(**active_kwargs)
96
+ try:
97
+ yield connection
98
+ finally:
99
+ connection.commit()
100
+ connection.close()
101
+
102
+ @contextmanager
103
+ def get_cursor(self) -> Generator["SnowflakeCursor", None, None]:
104
+ with self.get_connection() as connection:
105
+ cursor = connection.cursor()
106
+ try:
107
+ yield cursor
108
+ finally:
109
+ cursor.close()
110
+
111
+
112
+ class SnowflakeIndexerConfig(SQLIndexerConfig):
113
+ pass
114
+
115
+
116
+ @dataclass
117
+ class SnowflakeIndexer(SQLIndexer):
118
+ connection_config: SnowflakeConnectionConfig
119
+ index_config: SnowflakeIndexerConfig
120
+ connector_type: str = CONNECTOR_TYPE
121
+
122
+
123
+ class SnowflakeDownloaderConfig(SQLDownloaderConfig):
124
+ pass
125
+
126
+
127
+ @dataclass
128
+ class SnowflakeDownloader(SQLDownloader):
129
+ connection_config: SnowflakeConnectionConfig
130
+ download_config: SnowflakeDownloaderConfig
131
+ connector_type: str = CONNECTOR_TYPE
132
+ values_delimiter: str = "?"
133
+
134
+ # The actual snowflake module package name is: snowflake-connector-python
135
+ @requires_dependencies(["snowflake"], extras="snowflake")
136
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
137
+ table_name = file_data.additional_metadata.table_name
138
+ id_column = file_data.additional_metadata.id_column
139
+ ids = [item.identifier for item in file_data.batch_items]
140
+
141
+ with self.connection_config.get_cursor() as cursor:
142
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
143
+ table_name=table_name,
144
+ id_column=id_column,
145
+ fields=(
146
+ ",".join(self.download_config.fields) if self.download_config.fields else "*"
147
+ ),
148
+ values=",".join([self.values_delimiter for _ in ids]),
149
+ )
150
+ logger.debug(f"running query: {query}\nwith values: {ids}")
151
+ cursor.execute(query, ids)
152
+ rows = [
153
+ tuple(row.values()) if isinstance(row, dict) else row for row in cursor.fetchall()
154
+ ]
155
+ columns = [col[0] for col in cursor.description]
156
+ return rows, columns
157
+
158
+
159
+ class SnowflakeUploadStagerConfig(SQLUploadStagerConfig):
160
+ pass
161
+
162
+
163
+ class SnowflakeUploadStager(SQLUploadStager):
164
+ upload_stager_config: SnowflakeUploadStagerConfig
165
+
166
+
167
+ class SnowflakeUploaderConfig(SQLUploaderConfig):
168
+ pass
169
+
170
+
171
+ @dataclass
172
+ class SnowflakeUploader(SQLUploader):
173
+ upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
174
+ connection_config: SnowflakeConnectionConfig
175
+ connector_type: str = CONNECTOR_TYPE
176
+ values_delimiter: str = "?"
177
+
178
+ _embeddings_dimension: Optional[int] = None
179
+
180
+ @property
181
+ def embeddings_dimension(self) -> Optional[int]:
182
+ """
183
+ Get the dimension of the embeddings column in the Snowflake table.
184
+ If the column is not present or is not of type VECTOR, returns None.
185
+ """
186
+ if self._embeddings_dimension is None:
187
+ with self.connection_config.get_cursor() as cursor:
188
+ embeddings_column = cursor.execute(
189
+ f"SHOW COLUMNS LIKE '{EMBEDDINGS_COLUMN}' IN {self.upload_config.table_name}"
190
+ ).fetchone()
191
+ if embeddings_column:
192
+ data_type = {}
193
+ if isinstance(embeddings_column, dict):
194
+ data_type = json.loads(embeddings_column.get("data_type", "{}"))
195
+ elif isinstance(embeddings_column, tuple):
196
+ data_type = json.loads(embeddings_column[3] or "{}")
197
+ if isinstance(data_type, dict) and data_type.get("type") == "VECTOR":
198
+ self._embeddings_dimension = data_type.get("dimension")
199
+ # If the _embeddings_dimension is still None, it means the column
200
+ # is not present or not a VECTOR type
201
+ if self._embeddings_dimension is None:
202
+ self._embeddings_dimension = 0
203
+ return self._embeddings_dimension
204
+
205
+ @requires_dependencies(["pandas"], extras="snowflake")
206
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
207
+ super().run(path=path, file_data=file_data, **kwargs)
208
+
209
+ @requires_dependencies(["pandas"], extras="snowflake")
210
+ def prepare_data(
211
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
212
+ ) -> list[tuple[Any, ...]]:
213
+ import pandas as pd
214
+
215
+ output = []
216
+ for row in data:
217
+ parsed = []
218
+ for column_name, value in zip(columns, row):
219
+ if column_name in _DATE_COLUMNS:
220
+ if value is None or pd.isna(value): # pandas is nan
221
+ parsed.append(None)
222
+ else:
223
+ parsed.append(parse_date_string(value))
224
+ elif column_name in _ARRAY_COLUMNS or column_name in _VECTOR_COLUMNS:
225
+ if not isinstance(value, list) and (
226
+ value is None or pd.isna(value)
227
+ ): # pandas is nan
228
+ parsed.append(None)
229
+ else:
230
+ parsed.append(json.dumps(value))
231
+ else:
232
+ parsed.append(value)
233
+ output.append(tuple(parsed))
234
+ return output
235
+
236
+ def _parse_select(self, columns: list[str]) -> str:
237
+ embeddings_dimension = self.embeddings_dimension
238
+ parsed_values = []
239
+ for i, col in enumerate(columns):
240
+ argument_selector = f"${i + 1}"
241
+ if col in _VECTOR_COLUMNS and embeddings_dimension:
242
+ parsed_values.append(
243
+ f"PARSE_JSON({argument_selector})::VECTOR(FLOAT,{embeddings_dimension})"
244
+ )
245
+ elif col in _ARRAY_COLUMNS or col in _VECTOR_COLUMNS:
246
+ parsed_values.append(f"PARSE_JSON({argument_selector})")
247
+ else:
248
+ parsed_values.append(argument_selector)
249
+ return ",".join(parsed_values)
250
+
251
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
252
+ import numpy as np
253
+
254
+ if self.can_delete():
255
+ self.delete_by_record_id(file_data=file_data)
256
+ else:
257
+ logger.warning(
258
+ f"table doesn't contain expected "
259
+ f"record id column "
260
+ f"{self.upload_config.record_id_key}, skipping delete"
261
+ )
262
+ df = self._fit_to_schema(df=df, add_missing_columns=True, case_sensitive=False)
263
+ df.replace({np.nan: None}, inplace=True)
264
+
265
+ columns = list(df.columns)
266
+ stmt = "INSERT INTO {table_name} ({columns}) SELECT {select} FROM VALUES ({values})".format(
267
+ table_name=self.upload_config.table_name,
268
+ columns=",".join(columns),
269
+ select=self._parse_select(columns),
270
+ values=",".join([self.values_delimiter for _ in columns]),
271
+ )
272
+ logger.info(
273
+ f"writing a total of {len(df)} elements via"
274
+ f" document batches to destination"
275
+ f" table named {self.upload_config.table_name}"
276
+ f" with batch size {self.upload_config.batch_size}"
277
+ )
278
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
279
+ with self.connection_config.get_cursor() as cursor:
280
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
281
+ cursor.executemany(stmt, values)
282
+
283
+
284
+ snowflake_source_entry = SourceRegistryEntry(
285
+ connection_config=SnowflakeConnectionConfig,
286
+ indexer_config=SnowflakeIndexerConfig,
287
+ indexer=SnowflakeIndexer,
288
+ downloader_config=SnowflakeDownloaderConfig,
289
+ downloader=SnowflakeDownloader,
290
+ )
291
+
292
+ snowflake_destination_entry = DestinationRegistryEntry(
293
+ connection_config=SnowflakeConnectionConfig,
294
+ uploader=SnowflakeUploader,
295
+ uploader_config=SnowflakeUploaderConfig,
296
+ upload_stager=SnowflakeUploadStager,
297
+ upload_stager_config=SnowflakeUploadStagerConfig,
298
+ )