unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,254 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Generator, Optional
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.data_types.file_data import FileData
9
+ from unstructured_ingest.logger import logger
10
+ from unstructured_ingest.processes.connector_registry import (
11
+ DestinationRegistryEntry,
12
+ SourceRegistryEntry,
13
+ )
14
+ from unstructured_ingest.processes.connectors.sql.sql import (
15
+ SQLAccessConfig,
16
+ SqlBatchFileData,
17
+ SQLConnectionConfig,
18
+ SQLDownloader,
19
+ SQLDownloaderConfig,
20
+ SQLIndexer,
21
+ SQLIndexerConfig,
22
+ SQLUploader,
23
+ SQLUploaderConfig,
24
+ SQLUploadStager,
25
+ SQLUploadStagerConfig,
26
+ )
27
+ from unstructured_ingest.utils.data_prep import split_dataframe
28
+ from unstructured_ingest.utils.dep_check import requires_dependencies
29
+
30
+ if TYPE_CHECKING:
31
+ from pandas import DataFrame
32
+ from teradatasql import TeradataConnection, TeradataCursor
33
+
34
+ CONNECTOR_TYPE = "teradata"
35
+
36
+
37
+ class TeradataAccessConfig(SQLAccessConfig):
38
+ password: str = Field(description="Teradata user password")
39
+
40
+
41
+ class TeradataConnectionConfig(SQLConnectionConfig):
42
+ access_config: Secret[TeradataAccessConfig]
43
+ host: str = Field(description="Teradata server hostname or IP address")
44
+ user: str = Field(description="Teradata database username")
45
+ database: Optional[str] = Field(
46
+ default=None,
47
+ description="Default database/schema to use for queries",
48
+ )
49
+ dbs_port: int = Field(
50
+ default=1025,
51
+ description="Teradata database port (default: 1025)",
52
+ )
53
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
54
+
55
+ @contextmanager
56
+ @requires_dependencies(["teradatasql"], extras="teradata")
57
+ def get_connection(self) -> Generator["TeradataConnection", None, None]:
58
+ from teradatasql import connect
59
+
60
+ conn_params = {
61
+ "host": self.host,
62
+ "user": self.user,
63
+ "password": self.access_config.get_secret_value().password,
64
+ "dbs_port": self.dbs_port,
65
+ "charset": "UTF8",
66
+ }
67
+ if self.database:
68
+ conn_params["database"] = self.database
69
+
70
+ connection = connect(**conn_params)
71
+ try:
72
+ yield connection
73
+ finally:
74
+ connection.commit()
75
+ connection.close()
76
+
77
+ @contextmanager
78
+ def get_cursor(self) -> Generator["TeradataCursor", None, None]:
79
+ with self.get_connection() as connection:
80
+ cursor = connection.cursor()
81
+ try:
82
+ yield cursor
83
+ finally:
84
+ cursor.close()
85
+
86
+
87
+ class TeradataIndexerConfig(SQLIndexerConfig):
88
+ pass
89
+
90
+
91
+ @dataclass
92
+ class TeradataIndexer(SQLIndexer):
93
+ connection_config: TeradataConnectionConfig
94
+ index_config: TeradataIndexerConfig
95
+ connector_type: str = CONNECTOR_TYPE
96
+
97
+ def _get_doc_ids(self) -> list[str]:
98
+ """Override to quote identifiers for Teradata reserved word handling."""
99
+ with self.get_cursor() as cursor:
100
+ cursor.execute(
101
+ f'SELECT "{self.index_config.id_column}" FROM "{self.index_config.table_name}"'
102
+ )
103
+ results = cursor.fetchall()
104
+ ids = sorted([result[0] for result in results])
105
+ return ids
106
+
107
+
108
+ class TeradataDownloaderConfig(SQLDownloaderConfig):
109
+ pass
110
+
111
+
112
+ @dataclass
113
+ class TeradataDownloader(SQLDownloader):
114
+ connection_config: TeradataConnectionConfig
115
+ download_config: TeradataDownloaderConfig
116
+ connector_type: str = CONNECTOR_TYPE
117
+ values_delimiter: str = "?"
118
+
119
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
120
+ table_name = file_data.additional_metadata.table_name
121
+ id_column = file_data.additional_metadata.id_column
122
+ ids = [item.identifier for item in file_data.batch_items]
123
+
124
+ with self.connection_config.get_cursor() as cursor:
125
+ if self.download_config.fields:
126
+ fields = ",".join([f'"{field}"' for field in self.download_config.fields])
127
+ else:
128
+ fields = "*"
129
+
130
+ placeholders = ",".join([self.values_delimiter for _ in ids])
131
+ query = f'SELECT {fields} FROM "{table_name}" WHERE "{id_column}" IN ({placeholders})'
132
+
133
+ logger.debug(f"running query: {query}\nwith values: {ids}")
134
+ cursor.execute(query, ids)
135
+ rows = cursor.fetchall()
136
+ columns = [col[0] for col in cursor.description]
137
+ return rows, columns
138
+
139
+
140
+ class TeradataUploadStagerConfig(SQLUploadStagerConfig):
141
+ pass
142
+
143
+
144
+ @dataclass
145
+ class TeradataUploadStager(SQLUploadStager):
146
+ upload_stager_config: TeradataUploadStagerConfig = field(
147
+ default_factory=TeradataUploadStagerConfig
148
+ )
149
+
150
+ def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
151
+ df = super().conform_dataframe(df)
152
+
153
+ # teradatasql driver cannot handle Python lists/dicts, convert to JSON strings
154
+ # Check a sample of values to detect columns with complex types (10 rows)
155
+ for column in df.columns:
156
+ sample = df[column].dropna().head(10)
157
+
158
+ if len(sample) > 0:
159
+ has_complex_type = sample.apply(
160
+ lambda x: isinstance(x, (list, dict))
161
+ ).any()
162
+
163
+ if has_complex_type:
164
+ df[column] = df[column].apply(
165
+ lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x
166
+ )
167
+
168
+ return df
169
+
170
+
171
+ class TeradataUploaderConfig(SQLUploaderConfig):
172
+ pass
173
+
174
+
175
+ @dataclass
176
+ class TeradataUploader(SQLUploader):
177
+ upload_config: TeradataUploaderConfig = field(default_factory=TeradataUploaderConfig)
178
+ connection_config: TeradataConnectionConfig
179
+ connector_type: str = CONNECTOR_TYPE
180
+ values_delimiter: str = "?"
181
+
182
+ def get_table_columns(self) -> list[str]:
183
+ if self._columns is None:
184
+ with self.get_cursor() as cursor:
185
+ cursor.execute(f'SELECT TOP 1 * FROM "{self.upload_config.table_name}"')
186
+ self._columns = [desc[0] for desc in cursor.description]
187
+ return self._columns
188
+
189
+ def delete_by_record_id(self, file_data: FileData) -> None:
190
+ logger.debug(
191
+ f"deleting any content with data "
192
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
193
+ f"from table {self.upload_config.table_name}"
194
+ )
195
+ stmt = (
196
+ f'DELETE FROM "{self.upload_config.table_name}" '
197
+ f'WHERE "{self.upload_config.record_id_key}" = {self.values_delimiter}'
198
+ )
199
+ with self.get_cursor() as cursor:
200
+ cursor.execute(stmt, [file_data.identifier])
201
+ rowcount = cursor.rowcount
202
+ if rowcount > 0:
203
+ logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
204
+
205
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
206
+ import numpy as np
207
+
208
+ if self.can_delete():
209
+ self.delete_by_record_id(file_data=file_data)
210
+ else:
211
+ logger.warning(
212
+ f"table doesn't contain expected "
213
+ f"record id column "
214
+ f"{self.upload_config.record_id_key}, skipping delete"
215
+ )
216
+ df = self._fit_to_schema(df=df)
217
+ df.replace({np.nan: None}, inplace=True)
218
+
219
+ columns = list(df.columns)
220
+ quoted_columns = [f'"{col}"' for col in columns]
221
+
222
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
223
+ table_name=f'"{self.upload_config.table_name}"',
224
+ columns=",".join(quoted_columns),
225
+ values=",".join([self.values_delimiter for _ in columns]),
226
+ )
227
+ logger.info(
228
+ f"writing a total of {len(df)} elements via"
229
+ f" document batches to destination"
230
+ f" table named {self.upload_config.table_name}"
231
+ f" with batch size {self.upload_config.batch_size}"
232
+ )
233
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
234
+ with self.get_cursor() as cursor:
235
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
236
+ logger.debug(f"running query: {stmt}")
237
+ cursor.executemany(stmt, values)
238
+
239
+
240
+ teradata_source_entry = SourceRegistryEntry(
241
+ connection_config=TeradataConnectionConfig,
242
+ indexer_config=TeradataIndexerConfig,
243
+ indexer=TeradataIndexer,
244
+ downloader_config=TeradataDownloaderConfig,
245
+ downloader=TeradataDownloader,
246
+ )
247
+
248
+ teradata_destination_entry = DestinationRegistryEntry(
249
+ connection_config=TeradataConnectionConfig,
250
+ uploader=TeradataUploader,
251
+ uploader_config=TeradataUploaderConfig,
252
+ upload_stager=TeradataUploadStager,
253
+ upload_stager_config=TeradataUploadStagerConfig,
254
+ )
@@ -0,0 +1,263 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Optional
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.data_types.file_data import (
9
+ FileData,
10
+ )
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.processes.connector_registry import (
14
+ DestinationRegistryEntry,
15
+ SourceRegistryEntry,
16
+ )
17
+ from unstructured_ingest.processes.connectors.sql.sql import (
18
+ SQLAccessConfig,
19
+ SqlBatchFileData,
20
+ SQLConnectionConfig,
21
+ SQLDownloader,
22
+ SQLDownloaderConfig,
23
+ SQLIndexer,
24
+ SQLIndexerConfig,
25
+ SQLUploader,
26
+ SQLUploaderConfig,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ )
30
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
31
+ from unstructured_ingest.utils.data_prep import get_enhanced_element_id, split_dataframe
32
+ from unstructured_ingest.utils.dep_check import requires_dependencies
33
+
34
+ if TYPE_CHECKING:
35
+ from pandas import DataFrame
36
+ from vastdb import connect as VastdbConnect
37
+ from vastdb import transaction as VastdbTransaction
38
+ from vastdb.table import Table as VastdbTable
39
+
40
+ CONNECTOR_TYPE = "vastdb"
41
+
42
+
43
+ class VastdbAccessConfig(SQLAccessConfig):
44
+ endpoint: Optional[str] = Field(default=None, description="DB endpoint")
45
+ access_key_id: Optional[str] = Field(default=None, description="access key id")
46
+ access_key_secret: Optional[str] = Field(default=None, description="access key secret")
47
+
48
+
49
+ class VastdbConnectionConfig(SQLConnectionConfig):
50
+ access_config: Secret[VastdbAccessConfig] = Field(
51
+ default=VastdbAccessConfig(), validate_default=True
52
+ )
53
+ vastdb_bucket: str
54
+ vastdb_schema: str
55
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
56
+
57
+ @requires_dependencies(["vastdb"], extras="vastdb")
58
+ @contextmanager
59
+ def get_connection(self) -> "VastdbConnect":
60
+ from vastdb import connect
61
+
62
+ access_config = self.access_config.get_secret_value()
63
+ connection = connect(
64
+ endpoint=access_config.endpoint,
65
+ access=access_config.access_key_id,
66
+ secret=access_config.access_key_secret,
67
+ )
68
+ yield connection
69
+
70
+ @contextmanager
71
+ def get_cursor(self) -> "VastdbTransaction":
72
+ with self.get_connection() as connection, connection.transaction() as transaction:
73
+ yield transaction
74
+
75
+ @contextmanager
76
+ def get_table(self, table_name: str) -> "VastdbTable":
77
+ with self.get_cursor() as cursor:
78
+ bucket = cursor.bucket(self.vastdb_bucket)
79
+ schema = bucket.schema(self.vastdb_schema)
80
+ table = schema.table(table_name)
81
+ yield table
82
+
83
+
84
+ class VastdbIndexerConfig(SQLIndexerConfig):
85
+ pass
86
+
87
+
88
+ @dataclass
89
+ class VastdbIndexer(SQLIndexer):
90
+ connection_config: VastdbConnectionConfig
91
+ index_config: VastdbIndexerConfig
92
+ connector_type: str = CONNECTOR_TYPE
93
+
94
+ def _get_doc_ids(self) -> list[str]:
95
+ with self.connection_config.get_table(self.index_config.table_name) as table:
96
+ reader = table.select(columns=[self.index_config.id_column])
97
+ results = reader.read_all() # Build a PyArrow Table from the RecordBatchReader
98
+ ids = sorted([result[self.index_config.id_column] for result in results.to_pylist()])
99
+ return ids
100
+
101
+ def precheck(self) -> None:
102
+ try:
103
+ with self.connection_config.get_table(self.index_config.table_name) as table:
104
+ table.select()
105
+ except Exception as e:
106
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
107
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
108
+
109
+
110
+ class VastdbDownloaderConfig(SQLDownloaderConfig):
111
+ pass
112
+
113
+
114
+ @dataclass
115
+ class VastdbDownloader(SQLDownloader):
116
+ connection_config: VastdbConnectionConfig
117
+ download_config: VastdbDownloaderConfig
118
+ connector_type: str = CONNECTOR_TYPE
119
+
120
+ @requires_dependencies(["ibis"], extras="vastdb")
121
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
122
+ from ibis import _ # imports the Ibis deferred expression
123
+
124
+ table_name = file_data.additional_metadata.table_name
125
+ id_column = file_data.additional_metadata.id_column
126
+ ids = tuple([item.identifier for item in file_data.batch_items])
127
+
128
+ with self.connection_config.get_table(table_name) as table:
129
+ predicate = _[id_column].isin(ids)
130
+
131
+ if self.download_config.fields:
132
+ # Vastdb requires the id column to be included in the fields
133
+ fields = self.download_config.fields + [id_column]
134
+ # dict.fromkeys to remove duplicates and keep order
135
+ reader = table.select(columns=list(dict.fromkeys(fields)), predicate=predicate)
136
+ else:
137
+ reader = table.select(predicate=predicate)
138
+ results = reader.read_all()
139
+ df = results.to_pandas()
140
+ return [tuple(r) for r in df.to_numpy()], results.column_names
141
+
142
+
143
+ class VastdbUploadStagerConfig(SQLUploadStagerConfig):
144
+ rename_columns_map: Optional[dict] = Field(
145
+ default=None,
146
+ description="Map of column names to rename, ex: {'old_name': 'new_name'}",
147
+ )
148
+
149
+
150
+ @dataclass
151
+ class VastdbUploadStager(SQLUploadStager):
152
+ upload_stager_config: VastdbUploadStagerConfig = field(default_factory=VastdbUploadStagerConfig)
153
+
154
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
155
+ data = element_dict.copy()
156
+ metadata: dict[str, Any] = data.pop("metadata", {})
157
+ data_source = metadata.pop("data_source", {})
158
+ coordinates = metadata.pop("coordinates", {})
159
+
160
+ data.update(metadata)
161
+ data.update(data_source)
162
+ data.update(coordinates)
163
+
164
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
165
+ data[RECORD_ID_LABEL] = file_data.identifier
166
+ return data
167
+
168
+ def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
169
+ df = super().conform_dataframe(df=df)
170
+ if self.upload_stager_config.rename_columns_map:
171
+ df.rename(columns=self.upload_stager_config.rename_columns_map, inplace=True)
172
+ return df
173
+
174
+
175
+ class VastdbUploaderConfig(SQLUploaderConfig):
176
+ pass
177
+
178
+
179
+ @dataclass
180
+ class VastdbUploader(SQLUploader):
181
+ upload_config: VastdbUploaderConfig = field(default_factory=VastdbUploaderConfig)
182
+ connection_config: VastdbConnectionConfig
183
+ connector_type: str = CONNECTOR_TYPE
184
+
185
+ def precheck(self) -> None:
186
+ try:
187
+ with self.connection_config.get_table(self.upload_config.table_name) as table:
188
+ table.select()
189
+ except Exception as e:
190
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
191
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
192
+
193
+ @requires_dependencies(["pandas"], extras="vastdb")
194
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
195
+ super().run(path=path, file_data=file_data, **kwargs)
196
+
197
+ @requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
198
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
199
+ import numpy as np
200
+ import pyarrow as pa
201
+
202
+ if self.can_delete():
203
+ self.delete_by_record_id(file_data=file_data)
204
+ else:
205
+ logger.warning(
206
+ f"table doesn't contain expected "
207
+ f"record id column "
208
+ f"{self.upload_config.record_id_key}, skipping delete"
209
+ )
210
+ df.replace({np.nan: None}, inplace=True)
211
+ df = self._fit_to_schema(df=df)
212
+
213
+ logger.info(
214
+ f"writing a total of {len(df)} elements via"
215
+ f" document batches to destination"
216
+ f" table named {self.upload_config.table_name}"
217
+ f" with batch size {self.upload_config.batch_size}"
218
+ )
219
+
220
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
221
+ with self.connection_config.get_table(self.upload_config.table_name) as table:
222
+ pa_table = pa.Table.from_pandas(rows)
223
+ table.insert(pa_table)
224
+
225
+ def get_table_columns(self) -> list[str]:
226
+ if self._columns is None:
227
+ with self.connection_config.get_table(self.upload_config.table_name) as table:
228
+ self._columns = table.columns().names
229
+ return self._columns
230
+
231
+ @requires_dependencies(["ibis"], extras="vastdb")
232
+ def delete_by_record_id(self, file_data: FileData) -> None:
233
+ from ibis import _ # imports the Ibis deferred expression
234
+
235
+ logger.debug(
236
+ f"deleting any content with data "
237
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
238
+ f"from table {self.upload_config.table_name}"
239
+ )
240
+ predicate = _[self.upload_config.record_id_key].isin([file_data.identifier])
241
+ with self.connection_config.get_table(self.upload_config.table_name) as table:
242
+ # Get the internal row id
243
+ rows_to_delete = table.select(
244
+ columns=[], predicate=predicate, internal_row_id=True
245
+ ).read_all()
246
+ table.delete(rows_to_delete)
247
+
248
+
249
+ vastdb_source_entry = SourceRegistryEntry(
250
+ connection_config=VastdbConnectionConfig,
251
+ indexer_config=VastdbIndexerConfig,
252
+ indexer=VastdbIndexer,
253
+ downloader_config=VastdbDownloaderConfig,
254
+ downloader=VastdbDownloader,
255
+ )
256
+
257
+ vastdb_destination_entry = DestinationRegistryEntry(
258
+ connection_config=VastdbConnectionConfig,
259
+ uploader=VastdbUploader,
260
+ uploader_config=VastdbUploaderConfig,
261
+ upload_stager=VastdbUploadStager,
262
+ upload_stager_config=VastdbUploadStagerConfig,
263
+ )
@@ -0,0 +1,60 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import Any, Union
4
+
5
+ from dateutil import parser
6
+ from pydantic import ValidationError
7
+
8
+ from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
9
+
10
+
11
+ def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
12
+ if isinstance(date_value, datetime):
13
+ return date_value
14
+ elif isinstance(date_value, float):
15
+ return datetime.fromtimestamp(date_value)
16
+ elif isinstance(date_value, int):
17
+ return datetime.fromtimestamp(date_value / 1000)
18
+
19
+ try:
20
+ timestamp = float(date_value)
21
+ return datetime.fromtimestamp(timestamp)
22
+ except ValueError:
23
+ return parser.parse(date_value)
24
+
25
+
26
+ def conform_string_to_dict(value: Any) -> dict:
27
+ if isinstance(value, dict):
28
+ return value
29
+ if isinstance(value, str):
30
+ return json.loads(value)
31
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
32
+
33
+
34
+ def format_and_truncate_orig_elements(
35
+ element: dict, include_text: bool = False
36
+ ) -> list[dict[str, Any]]:
37
+ """
38
+ This function is used to format and truncate the orig_elements field in the metadata.
39
+ This is used to remove the text field and other larger fields from the orig_elements
40
+ that are not helpful in filtering/searching when used along with chunked elements.
41
+ """
42
+ metadata = element.get("metadata", {})
43
+ raw_orig_elements = metadata.get("orig_elements", None)
44
+ orig_elements = []
45
+ if raw_orig_elements is not None:
46
+ for element in elements_from_base64_gzipped_json(raw_orig_elements):
47
+ if not include_text:
48
+ element.pop("text", None)
49
+ for prop in (
50
+ "image_base64",
51
+ "text_as_html",
52
+ "table_as_cells",
53
+ "link_urls",
54
+ "link_texts",
55
+ "link_start_indexes",
56
+ "emphasized_text_contents",
57
+ ):
58
+ element["metadata"].pop(prop, None)
59
+ orig_elements.append(element)
60
+ return orig_elements