unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,456 @@
1
+ import hashlib
2
+ import json
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from time import time
9
+ from typing import TYPE_CHECKING, Any, Generator, Union
10
+
11
+ from dateutil import parser
12
+ from pydantic import BaseModel, Field, Secret
13
+
14
+ from unstructured_ingest.data_types.file_data import (
15
+ BatchFileData,
16
+ BatchItem,
17
+ FileData,
18
+ FileDataSourceMetadata,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
22
+ from unstructured_ingest.interfaces import (
23
+ AccessConfig,
24
+ ConnectionConfig,
25
+ Downloader,
26
+ DownloaderConfig,
27
+ DownloadResponse,
28
+ Indexer,
29
+ IndexerConfig,
30
+ Uploader,
31
+ UploaderConfig,
32
+ UploadStager,
33
+ UploadStagerConfig,
34
+ download_responses,
35
+ )
36
+ from unstructured_ingest.logger import logger
37
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
38
+ from unstructured_ingest.utils.data_prep import (
39
+ get_data_df,
40
+ get_enhanced_element_id,
41
+ get_json_data,
42
+ split_dataframe,
43
+ write_data,
44
+ )
45
+
46
+ if TYPE_CHECKING:
47
+ from pandas import DataFrame
48
+
49
+ _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
50
+
51
+
52
+ class SqlAdditionalMetadata(BaseModel):
53
+ table_name: str
54
+ id_column: str
55
+
56
+
57
+ class SqlBatchFileData(BatchFileData):
58
+ additional_metadata: SqlAdditionalMetadata
59
+
60
+
61
+ def parse_date_string(date_value: Union[str, int]) -> datetime:
62
+ try:
63
+ timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
64
+ return datetime.fromtimestamp(timestamp)
65
+ except Exception as e:
66
+ logger.debug(f"date {date_value} string not a timestamp: {e}")
67
+
68
+ if isinstance(date_value, str):
69
+ try:
70
+ return datetime.fromisoformat(date_value)
71
+ except Exception:
72
+ pass
73
+ return parser.parse(date_value)
74
+
75
+
76
+ class SQLAccessConfig(AccessConfig):
77
+ pass
78
+
79
+
80
+ class SQLConnectionConfig(ConnectionConfig, ABC):
81
+ access_config: Secret[SQLAccessConfig] = Field(default=SQLAccessConfig(), validate_default=True)
82
+
83
+ @abstractmethod
84
+ @contextmanager
85
+ def get_connection(self) -> Generator[Any, None, None]:
86
+ pass
87
+
88
+ @abstractmethod
89
+ @contextmanager
90
+ def get_cursor(self) -> Generator[Any, None, None]:
91
+ pass
92
+
93
+
94
+ class SQLIndexerConfig(IndexerConfig):
95
+ table_name: str
96
+ id_column: str
97
+ batch_size: int = 100
98
+
99
+
100
+ class SQLIndexer(Indexer, ABC):
101
+ connection_config: SQLConnectionConfig
102
+ index_config: SQLIndexerConfig
103
+
104
+ @contextmanager
105
+ def get_cursor(self) -> Generator[Any, None, None]:
106
+ with self.connection_config.get_cursor() as cursor:
107
+ yield cursor
108
+
109
+ def _get_doc_ids(self) -> list[str]:
110
+ with self.get_cursor() as cursor:
111
+ cursor.execute(
112
+ f"SELECT {self.index_config.id_column} FROM {self.index_config.table_name}"
113
+ )
114
+ results = cursor.fetchall()
115
+ ids = sorted([result[0] for result in results])
116
+ return ids
117
+
118
+ def precheck(self) -> None:
119
+ try:
120
+ with self.get_cursor() as cursor:
121
+ cursor.execute("SELECT 1;")
122
+ except Exception as e:
123
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
124
+ raise SourceConnectionError(f"failed to validate connection: {e}")
125
+
126
+ def run(self, **kwargs: Any) -> Generator[SqlBatchFileData, None, None]:
127
+ ids = self._get_doc_ids()
128
+ id_batches: list[frozenset[str]] = [
129
+ frozenset(
130
+ ids[
131
+ i * self.index_config.batch_size : (i + 1) # noqa
132
+ * self.index_config.batch_size
133
+ ]
134
+ )
135
+ for i in range(
136
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
137
+ )
138
+ ]
139
+
140
+ for batch in id_batches:
141
+ batch_items = [BatchItem(identifier=str(b)) for b in batch]
142
+ display_name = (
143
+ f"{self.index_config.table_name}-{self.index_config.id_column}"
144
+ f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
145
+ )
146
+ # Make sure the hash is always a positive number to create identified
147
+ yield SqlBatchFileData(
148
+ connector_type=self.connector_type,
149
+ metadata=FileDataSourceMetadata(
150
+ date_processed=str(time()),
151
+ ),
152
+ additional_metadata=SqlAdditionalMetadata(
153
+ table_name=self.index_config.table_name, id_column=self.index_config.id_column
154
+ ),
155
+ batch_items=batch_items,
156
+ display_name=display_name,
157
+ )
158
+
159
+
160
+ class SQLDownloaderConfig(DownloaderConfig):
161
+ fields: list[str] = field(default_factory=list)
162
+
163
+
164
+ class SQLDownloader(Downloader, ABC):
165
+ connection_config: SQLConnectionConfig
166
+ download_config: SQLDownloaderConfig
167
+
168
+ @contextmanager
169
+ def get_cursor(self) -> Generator[Any, None, None]:
170
+ with self.connection_config.get_cursor() as cursor:
171
+ yield cursor
172
+
173
+ @abstractmethod
174
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
175
+ pass
176
+
177
+ def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list["DataFrame"]:
178
+ import pandas as pd
179
+
180
+ data = [dict(zip(columns, row)) for row in rows]
181
+ df = pd.DataFrame(data)
182
+ dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
183
+ return dfs
184
+
185
+ def get_data(self, file_data: SqlBatchFileData) -> list["DataFrame"]:
186
+ rows, columns = self.query_db(file_data=file_data)
187
+ return self.sql_to_df(rows=rows, columns=columns)
188
+
189
+ def get_identifier(self, table_name: str, record_id: str) -> str:
190
+ f = f"{table_name}-{record_id}"
191
+ if self.download_config.fields:
192
+ f = "{}-{}".format(
193
+ f,
194
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
195
+ )
196
+ return f
197
+
198
+ def generate_download_response(
199
+ self, result: "DataFrame", file_data: SqlBatchFileData
200
+ ) -> DownloadResponse:
201
+ id_column = file_data.additional_metadata.id_column
202
+ table_name = file_data.additional_metadata.table_name
203
+ record_id = result.iloc[0][id_column]
204
+ filename_id = self.get_identifier(table_name=table_name, record_id=record_id)
205
+ filename = f"{filename_id}.csv"
206
+ download_path = self.download_dir / Path(filename)
207
+ logger.debug(
208
+ f"Downloading results from table {table_name} and id {record_id} to {download_path}"
209
+ )
210
+ download_path.parent.mkdir(parents=True, exist_ok=True)
211
+ result.to_csv(download_path, index=False)
212
+ file_data.source_identifiers = SourceIdentifiers(
213
+ filename=filename,
214
+ fullpath=filename,
215
+ )
216
+ cast_file_data = FileData.cast(file_data=file_data)
217
+ cast_file_data.identifier = filename_id
218
+ return super().generate_download_response(
219
+ file_data=cast_file_data, download_path=download_path
220
+ )
221
+
222
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
223
+ sql_filedata = SqlBatchFileData.cast(file_data=file_data)
224
+ data_dfs = self.get_data(file_data=sql_filedata)
225
+ download_responses = []
226
+ for df in data_dfs:
227
+ download_responses.append(
228
+ self.generate_download_response(result=df, file_data=sql_filedata)
229
+ )
230
+ return download_responses
231
+
232
+
233
+ class SQLUploadStagerConfig(UploadStagerConfig):
234
+ pass
235
+
236
+
237
+ @dataclass
238
+ class SQLUploadStager(UploadStager):
239
+ upload_stager_config: SQLUploadStagerConfig = field(default_factory=SQLUploadStagerConfig)
240
+
241
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
242
+ data = element_dict.copy()
243
+ metadata: dict[str, Any] = data.pop("metadata", {})
244
+ data_source = metadata.pop("data_source", {})
245
+ coordinates = metadata.pop("coordinates", {})
246
+
247
+ data.update(metadata)
248
+ data.update(data_source)
249
+ data.update(coordinates)
250
+
251
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
252
+
253
+ data[RECORD_ID_LABEL] = file_data.identifier
254
+ return data
255
+
256
+ def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
257
+ for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
258
+ df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
259
+ for column in filter(
260
+ lambda x: x in df.columns,
261
+ ("permissions_data", "record_locator", "points", "links"),
262
+ ):
263
+ df[column] = df[column].apply(
264
+ lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
265
+ )
266
+ for column in filter(
267
+ lambda x: x in df.columns,
268
+ ("version", "page_number", "regex_metadata"),
269
+ ):
270
+ df[column] = df[column].apply(str)
271
+ return df
272
+
273
+ def write_output(self, output_path: Path, data: list[dict]) -> Path:
274
+ write_data(path=output_path, data=data)
275
+ return output_path
276
+
277
+ def run(
278
+ self,
279
+ elements_filepath: Path,
280
+ file_data: FileData,
281
+ output_dir: Path,
282
+ output_filename: str,
283
+ **kwargs: Any,
284
+ ) -> Path:
285
+ import pandas as pd
286
+
287
+ elements_contents = get_json_data(path=elements_filepath)
288
+
289
+ df = pd.DataFrame(
290
+ data=[
291
+ self.conform_dict(element_dict=element_dict, file_data=file_data)
292
+ for element_dict in elements_contents
293
+ ]
294
+ )
295
+ df = self.conform_dataframe(df=df)
296
+
297
+ output_filename_suffix = Path(elements_filepath).suffix
298
+ output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
299
+ output_path = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
300
+
301
+ final_output_path = self.write_output(
302
+ output_path=output_path, data=df.to_dict(orient="records")
303
+ )
304
+ return final_output_path
305
+
306
+
307
+ class SQLUploaderConfig(UploaderConfig):
308
+ batch_size: int = Field(default=50, description="Number of records per batch")
309
+ table_name: str = Field(default="elements", description="which table to upload contents to")
310
+ record_id_key: str = Field(
311
+ default=RECORD_ID_LABEL,
312
+ description="searchable key to find entries for the same record on previous runs",
313
+ )
314
+
315
+
316
+ @dataclass
317
+ class SQLUploader(Uploader):
318
+ upload_config: SQLUploaderConfig
319
+ connection_config: SQLConnectionConfig
320
+ values_delimiter: str = "?"
321
+ _columns: list[str] = field(init=False, default=None)
322
+
323
+ def precheck(self) -> None:
324
+ try:
325
+ with self.get_cursor() as cursor:
326
+ cursor.execute("SELECT 1;")
327
+ except Exception as e:
328
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
329
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
330
+
331
+ @contextmanager
332
+ def get_cursor(self) -> Generator[Any, None, None]:
333
+ with self.connection_config.get_cursor() as cursor:
334
+ yield cursor
335
+
336
+ def prepare_data(
337
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
338
+ ) -> list[tuple[Any, ...]]:
339
+ import pandas as pd
340
+
341
+ output = []
342
+ for row in data:
343
+ parsed = []
344
+ for column_name, value in zip(columns, row):
345
+ if column_name in _DATE_COLUMNS:
346
+ if value is None or pd.isna(value): # pandas is nan
347
+ parsed.append(None)
348
+ else:
349
+ parsed.append(parse_date_string(value))
350
+ else:
351
+ parsed.append(value)
352
+ output.append(tuple(parsed))
353
+ return output
354
+
355
+ def _fit_to_schema(
356
+ self, df: "DataFrame", add_missing_columns: bool = True, case_sensitive: bool = True
357
+ ) -> "DataFrame":
358
+ import pandas as pd
359
+
360
+ table_columns = self.get_table_columns()
361
+ columns = set(df.columns if case_sensitive else df.columns.str.lower())
362
+ schema_fields = set(
363
+ table_columns if case_sensitive else {col.lower() for col in table_columns}
364
+ )
365
+ columns_to_drop = columns - schema_fields
366
+ missing_columns = schema_fields - columns
367
+
368
+ if columns_to_drop:
369
+ logger.info(
370
+ "Following columns will be dropped to match the table's schema: "
371
+ f"{', '.join(columns_to_drop)}"
372
+ )
373
+ if missing_columns and add_missing_columns:
374
+ logger.info(
375
+ "Following null filled columns will be added to match the table's schema:"
376
+ f" {', '.join(missing_columns)} "
377
+ )
378
+
379
+ df = df.drop(columns=columns_to_drop)
380
+
381
+ if add_missing_columns:
382
+ for column in missing_columns:
383
+ df[column] = pd.Series()
384
+ return df
385
+
386
+ def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
387
+ import numpy as np
388
+
389
+ if self.can_delete():
390
+ self.delete_by_record_id(file_data=file_data)
391
+ else:
392
+ logger.warning(
393
+ f"table doesn't contain expected "
394
+ f"record id column "
395
+ f"{self.upload_config.record_id_key}, skipping delete"
396
+ )
397
+ df = self._fit_to_schema(df=df)
398
+ df.replace({np.nan: None}, inplace=True)
399
+
400
+ columns = list(df.columns)
401
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
402
+ table_name=self.upload_config.table_name,
403
+ columns=",".join(columns),
404
+ values=",".join([self.values_delimiter for _ in columns]),
405
+ )
406
+ logger.info(
407
+ f"writing a total of {len(df)} elements via"
408
+ f" document batches to destination"
409
+ f" table named {self.upload_config.table_name}"
410
+ f" with batch size {self.upload_config.batch_size}"
411
+ )
412
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
413
+ with self.get_cursor() as cursor:
414
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
415
+ # For debugging purposes:
416
+ # for val in values:
417
+ # try:
418
+ # cursor.execute(stmt, val)
419
+ # except Exception as e:
420
+ # print(f"Error: {e}")
421
+ # print(f"failed to write {len(columns)}, {len(val)}: {stmt} -> {val}")
422
+ logger.debug(f"running query: {stmt}")
423
+ cursor.executemany(stmt, values)
424
+
425
+ def get_table_columns(self) -> list[str]:
426
+ if self._columns is None:
427
+ with self.get_cursor() as cursor:
428
+ cursor.execute(f"SELECT * from {self.upload_config.table_name} LIMIT 1")
429
+ self._columns = [desc[0] for desc in cursor.description]
430
+ return self._columns
431
+
432
+ def can_delete(self) -> bool:
433
+ return self.upload_config.record_id_key in self.get_table_columns()
434
+
435
+ def delete_by_record_id(self, file_data: FileData) -> None:
436
+ logger.debug(
437
+ f"deleting any content with data "
438
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
439
+ f"from table {self.upload_config.table_name}"
440
+ )
441
+ stmt = f"DELETE FROM {self.upload_config.table_name} WHERE {self.upload_config.record_id_key} = {self.values_delimiter}" # noqa: E501
442
+ with self.get_cursor() as cursor:
443
+ cursor.execute(stmt, [file_data.identifier])
444
+ rowcount = cursor.rowcount
445
+ if rowcount > 0:
446
+ logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
447
+
448
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
449
+ import pandas as pd
450
+
451
+ df = pd.DataFrame(data)
452
+ self.upload_dataframe(df=df, file_data=file_data)
453
+
454
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
455
+ df = get_data_df(path=path)
456
+ self.upload_dataframe(df=df, file_data=file_data)
@@ -0,0 +1,179 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator
6
+
7
+ from pydantic import Field, Secret, model_validator
8
+
9
+ from unstructured_ingest.data_types.file_data import FileData
10
+ from unstructured_ingest.error import ValueError
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.processes.connector_registry import (
13
+ DestinationRegistryEntry,
14
+ SourceRegistryEntry,
15
+ )
16
+ from unstructured_ingest.processes.connectors.sql.sql import (
17
+ _DATE_COLUMNS,
18
+ SQLAccessConfig,
19
+ SqlBatchFileData,
20
+ SQLConnectionConfig,
21
+ SQLDownloader,
22
+ SQLDownloaderConfig,
23
+ SQLIndexer,
24
+ SQLIndexerConfig,
25
+ SQLUploader,
26
+ SQLUploaderConfig,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ parse_date_string,
30
+ )
31
+ from unstructured_ingest.utils.dep_check import requires_dependencies
32
+
33
+ if TYPE_CHECKING:
34
+ from sqlite3 import Connection as SqliteConnection
35
+ from sqlite3 import Cursor as SqliteCursor
36
+
37
+
38
+ CONNECTOR_TYPE = "sqlite"
39
+
40
+
41
+ class SQLiteAccessConfig(SQLAccessConfig):
42
+ pass
43
+
44
+
45
+ class SQLiteConnectionConfig(SQLConnectionConfig):
46
+ access_config: Secret[SQLiteAccessConfig] = Field(
47
+ default=SQLiteAccessConfig(), validate_default=True
48
+ )
49
+ database_path: Path = Field(
50
+ description="Path to the .db file.",
51
+ )
52
+
53
+ @model_validator(mode="after")
54
+ def check_database_path(self) -> "SQLiteConnectionConfig":
55
+ if not self.database_path.exists():
56
+ raise ValueError(f"{self.database_path} does not exist")
57
+ if not self.database_path.is_file():
58
+ raise ValueError(f"{self.database_path} is not a valid file")
59
+ return self
60
+
61
+ @contextmanager
62
+ def get_connection(self) -> Generator["SqliteConnection", None, None]:
63
+ from sqlite3 import connect
64
+
65
+ connection = connect(database=self.database_path)
66
+ try:
67
+ yield connection
68
+ finally:
69
+ connection.commit()
70
+ connection.close()
71
+
72
+ @contextmanager
73
+ def get_cursor(self) -> Generator["SqliteCursor", None, None]:
74
+ with self.get_connection() as connection:
75
+ cursor = connection.cursor()
76
+ try:
77
+ yield cursor
78
+ finally:
79
+ cursor.close()
80
+
81
+
82
+ class SQLiteIndexerConfig(SQLIndexerConfig):
83
+ pass
84
+
85
+
86
+ @dataclass
87
+ class SQLiteIndexer(SQLIndexer):
88
+ connection_config: SQLConnectionConfig
89
+ index_config: SQLIndexerConfig
90
+ connector_type: str = CONNECTOR_TYPE
91
+
92
+
93
+ class SQLiteDownloaderConfig(SQLDownloaderConfig):
94
+ pass
95
+
96
+
97
+ @dataclass
98
+ class SQLiteDownloader(SQLDownloader):
99
+ connection_config: SQLConnectionConfig
100
+ download_config: SQLDownloaderConfig
101
+ connector_type: str = CONNECTOR_TYPE
102
+ values_delimiter: str = "?"
103
+
104
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
105
+ table_name = file_data.additional_metadata.table_name
106
+ id_column = file_data.additional_metadata.id_column
107
+ ids = [item.identifier for item in file_data.batch_items]
108
+ with self.connection_config.get_connection() as sqlite_connection:
109
+ cursor = sqlite_connection.cursor()
110
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
111
+ values = ",".join(self.values_delimiter for _ in ids)
112
+ query = f"SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})"
113
+ logger.debug(f"running query: {query}\nwith values: {ids}")
114
+ cursor.execute(query, ids)
115
+ rows = cursor.fetchall()
116
+ columns = [col[0] for col in cursor.description]
117
+ return rows, columns
118
+
119
+
120
+ class SQLiteUploadStagerConfig(SQLUploadStagerConfig):
121
+ pass
122
+
123
+
124
+ class SQLiteUploadStager(SQLUploadStager):
125
+ upload_stager_config: SQLiteUploadStagerConfig
126
+
127
+
128
+ class SQLiteUploaderConfig(SQLUploaderConfig):
129
+ pass
130
+
131
+
132
+ @dataclass
133
+ class SQLiteUploader(SQLUploader):
134
+ upload_config: SQLiteUploaderConfig = field(default_factory=SQLiteUploaderConfig)
135
+ connection_config: SQLiteConnectionConfig
136
+ connector_type: str = CONNECTOR_TYPE
137
+
138
+ @requires_dependencies(["pandas"])
139
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
140
+ super().run(path=path, file_data=file_data, **kwargs)
141
+
142
+ @requires_dependencies(["pandas"])
143
+ def prepare_data(
144
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
145
+ ) -> list[tuple[Any, ...]]:
146
+ import pandas as pd
147
+
148
+ output = []
149
+ for row in data:
150
+ parsed = []
151
+ for column_name, value in zip(columns, row):
152
+ if isinstance(value, (list, dict)):
153
+ value = json.dumps(value)
154
+ if column_name in _DATE_COLUMNS:
155
+ if value is None or pd.isna(value):
156
+ parsed.append(None)
157
+ else:
158
+ parsed.append(parse_date_string(value))
159
+ else:
160
+ parsed.append(value)
161
+ output.append(tuple(parsed))
162
+ return output
163
+
164
+
165
+ sqlite_source_entry = SourceRegistryEntry(
166
+ connection_config=SQLiteConnectionConfig,
167
+ indexer_config=SQLiteIndexerConfig,
168
+ indexer=SQLiteIndexer,
169
+ downloader_config=SQLiteDownloaderConfig,
170
+ downloader=SQLiteDownloader,
171
+ )
172
+
173
+ sqlite_destination_entry = DestinationRegistryEntry(
174
+ connection_config=SQLiteConnectionConfig,
175
+ uploader=SQLiteUploader,
176
+ uploader_config=SQLiteUploaderConfig,
177
+ upload_stager=SQLiteUploadStager,
178
+ upload_stager_config=SQLiteUploadStagerConfig,
179
+ )