unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,93 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+ from unstructured_ingest.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
24
+
25
+ CONNECTOR_TYPE = "databricks_volumes_aws"
26
+
27
+
28
+ class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
+ account_id: Optional[str] = Field(
30
+ default=None,
31
+ description="The Databricks account ID for the Databricks accounts endpoint",
32
+ )
33
+ profile: Optional[str] = None
34
+ token: Optional[str] = Field(
35
+ default=None,
36
+ description="The Databricks personal access token (PAT)",
37
+ )
38
+
39
+
40
+ class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
41
+ access_config: Secret[DatabricksAWSVolumesAccessConfig]
42
+
43
+
44
+ class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
45
+ pass
46
+
47
+
48
+ @dataclass
49
+ class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
50
+ connection_config: DatabricksAWSVolumesConnectionConfig
51
+ index_config: DatabricksAWSVolumesIndexerConfig
52
+ connector_type: str = CONNECTOR_TYPE
53
+
54
+
55
+ class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
56
+ pass
57
+
58
+
59
+ @dataclass
60
+ class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
61
+ connection_config: DatabricksAWSVolumesConnectionConfig
62
+ download_config: DatabricksVolumesDownloaderConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+
65
+
66
+ class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
72
+ connection_config: DatabricksAWSVolumesConnectionConfig
73
+ upload_config: DatabricksAWSVolumesUploaderConfig = field(
74
+ default_factory=DatabricksAWSVolumesUploaderConfig
75
+ )
76
+ connector_type: str = CONNECTOR_TYPE
77
+
78
+
79
+ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
80
+ connection_config=DatabricksAWSVolumesConnectionConfig,
81
+ uploader=DatabricksAWSVolumesUploader,
82
+ uploader_config=DatabricksAWSVolumesUploaderConfig,
83
+ upload_stager_config=BlobStoreUploadStagerConfig,
84
+ upload_stager=BlobStoreUploadStager,
85
+ )
86
+
87
+ databricks_aws_volumes_source_entry = SourceRegistryEntry(
88
+ connection_config=DatabricksAWSVolumesConnectionConfig,
89
+ indexer=DatabricksAWSVolumesIndexer,
90
+ indexer_config=DatabricksAWSVolumesIndexerConfig,
91
+ downloader=DatabricksAWSVolumesDownloader,
92
+ downloader_config=DatabricksAWSVolumesDownloaderConfig,
93
+ )
@@ -0,0 +1,108 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+ from unstructured_ingest.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
24
+
25
+ CONNECTOR_TYPE = "databricks_volumes_azure"
26
+
27
+
28
+ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
+ account_id: Optional[str] = Field(
30
+ default=None,
31
+ description="The Databricks account ID for the Databricks accounts endpoint.",
32
+ )
33
+ profile: Optional[str] = None
34
+ azure_workspace_resource_id: Optional[str] = Field(
35
+ default=None,
36
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
37
+ "which is exchanged for a Databricks host URL.",
38
+ )
39
+ azure_client_secret: Optional[str] = Field(
40
+ default=None, description="The Azure AD service principal’s client secret."
41
+ )
42
+ azure_client_id: Optional[str] = Field(
43
+ default=None, description="The Azure AD service principal’s application ID."
44
+ )
45
+ azure_tenant_id: Optional[str] = Field(
46
+ default=None, description="The Azure AD service principal’s tenant ID."
47
+ )
48
+ azure_environment: Optional[str] = Field(
49
+ default=None,
50
+ description="The Azure environment type for a specific set of API endpoints",
51
+ examples=["Public", "UsGov", "China", "Germany"],
52
+ )
53
+
54
+
55
+ class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
56
+ access_config: Secret[DatabricksAzureVolumesAccessConfig]
57
+
58
+
59
+ class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
60
+ pass
61
+
62
+
63
+ @dataclass
64
+ class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
65
+ connection_config: DatabricksAzureVolumesConnectionConfig
66
+ index_config: DatabricksAzureVolumesIndexerConfig
67
+ connector_type: str = CONNECTOR_TYPE
68
+
69
+
70
+ class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
71
+ pass
72
+
73
+
74
+ @dataclass
75
+ class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
76
+ connection_config: DatabricksAzureVolumesConnectionConfig
77
+ download_config: DatabricksVolumesDownloaderConfig
78
+ connector_type: str = CONNECTOR_TYPE
79
+
80
+
81
+ class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
82
+ pass
83
+
84
+
85
+ @dataclass
86
+ class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
87
+ connection_config: DatabricksAzureVolumesConnectionConfig
88
+ upload_config: DatabricksAzureVolumesUploaderConfig = field(
89
+ default_factory=DatabricksAzureVolumesUploaderConfig
90
+ )
91
+ connector_type: str = CONNECTOR_TYPE
92
+
93
+
94
+ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
95
+ connection_config=DatabricksAzureVolumesConnectionConfig,
96
+ uploader=DatabricksAzureVolumesUploader,
97
+ uploader_config=DatabricksAzureVolumesUploaderConfig,
98
+ upload_stager_config=BlobStoreUploadStagerConfig,
99
+ upload_stager=BlobStoreUploadStager,
100
+ )
101
+
102
+ databricks_azure_volumes_source_entry = SourceRegistryEntry(
103
+ connection_config=DatabricksAzureVolumesConnectionConfig,
104
+ indexer=DatabricksAzureVolumesIndexer,
105
+ indexer_config=DatabricksAzureVolumesIndexerConfig,
106
+ downloader=DatabricksAzureVolumesDownloader,
107
+ downloader_config=DatabricksAzureVolumesDownloaderConfig,
108
+ )
@@ -0,0 +1,91 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+ from unstructured_ingest.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
24
+
25
+ CONNECTOR_TYPE = "databricks_volumes_gcp"
26
+
27
+
28
+ class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
+ account_id: Optional[str] = Field(
30
+ default=None,
31
+ description="The Databricks account ID for the Databricks accounts endpoint.",
32
+ )
33
+ profile: Optional[str] = None
34
+ google_credentials: Optional[str] = None
35
+ google_service_account: Optional[str] = None
36
+
37
+
38
+ class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
39
+ access_config: Secret[DatabricksGoogleVolumesAccessConfig]
40
+
41
+
42
+ class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
43
+ pass
44
+
45
+
46
+ @dataclass
47
+ class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
48
+ connection_config: DatabricksGoogleVolumesConnectionConfig
49
+ index_config: DatabricksGoogleVolumesIndexerConfig
50
+ connector_type: str = CONNECTOR_TYPE
51
+
52
+
53
+ class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
54
+ pass
55
+
56
+
57
+ @dataclass
58
+ class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
59
+ connection_config: DatabricksGoogleVolumesConnectionConfig
60
+ download_config: DatabricksVolumesDownloaderConfig
61
+ connector_type: str = CONNECTOR_TYPE
62
+
63
+
64
+ class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
65
+ pass
66
+
67
+
68
+ @dataclass
69
+ class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
70
+ connection_config: DatabricksGoogleVolumesConnectionConfig
71
+ upload_config: DatabricksGoogleVolumesUploaderConfig = field(
72
+ default_factory=DatabricksGoogleVolumesUploaderConfig
73
+ )
74
+ connector_type: str = CONNECTOR_TYPE
75
+
76
+
77
+ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
78
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
79
+ uploader=DatabricksGoogleVolumesUploader,
80
+ uploader_config=DatabricksGoogleVolumesUploaderConfig,
81
+ upload_stager_config=BlobStoreUploadStagerConfig,
82
+ upload_stager=BlobStoreUploadStager,
83
+ )
84
+
85
+ databricks_gcp_volumes_source_entry = SourceRegistryEntry(
86
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
87
+ indexer=DatabricksGoogleVolumesIndexer,
88
+ indexer_config=DatabricksGoogleVolumesIndexerConfig,
89
+ downloader=DatabricksGoogleVolumesDownloader,
90
+ downloader_config=DatabricksGoogleVolumesDownloaderConfig,
91
+ )
@@ -0,0 +1,92 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+ from unstructured_ingest.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
24
+
25
+ CONNECTOR_TYPE = "databricks_volumes"
26
+
27
+
28
+ class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
29
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
30
+ client_secret: Optional[str] = Field(
31
+ default=None, description="Client Secret of the OAuth app."
32
+ )
33
+ profile: Optional[str] = None
34
+ azure_workspace_resource_id: Optional[str] = Field(
35
+ default=None,
36
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
37
+ "which is exchanged for a Databricks host URL.",
38
+ )
39
+
40
+
41
+ class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
42
+ access_config: Secret[DatabricksNativeVolumesAccessConfig]
43
+
44
+
45
+ class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
46
+ pass
47
+
48
+
49
+ @dataclass
50
+ class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
51
+ connection_config: DatabricksNativeVolumesConnectionConfig
52
+ index_config: DatabricksNativeVolumesIndexerConfig
53
+ connector_type: str = CONNECTOR_TYPE
54
+
55
+
56
+ class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
57
+ pass
58
+
59
+
60
+ @dataclass
61
+ class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
62
+ connection_config: DatabricksNativeVolumesConnectionConfig
63
+ download_config: DatabricksVolumesDownloaderConfig
64
+ connector_type: str = CONNECTOR_TYPE
65
+
66
+
67
+ class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
68
+ pass
69
+
70
+
71
+ @dataclass
72
+ class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
73
+ connection_config: DatabricksNativeVolumesConnectionConfig
74
+ upload_config: DatabricksNativeVolumesUploaderConfig
75
+ connector_type: str = CONNECTOR_TYPE
76
+
77
+
78
+ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
79
+ connection_config=DatabricksNativeVolumesConnectionConfig,
80
+ uploader=DatabricksNativeVolumesUploader,
81
+ uploader_config=DatabricksNativeVolumesUploaderConfig,
82
+ upload_stager_config=BlobStoreUploadStagerConfig,
83
+ upload_stager=BlobStoreUploadStager,
84
+ )
85
+
86
+ databricks_native_volumes_source_entry = SourceRegistryEntry(
87
+ connection_config=DatabricksNativeVolumesConnectionConfig,
88
+ indexer=DatabricksNativeVolumesIndexer,
89
+ indexer_config=DatabricksNativeVolumesIndexerConfig,
90
+ downloader=DatabricksNativeVolumesDownloader,
91
+ downloader_config=DatabricksNativeVolumesDownloaderConfig,
92
+ )
@@ -0,0 +1,187 @@
1
+ import json
2
+ import os
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from pydantic import Field
9
+
10
+ from unstructured_ingest.data_types.file_data import FileData
11
+ from unstructured_ingest.error import ValueError
12
+ from unstructured_ingest.interfaces import (
13
+ Uploader,
14
+ UploaderConfig,
15
+ UploadStager,
16
+ UploadStagerConfig,
17
+ )
18
+ from unstructured_ingest.logger import logger
19
+ from unstructured_ingest.processes.connector_registry import (
20
+ DestinationRegistryEntry,
21
+ )
22
+ from unstructured_ingest.processes.connectors.databricks.volumes import DatabricksPathMixin
23
+ from unstructured_ingest.processes.connectors.sql.databricks_delta_tables import (
24
+ DatabricksDeltaTablesConnectionConfig,
25
+ DatabricksDeltaTablesUploadStagerConfig,
26
+ )
27
+ from unstructured_ingest.utils.constants import RECORD_ID_LABEL
28
+ from unstructured_ingest.utils.data_prep import get_enhanced_element_id, get_json_data, write_data
29
+
30
+ CONNECTOR_TYPE = "databricks_volume_delta_tables"
31
+
32
+ if TYPE_CHECKING:
33
+ pass
34
+
35
+
36
+ class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
37
+ database: str = Field(description="Database name", default="default")
38
+ table_name: Optional[str] = Field(description="Table name", default=None)
39
+
40
+
41
+ class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class DatabricksVolumeDeltaTableStager(UploadStager):
47
+ upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
48
+ default_factory=DatabricksVolumeDeltaTableStagerConfig
49
+ )
50
+
51
+ def run(
52
+ self,
53
+ elements_filepath: Path,
54
+ output_dir: Path,
55
+ output_filename: str,
56
+ file_data: FileData,
57
+ **kwargs: Any,
58
+ ) -> Path:
59
+ # To avoid new line issues when migrating from volumes into delta tables, omit indenting
60
+ # and always write it as a json file
61
+ output_dir.mkdir(exist_ok=True, parents=True)
62
+ output_path = output_dir / output_filename
63
+ final_output_path = output_path.with_suffix(".json")
64
+ data = get_json_data(path=elements_filepath)
65
+ for element in data:
66
+ element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
67
+ element[RECORD_ID_LABEL] = file_data.identifier
68
+ element["metadata"] = json.dumps(element.get("metadata", {}))
69
+ write_data(path=final_output_path, data=data, indent=None)
70
+ return final_output_path
71
+
72
+
73
+ @dataclass
74
+ class DatabricksVolumeDeltaTableUploader(Uploader):
75
+ connection_config: DatabricksDeltaTablesConnectionConfig
76
+ upload_config: DatabricksVolumeDeltaTableUploaderConfig
77
+ connector_type: str = CONNECTOR_TYPE
78
+ _columns: Optional[dict[str, str]] = None
79
+
80
+ def init(self, **kwargs: Any) -> None:
81
+ self.create_destination(**kwargs)
82
+
83
+ def create_destination(
84
+ self, destination_name: str = "unstructuredautocreated", **kwargs: Any
85
+ ) -> bool:
86
+ table_name = self.upload_config.table_name or destination_name
87
+ self.upload_config.table_name = table_name
88
+ connectors_dir = Path(__file__).parents[1]
89
+ collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
90
+ with self.get_cursor() as cursor:
91
+ cursor.execute("SHOW TABLES")
92
+ table_names = [r[1] for r in cursor.fetchall()]
93
+ if table_name in table_names:
94
+ return False
95
+ with collection_config_file.open() as schema_file:
96
+ data_lines = schema_file.readlines()
97
+ data_lines[0] = data_lines[0].replace("elements", table_name)
98
+ destination_schema = "".join([line.strip() for line in data_lines])
99
+ logger.info(f"creating table {table_name} for user")
100
+ cursor.execute(destination_schema)
101
+ return True
102
+
103
+ def precheck(self) -> None:
104
+ with self.connection_config.get_cursor() as cursor:
105
+ cursor.execute("SHOW CATALOGS")
106
+ catalogs = [r[0] for r in cursor.fetchall()]
107
+ if self.upload_config.catalog not in catalogs:
108
+ raise ValueError(
109
+ "Catalog {} not found in {}".format(
110
+ self.upload_config.catalog, ", ".join(catalogs)
111
+ )
112
+ )
113
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
114
+ cursor.execute("SHOW DATABASES")
115
+ databases = [r[0] for r in cursor.fetchall()]
116
+ if self.upload_config.database not in databases:
117
+ raise ValueError(
118
+ "Database {} not found in {}".format(
119
+ self.upload_config.database, ", ".join(databases)
120
+ )
121
+ )
122
+
123
+ def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
124
+ filename = Path(file_data.source_identifiers.filename)
125
+ adjusted_filename = filename if filename.suffix == suffix else f"{filename}{suffix}"
126
+ return os.path.join(self.upload_config.path, f"{adjusted_filename}")
127
+
128
+ @contextmanager
129
+ def get_cursor(self, **connect_kwargs) -> Generator[Any, None, None]:
130
+ with self.connection_config.get_cursor(**connect_kwargs) as cursor:
131
+ logger.debug(f"executing: USE CATALOG: '{self.upload_config.catalog}'")
132
+ cursor.execute(f"USE CATALOG '{self.upload_config.catalog}'")
133
+ logger.debug(f"executing: USE DATABASE: {self.upload_config.database}")
134
+ cursor.execute(f"USE DATABASE {self.upload_config.database}")
135
+ yield cursor
136
+
137
+ def get_table_columns(self) -> dict[str, str]:
138
+ if self._columns is None:
139
+ with self.get_cursor() as cursor:
140
+ cursor.execute(f"SELECT * from `{self.upload_config.table_name}` LIMIT 1")
141
+ self._columns = {desc[0]: desc[1] for desc in cursor.description}
142
+ return self._columns
143
+
144
+ def can_delete(self) -> bool:
145
+ existing_columns = self.get_table_columns()
146
+ return RECORD_ID_LABEL in existing_columns
147
+
148
+ def delete_previous_content(self, file_data: FileData) -> None:
149
+ logger.debug(
150
+ f"deleting any content with metadata "
151
+ f"{RECORD_ID_LABEL}={file_data.identifier} "
152
+ f"from delta table: {self.upload_config.table_name}"
153
+ )
154
+ with self.get_cursor() as cursor:
155
+ cursor.execute(
156
+ f"DELETE FROM `{self.upload_config.table_name}` WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
157
+ )
158
+ results = cursor.fetchall()
159
+ deleted_rows = results[0][0]
160
+ logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
161
+
162
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
163
+ if self.can_delete():
164
+ self.delete_previous_content(file_data=file_data)
165
+ with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
166
+ catalog_path = self.get_output_path(file_data=file_data)
167
+ logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
168
+ cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
169
+ logger.debug(
170
+ f"migrating content from {catalog_path} to table {self.upload_config.table_name}"
171
+ )
172
+ data = get_json_data(path=path)
173
+ columns = data[0].keys()
174
+ select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
175
+ column_str = ", ".join(columns)
176
+ select_column_str = ", ".join(select_columns)
177
+ sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
178
+ cursor.execute(sql_statment)
179
+
180
+
181
+ databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
182
+ connection_config=DatabricksDeltaTablesConnectionConfig,
183
+ uploader=DatabricksVolumeDeltaTableUploader,
184
+ uploader_config=DatabricksVolumeDeltaTableUploaderConfig,
185
+ upload_stager=DatabricksVolumeDeltaTableStager,
186
+ upload_stager_config=DatabricksDeltaTablesUploadStagerConfig,
187
+ )