unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .postgres import CONNECTOR_TYPE as POSTGRES_CONNECTOR_TYPE
9
+ from .postgres import postgres_destination_entry, postgres_source_entry
10
+ from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
11
+ from .singlestore import singlestore_destination_entry, singlestore_source_entry
12
+ from .snowflake import CONNECTOR_TYPE as SNOWFLAKE_CONNECTOR_TYPE
13
+ from .snowflake import snowflake_destination_entry, snowflake_source_entry
14
+ from .sqlite import CONNECTOR_TYPE as SQLITE_CONNECTOR_TYPE
15
+ from .sqlite import sqlite_destination_entry, sqlite_source_entry
16
+
17
+ add_source_entry(source_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_source_entry)
18
+ add_source_entry(source_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_source_entry)
19
+ add_source_entry(source_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_source_entry)
20
+ add_source_entry(source_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_source_entry)
21
+
22
+ add_destination_entry(destination_type=SQLITE_CONNECTOR_TYPE, entry=sqlite_destination_entry)
23
+ add_destination_entry(destination_type=POSTGRES_CONNECTOR_TYPE, entry=postgres_destination_entry)
24
+ add_destination_entry(destination_type=SNOWFLAKE_CONNECTOR_TYPE, entry=snowflake_destination_entry)
25
+ add_destination_entry(
26
+ destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
27
+ )
@@ -0,0 +1,162 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.logger import logger
9
+ from unstructured_ingest.v2.processes.connector_registry import (
10
+ DestinationRegistryEntry,
11
+ SourceRegistryEntry,
12
+ )
13
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
14
+ SQLAccessConfig,
15
+ SqlBatchFileData,
16
+ SQLConnectionConfig,
17
+ SQLDownloader,
18
+ SQLDownloaderConfig,
19
+ SQLIndexer,
20
+ SQLIndexerConfig,
21
+ SQLUploader,
22
+ SQLUploaderConfig,
23
+ SQLUploadStager,
24
+ SQLUploadStagerConfig,
25
+ )
26
+
27
+ if TYPE_CHECKING:
28
+ from psycopg2.extensions import connection as PostgresConnection
29
+ from psycopg2.extensions import cursor as PostgresCursor
30
+
31
+ CONNECTOR_TYPE = "postgres"
32
+
33
+
34
+ class PostgresAccessConfig(SQLAccessConfig):
35
+ password: Optional[str] = Field(default=None, description="DB password")
36
+
37
+
38
+ class PostgresConnectionConfig(SQLConnectionConfig):
39
+ access_config: Secret[PostgresAccessConfig] = Field(
40
+ default=PostgresAccessConfig(), validate_default=True
41
+ )
42
+ database: Optional[str] = Field(
43
+ default=None,
44
+ description="Database name.",
45
+ )
46
+ username: Optional[str] = Field(default=None, description="DB username")
47
+ host: Optional[str] = Field(default=None, description="DB host")
48
+ port: Optional[int] = Field(default=5432, description="DB host connection port")
49
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
50
+
51
+ @contextmanager
52
+ @requires_dependencies(["psycopg2"], extras="postgres")
53
+ def get_connection(self) -> Generator["PostgresConnection", None, None]:
54
+ from psycopg2 import connect
55
+
56
+ access_config = self.access_config.get_secret_value()
57
+ connection = connect(
58
+ user=self.username,
59
+ password=access_config.password,
60
+ dbname=self.database,
61
+ host=self.host,
62
+ port=self.port,
63
+ )
64
+ try:
65
+ yield connection
66
+ finally:
67
+ connection.commit()
68
+ connection.close()
69
+
70
+ @contextmanager
71
+ def get_cursor(self) -> Generator["PostgresCursor", None, None]:
72
+ with self.get_connection() as connection:
73
+ cursor = connection.cursor()
74
+ try:
75
+ yield cursor
76
+ finally:
77
+ cursor.close()
78
+
79
+
80
+ class PostgresIndexerConfig(SQLIndexerConfig):
81
+ pass
82
+
83
+
84
+ @dataclass
85
+ class PostgresIndexer(SQLIndexer):
86
+ connection_config: PostgresConnectionConfig
87
+ index_config: PostgresIndexerConfig
88
+ connector_type: str = CONNECTOR_TYPE
89
+
90
+
91
+ class PostgresDownloaderConfig(SQLDownloaderConfig):
92
+ pass
93
+
94
+
95
+ @dataclass
96
+ class PostgresDownloader(SQLDownloader):
97
+ connection_config: PostgresConnectionConfig
98
+ download_config: PostgresDownloaderConfig
99
+ connector_type: str = CONNECTOR_TYPE
100
+
101
+ @requires_dependencies(["psycopg2"], extras="postgres")
102
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
103
+ from psycopg2 import sql
104
+
105
+ table_name = file_data.additional_metadata.table_name
106
+ id_column = file_data.additional_metadata.id_column
107
+ ids = tuple([item.identifier for item in file_data.batch_items])
108
+
109
+ with self.connection_config.get_cursor() as cursor:
110
+ fields = (
111
+ sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
112
+ if self.download_config.fields
113
+ else sql.SQL("*")
114
+ )
115
+
116
+ query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
117
+ fields=fields,
118
+ table_name=sql.Identifier(table_name),
119
+ id_column=sql.Identifier(id_column),
120
+ )
121
+ logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
122
+ cursor.execute(query, (ids,))
123
+ rows = cursor.fetchall()
124
+ columns = [col[0] for col in cursor.description]
125
+ return rows, columns
126
+
127
+
128
+ class PostgresUploadStagerConfig(SQLUploadStagerConfig):
129
+ pass
130
+
131
+
132
+ class PostgresUploadStager(SQLUploadStager):
133
+ upload_stager_config: PostgresUploadStagerConfig
134
+
135
+
136
+ class PostgresUploaderConfig(SQLUploaderConfig):
137
+ pass
138
+
139
+
140
+ @dataclass
141
+ class PostgresUploader(SQLUploader):
142
+ upload_config: PostgresUploaderConfig = field(default_factory=PostgresUploaderConfig)
143
+ connection_config: PostgresConnectionConfig
144
+ connector_type: str = CONNECTOR_TYPE
145
+ values_delimiter: str = "%s"
146
+
147
+
148
+ postgres_source_entry = SourceRegistryEntry(
149
+ connection_config=PostgresConnectionConfig,
150
+ indexer_config=PostgresIndexerConfig,
151
+ indexer=PostgresIndexer,
152
+ downloader_config=PostgresDownloaderConfig,
153
+ downloader=PostgresDownloader,
154
+ )
155
+
156
+ postgres_destination_entry = DestinationRegistryEntry(
157
+ connection_config=PostgresConnectionConfig,
158
+ uploader=PostgresUploader,
159
+ uploader_config=PostgresUploaderConfig,
160
+ upload_stager=PostgresUploadStager,
161
+ upload_stager_config=PostgresUploadStagerConfig,
162
+ )
@@ -0,0 +1,166 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.v2.logger import logger
9
+ from unstructured_ingest.v2.processes.connector_registry import (
10
+ DestinationRegistryEntry,
11
+ SourceRegistryEntry,
12
+ )
13
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
14
+ _DATE_COLUMNS,
15
+ SQLAccessConfig,
16
+ SqlBatchFileData,
17
+ SQLConnectionConfig,
18
+ SQLDownloader,
19
+ SQLDownloaderConfig,
20
+ SQLIndexer,
21
+ SQLIndexerConfig,
22
+ SQLUploader,
23
+ SQLUploaderConfig,
24
+ SQLUploadStager,
25
+ SQLUploadStagerConfig,
26
+ parse_date_string,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from singlestoredb.connection import Connection as SingleStoreConnection
31
+ from singlestoredb.connection import Cursor as SingleStoreCursor
32
+
33
+ CONNECTOR_TYPE = "singlestore"
34
+
35
+
36
+ class SingleStoreAccessConfig(SQLAccessConfig):
37
+ password: Optional[str] = Field(default=None, description="SingleStore password")
38
+
39
+
40
+ class SingleStoreConnectionConfig(SQLConnectionConfig):
41
+ access_config: Secret[SingleStoreAccessConfig]
42
+ host: Optional[str] = Field(default=None, description="SingleStore host")
43
+ port: Optional[int] = Field(default=None, description="SingleStore port")
44
+ user: Optional[str] = Field(default=None, description="SingleStore user")
45
+ database: Optional[str] = Field(default=None, description="SingleStore database")
46
+
47
+ @contextmanager
48
+ def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
49
+ import singlestoredb as s2
50
+
51
+ connection = s2.connect(
52
+ host=self.host,
53
+ port=self.port,
54
+ database=self.database,
55
+ user=self.user,
56
+ password=self.access_config.get_secret_value().password,
57
+ )
58
+ try:
59
+ yield connection
60
+ finally:
61
+ connection.commit()
62
+ connection.close()
63
+
64
+ @contextmanager
65
+ def get_cursor(self) -> Generator["SingleStoreCursor", None, None]:
66
+ with self.get_connection() as connection:
67
+ with connection.cursor() as cursor:
68
+ try:
69
+ yield cursor
70
+ finally:
71
+ cursor.close()
72
+
73
+
74
+ class SingleStoreIndexerConfig(SQLIndexerConfig):
75
+ pass
76
+
77
+
78
+ @dataclass
79
+ class SingleStoreIndexer(SQLIndexer):
80
+ connection_config: SingleStoreConnectionConfig
81
+ index_config: SingleStoreIndexerConfig
82
+ connector_type: str = CONNECTOR_TYPE
83
+
84
+
85
+ class SingleStoreDownloaderConfig(SQLDownloaderConfig):
86
+ pass
87
+
88
+
89
+ @dataclass
90
+ class SingleStoreDownloader(SQLDownloader):
91
+ connection_config: SingleStoreConnectionConfig
92
+ download_config: SingleStoreDownloaderConfig
93
+ connector_type: str = CONNECTOR_TYPE
94
+ values_delimiter: str = "%s"
95
+
96
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
97
+ table_name = file_data.additional_metadata.table_name
98
+ id_column = file_data.additional_metadata.id_column
99
+ ids = tuple([item.identifier for item in file_data.batch_items])
100
+ with self.connection_config.get_connection() as sqlite_connection:
101
+ cursor = sqlite_connection.cursor()
102
+ fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
103
+ query = (
104
+ f"SELECT {fields} FROM {table_name} WHERE {id_column} IN {self.values_delimiter}"
105
+ )
106
+ logger.debug(f"running query: {query}\nwith values: {(ids,)}")
107
+ cursor.execute(query, (ids,))
108
+ rows = cursor.fetchall()
109
+ columns = [col[0] for col in cursor.description]
110
+ return rows, columns
111
+
112
+
113
+ class SingleStoreUploadStagerConfig(SQLUploadStagerConfig):
114
+ pass
115
+
116
+
117
+ class SingleStoreUploadStager(SQLUploadStager):
118
+ upload_stager_config: SingleStoreUploadStagerConfig
119
+
120
+
121
+ class SingleStoreUploaderConfig(SQLUploaderConfig):
122
+ pass
123
+
124
+
125
+ @dataclass
126
+ class SingleStoreUploader(SQLUploader):
127
+ upload_config: SingleStoreUploaderConfig = field(default_factory=SingleStoreUploaderConfig)
128
+ connection_config: SingleStoreConnectionConfig
129
+ values_delimiter: str = "%s"
130
+ connector_type: str = CONNECTOR_TYPE
131
+
132
+ def prepare_data(
133
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
134
+ ) -> list[tuple[Any, ...]]:
135
+ output = []
136
+ for row in data:
137
+ parsed = []
138
+ for column_name, value in zip(columns, row):
139
+ if isinstance(value, (list, dict)):
140
+ value = json.dumps(value)
141
+ if column_name in _DATE_COLUMNS:
142
+ if value is None:
143
+ parsed.append(None)
144
+ else:
145
+ parsed.append(parse_date_string(value))
146
+ else:
147
+ parsed.append(value)
148
+ output.append(tuple(parsed))
149
+ return output
150
+
151
+
152
+ singlestore_source_entry = SourceRegistryEntry(
153
+ connection_config=SingleStoreConnectionConfig,
154
+ indexer_config=SingleStoreIndexerConfig,
155
+ indexer=SingleStoreIndexer,
156
+ downloader_config=SingleStoreDownloaderConfig,
157
+ downloader=SingleStoreDownloader,
158
+ )
159
+
160
+ singlestore_destination_entry = DestinationRegistryEntry(
161
+ connection_config=SingleStoreConnectionConfig,
162
+ uploader=SingleStoreUploader,
163
+ uploader_config=SingleStoreUploaderConfig,
164
+ upload_stager=SingleStoreUploadStager,
165
+ upload_stager_config=SingleStoreUploadStagerConfig,
166
+ )
@@ -0,0 +1,210 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.utils.data_prep import split_dataframe
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces.file_data import FileData
12
+ from unstructured_ingest.v2.logger import logger
13
+ from unstructured_ingest.v2.processes.connector_registry import (
14
+ DestinationRegistryEntry,
15
+ SourceRegistryEntry,
16
+ )
17
+ from unstructured_ingest.v2.processes.connectors.sql.sql import (
18
+ SQLAccessConfig,
19
+ SqlBatchFileData,
20
+ SQLConnectionConfig,
21
+ SQLDownloader,
22
+ SQLDownloaderConfig,
23
+ SQLIndexer,
24
+ SQLIndexerConfig,
25
+ SQLUploader,
26
+ SQLUploaderConfig,
27
+ SQLUploadStager,
28
+ SQLUploadStagerConfig,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from snowflake.connector import SnowflakeConnection
33
+ from snowflake.connector.cursor import SnowflakeCursor
34
+
35
+ CONNECTOR_TYPE = "snowflake"
36
+
37
+
38
+ class SnowflakeAccessConfig(SQLAccessConfig):
39
+ password: Optional[str] = Field(default=None, description="DB password")
40
+
41
+
42
+ class SnowflakeConnectionConfig(SQLConnectionConfig):
43
+ access_config: Secret[SnowflakeAccessConfig] = Field(
44
+ default=SnowflakeAccessConfig(), validate_default=True
45
+ )
46
+ account: str = Field(
47
+ default=None,
48
+ description="Your account identifier. The account identifier "
49
+ "does not include the snowflakecomputing.com suffix.",
50
+ )
51
+ user: Optional[str] = Field(default=None, description="DB username")
52
+ host: Optional[str] = Field(default=None, description="DB host")
53
+ port: Optional[int] = Field(default=443, description="DB host connection port")
54
+ database: str = Field(
55
+ default=None,
56
+ description="Database name.",
57
+ )
58
+ db_schema: str = Field(default=None, description="Database schema.", alias="schema")
59
+ role: str = Field(
60
+ default=None,
61
+ description="Database role.",
62
+ )
63
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
64
+
65
+ @contextmanager
66
+ # The actual snowflake module package name is: snowflake-connector-python
67
+ @requires_dependencies(["snowflake"], extras="snowflake")
68
+ def get_connection(self) -> Generator["SnowflakeConnection", None, None]:
69
+ # https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#label-snowflake-connector-methods-connect
70
+ from snowflake.connector import connect
71
+
72
+ connect_kwargs = self.model_dump()
73
+ connect_kwargs["schema"] = connect_kwargs.pop("db_schema")
74
+ connect_kwargs.pop("access_configs", None)
75
+ connect_kwargs["password"] = self.access_config.get_secret_value().password
76
+ # https://peps.python.org/pep-0249/#paramstyle
77
+ connect_kwargs["paramstyle"] = "qmark"
78
+ # remove anything that is none
79
+ active_kwargs = {k: v for k, v in connect_kwargs.items() if v is not None}
80
+ connection = connect(**active_kwargs)
81
+ try:
82
+ yield connection
83
+ finally:
84
+ connection.commit()
85
+ connection.close()
86
+
87
+ @contextmanager
88
+ def get_cursor(self) -> Generator["SnowflakeCursor", None, None]:
89
+ with self.get_connection() as connection:
90
+ cursor = connection.cursor()
91
+ try:
92
+ yield cursor
93
+ finally:
94
+ cursor.close()
95
+
96
+
97
+ class SnowflakeIndexerConfig(SQLIndexerConfig):
98
+ pass
99
+
100
+
101
+ @dataclass
102
+ class SnowflakeIndexer(SQLIndexer):
103
+ connection_config: SnowflakeConnectionConfig
104
+ index_config: SnowflakeIndexerConfig
105
+ connector_type: str = CONNECTOR_TYPE
106
+
107
+
108
+ class SnowflakeDownloaderConfig(SQLDownloaderConfig):
109
+ pass
110
+
111
+
112
+ @dataclass
113
+ class SnowflakeDownloader(SQLDownloader):
114
+ connection_config: SnowflakeConnectionConfig
115
+ download_config: SnowflakeDownloaderConfig
116
+ connector_type: str = CONNECTOR_TYPE
117
+ values_delimiter: str = "?"
118
+
119
+ # The actual snowflake module package name is: snowflake-connector-python
120
+ @requires_dependencies(["snowflake"], extras="snowflake")
121
+ def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
122
+ table_name = file_data.additional_metadata.table_name
123
+ id_column = file_data.additional_metadata.id_column
124
+ ids = [item.identifier for item in file_data.batch_items]
125
+
126
+ with self.connection_config.get_cursor() as cursor:
127
+ query = "SELECT {fields} FROM {table_name} WHERE {id_column} IN ({values})".format(
128
+ table_name=table_name,
129
+ id_column=id_column,
130
+ fields=(
131
+ ",".join(self.download_config.fields) if self.download_config.fields else "*"
132
+ ),
133
+ values=",".join([self.values_delimiter for _ in ids]),
134
+ )
135
+ logger.debug(f"running query: {query}\nwith values: {ids}")
136
+ cursor.execute(query, ids)
137
+ rows = [
138
+ tuple(row.values()) if isinstance(row, dict) else row for row in cursor.fetchall()
139
+ ]
140
+ columns = [col[0] for col in cursor.description]
141
+ return rows, columns
142
+
143
+
144
+ class SnowflakeUploadStagerConfig(SQLUploadStagerConfig):
145
+ pass
146
+
147
+
148
+ class SnowflakeUploadStager(SQLUploadStager):
149
+ upload_stager_config: SnowflakeUploadStagerConfig
150
+
151
+
152
+ class SnowflakeUploaderConfig(SQLUploaderConfig):
153
+ pass
154
+
155
+
156
+ @dataclass
157
+ class SnowflakeUploader(SQLUploader):
158
+ upload_config: SnowflakeUploaderConfig = field(default_factory=SnowflakeUploaderConfig)
159
+ connection_config: SnowflakeConnectionConfig
160
+ connector_type: str = CONNECTOR_TYPE
161
+ values_delimiter: str = "?"
162
+
163
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
164
+ if self.can_delete():
165
+ self.delete_by_record_id(file_data=file_data)
166
+ else:
167
+ logger.warning(
168
+ f"table doesn't contain expected "
169
+ f"record id column "
170
+ f"{self.upload_config.record_id_key}, skipping delete"
171
+ )
172
+ df.replace({np.nan: None}, inplace=True)
173
+ self._fit_to_schema(df=df, columns=self.get_table_columns())
174
+
175
+ columns = list(df.columns)
176
+ stmt = "INSERT INTO {table_name} ({columns}) VALUES({values})".format(
177
+ table_name=self.upload_config.table_name,
178
+ columns=",".join(columns),
179
+ values=",".join([self.values_delimiter for _ in columns]),
180
+ )
181
+ logger.info(
182
+ f"writing a total of {len(df)} elements via"
183
+ f" document batches to destination"
184
+ f" table named {self.upload_config.table_name}"
185
+ f" with batch size {self.upload_config.batch_size}"
186
+ )
187
+ for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
188
+ with self.connection_config.get_cursor() as cursor:
189
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
190
+ # TODO: executemany break on 'Binding data in type (list) is not supported'
191
+ for val in values:
192
+ logger.debug(f"running query: {stmt}\nwith values: {val}")
193
+ cursor.execute(stmt, val)
194
+
195
+
196
+ snowflake_source_entry = SourceRegistryEntry(
197
+ connection_config=SnowflakeConnectionConfig,
198
+ indexer_config=SnowflakeIndexerConfig,
199
+ indexer=SnowflakeIndexer,
200
+ downloader_config=SnowflakeDownloaderConfig,
201
+ downloader=SnowflakeDownloader,
202
+ )
203
+
204
+ snowflake_destination_entry = DestinationRegistryEntry(
205
+ connection_config=SnowflakeConnectionConfig,
206
+ uploader=SnowflakeUploader,
207
+ uploader_config=SnowflakeUploaderConfig,
208
+ upload_stager=SnowflakeUploadStager,
209
+ upload_stager_config=SnowflakeUploadStagerConfig,
210
+ )