unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,412 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+ from urllib.parse import quote
8
+
9
+ from unstructured.documents.elements import DataSourceMetadata
10
+
11
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
12
+ from unstructured_ingest.error import SourceConnectionNetworkError
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ Downloader,
18
+ DownloaderConfig,
19
+ DownloadResponse,
20
+ FileData,
21
+ Indexer,
22
+ IndexerConfig,
23
+ SourceIdentifiers,
24
+ download_responses,
25
+ )
26
+ from unstructured_ingest.v2.logger import logger
27
+ from unstructured_ingest.v2.processes.connector_registry import (
28
+ SourceRegistryEntry,
29
+ )
30
+
31
+ from .utils import parse_datetime
32
+
33
+ if TYPE_CHECKING:
34
+ from office365.graph_client import GraphClient
35
+ from office365.onedrive.driveitems.driveItem import DriveItem
36
+ from office365.onedrive.drives.drive import Drive
37
+ from office365.onedrive.permissions.permission import Permission
38
+ from office365.onedrive.sites.site import Site
39
+ from office365.sharepoint.client_context import ClientContext
40
+ from office365.sharepoint.files.file import File
41
+ from office365.sharepoint.folders.folder import Folder
42
+ from office365.sharepoint.publishing.pages.page import SitePage
43
+
44
+ CONNECTOR_TYPE = "sharepoint"
45
+
46
+ MAX_MB_SIZE = 512_000_000
47
+
48
+ # TODO handle other data types possible from Sharepoint
49
+ # exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
50
+
51
+
52
+ class SharepointContentType(Enum):
53
+ DOCUMENT = "document"
54
+ SITEPAGE = "site_page"
55
+ LIST = "list"
56
+
57
+
58
+ @dataclass
59
+ class SharepointAccessConfig(AccessConfig):
60
+ client_cred: str
61
+
62
+
63
+ @dataclass
64
+ class SharepointPermissionsConfig(EnhancedDataClassJsonMixin):
65
+ permissions_application_id: str
66
+ permissions_tenant: str
67
+ permissions_client_cred: str = enhanced_field(sensitive=True)
68
+ authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
69
+
70
+
71
+ @dataclass
72
+ class SharepointConnectionConfig(ConnectionConfig):
73
+ client_id: str
74
+ site: str
75
+ access_config: SharepointAccessConfig = enhanced_field(sensitive=True)
76
+ permissions_config: Optional[SharepointPermissionsConfig] = None
77
+
78
+ @requires_dependencies(["office365"], extras="sharepoint")
79
+ def get_client(self) -> "ClientContext":
80
+ from office365.runtime.auth.client_credential import ClientCredential
81
+ from office365.sharepoint.client_context import ClientContext
82
+
83
+ try:
84
+ credentials = ClientCredential(self.client_id, self.access_config.client_cred)
85
+ site_client = ClientContext(self.site).with_credentials(credentials)
86
+ except Exception as e:
87
+ logger.error(f"Couldn't set Sharepoint client: {e}")
88
+ raise e
89
+ return site_client
90
+
91
+ @requires_dependencies(["msal"], extras="sharepoint")
92
+ def get_permissions_token(self):
93
+ from msal import ConfidentialClientApplication
94
+
95
+ try:
96
+ app = ConfidentialClientApplication(
97
+ authority=f"{self.permissions_config.authority_url}/"
98
+ f"{self.permissions_config.permissions_tenant}",
99
+ client_id=self.permissions_config.permissions_application_id,
100
+ client_credential=self.permissions_config.permissions_client_cred,
101
+ )
102
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
103
+ except ValueError as exc:
104
+ logger.error("Couldn't set up credentials for Sharepoint")
105
+ raise exc
106
+ if "error" in token:
107
+ raise SourceConnectionNetworkError(
108
+ "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
109
+ )
110
+ return token
111
+
112
+ @requires_dependencies(["office365"], extras="sharepoint")
113
+ def get_permissions_client(self) -> Optional["GraphClient"]:
114
+ from office365.graph_client import GraphClient
115
+
116
+ if self.permissions_config is None:
117
+ return None
118
+
119
+ client = GraphClient(self.get_permissions_token)
120
+ return client
121
+
122
+
123
+ @dataclass
124
+ class SharepointIndexerConfig(IndexerConfig):
125
+ path: Optional[str] = None
126
+ recursive: bool = False
127
+ omit_files: bool = False
128
+ omit_pages: bool = False
129
+ omit_lists: bool = False
130
+
131
+
132
+ @dataclass
133
+ class SharepointIndexer(Indexer):
134
+ connection_config: SharepointConnectionConfig
135
+ index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
136
+
137
+ def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
138
+ if not recursive:
139
+ folder.expand(["Files"]).get().execute_query()
140
+ return folder.files
141
+
142
+ folder.expand(["Files", "Folders"]).get().execute_query()
143
+ files: list["File"] = list(folder.files)
144
+ folders: list["Folder"] = list(folder.folders)
145
+ for f in folders:
146
+ if "/Forms" in f.serverRelativeUrl:
147
+ continue
148
+ files.extend(self.list_files(f, recursive))
149
+ return files
150
+
151
+ def get_properties(self, raw_properties: dict) -> dict:
152
+ raw_properties = {k: v for k, v in raw_properties.items() if v}
153
+ filtered_properties = {}
154
+ for k, v in raw_properties.items():
155
+ try:
156
+ json.dumps(v)
157
+ filtered_properties[k] = v
158
+ except TypeError:
159
+ pass
160
+ return filtered_properties
161
+
162
+ def list_pages(self, client: "ClientContext") -> list["SitePage"]:
163
+ pages = client.site_pages.pages.get().execute_query()
164
+ return pages
165
+
166
+ def page_to_file_data(self, site_page: "SitePage") -> FileData:
167
+ site_page.expand(site_page.properties.keys()).get().execute_query()
168
+ version = site_page.properties.get("Version", None)
169
+ unique_id = site_page.properties.get("UniqueId", None)
170
+ modified_date = site_page.properties.get("Modified", None)
171
+ url = site_page.properties.get("AbsoluteUrl", None)
172
+ date_modified_dt = parse_datetime(modified_date) if modified_date else None
173
+ date_created_at = (
174
+ parse_datetime(site_page.first_published)
175
+ if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
176
+ else None
177
+ )
178
+ file_path = site_page.get_property("Url", "")
179
+ server_path = file_path if file_path[0] != "/" else file_path[1:]
180
+ additional_metadata = self.get_properties(raw_properties=site_page.properties)
181
+ additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
182
+ return FileData(
183
+ identifier=unique_id,
184
+ connector_type=CONNECTOR_TYPE,
185
+ source_identifiers=SourceIdentifiers(
186
+ filename=site_page.file_name,
187
+ fullpath=file_path,
188
+ rel_path=file_path.replace(self.index_config.path, ""),
189
+ ),
190
+ metadata=DataSourceMetadata(
191
+ url=url,
192
+ version=version,
193
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
194
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
195
+ date_processed=str(time()),
196
+ record_locator={
197
+ "server_path": server_path,
198
+ },
199
+ ),
200
+ additional_metadata=additional_metadata,
201
+ )
202
+
203
+ def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
204
+ file.expand(file.properties.keys()).get().execute_query()
205
+ absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
206
+ date_modified_dt = (
207
+ parse_datetime(file.time_last_modified) if file.time_last_modified else None
208
+ )
209
+
210
+ date_created_at = parse_datetime(file.time_created) if file.time_created else None
211
+ additional_metadata = self.get_properties(raw_properties=file.properties)
212
+ additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
213
+ fullpath = str(file.serverRelativeUrl)
214
+ rel_path = fullpath.replace(self.index_config.path, "")
215
+ while rel_path[0] == "/":
216
+ rel_path = rel_path[1:]
217
+ return FileData(
218
+ identifier=file.unique_id,
219
+ connector_type=CONNECTOR_TYPE,
220
+ source_identifiers=SourceIdentifiers(
221
+ filename=file.name,
222
+ fullpath=fullpath,
223
+ rel_path=rel_path,
224
+ ),
225
+ metadata=DataSourceMetadata(
226
+ url=absolute_url,
227
+ version=f"{file.major_version}.{file.minor_version}",
228
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
229
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
230
+ date_processed=str(time()),
231
+ record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
232
+ ),
233
+ additional_metadata=additional_metadata,
234
+ )
235
+
236
+ def get_root(self, client: "ClientContext") -> "Folder":
237
+ if path := self.index_config.path:
238
+ return client.web.get_folder_by_server_relative_path(path)
239
+ default_document_library = client.web.default_document_library()
240
+ root_folder = default_document_library.root_folder
241
+ root_folder = root_folder.get().execute_query()
242
+ self.index_config.path = root_folder.name
243
+ return root_folder
244
+
245
+ def get_site_url(self, client: "ClientContext") -> str:
246
+ res = client.web.get().execute_query()
247
+ return res.url
248
+
249
+ def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
250
+ return permissions_client.sites.get_by_url(url=site_url).execute_query()
251
+
252
+ def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
253
+ # TODO find a way to narrow this search down by name of drive
254
+ items: list["DriveItem"] = []
255
+ drives: list["Drive"] = site.drives.get_all().execute_query()
256
+ for drive in drives:
257
+ items.extend(drive.root.children.get_all().execute_query())
258
+ return items
259
+
260
+ def map_permission(self, permission: "Permission") -> dict:
261
+ return {
262
+ "id": permission.id,
263
+ "roles": list(permission.roles),
264
+ "share_id": permission.share_id,
265
+ "has_password": permission.has_password,
266
+ "link": permission.link.to_json(),
267
+ "granted_to_identities": permission.granted_to_identities.to_json(),
268
+ "granted_to": permission.granted_to.to_json(),
269
+ "granted_to_v2": permission.granted_to_v2.to_json(),
270
+ "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
271
+ "invitation": permission.invitation.to_json(),
272
+ }
273
+
274
+ def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
275
+ logger.debug("Enriching permissions on files")
276
+ permission_client = self.connection_config.get_permissions_client()
277
+ if permission_client is None:
278
+ return
279
+ site = self.get_site(permissions_client=permission_client, site_url=site_url)
280
+ existing_items = self.get_permissions_items(site=site)
281
+ for file_data in all_file_data:
282
+ etag = file_data.additional_metadata.get("ETag")
283
+ if not etag:
284
+ continue
285
+ matching_items = list(filter(lambda x: x.etag == etag, existing_items))
286
+ if not matching_items:
287
+ continue
288
+ if len(matching_items) > 1:
289
+ logger.warning(
290
+ "Found multiple drive items with etag matching {}, skipping: {}".format(
291
+ etag, ", ".join([i.name for i in matching_items])
292
+ )
293
+ )
294
+ continue
295
+ matching_item = matching_items[0]
296
+ permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
297
+ permissions_data = [
298
+ self.map_permission(permission=permission) for permission in permissions
299
+ ]
300
+ file_data.metadata.permissions_data = permissions_data
301
+
302
+ @property
303
+ def process_permissions(self) -> bool:
304
+ return (
305
+ self.connection_config.permissions_config.permissions_tenant
306
+ and self.connection_config.permissions_config.permissions_client_cred
307
+ and self.connection_config.permissions_config.permissions_application_id
308
+ )
309
+
310
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
311
+ client = self.connection_config.get_client()
312
+ root_folder = self.get_root(client=client)
313
+ logger.debug(f"processing content from path: {self.index_config.path}")
314
+ if not self.index_config.omit_files:
315
+ files = self.list_files(root_folder, recursive=self.index_config.recursive)
316
+ file_data = [self.file_to_file_data(file=file, client=client) for file in files]
317
+ if self.process_permissions:
318
+ self.enrich_permissions_on_files(
319
+ all_file_data=file_data, site_url=self.get_site_url(client=client)
320
+ )
321
+ for file in file_data:
322
+ yield file
323
+ if not self.index_config.omit_pages:
324
+ pages = self.list_pages(client=client)
325
+ for page in pages:
326
+ file_data = self.page_to_file_data(site_page=page)
327
+ file_data.metadata.record_locator["site_url"] = client.base_url
328
+ yield file_data
329
+
330
+
331
+ @dataclass
332
+ class SharepointDownloaderConfig(DownloaderConfig):
333
+ pass
334
+
335
+
336
+ @dataclass
337
+ class SharepointDownloader(Downloader):
338
+ connection_config: SharepointConnectionConfig
339
+ download_config: SharepointDownloaderConfig
340
+ connector_type: str = CONNECTOR_TYPE
341
+
342
+ def get_download_path(self, file_data: FileData) -> Path:
343
+ content_type = file_data.additional_metadata.get("sharepoint_content_type")
344
+ rel_path = file_data.source_identifiers.fullpath
345
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
346
+ download_path = self.download_dir / Path(rel_path)
347
+ if content_type == SharepointContentType.SITEPAGE.value:
348
+ # Update output extension to html if site page
349
+ download_path = download_path.with_suffix(".html")
350
+ return download_path
351
+
352
+ def get_document(self, file_data: FileData) -> DownloadResponse:
353
+ client: "ClientContext" = self.connection_config.get_client()
354
+ file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
355
+ download_path = self.get_download_path(file_data=file_data)
356
+ download_path.parent.mkdir(parents=True, exist_ok=True)
357
+ logger.debug(
358
+ f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
359
+ )
360
+ with download_path.open("wb") as f:
361
+ file.download(f).execute_query()
362
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
363
+
364
+ def get_site_page(self, file_data: FileData) -> DownloadResponse:
365
+ # TODO fetch comments for site page as well
366
+ from lxml import etree, html
367
+
368
+ canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
369
+ layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
370
+ html_content = []
371
+ if layout_web_parts_content_raw:
372
+ layout_web_parts_content = json.loads(layout_web_parts_content_raw)
373
+ for web_part in layout_web_parts_content:
374
+ properties = web_part.get("properties", {})
375
+ if title := properties.get("title"):
376
+ html_content.append(f"<title>{title}</title>")
377
+ if canvas_content_raw:
378
+ canvas_content = json.loads(canvas_content_raw)
379
+ for content in canvas_content:
380
+ if inner_html := content.get("innerHTML"):
381
+ html_content.append(inner_html)
382
+ htmls = "".join(html_content)
383
+ content = f"<div>{htmls}</div>"
384
+ document = html.fromstring(content)
385
+ download_path = self.get_download_path(file_data=file_data)
386
+ download_path.parent.mkdir(parents=True, exist_ok=True)
387
+ logger.debug(
388
+ f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
389
+ )
390
+ with download_path.open("w") as f:
391
+ f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
392
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
393
+
394
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
395
+ content_type = file_data.additional_metadata.get("sharepoint_content_type")
396
+ if not content_type:
397
+ raise ValueError(
398
+ f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
399
+ )
400
+ if content_type == SharepointContentType.DOCUMENT.value:
401
+ return self.get_document(file_data=file_data)
402
+ elif content_type == SharepointContentType.SITEPAGE.value:
403
+ return self.get_site_page(file_data=file_data)
404
+
405
+
406
+ sharepoint_source_entry = SourceRegistryEntry(
407
+ connection_config=SharepointConnectionConfig,
408
+ indexer_config=SharepointIndexerConfig,
409
+ indexer=SharepointIndexer,
410
+ downloader_config=SharepointDownloaderConfig,
411
+ downloader=SharepointDownloader,
412
+ )
@@ -0,0 +1,160 @@
1
+ import json
2
+ from dataclasses import dataclass
3
+ from datetime import date, datetime
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from dateutil import parser
10
+
11
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
12
+ from unstructured_ingest.utils.data_prep import batch_generator
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ UploadContent,
20
+ Uploader,
21
+ UploaderConfig,
22
+ UploadStager,
23
+ UploadStagerConfig,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import (
27
+ DestinationRegistryEntry,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from singlestoredb.connection import Connection
32
+
33
+ CONNECTOR_TYPE = "singlestore"
34
+
35
+
36
+ @dataclass
37
+ class SingleStoreAccessConfig(AccessConfig):
38
+ password: Optional[str] = None
39
+
40
+
41
+ @dataclass
42
+ class SingleStoreConnectionConfig(ConnectionConfig):
43
+ host: Optional[str] = None
44
+ port: Optional[int] = None
45
+ user: Optional[str] = None
46
+ database: Optional[str] = None
47
+ access_config: SingleStoreAccessConfig = enhanced_field(sensitive=True)
48
+
49
+ @requires_dependencies(["singlestoredb"], extras="singlestore")
50
+ def get_connection(self) -> "Connection":
51
+ import singlestoredb as s2
52
+
53
+ conn = s2.connect(
54
+ host=self.host,
55
+ port=self.port,
56
+ database=self.database,
57
+ user=self.user,
58
+ password=self.access_config.password,
59
+ )
60
+ return conn
61
+
62
+
63
+ @dataclass
64
+ class SingleStoreUploadStagerConfig(UploadStagerConfig):
65
+ drop_empty_cols: bool = False
66
+
67
+
68
+ @dataclass
69
+ class SingleStoreUploadStager(UploadStager):
70
+ upload_stager_config: SingleStoreUploadStagerConfig
71
+
72
+ @staticmethod
73
+ def parse_date_string(date_string: str) -> date:
74
+ try:
75
+ timestamp = float(date_string)
76
+ return datetime.fromtimestamp(timestamp)
77
+ except Exception as e:
78
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
79
+ return parser.parse(date_string)
80
+
81
+ def run(
82
+ self,
83
+ elements_filepath: Path,
84
+ file_data: FileData,
85
+ output_dir: Path,
86
+ output_filename: str,
87
+ **kwargs: Any,
88
+ ) -> Path:
89
+ with open(elements_filepath) as elements_file:
90
+ elements_contents = json.load(elements_file)
91
+ output_path = Path(output_dir) / Path(f"{output_filename}.csv")
92
+ output_path.parent.mkdir(parents=True, exist_ok=True)
93
+
94
+ df = convert_to_pandas_dataframe(
95
+ elements_dict=elements_contents,
96
+ drop_empty_cols=self.upload_stager_config.drop_empty_cols,
97
+ )
98
+ datetime_columns = [
99
+ "data_source_date_created",
100
+ "data_source_date_modified",
101
+ "data_source_date_processed",
102
+ ]
103
+ for column in filter(lambda x: x in df.columns, datetime_columns):
104
+ df[column] = df[column].apply(self.parse_date_string)
105
+ if "data_source_record_locator" in df.columns:
106
+ df["data_source_record_locator"] = df["data_source_record_locator"].apply(
107
+ lambda x: json.dumps(x) if x else None
108
+ )
109
+
110
+ with output_path.open("w") as output_file:
111
+ df.to_csv(output_file, index=False)
112
+ return output_path
113
+
114
+
115
+ @dataclass
116
+ class SingleStoreUploaderConfig(UploaderConfig):
117
+ table_name: str
118
+ batch_size: int = 100
119
+
120
+
121
+ @dataclass
122
+ class SingleStoreUploader(Uploader):
123
+ connection_config: SingleStoreConnectionConfig
124
+ upload_config: SingleStoreUploaderConfig
125
+ connector_type: str = CONNECTOR_TYPE
126
+
127
+ def upload_csv(self, content: UploadContent) -> None:
128
+ df = pd.read_csv(content.path)
129
+ logger.debug(
130
+ f"uploading {len(df)} entries to {self.connection_config.database} "
131
+ f"db in table {self.upload_config.table_name}"
132
+ )
133
+ stmt = "INSERT INTO {} ({}) VALUES ({})".format(
134
+ self.upload_config.table_name,
135
+ ", ".join(df.columns),
136
+ ", ".join(["%s"] * len(df.columns)),
137
+ )
138
+ logger.debug(f"sql statement: {stmt}")
139
+ df.replace({np.nan: None}, inplace=True)
140
+ data_as_tuples = list(df.itertuples(index=False, name=None))
141
+ with self.connection_config.get_connection() as conn:
142
+ with conn.cursor() as cur:
143
+ for chunk in batch_generator(
144
+ data_as_tuples, batch_size=self.upload_config.batch_size
145
+ ):
146
+ cur.executemany(stmt, chunk)
147
+ conn.commit()
148
+
149
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
150
+ for content in contents:
151
+ self.upload_csv(content=content)
152
+
153
+
154
+ singlestore_destination_entry = DestinationRegistryEntry(
155
+ connection_config=SingleStoreConnectionConfig,
156
+ uploader=SingleStoreUploader,
157
+ uploader_config=SingleStoreUploaderConfig,
158
+ upload_stager=SingleStoreUploadStager,
159
+ upload_stager_config=SingleStoreUploadStagerConfig,
160
+ )