unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ import os
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+ from datetime import datetime as dt
5
+ from multiprocessing import Process
6
+ from pathlib import Path
7
+
8
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
9
+ from unstructured_ingest.interfaces import (
10
+ BaseConnectorConfig,
11
+ BaseDestinationConnector,
12
+ BaseSingleIngestDoc,
13
+ BaseSourceConnector,
14
+ IngestDocCleanupMixin,
15
+ SourceConnectorCleanupMixin,
16
+ SourceMetadata,
17
+ WriteConfig,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
+
22
+ if t.TYPE_CHECKING:
23
+ from deltalake import DeltaTable
24
+
25
+
26
+ @dataclass
27
+ class SimpleDeltaTableConfig(BaseConnectorConfig):
28
+ table_uri: t.Union[str, Path]
29
+ version: t.Optional[int] = None
30
+ storage_options: t.Optional[t.Dict[str, str]] = None
31
+ without_files: bool = False
32
+
33
+
34
+ @dataclass
35
+ class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
36
+ connector_config: SimpleDeltaTableConfig
37
+ uri: str
38
+ modified_date: str
39
+ created_at: str
40
+ registry_name: str = "delta-table"
41
+
42
+ def uri_filename(self) -> str:
43
+ basename = os.path.basename(self.uri)
44
+ return os.path.splitext(basename)[0]
45
+
46
+ @property
47
+ def filename(self):
48
+ return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve()
49
+
50
+ @property
51
+ def _output_filename(self):
52
+ """Create filename document id combined with a hash of the query to uniquely identify
53
+ the output file."""
54
+ return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json"
55
+
56
+ def _create_full_tmp_dir_path(self):
57
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
58
+ self._output_filename.parent.mkdir(parents=True, exist_ok=True)
59
+
60
+ @requires_dependencies(["fsspec"], extras="delta-table")
61
+ def _get_fs_from_uri(self):
62
+ from fsspec.core import url_to_fs
63
+
64
+ try:
65
+ fs, _ = url_to_fs(self.uri)
66
+ except ImportError as error:
67
+ raise ImportError(
68
+ f"uri {self.uri} may be associated with a filesystem that "
69
+ f"requires additional dependencies: {error}",
70
+ )
71
+ return fs
72
+
73
+ def update_source_metadata(self, **kwargs):
74
+ fs = kwargs.get("fs", self._get_fs_from_uri())
75
+ version = (
76
+ fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "")
77
+ )
78
+ file_exists = fs.exists(self.uri)
79
+ self.source_metadata = SourceMetadata(
80
+ date_created=self.created_at,
81
+ date_modified=self.modified_date,
82
+ version=version,
83
+ source_url=self.uri,
84
+ exists=file_exists,
85
+ )
86
+
87
+ @SourceConnectionError.wrap
88
+ @BaseSingleIngestDoc.skip_if_file_exists
89
+ def get_file(self):
90
+ fs = self._get_fs_from_uri()
91
+ self.update_source_metadata(fs=fs)
92
+ logger.info(f"using a {fs} filesystem to collect table data")
93
+ self._create_full_tmp_dir_path()
94
+
95
+ df = self._get_df(filesystem=fs)
96
+
97
+ logger.info(f"writing {len(df)} rows to {self.filename}")
98
+ df.to_csv(self.filename)
99
+
100
+ @SourceConnectionNetworkError.wrap
101
+ def _get_df(self, filesystem):
102
+ import pyarrow.parquet as pq
103
+
104
+ return pq.ParquetDataset(self.uri, filesystem=filesystem).read_pandas().to_pandas()
105
+
106
+
107
+ @dataclass
108
+ class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
109
+ connector_config: SimpleDeltaTableConfig
110
+ delta_table: t.Optional["DeltaTable"] = None
111
+
112
+ def check_connection(self):
113
+ pass
114
+
115
+ @requires_dependencies(["deltalake"], extras="delta-table")
116
+ def initialize(self):
117
+ from deltalake import DeltaTable
118
+
119
+ self.delta_table = DeltaTable(
120
+ table_uri=self.connector_config.table_uri,
121
+ version=self.connector_config.version,
122
+ storage_options=self.connector_config.storage_options,
123
+ without_files=self.connector_config.without_files,
124
+ )
125
+ rows = self.delta_table.to_pyarrow_dataset().count_rows()
126
+ if not rows > 0:
127
+ raise ValueError(f"no data found at {self.connector_config.table_uri}")
128
+ logger.info(f"processing {rows} rows of data")
129
+
130
+ def get_ingest_docs(self):
131
+ """Batches the results into distinct docs"""
132
+ if not self.delta_table:
133
+ raise ValueError("delta table was never initialized")
134
+ actions = self.delta_table.get_add_actions().to_pandas()
135
+ mod_date_dict = {
136
+ row["path"]: str(row["modification_time"]) for _, row in actions.iterrows()
137
+ }
138
+ created_at = dt.fromtimestamp(self.delta_table.metadata().created_time / 1000)
139
+ return [
140
+ DeltaTableIngestDoc(
141
+ connector_config=self.connector_config,
142
+ processor_config=self.processor_config,
143
+ read_config=self.read_config,
144
+ uri=uri,
145
+ modified_date=mod_date_dict[os.path.basename(uri)],
146
+ created_at=str(created_at),
147
+ )
148
+ for uri in self.delta_table.file_uris()
149
+ ]
150
+
151
+
152
+ @dataclass
153
+ class DeltaTableWriteConfig(WriteConfig):
154
+ drop_empty_cols: bool = False
155
+ mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
156
+ schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
157
+ engine: t.Literal["pyarrow", "rust"] = "pyarrow"
158
+
159
+
160
+ @dataclass
161
+ class DeltaTableDestinationConnector(BaseDestinationConnector):
162
+ write_config: DeltaTableWriteConfig
163
+ connector_config: SimpleDeltaTableConfig
164
+
165
+ @requires_dependencies(["deltalake"], extras="delta-table")
166
+ def initialize(self):
167
+ pass
168
+
169
+ def check_connection(self):
170
+ pass
171
+
172
+ @requires_dependencies(["deltalake"], extras="delta-table")
173
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
174
+ from deltalake.writer import write_deltalake
175
+
176
+ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
177
+
178
+ df = convert_to_pandas_dataframe(
179
+ elements_dict=elements_dict,
180
+ drop_empty_cols=self.write_config.drop_empty_cols,
181
+ )
182
+ logger.info(
183
+ f"writing {len(df)} rows to destination table "
184
+ f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
185
+ )
186
+ writer_kwargs = {
187
+ "table_or_uri": self.connector_config.table_uri,
188
+ "data": df,
189
+ "mode": self.write_config.mode,
190
+ "engine": self.write_config.engine,
191
+ }
192
+ if self.write_config.schema_mode is not None:
193
+ writer_kwargs["schema_mode"] = self.write_config.schema_mode
194
+ # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
195
+ # ingest to fail, even though all tasks are completed normally. Putting the writer into a
196
+ # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
197
+ # rust backend to finish
198
+ writer = Process(
199
+ target=write_deltalake,
200
+ kwargs=writer_kwargs,
201
+ )
202
+ writer.start()
203
+ writer.join()
@@ -0,0 +1,180 @@
1
+ import datetime as dt
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
8
+ from unstructured_ingest.interfaces import (
9
+ AccessConfig,
10
+ BaseConnectorConfig,
11
+ BaseSingleIngestDoc,
12
+ BaseSourceConnector,
13
+ IngestDocCleanupMixin,
14
+ SourceConnectorCleanupMixin,
15
+ SourceMetadata,
16
+ )
17
+ from unstructured_ingest.logger import logger
18
+ from unstructured_ingest.utils.dep_check import (
19
+ requires_dependencies,
20
+ )
21
+
22
+
23
+ @dataclass
24
+ class DiscordAccessConfig(AccessConfig):
25
+ token: str = enhanced_field(sensitive=True)
26
+
27
+
28
+ @dataclass
29
+ class SimpleDiscordConfig(BaseConnectorConfig):
30
+ """Connector config where channels is a comma separated list of
31
+ Discord channels to pull messages from.
32
+ """
33
+
34
+ # Discord Specific Options
35
+ access_config: DiscordAccessConfig
36
+ channels: t.List[str]
37
+ period: t.Optional[int] = None
38
+
39
+
40
+ @dataclass
41
+ class DiscordIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
42
+ """Class encapsulating fetching a doc and writing processed results (but not
43
+ doing the processing!).
44
+ Also includes a cleanup method. When things go wrong and the cleanup
45
+ method is not called, the file is left behind on the filesystem to assist debugging.
46
+ """
47
+
48
+ connector_config: SimpleDiscordConfig
49
+ channel: str
50
+ days: t.Optional[int] = None
51
+ registry_name: str = "discord"
52
+
53
+ # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
54
+ # __post_init__ for multiprocessing simplicity (no Path objects in initially
55
+ # instantiated object)
56
+ def _tmp_download_file(self):
57
+ channel_file = self.channel + ".txt"
58
+ return Path(self.read_config.download_dir) / channel_file
59
+
60
+ @property
61
+ def _output_filename(self):
62
+ output_file = self.channel + ".json"
63
+ return Path(self.processor_config.output_dir) / output_file
64
+
65
+ def _create_full_tmp_dir_path(self):
66
+ self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
67
+
68
+ @SourceConnectionNetworkError.wrap
69
+ @requires_dependencies(dependencies=["discord"], extras="discord")
70
+ def _get_messages(self):
71
+ """Actually fetches the data from discord."""
72
+ import discord
73
+ from discord.ext import commands
74
+
75
+ messages: t.List[discord.Message] = []
76
+ jumpurl: t.List[str] = []
77
+ intents = discord.Intents.default()
78
+ intents.message_content = True
79
+ bot = commands.Bot(command_prefix=">", intents=intents)
80
+
81
+ @bot.event
82
+ async def on_ready():
83
+ try:
84
+ after_date = None
85
+ if self.days:
86
+ after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
87
+ channel = bot.get_channel(int(self.channel))
88
+ jumpurl.append(channel.jump_url) # type: ignore
89
+ async for msg in channel.history(after=after_date): # type: ignore
90
+ messages.append(msg)
91
+ await bot.close()
92
+ except Exception:
93
+ logger.error("Error fetching messages")
94
+ await bot.close()
95
+ raise
96
+
97
+ bot.run(self.connector_config.access_config.token)
98
+ jump_url = None if len(jumpurl) < 1 else jumpurl[0]
99
+ return messages, jump_url
100
+
101
+ def update_source_metadata(self, **kwargs):
102
+ messages, jump_url = kwargs.get("messages_tuple", self._get_messages())
103
+ if messages == []:
104
+ self.source_metadata = SourceMetadata(
105
+ exists=False,
106
+ )
107
+ return
108
+ dates = [m.created_at for m in messages if m.created_at]
109
+ dates.sort()
110
+ self.source_metadata = SourceMetadata(
111
+ date_created=dates[0].isoformat(),
112
+ date_modified=dates[-1].isoformat(),
113
+ source_url=jump_url,
114
+ exists=True,
115
+ )
116
+
117
+ @SourceConnectionError.wrap
118
+ @BaseSingleIngestDoc.skip_if_file_exists
119
+ def get_file(self):
120
+ self._create_full_tmp_dir_path()
121
+
122
+ messages, jump_url = self._get_messages()
123
+ self.update_source_metadata(messages_tuple=(messages, jump_url))
124
+ if messages == []:
125
+ raise ValueError(f"Failed to retrieve messages from Discord channel {self.channel}")
126
+ self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
127
+ with open(self._tmp_download_file(), "w") as f:
128
+ for m in messages:
129
+ f.write(m.content + "\n")
130
+
131
+ @property
132
+ def filename(self):
133
+ """The filename of the file created from a discord channel"""
134
+ return self._tmp_download_file()
135
+
136
+ @property
137
+ def version(self) -> t.Optional[str]:
138
+ return None
139
+
140
+ @property
141
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
142
+ return {
143
+ "channel": self.channel,
144
+ }
145
+
146
+
147
+ class DiscordSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
148
+ """Objects of this class support fetching document(s) from"""
149
+
150
+ connector_config: SimpleDiscordConfig
151
+
152
+ def initialize(self):
153
+ pass
154
+
155
+ @requires_dependencies(dependencies=["discord"], extras="discord")
156
+ def check_connection(self):
157
+ import asyncio
158
+
159
+ import discord
160
+ from discord.client import Client
161
+
162
+ intents = discord.Intents.default()
163
+ try:
164
+ client = Client(intents=intents)
165
+ asyncio.run(client.start(token=self.connector_config.access_config.token))
166
+ except Exception as e:
167
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
168
+ raise SourceConnectionError(f"failed to validate connection: {e}")
169
+
170
+ def get_ingest_docs(self):
171
+ return [
172
+ DiscordIngestDoc(
173
+ connector_config=self.connector_config,
174
+ processor_config=self.processor_config,
175
+ read_config=self.read_config,
176
+ channel=channel,
177
+ days=self.connector_config.period,
178
+ )
179
+ for channel in self.connector_config.channels
180
+ ]