unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,216 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from time import time
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+
7
+ from dateutil import parser
8
+ from unstructured.documents.elements import DataSourceMetadata
9
+
10
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
11
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ FileData,
20
+ Indexer,
21
+ IndexerConfig,
22
+ SourceIdentifiers,
23
+ download_responses,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import (
27
+ SourceRegistryEntry,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from office365.graph_client import GraphClient
32
+ from office365.onedrive.driveitems.driveItem import DriveItem
33
+
34
+ CONNECTOR_TYPE = "onedrive"
35
+ MAX_MB_SIZE = 512_000_000
36
+
37
+
38
+ @dataclass
39
+ class OnedriveAccessConfig(AccessConfig):
40
+ client_cred: str
41
+
42
+
43
+ @dataclass
44
+ class OnedriveConnectionConfig(ConnectionConfig):
45
+ client_id: str
46
+ user_pname: str
47
+ tenant: str = field(repr=False)
48
+ authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
49
+ access_config: OnedriveAccessConfig = enhanced_field(sensitive=True)
50
+
51
+ @requires_dependencies(["msal"], extras="onedrive")
52
+ def get_token(self):
53
+ from msal import ConfidentialClientApplication
54
+
55
+ try:
56
+ app = ConfidentialClientApplication(
57
+ authority=f"{self.authority_url}/{self.tenant}",
58
+ client_id=self.client_id,
59
+ client_credential=self.access_config.client_cred,
60
+ )
61
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
62
+ except ValueError as exc:
63
+ logger.error("Couldn't set up credentials for OneDrive")
64
+ raise exc
65
+ if "error" in token:
66
+ raise SourceConnectionNetworkError(
67
+ "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
68
+ )
69
+ return token
70
+
71
+ @requires_dependencies(["office365"], extras="onedrive")
72
+ def get_client(self) -> "GraphClient":
73
+ from office365.graph_client import GraphClient
74
+
75
+ client = GraphClient(self.get_token)
76
+ return client
77
+
78
+
79
+ @dataclass
80
+ class OnedriveIndexerConfig(IndexerConfig):
81
+ path: Optional[str] = field(default="")
82
+ recursive: bool = False
83
+
84
+
85
+ @dataclass
86
+ class OnedriveIndexer(Indexer):
87
+ connection_config: OnedriveConnectionConfig
88
+ index_config: OnedriveIndexerConfig
89
+
90
+ def list_objects(self, folder, recursive) -> list["DriveItem"]:
91
+ drive_items = folder.children.get().execute_query()
92
+ files = [d for d in drive_items if d.is_file]
93
+ if not recursive:
94
+ return files
95
+ folders = [d for d in drive_items if d.is_folder]
96
+ for f in folders:
97
+ files.extend(self.list_objects(f, recursive))
98
+ return files
99
+
100
+ def get_root(self, client: "GraphClient") -> "DriveItem":
101
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
102
+ if fpath := self.index_config.path:
103
+ root = root.get_by_path(fpath).get().execute_query()
104
+ if root is None or not root.is_folder:
105
+ raise ValueError(f"Unable to find directory, given: {fpath}")
106
+ return root
107
+
108
+ def get_properties(self, drive_item: "DriveItem") -> dict:
109
+ properties = drive_item.properties
110
+ filtered_properties = {}
111
+ for k, v in properties.items():
112
+ try:
113
+ json.dumps(v)
114
+ filtered_properties[k] = v
115
+ except TypeError:
116
+ pass
117
+ return filtered_properties
118
+
119
+ def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
120
+ file_path = drive_item.parent_reference.path.split(":")[-1]
121
+ file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
122
+ filename = drive_item.name
123
+ server_path = file_path + "/" + filename
124
+ rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
125
+ date_modified_dt = (
126
+ parser.parse(drive_item.last_modified_datetime)
127
+ if drive_item.last_modified_datetime
128
+ else None
129
+ )
130
+ date_created_at = (
131
+ parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
132
+ )
133
+ return FileData(
134
+ identifier=drive_item.id,
135
+ connector_type=CONNECTOR_TYPE,
136
+ source_identifiers=SourceIdentifiers(
137
+ fullpath=server_path, filename=drive_item.name, rel_path=rel_path
138
+ ),
139
+ metadata=DataSourceMetadata(
140
+ url=drive_item.parent_reference.path + "/" + drive_item.name,
141
+ version=drive_item.etag,
142
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
143
+ date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
144
+ date_processed=str(time()),
145
+ record_locator={
146
+ "user_pname": self.connection_config.user_pname,
147
+ "server_relative_path": server_path,
148
+ },
149
+ ),
150
+ additional_metadata=self.get_properties(drive_item=drive_item),
151
+ )
152
+
153
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
154
+ client = self.connection_config.get_client()
155
+ root = self.get_root(client=client)
156
+ drive_items = self.list_objects(folder=root, recursive=self.index_config.recursive)
157
+ for drive_item in drive_items:
158
+ file_data = self.drive_item_to_file_data(drive_item=drive_item)
159
+ yield file_data
160
+
161
+
162
+ @dataclass
163
+ class OnedriveDownloaderConfig(DownloaderConfig):
164
+ pass
165
+
166
+
167
+ @dataclass
168
+ class OnedriveDownloader(Downloader):
169
+ connection_config: OnedriveConnectionConfig
170
+ download_config: OnedriveDownloaderConfig
171
+
172
+ @SourceConnectionNetworkError.wrap
173
+ def _fetch_file(self, file_data: FileData):
174
+ if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
175
+ raise ValueError(
176
+ f"file data doesn't have enough information to get "
177
+ f"file content: {file_data.to_dict()}"
178
+ )
179
+
180
+ server_relative_path = file_data.source_identifiers.fullpath
181
+ client = self.connection_config.get_client()
182
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
183
+ file = root.get_by_path(server_relative_path).get().execute_query()
184
+ if not file:
185
+ raise FileNotFoundError(f"file not found: {server_relative_path}")
186
+ return file
187
+
188
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
189
+ rel_path = file_data.source_identifiers.relative_path
190
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
191
+ return self.download_dir / Path(rel_path)
192
+
193
+ @SourceConnectionError.wrap
194
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
195
+ file = self._fetch_file(file_data=file_data)
196
+ fsize = file.get_property("size", 0)
197
+ download_path = self.get_download_path(file_data=file_data)
198
+ download_path.parent.mkdir(parents=True, exist_ok=True)
199
+ logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
200
+ if fsize > MAX_MB_SIZE:
201
+ logger.info(f"Downloading file with size: {fsize} bytes in chunks")
202
+ with download_path.open(mode="wb") as f:
203
+ file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
204
+ else:
205
+ with download_path.open(mode="wb") as f:
206
+ file.download(f).execute_query()
207
+ return DownloadResponse(file_data=file_data, path=download_path)
208
+
209
+
210
+ onedrive_source_entry = SourceRegistryEntry(
211
+ connection_config=OnedriveConnectionConfig,
212
+ indexer_config=OnedriveIndexerConfig,
213
+ indexer=OnedriveIndexer,
214
+ downloader_config=OnedriveDownloaderConfig,
215
+ downloader=OnedriveDownloader,
216
+ )
@@ -0,0 +1,155 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
5
+ from unstructured_ingest.error import (
6
+ DestinationConnectionError,
7
+ )
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import (
10
+ AccessConfig,
11
+ ConnectionConfig,
12
+ )
13
+ from unstructured_ingest.v2.logger import logger
14
+ from unstructured_ingest.v2.processes.connector_registry import (
15
+ DestinationRegistryEntry,
16
+ SourceRegistryEntry,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.elasticsearch import (
19
+ ElasticsearchDownloader,
20
+ ElasticsearchDownloaderConfig,
21
+ ElasticsearchIndexer,
22
+ ElasticsearchIndexerConfig,
23
+ ElasticsearchUploader,
24
+ ElasticsearchUploaderConfig,
25
+ ElasticsearchUploadStager,
26
+ ElasticsearchUploadStagerConfig,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from opensearchpy import OpenSearch
31
+
32
+ CONNECTOR_TYPE = "opensearch"
33
+
34
+ """Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
35
+ heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
36
+
37
+
38
+ @dataclass
39
+ class OpenSearchAccessConfig(AccessConfig):
40
+ password: Optional[str] = enhanced_field(default=None, sensitive=True)
41
+ use_ssl: bool = False
42
+ verify_certs: bool = False
43
+ ssl_show_warn: bool = False
44
+ ca_certs: Optional[str] = None
45
+ client_cert: Optional[str] = None
46
+ client_key: Optional[str] = None
47
+
48
+
49
+ @dataclass
50
+ class OpenSearchClientInput(EnhancedDataClassJsonMixin):
51
+ http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
52
+ hosts: Optional[list[str]] = None
53
+ use_ssl: bool = False
54
+ verify_certs: bool = False
55
+ ssl_show_warn: bool = False
56
+ ca_certs: Optional[str] = None
57
+ client_cert: Optional[str] = None
58
+ client_key: Optional[str] = None
59
+
60
+
61
+ @dataclass
62
+ class OpenSearchConnectionConfig(ConnectionConfig):
63
+ hosts: Optional[list[str]] = None
64
+ username: Optional[str] = None
65
+ access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True)
66
+
67
+ def get_client_kwargs(self) -> dict:
68
+ # Update auth related fields to conform to what the SDK expects based on the
69
+ # supported methods:
70
+ # https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
71
+ client_input = OpenSearchClientInput()
72
+ if self.hosts:
73
+ client_input.hosts = self.hosts
74
+ if self.access_config.use_ssl:
75
+ client_input.use_ssl = self.access_config.use_ssl
76
+ if self.access_config.verify_certs:
77
+ client_input.verify_certs = self.access_config.verify_certs
78
+ if self.access_config.ssl_show_warn:
79
+ client_input.ssl_show_warn = self.access_config.ssl_show_warn
80
+ if self.access_config.ca_certs:
81
+ client_input.ca_certs = self.access_config.ca_certs
82
+ if self.access_config.client_cert:
83
+ client_input.client_cert = self.access_config.client_cert
84
+ if self.access_config.client_key:
85
+ client_input.client_key = self.access_config.client_key
86
+ if self.username and self.access_config.password:
87
+ client_input.http_auth = (self.username, self.access_config.password)
88
+ logger.debug(
89
+ f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
90
+ )
91
+ client_kwargs = client_input.to_dict(redact_sensitive=False)
92
+ client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
93
+ return client_kwargs
94
+
95
+ @DestinationConnectionError.wrap
96
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
97
+ def get_client(self) -> "OpenSearch":
98
+ from opensearchpy import OpenSearch
99
+
100
+ return OpenSearch(**self.get_client_kwargs())
101
+
102
+
103
+ @dataclass
104
+ class OpenSearchIndexer(ElasticsearchIndexer):
105
+ connection_config: OpenSearchConnectionConfig
106
+ client: "OpenSearch" = field(init=False)
107
+
108
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
109
+ def load_scan(self):
110
+ from opensearchpy.helpers import scan
111
+
112
+ return scan
113
+
114
+
115
+ @dataclass
116
+ class OpenSearchDownloader(ElasticsearchDownloader):
117
+ connection_config: OpenSearchConnectionConfig
118
+ connector_type: str = CONNECTOR_TYPE
119
+
120
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
121
+ def load_async(self):
122
+ from opensearchpy import AsyncOpenSearch
123
+ from opensearchpy.helpers import async_scan
124
+
125
+ return AsyncOpenSearch, async_scan
126
+
127
+
128
+ @dataclass
129
+ class OpenSearchUploader(ElasticsearchUploader):
130
+ connection_config: OpenSearchConnectionConfig
131
+ connector_type: str = CONNECTOR_TYPE
132
+
133
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
134
+ def load_parallel_bulk(self):
135
+ from opensearchpy.helpers import parallel_bulk
136
+
137
+ return parallel_bulk
138
+
139
+
140
+ opensearch_source_entry = SourceRegistryEntry(
141
+ connection_config=OpenSearchConnectionConfig,
142
+ indexer=OpenSearchIndexer,
143
+ indexer_config=ElasticsearchIndexerConfig,
144
+ downloader=OpenSearchDownloader,
145
+ downloader_config=ElasticsearchDownloaderConfig,
146
+ )
147
+
148
+
149
+ opensearch_destination_entry = DestinationRegistryEntry(
150
+ connection_config=OpenSearchConnectionConfig,
151
+ upload_stager_config=ElasticsearchUploadStagerConfig,
152
+ upload_stager=ElasticsearchUploadStager,
153
+ uploader_config=ElasticsearchUploaderConfig,
154
+ uploader=OpenSearchUploader,
155
+ )
@@ -0,0 +1,178 @@
1
+ import json
2
+ import multiprocessing as mp
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Optional
7
+
8
+ from unstructured.ingest.v2.logger import logger
9
+ from unstructured.ingest.v2.processes.connector_registry import (
10
+ DestinationRegistryEntry,
11
+ )
12
+ from unstructured.staging.base import flatten_dict
13
+ from unstructured.utils import requires_dependencies
14
+
15
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
16
+ from unstructured_ingest.error import DestinationConnectionError
17
+ from unstructured_ingest.utils.data_prep import batch_generator
18
+ from unstructured_ingest.v2.interfaces import (
19
+ AccessConfig,
20
+ ConnectionConfig,
21
+ UploadContent,
22
+ Uploader,
23
+ UploaderConfig,
24
+ UploadStager,
25
+ UploadStagerConfig,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from pinecone import Index as PineconeIndex
30
+
31
+
32
+ CONNECTOR_TYPE = "pinecone"
33
+
34
+
35
+ @dataclass
36
+ class PineconeAccessConfig(AccessConfig):
37
+ api_key: Optional[str] = enhanced_field(default=None, overload_name="pinecone_api_key")
38
+
39
+
40
+ @dataclass
41
+ class PineconeConnectionConfig(ConnectionConfig):
42
+ index_name: str
43
+ environment: str
44
+ access_config: PineconeAccessConfig = enhanced_field(sensitive=True)
45
+
46
+ @requires_dependencies(["pinecone"], extras="pinecone")
47
+ def get_index(self) -> "PineconeIndex":
48
+ from pinecone import Pinecone
49
+ from unstructured import __version__ as unstructured_version
50
+
51
+ pc = Pinecone(
52
+ api_key=self.access_config.api_key,
53
+ source_tag=f"unstructured=={unstructured_version}",
54
+ )
55
+
56
+ index = pc.Index(self.index_name)
57
+ logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
58
+ return index
59
+
60
+
61
+ @dataclass
62
+ class PineconeUploadStagerConfig(UploadStagerConfig):
63
+ pass
64
+
65
+
66
+ @dataclass
67
+ class PineconeUploaderConfig(UploaderConfig):
68
+ batch_size: int = 100
69
+ num_of_processes: int = 4
70
+
71
+
72
+ @dataclass
73
+ class PineconeUploadStager(UploadStager):
74
+ upload_stager_config: PineconeUploadStagerConfig = field(
75
+ default_factory=lambda: PineconeUploadStagerConfig()
76
+ )
77
+
78
+ @staticmethod
79
+ def conform_dict(element_dict: dict) -> dict:
80
+ # While flatten_dict enables indexing on various fields,
81
+ # element_serialized enables easily reloading the element object to memory.
82
+ # element_serialized is formed without text/embeddings to avoid data bloating.
83
+ return {
84
+ "id": str(uuid.uuid4()),
85
+ "values": element_dict.pop("embeddings", None),
86
+ "metadata": {
87
+ "text": element_dict.pop("text", None),
88
+ "element_serialized": json.dumps(element_dict),
89
+ **flatten_dict(
90
+ element_dict,
91
+ separator="-",
92
+ flatten_lists=True,
93
+ remove_none=True,
94
+ ),
95
+ },
96
+ }
97
+
98
+ def run(
99
+ self,
100
+ elements_filepath: Path,
101
+ output_dir: Path,
102
+ output_filename: str,
103
+ **kwargs: Any,
104
+ ) -> Path:
105
+ with open(elements_filepath) as elements_file:
106
+ elements_contents = json.load(elements_file)
107
+
108
+ conformed_elements = [
109
+ self.conform_dict(element_dict=element) for element in elements_contents
110
+ ]
111
+
112
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
113
+ output_path.parent.mkdir(parents=True, exist_ok=True)
114
+
115
+ with open(output_path, "w") as output_file:
116
+ json.dump(conformed_elements, output_file)
117
+ return output_path
118
+
119
+
120
+ @dataclass
121
+ class PineconeUploader(Uploader):
122
+ upload_config: PineconeUploaderConfig
123
+ connection_config: PineconeConnectionConfig
124
+ connector_type: str = CONNECTOR_TYPE
125
+
126
+ @DestinationConnectionError.wrap
127
+ def check_connection(self):
128
+ _ = self.connection_config.get_index()
129
+
130
+ @requires_dependencies(["pinecone"], extras="pinecone")
131
+ def upsert_batch(self, batch):
132
+ from pinecone.exceptions import PineconeApiException
133
+
134
+ try:
135
+ index = self.connection_config.get_index()
136
+ response = index.upsert(batch)
137
+ except PineconeApiException as api_error:
138
+ raise DestinationConnectionError(f"http error: {api_error}") from api_error
139
+ logger.debug(f"results: {response}")
140
+
141
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
142
+
143
+ elements_dict = []
144
+ for content in contents:
145
+ with open(content.path) as elements_file:
146
+ elements = json.load(elements_file)
147
+ elements_dict.extend(elements)
148
+
149
+ logger.info(
150
+ f"writing document batches to destination"
151
+ f" index named {self.connection_config.index_name}"
152
+ f" environment named {self.connection_config.environment}"
153
+ f" with batch size {self.upload_config.batch_size}"
154
+ f" with {self.upload_config.num_of_processes} (number of) processes"
155
+ )
156
+
157
+ pinecone_batch_size = self.upload_config.batch_size
158
+
159
+ if self.upload_config.num_of_processes == 1:
160
+ for batch in batch_generator(elements_dict, pinecone_batch_size):
161
+ self.upsert_batch(batch) # noqa: E203
162
+
163
+ else:
164
+ with mp.Pool(
165
+ processes=self.upload_config.num_of_processes,
166
+ ) as pool:
167
+ pool.map(
168
+ self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
169
+ )
170
+
171
+
172
+ pinecone_destination_entry = DestinationRegistryEntry(
173
+ connection_config=PineconeConnectionConfig,
174
+ uploader=PineconeUploader,
175
+ uploader_config=PineconeUploaderConfig,
176
+ upload_stager=PineconeUploadStager,
177
+ upload_stager_config=PineconeUploadStagerConfig,
178
+ )