unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,189 @@
1
+ # https://developers.notion.com/reference/rich-text
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Href, Style
6
+ from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U
7
+ from htmlBuilder.tags import Text as HtmlText
8
+
9
+ from unstructured_ingest.connector.notion.interfaces import (
10
+ FromJSONMixin,
11
+ GetHTMLMixin,
12
+ )
13
+ from unstructured_ingest.connector.notion.types.date import Date
14
+ from unstructured_ingest.connector.notion.types.user import People
15
+
16
+
17
+ @dataclass
18
+ class Annotations(FromJSONMixin):
19
+ bold: bool
20
+ code: bool
21
+ italic: bool
22
+ strikethrough: bool
23
+ underline: bool
24
+ color: str
25
+
26
+ @classmethod
27
+ def from_dict(cls, data: dict):
28
+ return cls(**data)
29
+
30
+
31
+ @dataclass
32
+ class Equation(FromJSONMixin, GetHTMLMixin):
33
+ expression: str
34
+
35
+ @classmethod
36
+ def from_dict(cls, data: dict):
37
+ return cls(**data)
38
+
39
+ def get_html(self) -> Optional[HtmlTag]:
40
+ return Code([], self.expression) if self.expression else None
41
+
42
+
43
+ @dataclass
44
+ class MentionDatabase(FromJSONMixin, GetHTMLMixin):
45
+ id: str
46
+
47
+ @classmethod
48
+ def from_dict(cls, data: dict):
49
+ return cls(**data)
50
+
51
+ def get_html(self) -> Optional[HtmlTag]:
52
+ return Div([], self.id) if self.id else None
53
+
54
+
55
+ @dataclass
56
+ class MentionLinkPreview(FromJSONMixin, GetHTMLMixin):
57
+ url: str
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: dict):
61
+ return cls(**data)
62
+
63
+ def get_html(self) -> Optional[HtmlTag]:
64
+ return A([Href(self.url)], self.url) if self.url else None
65
+
66
+
67
+ @dataclass
68
+ class MentionPage(FromJSONMixin, GetHTMLMixin):
69
+ id: str
70
+
71
+ @classmethod
72
+ def from_dict(cls, data: dict):
73
+ return cls(**data)
74
+
75
+ def get_html(self) -> Optional[HtmlTag]:
76
+ return Div([], self.id) if self.id else None
77
+
78
+
79
+ @dataclass
80
+ class MentionTemplate(FromJSONMixin):
81
+ template_mention_date: Optional[str]
82
+ template_mention_user: Optional[str]
83
+
84
+ @classmethod
85
+ def from_dict(cls, data: dict):
86
+ return cls(**data)
87
+
88
+
89
+ @dataclass
90
+ class Mention(FromJSONMixin, GetHTMLMixin):
91
+ type: str
92
+ database: Optional[MentionDatabase] = None
93
+ date: Optional[Date] = None
94
+ link_preview: Optional[MentionLinkPreview] = None
95
+ page: Optional[MentionPage] = None
96
+ template_mention: Optional[MentionTemplate] = None
97
+ user: Optional[People] = None
98
+
99
+ @classmethod
100
+ def from_dict(cls, data: dict):
101
+ t = data["type"]
102
+ mention = cls(type=t)
103
+ if t == "date":
104
+ mention.date = Date.from_dict(data["date"])
105
+ elif t == "database":
106
+ mention.database = MentionDatabase.from_dict(data["database"])
107
+ elif t == "link_preview":
108
+ mention.link_preview = MentionLinkPreview.from_dict(data["link_preview"])
109
+ elif t == "page":
110
+ mention.page = MentionPage.from_dict(data["page"])
111
+ elif t == "template_mention":
112
+ mention.template_mention = MentionTemplate.from_dict(data["template_mention"])
113
+ elif t == "user":
114
+ mention.user = People.from_dict(data["user"])
115
+
116
+ return mention
117
+
118
+ def get_html(self) -> Optional[HtmlTag]:
119
+ t = self.type
120
+ if t == "date":
121
+ return self.date.get_html() if self.date else None
122
+ elif t == "database":
123
+ return self.database.get_html() if self.database else None
124
+ elif t == "link_preview":
125
+ return self.link_preview.get_html() if self.link_preview else None
126
+ elif t == "page":
127
+ return self.page.get_html() if self.page else None
128
+ elif t == "user":
129
+ return self.user.get_html() if self.user else None
130
+ return None
131
+
132
+
133
+ @dataclass
134
+ class Text(FromJSONMixin):
135
+ content: str
136
+ link: Optional[dict]
137
+
138
+ @classmethod
139
+ def from_dict(cls, data: dict):
140
+ return cls(**data)
141
+
142
+
143
+ @dataclass
144
+ class RichText(FromJSONMixin, GetHTMLMixin):
145
+ type: str
146
+ plain_text: str
147
+ annotations: Optional[Annotations] = None
148
+ href: Optional[str] = None
149
+ text: Optional[Text] = None
150
+ mention: Optional[Mention] = None
151
+ equation: Optional[Equation] = None
152
+
153
+ def get_html(self) -> Optional[HtmlTag]:
154
+ text = HtmlText(self.plain_text)
155
+ if self.href:
156
+ text = A([Href(self.href)], text)
157
+ if self.annotations:
158
+ annotations = self.annotations
159
+ if annotations.bold:
160
+ text = B([], text)
161
+ if annotations.code:
162
+ text = Code([], text)
163
+ if annotations.italic:
164
+ text = I([], text)
165
+ if annotations.strikethrough:
166
+ text = S([], text)
167
+ if annotations.underline:
168
+ text = U([], text)
169
+ if annotations.color and annotations.color != "default":
170
+ if isinstance(text, HtmlText):
171
+ text = Span([], text)
172
+ text.attributes.append(Style(f"color:{annotations.color}"))
173
+ return text
174
+
175
+ @classmethod
176
+ def from_dict(cls, data: dict):
177
+ t = data["type"]
178
+ rich_text = cls(
179
+ annotations=Annotations.from_dict(data.pop("annotations")),
180
+ **data,
181
+ )
182
+ if t == "text":
183
+ rich_text.text = Text.from_dict(data["text"])
184
+ elif t == "mention":
185
+ rich_text.mention = Mention.from_dict(data["mention"])
186
+ elif t == "equation":
187
+ rich_text.equation = Equation.from_dict(data["equation"])
188
+
189
+ return rich_text
@@ -0,0 +1,76 @@
1
+ # https://developers.notion.com/reference/user
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin
9
+
10
+
11
+ @dataclass
12
+ class PartialUser(FromJSONMixin):
13
+ id: str
14
+ object: str = "user"
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict):
18
+ return cls(id=data["id"])
19
+
20
+
21
+ @dataclass
22
+ class User(FromJSONMixin, GetHTMLMixin):
23
+ object: dict
24
+ id: str
25
+ type: Optional[str] = None
26
+ name: Optional[str] = None
27
+ avatar_url: Optional[str] = None
28
+
29
+ @classmethod
30
+ def from_dict(cls, data: dict):
31
+ return cls(**data)
32
+
33
+ def get_text(self) -> Optional[str]:
34
+ text = self.name
35
+ if self.avatar_url:
36
+ text = f"[{text}]({self.avatar_url}"
37
+ return text
38
+
39
+ def get_html(self) -> Optional[HtmlTag]:
40
+ if self.avatar_url:
41
+ return A([Href(self.avatar_url)], self.name)
42
+ else:
43
+ return Div([], self.name)
44
+
45
+
46
+ @dataclass
47
+ class People(User):
48
+ person: dict = field(default_factory=dict)
49
+
50
+
51
+ @dataclass
52
+ class Bots(FromJSONMixin, GetHTMLMixin):
53
+ object: dict
54
+ id: str
55
+ bot: dict
56
+ owner: dict
57
+ type: str
58
+ workspace_name: str
59
+ name: Optional[str] = None
60
+ avatar_url: Optional[str] = None
61
+
62
+ @classmethod
63
+ def from_dict(cls, data: dict):
64
+ return cls(**data)
65
+
66
+ def get_text(self) -> Optional[str]:
67
+ text = self.name
68
+ if self.avatar_url:
69
+ text = f"[{text}]({self.avatar_url}"
70
+ return text
71
+
72
+ def get_html(self) -> Optional[HtmlTag]:
73
+ if self.avatar_url:
74
+ return A([Href(self.avatar_url)], self.name)
75
+ else:
76
+ return Div([], self.name)
@@ -0,0 +1,232 @@
1
+ import typing as t
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+
5
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
7
+ from unstructured_ingest.interfaces import (
8
+ AccessConfig,
9
+ BaseConnectorConfig,
10
+ BaseSingleIngestDoc,
11
+ BaseSourceConnector,
12
+ IngestDocCleanupMixin,
13
+ SourceConnectorCleanupMixin,
14
+ SourceMetadata,
15
+ )
16
+ from unstructured_ingest.logger import logger
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime
19
+
20
+ if t.TYPE_CHECKING:
21
+ from office365.graph_client import GraphClient
22
+ from office365.onedrive.driveitems.driveItem import DriveItem
23
+ MAX_MB_SIZE = 512_000_000
24
+
25
+
26
+ @dataclass
27
+ class OneDriveAccessConfig(AccessConfig):
28
+ client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred")
29
+
30
+
31
+ @dataclass
32
+ class SimpleOneDriveConfig(BaseConnectorConfig):
33
+ access_config: OneDriveAccessConfig
34
+ client_id: str
35
+ user_pname: str
36
+ tenant: str = field(repr=False)
37
+ authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
38
+ path: t.Optional[str] = field(default="")
39
+ recursive: bool = False
40
+
41
+ def __post_init__(self):
42
+ if not (self.client_id and self.access_config.client_credential and self.user_pname):
43
+ raise ValueError(
44
+ "Please provide all the following mandatory values:"
45
+ "\n-ms-client_id\n-ms-client_cred\n-ms-user-pname",
46
+ )
47
+ self.token_factory = self._acquire_token
48
+
49
+ @SourceConnectionError.wrap
50
+ @requires_dependencies(["msal"])
51
+ def _acquire_token(self):
52
+ from msal import ConfidentialClientApplication
53
+
54
+ try:
55
+ app = ConfidentialClientApplication(
56
+ authority=f"{self.authority_url}/{self.tenant}",
57
+ client_id=self.client_id,
58
+ client_credential=self.access_config.client_credential,
59
+ )
60
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
61
+ except ValueError as exc:
62
+ logger.error("Couldn't set up credentials for OneDrive")
63
+ raise exc
64
+ return token
65
+
66
+
67
+ @dataclass
68
+ class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
69
+ connector_config: SimpleOneDriveConfig
70
+ file_name: str
71
+ file_path: str
72
+ registry_name: str = "onedrive"
73
+
74
+ def __post_init__(self):
75
+ self.ext = Path(self.file_name).suffix
76
+ if not self.ext:
77
+ raise ValueError("Unsupported file without extension.")
78
+
79
+ self.server_relative_path = self.file_path + "/" + self.file_name
80
+ self._set_download_paths()
81
+
82
+ def _set_download_paths(self) -> None:
83
+ """Parses the folder structure from the source and creates the download and output paths"""
84
+ download_path = Path(f"{self.read_config.download_dir}")
85
+ output_path = Path(f"{self.processor_config.output_dir}")
86
+
87
+ if parent_path := self.file_path:
88
+ download_path = (
89
+ download_path if parent_path == "" else (download_path / parent_path).resolve()
90
+ )
91
+ output_path = (
92
+ output_path if parent_path == "" else (output_path / parent_path).resolve()
93
+ )
94
+
95
+ self.download_dir = download_path
96
+ self.download_filepath = (download_path / self.file_name).resolve()
97
+ output_filename = output_filename = self.file_name + ".json"
98
+ self.output_dir = output_path
99
+ self.output_filepath = (output_path / output_filename).resolve()
100
+
101
+ @property
102
+ def filename(self):
103
+ return Path(self.download_filepath).resolve()
104
+
105
+ @property
106
+ def _output_filename(self):
107
+ return Path(self.output_filepath).resolve()
108
+
109
+ @property
110
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
111
+ return {
112
+ "user_pname": self.connector_config.user_pname,
113
+ "server_relative_path": self.server_relative_path,
114
+ }
115
+
116
+ @SourceConnectionNetworkError.wrap
117
+ @requires_dependencies(["office365"], extras="onedrive")
118
+ def _fetch_file(self):
119
+ from office365.graph_client import GraphClient
120
+
121
+ client = GraphClient(self.connector_config.token_factory)
122
+ root = client.users[self.connector_config.user_pname].drive.get().execute_query().root
123
+ file = root.get_by_path(self.server_relative_path).get().execute_query()
124
+ return file
125
+
126
+ def update_source_metadata(self, **kwargs):
127
+ file = kwargs.get("file", self._fetch_file())
128
+ if file is None:
129
+ self.source_metadata = SourceMetadata(
130
+ exists=False,
131
+ )
132
+ return
133
+
134
+ version = None
135
+ if (n_versions := len(file.versions)) > 0:
136
+ version = file.versions[n_versions - 1].properties.get("id", None)
137
+
138
+ self.source_metadata = SourceMetadata(
139
+ date_created=ensure_isoformat_datetime(timestamp=file.created_datetime),
140
+ date_modified=ensure_isoformat_datetime(timestamp=file.last_modified_datetime),
141
+ version=version,
142
+ source_url=file.parent_reference.path + "/" + self.file_name,
143
+ exists=True,
144
+ )
145
+
146
+ @SourceConnectionError.wrap
147
+ @BaseSingleIngestDoc.skip_if_file_exists
148
+ def get_file(self):
149
+ file = self._fetch_file()
150
+ self.update_source_metadata(file=file)
151
+ if file is None:
152
+ raise ValueError(
153
+ f"Failed to retrieve file {self.file_path}/{self.file_name}",
154
+ )
155
+
156
+ fsize = file.get_property("size", 0)
157
+ self.output_dir.mkdir(parents=True, exist_ok=True)
158
+
159
+ if not self.download_dir.is_dir():
160
+ logger.debug(f"Creating directory: {self.download_dir}")
161
+ self.download_dir.mkdir(parents=True, exist_ok=True)
162
+
163
+ if fsize > MAX_MB_SIZE:
164
+ logger.info(f"Downloading file with size: {fsize} bytes in chunks")
165
+ with self.filename.open(mode="wb") as f:
166
+ file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
167
+ else:
168
+ with self.filename.open(mode="wb") as f:
169
+ file.download(f).execute_query()
170
+ logger.info(f"File downloaded: {self.filename}")
171
+ return
172
+
173
+
174
+ @dataclass
175
+ class OneDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
176
+ connector_config: SimpleOneDriveConfig
177
+ _client: t.Optional["GraphClient"] = field(init=False, default=None)
178
+
179
+ @property
180
+ def client(self) -> "GraphClient":
181
+ from office365.graph_client import GraphClient
182
+
183
+ if self._client is None:
184
+ self._client = GraphClient(self.connector_config.token_factory)
185
+ return self._client
186
+
187
+ @requires_dependencies(["office365"], extras="onedrive")
188
+ def initialize(self):
189
+ _ = self.client
190
+
191
+ @requires_dependencies(["office365"], extras="onedrive")
192
+ def check_connection(self):
193
+ try:
194
+ token_resp: dict = self.connector_config.token_factory()
195
+ if error := token_resp.get("error"):
196
+ raise SourceConnectionError(
197
+ "{} ({})".format(error, token_resp.get("error_description"))
198
+ )
199
+ _ = self.client
200
+ except Exception as e:
201
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
202
+ raise SourceConnectionError(f"failed to validate connection: {e}")
203
+
204
+ def _list_objects(self, folder, recursive) -> t.List["DriveItem"]:
205
+ drive_items = folder.children.get().execute_query()
206
+ files = [d for d in drive_items if d.is_file]
207
+ if not recursive:
208
+ return files
209
+ folders = [d for d in drive_items if d.is_folder]
210
+ for f in folders:
211
+ files += self._list_objects(f, recursive)
212
+ return files
213
+
214
+ def _gen_ingest_doc(self, file: "DriveItem") -> OneDriveIngestDoc:
215
+ file_path = file.parent_reference.path.split(":")[-1]
216
+ file_path = file_path[1:] if file_path[0] == "/" else file_path
217
+ return OneDriveIngestDoc(
218
+ connector_config=self.connector_config,
219
+ processor_config=self.processor_config,
220
+ read_config=self.read_config,
221
+ file_name=file.name,
222
+ file_path=file_path,
223
+ )
224
+
225
+ def get_ingest_docs(self):
226
+ root = self.client.users[self.connector_config.user_pname].drive.get().execute_query().root
227
+ if fpath := self.connector_config.path:
228
+ root = root.get_by_path(fpath).get().execute_query()
229
+ if root is None or not root.is_folder:
230
+ raise ValueError(f"Unable to find directory, given: {fpath}")
231
+ files = self._list_objects(root, self.connector_config.recursive)
232
+ return [self._gen_ingest_doc(f) for f in files]