unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,468 @@
1
+ import typing as t
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from uuid import UUID
5
+
6
+ import httpx
7
+
8
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.error import SourceConnectionError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ BaseConnectorConfig,
13
+ BaseSingleIngestDoc,
14
+ BaseSourceConnector,
15
+ IngestDocCleanupMixin,
16
+ RetryStrategyConfig,
17
+ SourceConnectorCleanupMixin,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.dep_check import (
21
+ requires_dependencies,
22
+ )
23
+
24
+ NOTION_API_VERSION = "2022-06-28"
25
+ if t.TYPE_CHECKING:
26
+ from unstructured_ingest.connector.notion.client import Client as NotionClient
27
+
28
+
29
+ @dataclass
30
+ class NotionAccessConfig(AccessConfig):
31
+ notion_api_key: str = enhanced_field(sensitive=True)
32
+
33
+
34
+ @dataclass
35
+ class SimpleNotionConfig(BaseConnectorConfig):
36
+ """Connector config to process all messages by channel id's."""
37
+
38
+ access_config: NotionAccessConfig
39
+ page_ids: t.Optional[t.List[str]] = None
40
+ database_ids: t.Optional[t.List[str]] = None
41
+ recursive: bool = False
42
+
43
+ def __post_init__(self):
44
+ if self.page_ids:
45
+ self.page_ids = [str(UUID(p.strip())) for p in self.page_ids]
46
+
47
+ if self.database_ids:
48
+ self.database_ids = [str(UUID(d.strip())) for d in self.database_ids]
49
+
50
+
51
+ @dataclass
52
+ class NotionPageIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
53
+ """Class encapsulating fetching a doc and writing processed results (but not
54
+ doing the processing!).
55
+
56
+ Also includes a cleanup method. When things go wrong and the cleanup
57
+ method is not called, the file is left behind on the filesystem to assist debugging.
58
+ """
59
+
60
+ page_id: str
61
+ connector_config: SimpleNotionConfig
62
+ registry_name: str = "notion_page"
63
+ retry_strategy_config: t.Optional[RetryStrategyConfig] = None
64
+
65
+ def _tmp_download_file(self):
66
+ page_file = self.page_id + ".html"
67
+ return Path(self.read_config.download_dir) / page_file
68
+
69
+ @property
70
+ def _output_filename(self):
71
+ page_file = self.page_id + ".json"
72
+ return Path(self.processor_config.output_dir) / page_file
73
+
74
+ def _create_full_tmp_dir_path(self):
75
+ self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
76
+
77
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
78
+ def get_client(self):
79
+ from unstructured_ingest.connector.notion.client import Client as NotionClient
80
+
81
+ # Pin the version of the api to avoid schema changes
82
+ return NotionClient(
83
+ notion_version=NOTION_API_VERSION,
84
+ auth=self.connector_config.access_config.notion_api_key,
85
+ logger=logger,
86
+ log_level=logger.level,
87
+ retry_strategy_config=self.retry_strategy_config,
88
+ )
89
+
90
+ @BaseSingleIngestDoc.skip_if_file_exists
91
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
92
+ def get_file(self):
93
+ from notion_client import APIErrorCode, APIResponseError
94
+
95
+ from unstructured_ingest.connector.notion.helpers import extract_page_html
96
+
97
+ self._create_full_tmp_dir_path()
98
+
99
+ client = self.get_client()
100
+
101
+ try:
102
+ text_extraction = extract_page_html(
103
+ client=client,
104
+ page_id=self.page_id,
105
+ logger=logger,
106
+ )
107
+ self.check_exists = True
108
+ self.file_exists = True
109
+ if html := text_extraction.html:
110
+ with open(self._tmp_download_file(), "w") as page_file:
111
+ page_file.write(html.render(pretty=True))
112
+
113
+ except APIResponseError as error:
114
+ if error.code == APIErrorCode.ObjectNotFound:
115
+ self.check_exists = True
116
+ self.file_exists = False
117
+ else:
118
+ logger.error(f"Error: {error}")
119
+
120
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
121
+ def get_file_metadata(self):
122
+ from notion_client import APIErrorCode, APIResponseError
123
+
124
+ client = self.get_client()
125
+
126
+ # The Notion block endpoint gives more hierarchical information (parent,child relationships)
127
+ # than the pages endpoint so choosing to use that one to get metadata about the page
128
+ try:
129
+ self.file_metadata = client.pages.retrieve(page_id=self.page_id) # type: ignore
130
+ self.check_exists = True
131
+ self.file_exists = True
132
+ except APIResponseError as error:
133
+ if error.code == APIErrorCode.ObjectNotFound:
134
+ self.check_exists = True
135
+ self.file_exists = False
136
+ else:
137
+ logger.error(f"Error: {error}")
138
+
139
+ @property
140
+ def date_created(self) -> t.Optional[str]:
141
+ """The date the document was created on the source system."""
142
+ if not hasattr(self, "file_metadata") or not self.file_metadata:
143
+ self.get_file_metadata()
144
+
145
+ return self.file_metadata.created_time if self.file_metadata else None
146
+
147
+ @property
148
+ def date_modified(self) -> t.Optional[str]:
149
+ """The date the document was last modified on the source system."""
150
+ if not hasattr(self, "file_metadata") or not self.file_metadata:
151
+ self.get_file_metadata()
152
+
153
+ return self.file_metadata.last_edited_time if self.file_metadata else None
154
+
155
+ @property
156
+ def exists(self) -> t.Optional[bool]:
157
+ """Whether the document exists on the remote source."""
158
+ if self.check_exists:
159
+ return self.file_exists
160
+
161
+ self.get_file_metadata()
162
+
163
+ return self.file_exists
164
+
165
+ @property
166
+ def filename(self):
167
+ """The filename of the file created from a notion page"""
168
+ return self._tmp_download_file()
169
+
170
+
171
+ @dataclass
172
+ class NotionDatabaseIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
173
+ """Class encapsulating fetching a doc and writing processed results (but not
174
+ doing the processing!).
175
+
176
+ Also includes a cleanup method. When things go wrong and the cleanup
177
+ method is not called, the file is left behind on the filesystem to assist debugging.
178
+ """
179
+
180
+ database_id: str
181
+ connector_config: SimpleNotionConfig
182
+ retry_strategy_config: t.Optional[RetryStrategyConfig] = None
183
+ registry_name: str = "notion_database"
184
+
185
+ def _tmp_download_file(self):
186
+ page_file = self.database_id + ".html"
187
+ return Path(self.read_config.download_dir) / page_file
188
+
189
+ @property
190
+ def _output_filename(self):
191
+ page_file = self.database_id + ".json"
192
+ return Path(self.processor_config.output_dir) / page_file
193
+
194
+ def _create_full_tmp_dir_path(self):
195
+ self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
196
+
197
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
198
+ def get_client(self):
199
+ from unstructured_ingest.connector.notion.client import Client as NotionClient
200
+
201
+ # Pin the version of the api to avoid schema changes
202
+ return NotionClient(
203
+ notion_version=NOTION_API_VERSION,
204
+ auth=self.connector_config.access_config.notion_api_key,
205
+ logger=logger,
206
+ log_level=logger.level,
207
+ retry_strategy_config=self.retry_strategy_config,
208
+ )
209
+
210
+ @BaseSingleIngestDoc.skip_if_file_exists
211
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
212
+ def get_file(self):
213
+ from notion_client import APIErrorCode, APIResponseError
214
+
215
+ from unstructured_ingest.connector.notion.helpers import extract_database_html
216
+
217
+ self._create_full_tmp_dir_path()
218
+
219
+ client = self.get_client()
220
+
221
+ try:
222
+ text_extraction = extract_database_html(
223
+ client=client,
224
+ database_id=self.database_id,
225
+ logger=logger,
226
+ )
227
+ self.check_exists = True
228
+ self.file_exists = True
229
+ if html := text_extraction.html:
230
+ with open(self._tmp_download_file(), "w") as page_file:
231
+ page_file.write(html.render(pretty=True))
232
+
233
+ except APIResponseError as error:
234
+ if error.code == APIErrorCode.ObjectNotFound:
235
+ self.check_exists = True
236
+ self.file_exists = False
237
+ else:
238
+ logger.error(f"Error: {error}")
239
+
240
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
241
+ def get_file_metadata(self):
242
+ from notion_client import APIErrorCode, APIResponseError
243
+
244
+ client = self.get_client()
245
+
246
+ # The Notion block endpoint gives more hierarchical information (parent,child relationships)
247
+ # than the pages endpoint so choosing to use that one to get metadata about the page
248
+ try:
249
+ self.file_metadata = client.databases.retrieve(
250
+ database_id=self.database_id,
251
+ ) # type: ignore
252
+ self.check_exists = True
253
+ self.file_exists = True
254
+ except APIResponseError as error:
255
+ if error.code == APIErrorCode.ObjectNotFound:
256
+ self.check_exists = True
257
+ self.file_exists = False
258
+ else:
259
+ logger.error(f"Error: {error}")
260
+
261
+ @property
262
+ def date_created(self) -> t.Optional[str]:
263
+ """The date the document was created on the source system."""
264
+ if not hasattr(self, "file_metadata") or not self.file_metadata:
265
+ self.get_file_metadata()
266
+
267
+ return self.file_metadata.created_time if self.file_metadata else None
268
+
269
+ @property
270
+ def date_modified(self) -> t.Optional[str]:
271
+ """The date the document was last modified on the source system."""
272
+ if not hasattr(self, "file_metadata") or not self.file_metadata:
273
+ self.get_file_metadata()
274
+
275
+ return self.file_metadata.last_edited_time if self.file_metadata else None
276
+
277
+ @property
278
+ def exists(self) -> t.Optional[bool]:
279
+ """Whether the document exists on the remote source."""
280
+ if self.check_exists:
281
+ return self.file_exists
282
+
283
+ self.get_file_metadata()
284
+
285
+ return self.file_exists
286
+
287
+ @property
288
+ def filename(self):
289
+ """The filename of the file created from a notion page"""
290
+ return self._tmp_download_file()
291
+
292
+
293
+ @dataclass
294
+ class NotionSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
295
+ """Objects of this class support fetching document(s) from"""
296
+
297
+ connector_config: SimpleNotionConfig
298
+ retry_strategy_config: t.Optional[RetryStrategyConfig] = None
299
+ _client: t.Optional["NotionClient"] = field(init=False, default=None)
300
+
301
+ @property
302
+ def client(self) -> "NotionClient":
303
+ if self._client is None:
304
+ self._client = self.create_client()
305
+ return self._client
306
+
307
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
308
+ def create_client(self) -> "NotionClient":
309
+ from unstructured_ingest.connector.notion.client import Client as NotionClient
310
+
311
+ return NotionClient(
312
+ notion_version=NOTION_API_VERSION,
313
+ auth=self.connector_config.access_config.notion_api_key,
314
+ logger=logger,
315
+ log_level=logger.level,
316
+ retry_strategy_config=self.retry_strategy_config,
317
+ )
318
+
319
+ def check_connection(self):
320
+ try:
321
+ request = self.client._build_request("HEAD", "users")
322
+ response = self.client.client.send(request)
323
+ response.raise_for_status()
324
+ except httpx.HTTPStatusError as http_error:
325
+ logger.error(f"failed to validate connection: {http_error}", exc_info=True)
326
+ raise SourceConnectionError(f"failed to validate connection: {http_error}")
327
+
328
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
329
+ def initialize(self):
330
+ """Verify that can get metadata for an object, validates connections info."""
331
+ _ = self.client
332
+
333
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
334
+ def get_child_page_content(self, page_id: str):
335
+ from unstructured_ingest.connector.notion.helpers import (
336
+ get_recursive_content_from_page,
337
+ )
338
+
339
+ # sanity check that database id is valid
340
+ resp_code = self.client.pages.retrieve_status(page_id=page_id)
341
+ if resp_code != 200:
342
+ raise ValueError(
343
+ f"page associated with page id could not be found: {page_id}",
344
+ )
345
+
346
+ child_content = get_recursive_content_from_page(
347
+ client=self.client,
348
+ page_id=page_id,
349
+ logger=logger,
350
+ )
351
+ return child_content
352
+
353
+ def get_child_content(self, page_id: str):
354
+ from unstructured_ingest.connector.notion.helpers import (
355
+ get_recursive_content_from_page,
356
+ )
357
+
358
+ child_content = get_recursive_content_from_page(
359
+ client=self.client,
360
+ page_id=page_id,
361
+ logger=logger,
362
+ )
363
+ return child_content
364
+
365
+ @requires_dependencies(dependencies=["notion_client"], extras="notion")
366
+ def get_child_database_content(self, database_id: str):
367
+ from unstructured_ingest.connector.notion.helpers import (
368
+ get_recursive_content_from_database,
369
+ )
370
+
371
+ # sanity check that database id is valid
372
+ resp_code = self.client.databases.retrieve_status(database_id=database_id)
373
+ if resp_code != 200:
374
+ raise ValueError(
375
+ f"database associated with database id could not be found: {database_id}",
376
+ )
377
+
378
+ child_content = get_recursive_content_from_database(
379
+ client=self.client,
380
+ database_id=database_id,
381
+ logger=logger,
382
+ )
383
+ return child_content
384
+
385
+ def get_ingest_docs(self):
386
+ docs: t.List[BaseSingleIngestDoc] = []
387
+ if self.connector_config.page_ids:
388
+ docs += [
389
+ NotionPageIngestDoc(
390
+ connector_config=self.connector_config,
391
+ processor_config=self.processor_config,
392
+ retry_strategy_config=self.retry_strategy_config,
393
+ read_config=self.read_config,
394
+ page_id=page_id,
395
+ )
396
+ for page_id in self.connector_config.page_ids
397
+ ]
398
+ if self.connector_config.database_ids:
399
+ docs += [
400
+ NotionDatabaseIngestDoc(
401
+ connector_config=self.connector_config,
402
+ processor_config=self.processor_config,
403
+ retry_strategy_config=self.retry_strategy_config,
404
+ read_config=self.read_config,
405
+ database_id=database_id,
406
+ )
407
+ for database_id in self.connector_config.database_ids
408
+ ]
409
+ if self.connector_config.recursive:
410
+ logger.info("Getting recursive content")
411
+ child_pages = []
412
+ child_databases = []
413
+ if self.connector_config.page_ids:
414
+ for page_id in self.connector_config.page_ids:
415
+ child_content = self.get_child_page_content(page_id=page_id)
416
+ child_pages.extend(child_content.child_pages)
417
+ child_databases.extend(child_content.child_databases)
418
+
419
+ if self.connector_config.database_ids:
420
+ for database_id in self.connector_config.database_ids:
421
+ child_content = self.get_child_database_content(database_id=database_id)
422
+ child_pages.extend(child_content.child_pages)
423
+ child_databases.extend(child_content.child_databases)
424
+
425
+ # Remove duplicates
426
+ child_pages = list(set(child_pages))
427
+ if self.connector_config.page_ids:
428
+ child_pages = [c for c in child_pages if c not in self.connector_config.page_ids]
429
+
430
+ child_databases = list(set(child_databases))
431
+ if self.connector_config.database_ids:
432
+ child_databases = [
433
+ db for db in child_databases if db not in self.connector_config.database_ids
434
+ ]
435
+
436
+ if child_pages:
437
+ logger.info(
438
+ "Adding the following child page ids: {}".format(", ".join(child_pages)),
439
+ )
440
+ docs += [
441
+ NotionPageIngestDoc(
442
+ connector_config=self.connector_config,
443
+ processor_config=self.processor_config,
444
+ retry_strategy_config=self.retry_strategy_config,
445
+ read_config=self.read_config,
446
+ page_id=page_id,
447
+ )
448
+ for page_id in child_pages
449
+ ]
450
+
451
+ if child_databases:
452
+ logger.info(
453
+ "Adding the following child database ids: {}".format(
454
+ ", ".join(child_databases),
455
+ ),
456
+ )
457
+ docs += [
458
+ NotionDatabaseIngestDoc(
459
+ connector_config=self.connector_config,
460
+ processor_config=self.processor_config,
461
+ retry_strategy_config=self.retry_strategy_config,
462
+ read_config=self.read_config,
463
+ database_id=database_id,
464
+ )
465
+ for database_id in child_databases
466
+ ]
467
+
468
+ return docs