unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any, Generator, Optional, Union
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
10
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.processes.connector_registry import (
12
+ DestinationRegistryEntry,
13
+ SourceRegistryEntry,
14
+ )
15
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
16
+ FsspecAccessConfig,
17
+ FsspecConnectionConfig,
18
+ FsspecDownloader,
19
+ FsspecDownloaderConfig,
20
+ FsspecIndexer,
21
+ FsspecIndexerConfig,
22
+ FsspecUploader,
23
+ FsspecUploaderConfig,
24
+ )
25
+
26
+ CONNECTOR_TYPE = "gcs"
27
+
28
+
29
+ @dataclass
30
+ class GcsIndexerConfig(FsspecIndexerConfig):
31
+ pass
32
+
33
+
34
+ @dataclass
35
+ class GcsAccessConfig(FsspecAccessConfig):
36
+ service_account_key: Optional[str] = None
37
+ token: Union[str, dict, None] = field(init=False, default=None)
38
+
39
+ def __post_init__(self):
40
+ ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
41
+
42
+ # Case: null value
43
+ if not self.service_account_key:
44
+ return
45
+
46
+ # Case: one of auth constants
47
+ if self.service_account_key in ALLOWED_AUTH_VALUES:
48
+ self.token = self.service_account_key
49
+ return
50
+
51
+ # Case: token as json
52
+ if isinstance(json_to_dict(self.service_account_key), dict):
53
+ self.token = json_to_dict(self.service_account_key)
54
+ return
55
+
56
+ # Case: path to token
57
+ if Path(self.service_account_key).is_file():
58
+ self.token = self.service_account_key
59
+ return
60
+
61
+ raise ValueError("Invalid auth token value")
62
+
63
+
64
+ @dataclass
65
+ class GcsConnectionConfig(FsspecConnectionConfig):
66
+ supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"])
67
+ access_config: GcsAccessConfig = enhanced_field(
68
+ sensitive=True, default_factory=lambda: GcsAccessConfig()
69
+ )
70
+ connector_type: str = CONNECTOR_TYPE
71
+
72
+
73
+ @dataclass
74
+ class GcsIndexer(FsspecIndexer):
75
+ connection_config: GcsConnectionConfig
76
+ index_config: GcsIndexerConfig
77
+ connector_type: str = CONNECTOR_TYPE
78
+
79
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
80
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
81
+ return super().run(**kwargs)
82
+
83
+
84
+ @dataclass
85
+ class GcsDownloaderConfig(FsspecDownloaderConfig):
86
+ pass
87
+
88
+
89
+ @dataclass
90
+ class GcsDownloader(FsspecDownloader):
91
+ protocol: str = "gcs"
92
+ connection_config: GcsConnectionConfig
93
+ connector_type: str = CONNECTOR_TYPE
94
+ download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
95
+
96
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
97
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
98
+ return super().run(file_data=file_data, **kwargs)
99
+
100
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
101
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
102
+ return await super().run_async(file_data=file_data, **kwargs)
103
+
104
+
105
+ @dataclass
106
+ class GcsUploaderConfig(FsspecUploaderConfig):
107
+ pass
108
+
109
+
110
+ @dataclass
111
+ class GcsUploader(FsspecUploader):
112
+ connector_type: str = CONNECTOR_TYPE
113
+ connection_config: GcsConnectionConfig
114
+ upload_config: GcsUploaderConfig = field(default=None)
115
+
116
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
117
+ def __post_init__(self):
118
+ super().__post_init__()
119
+
120
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
121
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122
+ return super().run(contents=contents, **kwargs)
123
+
124
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
125
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
126
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
127
+
128
+
129
+ gcs_source_entry = SourceRegistryEntry(
130
+ indexer=GcsIndexer,
131
+ indexer_config=GcsIndexerConfig,
132
+ downloader=GcsDownloader,
133
+ downloader_config=GcsDownloaderConfig,
134
+ connection_config=GcsConnectionConfig,
135
+ )
136
+
137
+ gcs_destination_entry = DestinationRegistryEntry(
138
+ uploader=GcsUploader,
139
+ uploader_config=GcsUploaderConfig,
140
+ connection_config=GcsConnectionConfig,
141
+ )
@@ -0,0 +1,164 @@
1
+ import contextlib
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import Any, Generator, Optional
7
+
8
+ from unstructured.documents.elements import DataSourceMetadata
9
+
10
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
13
+ from unstructured_ingest.v2.processes.connector_registry import (
14
+ DestinationRegistryEntry,
15
+ SourceRegistryEntry,
16
+ )
17
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
18
+ FsspecAccessConfig,
19
+ FsspecConnectionConfig,
20
+ FsspecDownloader,
21
+ FsspecDownloaderConfig,
22
+ FsspecIndexer,
23
+ FsspecIndexerConfig,
24
+ FsspecUploader,
25
+ FsspecUploaderConfig,
26
+ )
27
+
28
+ CONNECTOR_TYPE = "s3"
29
+
30
+
31
+ @dataclass
32
+ class S3IndexerConfig(FsspecIndexerConfig):
33
+ pass
34
+
35
+
36
+ @dataclass
37
+ class S3AccessConfig(FsspecAccessConfig):
38
+ key: Optional[str] = None
39
+ secret: Optional[str] = None
40
+ token: Optional[str] = None
41
+
42
+
43
+ @dataclass
44
+ class S3ConnectionConfig(FsspecConnectionConfig):
45
+ supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"])
46
+ access_config: S3AccessConfig = enhanced_field(
47
+ sensitive=True, default_factory=lambda: S3AccessConfig()
48
+ )
49
+ endpoint_url: Optional[str] = None
50
+ anonymous: bool = False
51
+ connector_type: str = CONNECTOR_TYPE
52
+
53
+ def get_access_config(self) -> dict[str, Any]:
54
+ access_configs: dict[str, Any] = {"anon": self.anonymous}
55
+ if self.endpoint_url:
56
+ access_configs["endpoint_url"] = self.endpoint_url
57
+
58
+ # Avoid injecting None by filtering out k,v pairs where the value is None
59
+ access_configs.update({k: v for k, v in self.access_config.to_dict().items() if v})
60
+ return access_configs
61
+
62
+
63
+ @dataclass
64
+ class S3Indexer(FsspecIndexer):
65
+ connection_config: S3ConnectionConfig
66
+ index_config: S3IndexerConfig
67
+ connector_type: str = CONNECTOR_TYPE
68
+
69
+ def get_metadata(self, path: str) -> DataSourceMetadata:
70
+ date_created = None
71
+ date_modified = None
72
+ try:
73
+ modified: Optional[datetime] = self.fs.modified(path)
74
+ if modified:
75
+ date_created = str(modified.timestamp())
76
+ date_modified = str(modified.timestamp())
77
+ except NotImplementedError:
78
+ pass
79
+
80
+ version = None
81
+ info: dict[str, Any] = self.fs.info(path)
82
+ if etag := info.get("ETag"):
83
+ version = str(etag).rstrip('"').lstrip('"')
84
+ metadata: dict[str, str] = {}
85
+ with contextlib.suppress(AttributeError):
86
+ metadata = self.fs.metadata(path)
87
+ record_locator = {
88
+ "protocol": self.index_config.protocol,
89
+ "remote_file_path": self.index_config.remote_url,
90
+ }
91
+ if metadata:
92
+ record_locator["metadata"] = metadata
93
+ return DataSourceMetadata(
94
+ date_created=date_created,
95
+ date_modified=date_modified,
96
+ date_processed=str(time()),
97
+ version=version,
98
+ url=f"{self.index_config.protocol}://{path}",
99
+ record_locator=record_locator,
100
+ )
101
+
102
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
103
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
104
+ return super().run(**kwargs)
105
+
106
+
107
+ @dataclass
108
+ class S3DownloaderConfig(FsspecDownloaderConfig):
109
+ pass
110
+
111
+
112
+ @dataclass
113
+ class S3Downloader(FsspecDownloader):
114
+ protocol: str = "s3"
115
+ connection_config: S3ConnectionConfig
116
+ connector_type: str = CONNECTOR_TYPE
117
+ download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
118
+
119
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
120
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
121
+ return super().run(file_data=file_data, **kwargs)
122
+
123
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
124
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
125
+ return await super().run_async(file_data=file_data, **kwargs)
126
+
127
+
128
+ @dataclass
129
+ class S3UploaderConfig(FsspecUploaderConfig):
130
+ pass
131
+
132
+
133
+ @dataclass
134
+ class S3Uploader(FsspecUploader):
135
+ connector_type: str = CONNECTOR_TYPE
136
+ connection_config: S3ConnectionConfig
137
+ upload_config: S3UploaderConfig = field(default=None)
138
+
139
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
140
+ def __post_init__(self):
141
+ super().__post_init__()
142
+
143
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
144
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
145
+ return super().run(contents=contents, **kwargs)
146
+
147
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
148
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
149
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
150
+
151
+
152
+ s3_source_entry = SourceRegistryEntry(
153
+ indexer=S3Indexer,
154
+ indexer_config=S3IndexerConfig,
155
+ downloader=S3Downloader,
156
+ downloader_config=S3DownloaderConfig,
157
+ connection_config=S3ConnectionConfig,
158
+ )
159
+
160
+ s3_destination_entry = DestinationRegistryEntry(
161
+ uploader=S3Uploader,
162
+ uploader_config=S3UploaderConfig,
163
+ connection_config=S3ConnectionConfig,
164
+ )
@@ -0,0 +1,166 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import Any, Generator, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
12
+ from unstructured_ingest.v2.processes.connector_registry import (
13
+ DestinationRegistryEntry,
14
+ SourceRegistryEntry,
15
+ )
16
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
17
+ FsspecAccessConfig,
18
+ FsspecConnectionConfig,
19
+ FsspecDownloader,
20
+ FsspecDownloaderConfig,
21
+ FsspecIndexer,
22
+ FsspecIndexerConfig,
23
+ FsspecUploader,
24
+ FsspecUploaderConfig,
25
+ )
26
+
27
+ CONNECTOR_TYPE = "sftp"
28
+
29
+
30
+ @dataclass
31
+ class SftpIndexerConfig(FsspecIndexerConfig):
32
+ def __post_init__(self):
33
+ super().__post_init__()
34
+ _, ext = os.path.splitext(self.remote_url)
35
+ parsed_url = urlparse(self.remote_url)
36
+ if ext:
37
+ self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/")
38
+ else:
39
+ self.path_without_protocol = parsed_url.path.lstrip("/")
40
+
41
+
42
+ @dataclass
43
+ class SftpAccessConfig(FsspecAccessConfig):
44
+ password: str
45
+
46
+
47
+ @dataclass
48
+ class SftpConnectionConfig(FsspecConnectionConfig):
49
+ supported_protocols: list[str] = field(default_factory=lambda: ["sftp"])
50
+ access_config: SftpAccessConfig = enhanced_field(sensitive=True)
51
+ connector_type: str = CONNECTOR_TYPE
52
+ username: Optional[str] = None
53
+ host: Optional[str] = None
54
+ port: int = 22
55
+ look_for_keys: bool = False
56
+ allow_agent: bool = False
57
+
58
+ def get_access_config(self) -> dict[str, Any]:
59
+ access_config = {
60
+ "username": self.username,
61
+ "host": self.host,
62
+ "port": self.port,
63
+ "look_for_keys": self.look_for_keys,
64
+ "allow_agent": self.allow_agent,
65
+ "password": self.access_config.password,
66
+ }
67
+ return access_config
68
+
69
+
70
+ @dataclass
71
+ class SftpIndexer(FsspecIndexer):
72
+ connection_config: SftpConnectionConfig
73
+ index_config: SftpIndexerConfig
74
+ connector_type: str = CONNECTOR_TYPE
75
+
76
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
77
+ def __post_init__(self):
78
+ parsed_url = urlparse(self.index_config.remote_url)
79
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
80
+ self.connection_config.port = parsed_url.port or self.connection_config.port
81
+
82
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
83
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
84
+ for file in super().run(**kwargs):
85
+ new_identifier = (
86
+ f"sftp://"
87
+ f"{self.connection_config.host}:"
88
+ f"{self.connection_config.port}/"
89
+ f"{file.identifier}"
90
+ )
91
+ file.identifier = new_identifier
92
+ yield file
93
+
94
+
95
+ @dataclass
96
+ class SftpDownloaderConfig(FsspecDownloaderConfig):
97
+ remote_url: Optional[str] = None
98
+
99
+ def __post_init__(self):
100
+ # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
101
+ if not self.remote_url:
102
+ raise TypeError(
103
+ f"{self.__class__.__name__}.__init__() "
104
+ f"missing 1 required positional argument: 'remote_url'"
105
+ )
106
+
107
+
108
+ @dataclass
109
+ class SftpDownloader(FsspecDownloader):
110
+ protocol: str = "sftp"
111
+ connection_config: SftpConnectionConfig
112
+ connector_type: str = CONNECTOR_TYPE
113
+ download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
114
+
115
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
116
+ def __post_init__(self):
117
+ parsed_url = urlparse(self.download_config.remote_url)
118
+ self.connection_config.host = parsed_url.hostname or self.connection_config.host
119
+ self.connection_config.port = parsed_url.port or self.connection_config.port
120
+
121
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
122
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
123
+ return super().run(file_data=file_data, **kwargs)
124
+
125
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
126
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
127
+ return await super().run_async(file_data=file_data, **kwargs)
128
+
129
+
130
+ @dataclass
131
+ class SftpUploaderConfig(FsspecUploaderConfig):
132
+ pass
133
+
134
+
135
+ @dataclass
136
+ class SftpUploader(FsspecUploader):
137
+ connector_type: str = CONNECTOR_TYPE
138
+ connection_config: SftpConnectionConfig
139
+ upload_config: SftpUploaderConfig = field(default=None)
140
+
141
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
142
+ def __post_init__(self):
143
+ super().__post_init__()
144
+
145
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
146
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147
+ return super().run(contents=contents, **kwargs)
148
+
149
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
150
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
151
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
152
+
153
+
154
+ sftp_source_entry = SourceRegistryEntry(
155
+ indexer=SftpIndexer,
156
+ indexer_config=SftpIndexerConfig,
157
+ downloader=SftpDownloader,
158
+ downloader_config=SftpDownloaderConfig,
159
+ connection_config=SftpConnectionConfig,
160
+ )
161
+
162
+ sftp_destination_entry = DestinationRegistryEntry(
163
+ uploader=SftpUploader,
164
+ uploader_config=SftpUploaderConfig,
165
+ connection_config=SftpConnectionConfig,
166
+ )
@@ -0,0 +1,17 @@
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Callable
5
+
6
+
7
+ def json_serial(obj):
8
+ if isinstance(obj, Path):
9
+ return obj.as_posix()
10
+ if isinstance(obj, datetime):
11
+ return obj.isoformat()
12
+ raise TypeError("Type %s not serializable" % type(obj))
13
+
14
+
15
+ def sterilize_dict(data: dict, default: Callable = json_serial) -> dict:
16
+ data_s = json.dumps(data, default=default)
17
+ return json.loads(data_s)