unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,160 @@
1
+ """
2
+ Dropbox Connector
3
+ The Dropbox Connector presents a couple abnormal situations.
4
+ 1) They don't have an unexpiring token
5
+ 2) They require a forward slash `/` in front of the remote_file_path. This presents
6
+ some real problems creating paths. When appending a path that begins with a
7
+ forward slash to any path, whether using the / shorthand or joinpath, causes the
8
+ starting path to disappear. So the `/` needs to be stripped off.
9
+ 3) To list and get files from the root directory Dropbox you need a ""," ", or " /"
10
+ """
11
+
12
+ import re
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Type
16
+
17
+ from unstructured_ingest.connector.fsspec.fsspec import (
18
+ FsspecDestinationConnector,
19
+ FsspecIngestDoc,
20
+ FsspecSourceConnector,
21
+ FsspecWriteConfig,
22
+ SimpleFsspecConfig,
23
+ )
24
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
25
+ from unstructured_ingest.error import SourceConnectionError
26
+ from unstructured_ingest.interfaces import AccessConfig
27
+ from unstructured_ingest.logger import logger
28
+ from unstructured_ingest.utils.dep_check import requires_dependencies
29
+
30
+
31
+ class MissingFolderError(Exception):
32
+ """There is no folder by that name. For root try `dropbox:// /`"""
33
+
34
+
35
+ @dataclass
36
+ class DropboxAccessConfig(AccessConfig):
37
+ token: str = enhanced_field(sensitive=True)
38
+
39
+
40
+ @dataclass
41
+ class DropboxWriteConfig(FsspecWriteConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class SimpleDropboxConfig(SimpleFsspecConfig):
47
+ access_config: DropboxAccessConfig = None
48
+
49
+
50
+ @dataclass
51
+ class DropboxIngestDoc(FsspecIngestDoc):
52
+ connector_config: SimpleDropboxConfig
53
+ registry_name: str = "dropbox"
54
+
55
+ @SourceConnectionError.wrap
56
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
57
+ def get_file(self):
58
+ super().get_file()
59
+
60
+ @property
61
+ def _output_filename(self):
62
+ # Dropbox requires a forward slash at the front of the folder path. This
63
+ # creates some complications in path joining so a custom path is created here.
64
+ # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
65
+ if self.connector_config.dir_path == " ":
66
+ return Path(self.processor_config.output_dir) / re.sub(
67
+ "^/",
68
+ "",
69
+ f"{self.remote_file_path}.json",
70
+ )
71
+ else:
72
+ return (
73
+ Path(self.processor_config.output_dir)
74
+ / f"{self.remote_file_path.replace(f'/{self.connector_config.dir_path}/', '')}.json"
75
+ )
76
+
77
+ def _tmp_download_file(self):
78
+ # Dropbox requires a forward slash at the front of the folder path. This
79
+ # creates some complications in path joining so a custom path is created here.
80
+ # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root
81
+ download_dir: str = self.read_config.download_dir if self.read_config.download_dir else ""
82
+ if not download_dir:
83
+ return ""
84
+ if self.connector_config.dir_path == " ":
85
+ return Path(download_dir) / re.sub(
86
+ "^/",
87
+ "",
88
+ self.remote_file_path,
89
+ )
90
+ else:
91
+ return Path(download_dir) / self.remote_file_path.replace(
92
+ f"/{self.connector_config.dir_path}/",
93
+ "",
94
+ )
95
+
96
+
97
+ @dataclass
98
+ class DropboxSourceConnector(FsspecSourceConnector):
99
+ connector_config: SimpleDropboxConfig
100
+
101
+ def __post_init__(self):
102
+ self.ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc
103
+
104
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
105
+ def initialize(self):
106
+ from fsspec import AbstractFileSystem, get_filesystem_class
107
+
108
+ try:
109
+ self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
110
+ **self.connector_config.get_access_config(),
111
+ )
112
+ # Dropbox requires a forward slash at the front of the folder path. This
113
+ # creates some complications in path joining so a custom path is created here.
114
+ ls_output = self.fs.ls(f"/{self.connector_config.path_without_protocol}")
115
+ except Exception as e:
116
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
117
+ raise SourceConnectionError(f"failed to validate connection: {e}")
118
+ if ls_output and len(ls_output) >= 1:
119
+ return
120
+ elif ls_output:
121
+ raise ValueError(
122
+ f"No objects found in {self.connector_config.remote_url}.",
123
+ )
124
+ else:
125
+ raise MissingFolderError(
126
+ "There is no folder by that name. For root try `dropbox:// /`",
127
+ )
128
+
129
+ def _list_files(self):
130
+ # Dropbox requires a forward slash at the front of the folder path. This
131
+ # creates some complications in path joining so a custom path is created here.
132
+ if not self.connector_config.recursive:
133
+ # fs.ls does not walk directories
134
+ # directories that are listed in cloud storage can cause problems because they are seen
135
+ # as 0byte files
136
+ return [
137
+ x.get("name")
138
+ for x in self.fs.ls(
139
+ f"/{self.connector_config.path_without_protocol}",
140
+ detail=True,
141
+ )
142
+ if x.get("size")
143
+ ]
144
+ else:
145
+ # fs.find will recursively walk directories
146
+ # "size" is a common key for all the cloud protocols with fs
147
+ return [
148
+ k
149
+ for k, v in self.fs.find(
150
+ f"/{self.connector_config.path_without_protocol}",
151
+ detail=True,
152
+ ).items()
153
+ if v.get("size")
154
+ ]
155
+
156
+
157
+ @dataclass
158
+ class DropboxDestinationConnector(FsspecDestinationConnector):
159
+ connector_config: SimpleFsspecConfig
160
+ write_config: DropboxWriteConfig
@@ -0,0 +1,359 @@
1
+ import fnmatch
2
+ import json
3
+ import os
4
+ import typing as t
5
+ from abc import ABC
6
+ from contextlib import suppress
7
+ from dataclasses import dataclass
8
+ from pathlib import Path, PurePath
9
+
10
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
16
+ from unstructured_ingest.interfaces import (
17
+ BaseConnectorConfig,
18
+ BaseDestinationConnector,
19
+ BaseSingleIngestDoc,
20
+ BaseSourceConnector,
21
+ FsspecConfig,
22
+ IngestDocCleanupMixin,
23
+ SourceConnectorCleanupMixin,
24
+ SourceMetadata,
25
+ WriteConfig,
26
+ )
27
+ from unstructured_ingest.logger import logger
28
+ from unstructured_ingest.utils.compression import (
29
+ TAR_FILE_EXT,
30
+ ZIP_FILE_EXT,
31
+ CompressionSourceConnectorMixin,
32
+ )
33
+ from unstructured_ingest.utils.dep_check import (
34
+ requires_dependencies,
35
+ )
36
+
37
+ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
38
+ "s3",
39
+ "s3a",
40
+ "abfs",
41
+ "az",
42
+ "gs",
43
+ "gcs",
44
+ "box",
45
+ "dropbox",
46
+ "sftp",
47
+ ]
48
+
49
+
50
+ @dataclass
51
+ class SimpleFsspecConfig(FsspecConfig, BaseConnectorConfig):
52
+ pass
53
+
54
+
55
+ @dataclass
56
+ class FsspecIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
57
+ """Class encapsulating fetching a doc and writing processed results (but not
58
+ doing the processing!).
59
+
60
+ Also includes a cleanup method. When things go wrong and the cleanup
61
+ method is not called, the file is left behind on the filesystem to assist debugging.
62
+ """
63
+
64
+ connector_config: SimpleFsspecConfig
65
+ remote_file_path: str
66
+
67
+ def _tmp_download_file(self):
68
+ download_dir = self.read_config.download_dir if self.read_config.download_dir else ""
69
+ return Path(download_dir) / self.remote_file_path.replace(
70
+ f"{self.connector_config.dir_path}/",
71
+ "",
72
+ )
73
+
74
+ @property
75
+ def _output_filename(self):
76
+ # Dynamically parse filename , can change if remote path was pointing to the single
77
+ # file, a directory, or nested directory
78
+ if self.remote_file_path == self.connector_config.path_without_protocol:
79
+ file = self.remote_file_path.split("/")[-1]
80
+ filename = f"{file}.json"
81
+ else:
82
+ path_without_protocol = (
83
+ self.connector_config.path_without_protocol
84
+ if self.connector_config.path_without_protocol.endswith("/")
85
+ else f"{self.connector_config.path_without_protocol}/"
86
+ )
87
+ filename = f"{self.remote_file_path.replace(path_without_protocol, '')}.json"
88
+ return Path(self.processor_config.output_dir) / filename
89
+
90
+ def _create_full_tmp_dir_path(self):
91
+ """Includes "directories" in the object path"""
92
+ self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
93
+
94
+ @SourceConnectionError.wrap
95
+ @BaseSingleIngestDoc.skip_if_file_exists
96
+ def get_file(self):
97
+ """Fetches the file from the current filesystem and stores it locally."""
98
+ from fsspec import AbstractFileSystem, get_filesystem_class
99
+
100
+ self._create_full_tmp_dir_path()
101
+ fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
102
+ **self.connector_config.get_access_config(),
103
+ )
104
+ self._get_file(fs=fs)
105
+ fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
106
+ self.update_source_metadata()
107
+
108
+ @SourceConnectionNetworkError.wrap
109
+ def _get_file(self, fs):
110
+ fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix())
111
+
112
+ @requires_dependencies(["fsspec"])
113
+ def update_source_metadata(self):
114
+ from fsspec import AbstractFileSystem, get_filesystem_class
115
+
116
+ fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
117
+ **self.connector_config.get_access_config(),
118
+ )
119
+
120
+ date_created = None
121
+ with suppress(NotImplementedError):
122
+ date_created = fs.created(self.remote_file_path).isoformat()
123
+
124
+ date_modified = None
125
+ with suppress(NotImplementedError):
126
+ date_modified = fs.modified(self.remote_file_path).isoformat()
127
+
128
+ version = (
129
+ fs.checksum(self.remote_file_path)
130
+ if self.connector_config.protocol != "gs"
131
+ else fs.info(self.remote_file_path).get("etag", "")
132
+ )
133
+ file_exists = fs.exists(self.remote_file_path)
134
+ self.source_metadata = SourceMetadata(
135
+ date_created=date_created,
136
+ date_modified=date_modified,
137
+ version=str(version),
138
+ source_url=f"{self.connector_config.protocol}://{self.remote_file_path}",
139
+ exists=file_exists,
140
+ )
141
+
142
+ @property
143
+ def filename(self):
144
+ """The filename of the file after downloading from cloud"""
145
+ return self._tmp_download_file()
146
+
147
+ @property
148
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
149
+ """Returns the equivalent of ls in dict"""
150
+ return {
151
+ "protocol": self.connector_config.protocol,
152
+ "remote_file_path": self.remote_file_path,
153
+ }
154
+
155
+
156
+ @dataclass
157
+ class FsspecSourceConnector(
158
+ SourceConnectorCleanupMixin,
159
+ CompressionSourceConnectorMixin,
160
+ BaseSourceConnector,
161
+ ):
162
+ """Objects of this class support fetching document(s) from"""
163
+
164
+ connector_config: SimpleFsspecConfig
165
+
166
+ def check_connection(self):
167
+ from fsspec import get_filesystem_class
168
+
169
+ try:
170
+ fs = get_filesystem_class(self.connector_config.protocol)(
171
+ **self.connector_config.get_access_config(),
172
+ )
173
+ fs.ls(path=self.connector_config.path_without_protocol, detail=False)
174
+ except Exception as e:
175
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
176
+ raise SourceConnectionError(f"failed to validate connection: {e}")
177
+
178
+ def __post_init__(self):
179
+ self.ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc
180
+
181
+ def initialize(self):
182
+ from fsspec import AbstractFileSystem, get_filesystem_class
183
+
184
+ self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
185
+ **self.connector_config.get_access_config(),
186
+ )
187
+
188
+ """Verify that can get metadata for an object, validates connections info."""
189
+ ls_output = self.fs.ls(self.connector_config.path_without_protocol, detail=False)
190
+ if len(ls_output) < 1:
191
+ raise ValueError(
192
+ f"No objects found in {self.connector_config.remote_url}.",
193
+ )
194
+
195
+ def _list_files(self):
196
+ if not self.connector_config.recursive:
197
+ # fs.ls does not walk directories
198
+ # directories that are listed in cloud storage can cause problems
199
+ # because they are seen as 0 byte files
200
+ return [
201
+ x.get("name")
202
+ for x in self.fs.ls(self.connector_config.path_without_protocol, detail=True)
203
+ if x.get("size") > 0
204
+ ]
205
+ else:
206
+ # fs.find will recursively walk directories
207
+ # "size" is a common key for all the cloud protocols with fs
208
+ return [
209
+ k
210
+ for k, v in self.fs.find(
211
+ self.connector_config.path_without_protocol,
212
+ detail=True,
213
+ ).items()
214
+ if v.get("size") > 0
215
+ ]
216
+
217
+ def does_path_match_glob(self, path: str) -> bool:
218
+ if self.connector_config.file_glob is None:
219
+ return True
220
+ patterns = self.connector_config.file_glob
221
+ for pattern in patterns:
222
+ if fnmatch.filter([path], pattern):
223
+ return True
224
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
225
+ return False
226
+
227
+ def get_ingest_docs(self):
228
+ raw_files = self._list_files()
229
+ # If glob filters provided, use to fiter on filepaths
230
+ files = [f for f in raw_files if self.does_path_match_glob(f)]
231
+ # remove compressed files
232
+ compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT
233
+ compressed_files = []
234
+ uncompressed_files = []
235
+ docs: t.List[BaseSingleIngestDoc] = []
236
+ for file in files:
237
+ if any(file.endswith(ext) for ext in compressed_file_ext):
238
+ compressed_files.append(file)
239
+ else:
240
+ uncompressed_files.append(file)
241
+ docs.extend(
242
+ [
243
+ self.ingest_doc_cls(
244
+ read_config=self.read_config,
245
+ connector_config=self.connector_config,
246
+ processor_config=self.processor_config,
247
+ remote_file_path=file,
248
+ )
249
+ for file in uncompressed_files
250
+ ],
251
+ )
252
+ if not self.connector_config.uncompress:
253
+ return docs
254
+ for compressed_file in compressed_files:
255
+ compressed_doc = self.ingest_doc_cls(
256
+ read_config=self.read_config,
257
+ processor_config=self.processor_config,
258
+ connector_config=self.connector_config,
259
+ remote_file_path=compressed_file,
260
+ )
261
+ try:
262
+ local_ingest_docs = self.process_compressed_doc(doc=compressed_doc)
263
+ logger.info(f"adding {len(local_ingest_docs)} from {compressed_file}")
264
+ docs.extend(local_ingest_docs)
265
+ finally:
266
+ compressed_doc.cleanup_file()
267
+ return docs
268
+
269
+
270
+ @dataclass
271
+ class WriteTextConfig(EnhancedDataClassJsonMixin, ABC):
272
+ pass
273
+
274
+
275
+ @dataclass
276
+ class FsspecWriteConfig(WriteConfig):
277
+ write_text_config: t.Optional[WriteTextConfig] = None
278
+
279
+ def get_write_text_config(self) -> t.Dict[str, t.Any]:
280
+ if write_text_kwargs := self.write_text_config:
281
+ return write_text_kwargs.to_dict()
282
+ return {}
283
+
284
+
285
+ @dataclass
286
+ class FsspecDestinationConnector(BaseDestinationConnector):
287
+ connector_config: SimpleFsspecConfig
288
+ write_config: FsspecWriteConfig
289
+
290
+ def initialize(self):
291
+ from fsspec import AbstractFileSystem, get_filesystem_class
292
+
293
+ self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
294
+ **self.connector_config.get_access_config(),
295
+ )
296
+ self.check_connection()
297
+
298
+ def check_connection(self):
299
+ from fsspec import AbstractFileSystem, get_filesystem_class
300
+
301
+ try:
302
+ fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
303
+ **self.connector_config.get_access_config(),
304
+ )
305
+
306
+ # e.g. Dropbox path starts with /
307
+ bucket_name = "/" if self.connector_config.path_without_protocol.startswith("/") else ""
308
+ bucket_name += self.connector_config.dir_path.split("/")[0]
309
+
310
+ logger.info(f"checking connection for destination {bucket_name}")
311
+ fs.ls(path=bucket_name, detail=False)
312
+ except Exception as e:
313
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
314
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
315
+
316
+ def write_dict(
317
+ self,
318
+ *args,
319
+ elements_dict: t.List[t.Dict[str, t.Any]],
320
+ filename: t.Optional[str] = None,
321
+ indent: int = 4,
322
+ encoding: str = "utf-8",
323
+ **kwargs,
324
+ ) -> None:
325
+ from fsspec import AbstractFileSystem, get_filesystem_class
326
+
327
+ fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
328
+ **self.connector_config.get_access_config(),
329
+ )
330
+
331
+ logger.info(f"Writing content using filesystem: {type(fs).__name__}")
332
+
333
+ output_folder = self.connector_config.path_without_protocol
334
+ output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator
335
+ filename = (
336
+ filename.strip(os.sep) if filename else filename
337
+ ) # Make sure filename doesn't begin with file seperator
338
+ output_path = str(PurePath(output_folder, filename)) if filename else output_folder
339
+ full_output_path = f"{self.connector_config.protocol}://{output_path}"
340
+ logger.debug(f"uploading content to {full_output_path}")
341
+ write_text_configs = self.write_config.get_write_text_config() if self.write_config else {}
342
+ fs.write_text(
343
+ full_output_path,
344
+ json.dumps(elements_dict, indent=indent),
345
+ encoding=encoding,
346
+ **write_text_configs,
347
+ )
348
+
349
+ def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
350
+ pass
351
+
352
+ def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
353
+ for doc in docs:
354
+ file_path = doc.base_output_filename
355
+ filename = file_path if file_path else None
356
+ with open(doc._output_filename) as json_file:
357
+ logger.debug(f"uploading content from {doc._output_filename}")
358
+ json_list = json.load(json_file)
359
+ self.write_dict(elements_dict=json_list, filename=filename)
@@ -0,0 +1,82 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Type
5
+
6
+ from unstructured_ingest.connector.fsspec.fsspec import (
7
+ FsspecDestinationConnector,
8
+ FsspecIngestDoc,
9
+ FsspecSourceConnector,
10
+ FsspecWriteConfig,
11
+ SimpleFsspecConfig,
12
+ )
13
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
14
+ from unstructured_ingest.error import SourceConnectionError
15
+ from unstructured_ingest.interfaces import AccessConfig
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
18
+
19
+
20
+ @dataclass
21
+ class GcsAccessConfig(AccessConfig):
22
+ token: t.Optional[str] = enhanced_field(
23
+ default=None, sensitive=True, overload_name="service_account_key"
24
+ )
25
+
26
+ def __post_init__(self):
27
+ ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud"
28
+
29
+ # Case: null value
30
+ if not self.token:
31
+ return
32
+ # Case: one of auth constants
33
+ if self.token in ALLOWED_AUTH_VALUES:
34
+ return
35
+ # Case: token as json
36
+ if isinstance(json_to_dict(self.token), dict):
37
+ self.token = json_to_dict(self.token)
38
+ return
39
+ # Case: path to token
40
+ if Path(self.token).is_file():
41
+ return
42
+
43
+ raise ValueError("Invalid auth token value")
44
+
45
+
46
+ @dataclass
47
+ class GcsWriteConfig(FsspecWriteConfig):
48
+ pass
49
+
50
+
51
+ @dataclass
52
+ class SimpleGcsConfig(SimpleFsspecConfig):
53
+ access_config: GcsAccessConfig = None
54
+
55
+
56
+ @dataclass
57
+ class GcsIngestDoc(FsspecIngestDoc):
58
+ connector_config: SimpleGcsConfig
59
+ registry_name: str = "gcs"
60
+
61
+ @SourceConnectionError.wrap
62
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
63
+ def get_file(self):
64
+ super().get_file()
65
+
66
+
67
+ @dataclass
68
+ class GcsSourceConnector(FsspecSourceConnector):
69
+ connector_config: SimpleGcsConfig
70
+
71
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
72
+ def initialize(self):
73
+ super().initialize()
74
+
75
+ def __post_init__(self):
76
+ self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc
77
+
78
+
79
+ @dataclass
80
+ class GcsDestinationConnector(FsspecDestinationConnector):
81
+ connector_config: SimpleGcsConfig
82
+ write_config: GcsWriteConfig
@@ -0,0 +1,62 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+ from typing import Type
4
+
5
+ from unstructured_ingest.connector.fsspec.fsspec import (
6
+ FsspecDestinationConnector,
7
+ FsspecIngestDoc,
8
+ FsspecSourceConnector,
9
+ FsspecWriteConfig,
10
+ SimpleFsspecConfig,
11
+ )
12
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
13
+ from unstructured_ingest.interfaces import AccessConfig
14
+ from unstructured_ingest.utils.dep_check import requires_dependencies
15
+
16
+
17
+ @dataclass
18
+ class S3AccessConfig(AccessConfig):
19
+ anon: bool = enhanced_field(default=False, overload_name="anonymous")
20
+ endpoint_url: t.Optional[str] = None
21
+ key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
22
+ secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
23
+ token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
24
+
25
+
26
+ @dataclass
27
+ class S3WriteConfig(FsspecWriteConfig):
28
+ pass
29
+
30
+
31
+ @dataclass
32
+ class SimpleS3Config(SimpleFsspecConfig):
33
+ access_config: S3AccessConfig = enhanced_field(default=None)
34
+
35
+
36
+ @dataclass
37
+ class S3IngestDoc(FsspecIngestDoc):
38
+ connector_config: SimpleS3Config
39
+ remote_file_path: str
40
+ registry_name: str = "s3"
41
+
42
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
43
+ def get_file(self):
44
+ super().get_file()
45
+
46
+
47
+ @dataclass
48
+ class S3SourceConnector(FsspecSourceConnector):
49
+ connector_config: SimpleS3Config
50
+
51
+ def __post_init__(self):
52
+ self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc
53
+
54
+
55
+ @dataclass
56
+ class S3DestinationConnector(FsspecDestinationConnector):
57
+ connector_config: SimpleS3Config
58
+ write_config: S3WriteConfig
59
+
60
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
61
+ def initialize(self):
62
+ super().initialize()