unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any, Generator, Optional
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.processes.connector_registry import (
11
+ DestinationRegistryEntry,
12
+ SourceRegistryEntry,
13
+ )
14
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
15
+ FsspecAccessConfig,
16
+ FsspecConnectionConfig,
17
+ FsspecDownloader,
18
+ FsspecDownloaderConfig,
19
+ FsspecIndexer,
20
+ FsspecIndexerConfig,
21
+ FsspecUploader,
22
+ FsspecUploaderConfig,
23
+ )
24
+
25
+ CONNECTOR_TYPE = "box"
26
+
27
+
28
+ @dataclass
29
+ class BoxIndexerConfig(FsspecIndexerConfig):
30
+ pass
31
+
32
+
33
+ @dataclass
34
+ class BoxAccessConfig(FsspecAccessConfig):
35
+ box_app_config: Optional[str] = None
36
+
37
+
38
+ @dataclass
39
+ class BoxConnectionConfig(FsspecConnectionConfig):
40
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"])
41
+ access_config: BoxAccessConfig = enhanced_field(
42
+ sensitive=True, default_factory=lambda: BoxAccessConfig()
43
+ )
44
+ connector_type: str = CONNECTOR_TYPE
45
+
46
+ def get_access_config(self) -> dict[str, Any]:
47
+ # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
48
+ # because it is not serializable.
49
+ from boxsdk import JWTAuth
50
+
51
+ access_kwargs_with_oauth: dict[str, Any] = {
52
+ "oauth": JWTAuth.from_settings_file(
53
+ self.access_config.box_app_config,
54
+ ),
55
+ }
56
+ access_config: dict[str, Any] = self.access_config.to_dict()
57
+ access_config.pop("box_app_config", None)
58
+ access_kwargs_with_oauth.update(access_config)
59
+
60
+ return access_kwargs_with_oauth
61
+
62
+
63
+ @dataclass
64
+ class BoxIndexer(FsspecIndexer):
65
+ connection_config: BoxConnectionConfig
66
+ index_config: BoxIndexerConfig
67
+ connector_type: str = CONNECTOR_TYPE
68
+
69
+ @requires_dependencies(["boxfs"], extras="box")
70
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
71
+ return super().run(**kwargs)
72
+
73
+
74
+ @dataclass
75
+ class BoxDownloaderConfig(FsspecDownloaderConfig):
76
+ pass
77
+
78
+
79
+ @dataclass
80
+ class BoxDownloader(FsspecDownloader):
81
+ protocol: str = "box"
82
+ connection_config: BoxConnectionConfig
83
+ connector_type: str = CONNECTOR_TYPE
84
+ download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
85
+
86
+ @requires_dependencies(["boxfs"], extras="box")
87
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
88
+ return super().run(file_data=file_data, **kwargs)
89
+
90
+ @requires_dependencies(["boxfs"], extras="box")
91
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
92
+ return await super().run_async(file_data=file_data, **kwargs)
93
+
94
+
95
+ @dataclass
96
+ class BoxUploaderConfig(FsspecUploaderConfig):
97
+ pass
98
+
99
+
100
+ @dataclass
101
+ class BoxUploader(FsspecUploader):
102
+ connector_type: str = CONNECTOR_TYPE
103
+ connection_config: BoxConnectionConfig
104
+ upload_config: BoxUploaderConfig = field(default=None)
105
+
106
+ @requires_dependencies(["boxfs"], extras="box")
107
+ def __post_init__(self):
108
+ super().__post_init__()
109
+
110
+ @requires_dependencies(["boxfs"], extras="box")
111
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
112
+ return super().run(contents=contents, **kwargs)
113
+
114
+ @requires_dependencies(["boxfs"], extras="box")
115
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
116
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
117
+
118
+
119
+ box_source_entry = SourceRegistryEntry(
120
+ indexer=BoxIndexer,
121
+ indexer_config=BoxIndexerConfig,
122
+ downloader=BoxDownloader,
123
+ downloader_config=BoxDownloaderConfig,
124
+ connection_config=BoxConnectionConfig,
125
+ )
126
+
127
+ box_destination_entry = DestinationRegistryEntry(
128
+ uploader=BoxUploader,
129
+ uploader_config=BoxUploaderConfig,
130
+ connection_config=BoxConnectionConfig,
131
+ )
@@ -0,0 +1,130 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any, Generator, Optional
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.processes.connector_registry import (
11
+ DestinationRegistryEntry,
12
+ SourceRegistryEntry,
13
+ )
14
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
15
+ FsspecAccessConfig,
16
+ FsspecConnectionConfig,
17
+ FsspecDownloader,
18
+ FsspecDownloaderConfig,
19
+ FsspecIndexer,
20
+ FsspecIndexerConfig,
21
+ FsspecUploader,
22
+ FsspecUploaderConfig,
23
+ )
24
+ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
25
+
26
+ CONNECTOR_TYPE = "dropbox"
27
+
28
+
29
+ @dataclass
30
+ class DropboxIndexerConfig(FsspecIndexerConfig):
31
+ pass
32
+
33
+
34
+ @dataclass
35
+ class DropboxAccessConfig(FsspecAccessConfig):
36
+ token: Optional[str] = None
37
+
38
+
39
+ @dataclass
40
+ class DropboxConnectionConfig(FsspecConnectionConfig):
41
+ supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"])
42
+ access_config: DropboxAccessConfig = enhanced_field(
43
+ sensitive=True, default_factory=lambda: DropboxAccessConfig()
44
+ )
45
+ connector_type: str = CONNECTOR_TYPE
46
+
47
+
48
+ @dataclass
49
+ class DropboxIndexer(FsspecIndexer):
50
+ connection_config: DropboxConnectionConfig
51
+ index_config: DropboxIndexerConfig
52
+ connector_type: str = CONNECTOR_TYPE
53
+
54
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
55
+ def __post_init__(self):
56
+ # dropbox expects the path to start with a /
57
+ if not self.index_config.path_without_protocol.startswith("/"):
58
+ self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
59
+
60
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
62
+ return super().run(**kwargs)
63
+
64
+ def sterilize_info(self, path) -> dict:
65
+ # the fs.info method defined in the dropboxdrivefs library expects a "url"
66
+ # kwarg rather than "path"; though both refer to the same thing
67
+ info = self.fs.info(url=path)
68
+ return sterilize_dict(data=info)
69
+
70
+
71
+ @dataclass
72
+ class DropboxDownloaderConfig(FsspecDownloaderConfig):
73
+ pass
74
+
75
+
76
+ @dataclass
77
+ class DropboxDownloader(FsspecDownloader):
78
+ protocol: str = "dropbox"
79
+ connection_config: DropboxConnectionConfig
80
+ connector_type: str = CONNECTOR_TYPE
81
+ download_config: Optional[DropboxDownloaderConfig] = field(
82
+ default_factory=DropboxDownloaderConfig
83
+ )
84
+
85
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
86
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
87
+ return super().run(file_data=file_data, **kwargs)
88
+
89
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
90
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
91
+ return await super().run_async(file_data=file_data, **kwargs)
92
+
93
+
94
+ @dataclass
95
+ class DropboxUploaderConfig(FsspecUploaderConfig):
96
+ pass
97
+
98
+
99
+ @dataclass
100
+ class DropboxUploader(FsspecUploader):
101
+ connector_type: str = CONNECTOR_TYPE
102
+ connection_config: DropboxConnectionConfig
103
+ upload_config: DropboxUploaderConfig = field(default=None)
104
+
105
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
106
+ def __post_init__(self):
107
+ super().__post_init__()
108
+
109
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
110
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
111
+ return super().run(contents=contents, **kwargs)
112
+
113
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
114
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
115
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
116
+
117
+
118
+ dropbox_source_entry = SourceRegistryEntry(
119
+ indexer=DropboxIndexer,
120
+ indexer_config=DropboxIndexerConfig,
121
+ downloader=DropboxDownloader,
122
+ downloader_config=DropboxDownloaderConfig,
123
+ connection_config=DropboxConnectionConfig,
124
+ )
125
+
126
+ dropbox_destination_entry = DestinationRegistryEntry(
127
+ uploader=DropboxUploader,
128
+ uploader_config=DropboxUploaderConfig,
129
+ connection_config=DropboxConnectionConfig,
130
+ )
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import fnmatch
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from time import time
9
+ from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
+
11
+ from unstructured.documents.elements import DataSourceMetadata
12
+
13
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
14
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ DownloadResponse,
21
+ FileData,
22
+ Indexer,
23
+ IndexerConfig,
24
+ SourceIdentifiers,
25
+ UploadContent,
26
+ Uploader,
27
+ UploaderConfig,
28
+ )
29
+ from unstructured_ingest.v2.logger import logger
30
+ from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
31
+
32
+ if TYPE_CHECKING:
33
+ from fsspec import AbstractFileSystem
34
+
35
+ CONNECTOR_TYPE = "fsspec"
36
+
37
+
38
+ class Base(object):
39
+ def __post_init__(self):
40
+ pass
41
+
42
+
43
+ @dataclass
44
+ class FileConfig(Base):
45
+ remote_url: str
46
+ protocol: str = field(init=False)
47
+ path_without_protocol: str = field(init=False)
48
+ supported_protocols: list[str] = field(
49
+ default_factory=lambda: [
50
+ "s3",
51
+ "s3a",
52
+ "abfs",
53
+ "az",
54
+ "gs",
55
+ "gcs",
56
+ "box",
57
+ "dropbox",
58
+ "sftp",
59
+ ]
60
+ )
61
+
62
+ def __post_init__(self):
63
+ super().__post_init__()
64
+ self.protocol, self.path_without_protocol = self.remote_url.split("://")
65
+ if self.protocol not in self.supported_protocols:
66
+ raise ValueError(
67
+ "Protocol {} not supported yet, only {} are supported.".format(
68
+ self.protocol, ", ".join(self.supported_protocols)
69
+ ),
70
+ )
71
+
72
+
73
+ @dataclass
74
+ class FsspecIndexerConfig(FileConfig, IndexerConfig):
75
+ recursive: bool = False
76
+ file_glob: Optional[list[str]] = None
77
+
78
+
79
+ @dataclass
80
+ class FsspecAccessConfig(AccessConfig):
81
+ pass
82
+
83
+
84
+ FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig)
85
+
86
+
87
+ @dataclass
88
+ class FsspecConnectionConfig(ConnectionConfig):
89
+ access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None)
90
+ connector_type: str = CONNECTOR_TYPE
91
+
92
+
93
+ FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
94
+ FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
95
+
96
+
97
+ @dataclass
98
+ class FsspecIndexer(Indexer):
99
+ connection_config: FsspecConnectionConfigT
100
+ index_config: FsspecIndexerConfigT
101
+ connector_type: str = CONNECTOR_TYPE
102
+
103
+ @property
104
+ def fs(self) -> "AbstractFileSystem":
105
+ from fsspec import get_filesystem_class
106
+
107
+ return get_filesystem_class(self.index_config.protocol)(
108
+ **self.connection_config.get_access_config(),
109
+ )
110
+
111
+ def does_path_match_glob(self, path: str) -> bool:
112
+ if self.index_config.file_glob is None:
113
+ return True
114
+ patterns = self.index_config.file_glob
115
+ for pattern in patterns:
116
+ if fnmatch.filter([path], pattern):
117
+ return True
118
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
119
+ return False
120
+
121
+ def check_connection(self):
122
+ from fsspec import get_filesystem_class
123
+
124
+ try:
125
+ fs = get_filesystem_class(self.index_config.protocol)(
126
+ **self.connection_config.get_access_config(),
127
+ )
128
+ fs.ls(path=self.index_config.path_without_protocol, detail=False)
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise SourceConnectionError(f"failed to validate connection: {e}")
132
+
133
+ def list_files(self) -> list[str]:
134
+ if not self.index_config.recursive:
135
+ # fs.ls does not walk directories
136
+ # directories that are listed in cloud storage can cause problems
137
+ # because they are seen as 0 byte files
138
+ found = self.fs.ls(self.index_config.path_without_protocol, detail=True)
139
+ if isinstance(found, list):
140
+ return [
141
+ x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
142
+ ]
143
+ else:
144
+ raise TypeError(f"unhandled response type from ls: {type(found)}")
145
+ else:
146
+ # fs.find will recursively walk directories
147
+ # "size" is a common key for all the cloud protocols with fs
148
+ found = self.fs.find(
149
+ self.index_config.path_without_protocol,
150
+ detail=True,
151
+ )
152
+ if isinstance(found, dict):
153
+ return [
154
+ k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file"
155
+ ]
156
+ else:
157
+ raise TypeError(f"unhandled response type from find: {type(found)}")
158
+
159
+ def get_metadata(self, path: str) -> DataSourceMetadata:
160
+ date_created = None
161
+ date_modified = None
162
+
163
+ try:
164
+ created: Optional[Any] = self.fs.created(path)
165
+ if created:
166
+ if isinstance(created, datetime):
167
+ date_created = str(created.timestamp())
168
+ else:
169
+ date_created = str(created)
170
+ except NotImplementedError:
171
+ pass
172
+
173
+ try:
174
+ modified: Optional[Any] = self.fs.modified(path)
175
+ if modified:
176
+ if isinstance(modified, datetime):
177
+ date_modified = str(modified.timestamp())
178
+ else:
179
+ date_modified = str(modified)
180
+ except NotImplementedError:
181
+ pass
182
+
183
+ version = self.fs.checksum(path)
184
+ metadata: dict[str, str] = {}
185
+ with contextlib.suppress(AttributeError):
186
+ metadata = self.fs.metadata(path)
187
+ record_locator = {
188
+ "protocol": self.index_config.protocol,
189
+ "remote_file_path": self.index_config.remote_url,
190
+ }
191
+ if metadata:
192
+ record_locator["metadata"] = metadata
193
+ return DataSourceMetadata(
194
+ date_created=date_created,
195
+ date_modified=date_modified,
196
+ date_processed=str(time()),
197
+ version=str(version),
198
+ url=f"{self.index_config.protocol}://{path}",
199
+ record_locator=record_locator,
200
+ )
201
+
202
+ def sterilize_info(self, path) -> dict:
203
+ info = self.fs.info(path=path)
204
+ return sterilize_dict(data=info)
205
+
206
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
207
+ raw_files = self.list_files()
208
+ files = [f for f in raw_files if self.does_path_match_glob(f)]
209
+ for file in files:
210
+ # Note: we remove any remaining leading slashes (Box introduces these)
211
+ # to get a valid relative path
212
+ rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
213
+ yield FileData(
214
+ identifier=file,
215
+ connector_type=self.connector_type,
216
+ source_identifiers=SourceIdentifiers(
217
+ filename=Path(file).name,
218
+ rel_path=rel_path or None,
219
+ fullpath=file,
220
+ ),
221
+ metadata=self.get_metadata(path=file),
222
+ additional_metadata=self.sterilize_info(path=file),
223
+ )
224
+
225
+
226
+ @dataclass
227
+ class FsspecDownloaderConfig(DownloaderConfig):
228
+ pass
229
+
230
+
231
+ FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig)
232
+
233
+
234
+ @dataclass
235
+ class FsspecDownloader(Downloader):
236
+ protocol: str
237
+ connection_config: FsspecConnectionConfigT
238
+ connector_type: str = CONNECTOR_TYPE
239
+ download_config: Optional[FsspecDownloaderConfigT] = field(
240
+ default_factory=lambda: FsspecDownloaderConfig()
241
+ )
242
+
243
+ def is_async(self) -> bool:
244
+ return self.fs.async_impl
245
+
246
+ @property
247
+ def fs(self) -> "AbstractFileSystem":
248
+ from fsspec import get_filesystem_class
249
+
250
+ return get_filesystem_class(self.protocol)(
251
+ **self.connection_config.get_access_config(),
252
+ )
253
+
254
+ def get_download_path(self, file_data: FileData) -> Path:
255
+ return (
256
+ self.download_dir / Path(file_data.source_identifiers.relative_path)
257
+ if self.download_config
258
+ else Path(file_data.source_identifiers.rel_path)
259
+ )
260
+
261
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
262
+ download_path = self.get_download_path(file_data=file_data)
263
+ download_path.parent.mkdir(parents=True, exist_ok=True)
264
+ try:
265
+ self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
266
+ except Exception as e:
267
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
268
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
269
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
270
+
271
+ async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
272
+ download_path = self.get_download_path(file_data=file_data)
273
+ download_path.parent.mkdir(parents=True, exist_ok=True)
274
+ try:
275
+ await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
276
+ except Exception as e:
277
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
278
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
279
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
280
+
281
+
282
+ @dataclass
283
+ class FsspecUploaderConfig(FileConfig, UploaderConfig):
284
+ overwrite: bool = False
285
+
286
+
287
+ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig)
288
+
289
+
290
+ @dataclass
291
+ class FsspecUploader(Uploader):
292
+ connector_type: str = CONNECTOR_TYPE
293
+ upload_config: FsspecUploaderConfigT = field(default=None)
294
+
295
+ @property
296
+ def fs(self) -> "AbstractFileSystem":
297
+ from fsspec import get_filesystem_class
298
+
299
+ fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {}
300
+ return get_filesystem_class(self.upload_config.protocol)(
301
+ **fs_kwargs,
302
+ )
303
+
304
+ def __post_init__(self):
305
+ # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
306
+ if not self.upload_config:
307
+ raise TypeError(
308
+ f"{self.__class__.__name__}.__init__() "
309
+ f"missing 1 required positional argument: 'upload_config'"
310
+ )
311
+
312
+ def get_upload_path(self, file_data: FileData) -> Path:
313
+ upload_path = (
314
+ Path(self.upload_config.path_without_protocol)
315
+ / file_data.source_identifiers.relative_path
316
+ )
317
+ updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
318
+ return updated_upload_path
319
+
320
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
321
+ for content in contents:
322
+ self._run(path=content.path, file_data=content.file_data)
323
+
324
+ def _run(self, path: Path, file_data: FileData) -> None:
325
+ path_str = str(path.resolve())
326
+ upload_path = self.get_upload_path(file_data=file_data)
327
+ if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
328
+ logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
329
+ return
330
+ logger.debug(f"Writing local file {path_str} to {upload_path}")
331
+ self.fs.upload(lpath=path_str, rpath=str(upload_path))
332
+
333
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
334
+ upload_path = self.get_upload_path(file_data=file_data)
335
+ path_str = str(path.resolve())
336
+ # Odd that fsspec doesn't run exists() as async even when client support async
337
+ already_exists = self.fs.exists(path=str(upload_path))
338
+ if already_exists and not self.upload_config.overwrite:
339
+ logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
340
+ return
341
+ logger.debug(f"Writing local file {path_str} to {upload_path}")
342
+ self.fs.upload(lpath=path_str, rpath=str(upload_path))