unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,401 @@
1
+ import hashlib
2
+ import json
3
+ import sys
4
+ import uuid
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
9
+
10
+ from unstructured.documents.elements import DataSourceMetadata
11
+
12
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
13
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
14
+ from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.interfaces import (
17
+ AccessConfig,
18
+ ConnectionConfig,
19
+ Downloader,
20
+ DownloaderConfig,
21
+ DownloadResponse,
22
+ FileData,
23
+ Indexer,
24
+ IndexerConfig,
25
+ UploadContent,
26
+ Uploader,
27
+ UploaderConfig,
28
+ UploadStager,
29
+ UploadStagerConfig,
30
+ download_responses,
31
+ )
32
+ from unstructured_ingest.v2.logger import logger
33
+ from unstructured_ingest.v2.processes.connector_registry import (
34
+ DestinationRegistryEntry,
35
+ SourceRegistryEntry,
36
+ )
37
+
38
+ if TYPE_CHECKING:
39
+ from elasticsearch import Elasticsearch as ElasticsearchClient
40
+
41
+ CONNECTOR_TYPE = "elasticsearch"
42
+
43
+
44
+ @dataclass
45
+ class ElasticsearchAccessConfig(AccessConfig):
46
+ password: Optional[str] = None
47
+ api_key: Optional[str] = enhanced_field(default=None, overload_name="es_api_key")
48
+ bearer_auth: Optional[str] = None
49
+ ssl_assert_fingerprint: Optional[str] = None
50
+
51
+
52
+ @dataclass
53
+ class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
54
+ hosts: Optional[list[str]] = None
55
+ cloud_id: Optional[str] = None
56
+ ca_certs: Optional[str] = None
57
+ basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
58
+ api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
59
+
60
+
61
+ @dataclass
62
+ class ElasticsearchConnectionConfig(ConnectionConfig):
63
+ hosts: Optional[list[str]] = None
64
+ username: Optional[str] = None
65
+ cloud_id: Optional[str] = None
66
+ api_key_id: Optional[str] = None
67
+ ca_certs: Optional[str] = None
68
+ access_config: ElasticsearchAccessConfig = enhanced_field(sensitive=True)
69
+
70
+ def get_client_kwargs(self) -> dict:
71
+ # Update auth related fields to conform to what the SDK expects based on the
72
+ # supported methods:
73
+ # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
74
+ client_input = ElasticsearchClientInput()
75
+ if self.hosts:
76
+ client_input.hosts = self.hosts
77
+ if self.cloud_id:
78
+ client_input.cloud_id = self.cloud_id
79
+ if self.ca_certs:
80
+ client_input.ca_certs = self.ca_certs
81
+ if self.access_config.password and (
82
+ self.cloud_id or self.ca_certs or self.access_config.ssl_assert_fingerprint
83
+ ):
84
+ client_input.basic_auth = ("elastic", self.access_config.password)
85
+ elif not self.cloud_id and self.username and self.access_config.password:
86
+ client_input.basic_auth = (self.username, self.access_config.password)
87
+ elif self.access_config.api_key and self.api_key_id:
88
+ client_input.api_key = (self.api_key_id, self.access_config.api_key)
89
+ elif self.access_config.api_key:
90
+ client_input.api_key = self.access_config.api_key
91
+ logger.debug(
92
+ f"Elasticsearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
93
+ )
94
+ client_kwargs = client_input.to_dict(redact_sensitive=False)
95
+ client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
96
+ return client_kwargs
97
+
98
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
99
+ def get_client(self) -> "ElasticsearchClient":
100
+ from elasticsearch import Elasticsearch as ElasticsearchClient
101
+
102
+ client = ElasticsearchClient(**self.get_client_kwargs())
103
+ self.check_connection(client=client)
104
+ return client
105
+
106
+ def check_connection(self, client: "ElasticsearchClient"):
107
+ try:
108
+ client.perform_request("HEAD", "/", headers={"accept": "application/json"})
109
+ except Exception as e:
110
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
111
+ raise SourceConnectionError(f"failed to validate connection: {e}")
112
+
113
+
114
+ @dataclass
115
+ class ElasticsearchIndexerConfig(IndexerConfig):
116
+ index_name: str
117
+ batch_size: int = 100
118
+
119
+
120
+ @dataclass
121
+ class ElasticsearchIndexer(Indexer):
122
+ connection_config: ElasticsearchConnectionConfig
123
+ index_config: ElasticsearchIndexerConfig
124
+ client: "ElasticsearchClient" = field(init=False)
125
+ connector_type: str = CONNECTOR_TYPE
126
+
127
+ def __post_init__(self):
128
+ self.client = self.connection_config.get_client()
129
+
130
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
131
+ def load_scan(self):
132
+ from elasticsearch.helpers import scan
133
+
134
+ return scan
135
+
136
+ def _get_doc_ids(self) -> set[str]:
137
+ """Fetches all document ids in an index"""
138
+ scan = self.load_scan()
139
+
140
+ scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
141
+ hits = scan(
142
+ self.client,
143
+ query=scan_query,
144
+ scroll="1m",
145
+ index=self.index_config.index_name,
146
+ )
147
+
148
+ return {hit["_id"] for hit in hits}
149
+
150
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
151
+ all_ids = self._get_doc_ids()
152
+ ids = list(all_ids)
153
+ id_batches: list[frozenset[str]] = [
154
+ frozenset(
155
+ ids[
156
+ i
157
+ * self.index_config.batch_size : (i + 1) # noqa
158
+ * self.index_config.batch_size
159
+ ]
160
+ )
161
+ for i in range(
162
+ (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
163
+ )
164
+ ]
165
+ for batch in id_batches:
166
+ # Make sure the hash is always a positive number to create identified
167
+ identified = str(hash(batch) + sys.maxsize + 1)
168
+ yield FileData(
169
+ identifier=identified,
170
+ connector_type=CONNECTOR_TYPE,
171
+ metadata=DataSourceMetadata(
172
+ url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
173
+ date_processed=str(time()),
174
+ ),
175
+ additional_metadata={
176
+ "ids": list(batch),
177
+ "index_name": self.index_config.index_name,
178
+ },
179
+ )
180
+
181
+
182
+ @dataclass
183
+ class ElasticsearchDownloaderConfig(DownloaderConfig):
184
+ fields: list[str] = field(default_factory=list)
185
+
186
+
187
+ @dataclass
188
+ class ElasticsearchDownloader(Downloader):
189
+ connection_config: ElasticsearchConnectionConfig
190
+ download_config: ElasticsearchDownloaderConfig
191
+ connector_type: str = CONNECTOR_TYPE
192
+
193
+ def is_async(self) -> bool:
194
+ return True
195
+
196
+ def get_identifier(self, index_name: str, record_id: str) -> str:
197
+ f = f"{index_name}-{record_id}"
198
+ if self.download_config.fields:
199
+ f = "{}-{}".format(
200
+ f,
201
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
202
+ )
203
+ return f
204
+
205
+ def map_es_results(self, es_results: dict) -> str:
206
+ doc_body = es_results["_source"]
207
+ flattened_dict = flatten_dict(dictionary=doc_body)
208
+ str_values = [str(value) for value in flattened_dict.values()]
209
+ concatenated_values = "\n".join(str_values)
210
+ return concatenated_values
211
+
212
+ def generate_download_response(
213
+ self, result: dict, index_name: str, file_data: FileData
214
+ ) -> DownloadResponse:
215
+ record_id = result["_id"]
216
+ filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
217
+ filename = f"{filename_id}.txt"
218
+ download_path = self.download_dir / Path(filename)
219
+ logger.debug(
220
+ f"Downloading results from index {index_name} and id {record_id} to {download_path}"
221
+ )
222
+ download_path.parent.mkdir(parents=True, exist_ok=True)
223
+ try:
224
+ with open(download_path, "w", encoding="utf8") as f:
225
+ f.write(self.map_es_results(es_results=result))
226
+ except Exception as e:
227
+ logger.error(
228
+ f"failed to download from index {index_name} "
229
+ f"and id {record_id} to {download_path}: {e}",
230
+ exc_info=True,
231
+ )
232
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
233
+ return DownloadResponse(
234
+ file_data=FileData(
235
+ identifier=filename_id,
236
+ connector_type=CONNECTOR_TYPE,
237
+ metadata=DataSourceMetadata(
238
+ version=str(result["_version"]) if "_version" in result else None,
239
+ date_processed=str(time()),
240
+ record_locator={
241
+ "hosts": self.connection_config.hosts,
242
+ "index_name": index_name,
243
+ "document_id": record_id,
244
+ },
245
+ ),
246
+ ),
247
+ path=download_path,
248
+ )
249
+
250
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
251
+ raise NotImplementedError()
252
+
253
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
254
+ def load_async(self):
255
+ from elasticsearch import AsyncElasticsearch
256
+ from elasticsearch.helpers import async_scan
257
+
258
+ return AsyncElasticsearch, async_scan
259
+
260
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
261
+ AsyncClient, async_scan = self.load_async()
262
+
263
+ index_name: str = file_data.additional_metadata["index_name"]
264
+ ids: list[str] = file_data.additional_metadata["ids"]
265
+
266
+ scan_query = {
267
+ "_source": self.download_config.fields,
268
+ "version": True,
269
+ "query": {"ids": {"values": ids}},
270
+ }
271
+
272
+ download_responses = []
273
+ async with AsyncClient(**self.connection_config.get_client_kwargs()) as client:
274
+ async for result in async_scan(
275
+ client,
276
+ query=scan_query,
277
+ scroll="1m",
278
+ index=index_name,
279
+ ):
280
+ download_responses.append(
281
+ self.generate_download_response(
282
+ result=result, index_name=index_name, file_data=file_data
283
+ )
284
+ )
285
+ return download_responses
286
+
287
+
288
+ @dataclass
289
+ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
290
+ index_name: str
291
+
292
+
293
+ @dataclass
294
+ class ElasticsearchUploadStager(UploadStager):
295
+ upload_stager_config: ElasticsearchUploadStagerConfig
296
+
297
+ def conform_dict(self, data: dict) -> dict:
298
+ resp = {
299
+ "_index": self.upload_stager_config.index_name,
300
+ "_id": str(uuid.uuid4()),
301
+ "_source": {
302
+ "element_id": data.pop("element_id", None),
303
+ "embeddings": data.pop("embeddings", None),
304
+ "text": data.pop("text", None),
305
+ "type": data.pop("type", None),
306
+ },
307
+ }
308
+ if "metadata" in data and isinstance(data["metadata"], dict):
309
+ resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
310
+ return resp
311
+
312
+ def run(
313
+ self,
314
+ elements_filepath: Path,
315
+ file_data: FileData,
316
+ output_dir: Path,
317
+ output_filename: str,
318
+ **kwargs: Any,
319
+ ) -> Path:
320
+ with open(elements_filepath) as elements_file:
321
+ elements_contents = json.load(elements_file)
322
+ conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
323
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
324
+ with open(output_path, "w") as output_file:
325
+ json.dump(conformed_elements, output_file)
326
+ return output_path
327
+
328
+
329
+ @dataclass
330
+ class ElasticsearchUploaderConfig(UploaderConfig):
331
+ index_name: str
332
+ batch_size_bytes: int = 15_000_000
333
+ num_threads: int = 4
334
+
335
+
336
+ @dataclass
337
+ class ElasticsearchUploader(Uploader):
338
+ connector_type: str = CONNECTOR_TYPE
339
+ upload_config: ElasticsearchUploaderConfig
340
+ connection_config: ElasticsearchConnectionConfig
341
+
342
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
343
+ def load_parallel_bulk(self):
344
+ from elasticsearch.helpers import parallel_bulk
345
+
346
+ return parallel_bulk
347
+
348
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
349
+ parallel_bulk = self.load_parallel_bulk()
350
+ elements_dict = []
351
+ for content in contents:
352
+ with open(content.path) as elements_file:
353
+ elements = json.load(elements_file)
354
+ elements_dict.extend(elements)
355
+ upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
356
+ logger.info(
357
+ f"writing {len(elements_dict)} elements via document batches to destination "
358
+ f"index named {self.upload_config.index_name} at {upload_destination} with "
359
+ f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
360
+ f"{self.upload_config.num_threads} (number of) threads"
361
+ )
362
+
363
+ client = self.connection_config.get_client()
364
+ if not client.indices.exists(index=self.upload_config.index_name):
365
+ logger.warning(
366
+ f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
367
+ f"{self.upload_config.index_name}. "
368
+ f"This may cause issues when uploading."
369
+ )
370
+ for batch in generator_batching_wbytes(
371
+ elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
372
+ ):
373
+ for success, info in parallel_bulk(
374
+ client=client,
375
+ actions=batch,
376
+ thread_count=self.upload_config.num_threads,
377
+ ):
378
+ if not success:
379
+ logger.error(
380
+ "upload failed for a batch in "
381
+ f"{(self.__class__.__name__).replace('Uploader', '')} "
382
+ "destination connector:",
383
+ info,
384
+ )
385
+
386
+
387
+ elasticsearch_source_entry = SourceRegistryEntry(
388
+ connection_config=ElasticsearchConnectionConfig,
389
+ indexer=ElasticsearchIndexer,
390
+ indexer_config=ElasticsearchIndexerConfig,
391
+ downloader=ElasticsearchDownloader,
392
+ downloader_config=ElasticsearchDownloaderConfig,
393
+ )
394
+
395
+ elasticsearch_destination_entry = DestinationRegistryEntry(
396
+ connection_config=ElasticsearchConnectionConfig,
397
+ upload_stager_config=ElasticsearchUploadStagerConfig,
398
+ upload_stager=ElasticsearchUploadStager,
399
+ uploader_config=ElasticsearchUploaderConfig,
400
+ uploader=ElasticsearchUploader,
401
+ )
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
9
+ from .azure import azure_destination_entry, azure_source_entry
10
+ from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
11
+ from .box import box_destination_entry, box_source_entry
12
+ from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
13
+ from .dropbox import dropbox_destination_entry, dropbox_source_entry
14
+ from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
15
+ from .gcs import gcs_destination_entry, gcs_source_entry
16
+ from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
17
+ from .s3 import s3_destination_entry, s3_source_entry
18
+ from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
19
+ from .sftp import sftp_destination_entry, sftp_source_entry
20
+
21
+ add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
22
+ add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)
23
+
24
+ add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
25
+ add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)
26
+
27
+ add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
28
+ add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)
29
+
30
+ add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
31
+ add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)
32
+
33
+ add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
34
+ add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)
35
+
36
+ add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
37
+ add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Any, Generator, Optional
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
10
+ from unstructured_ingest.v2.processes.connector_registry import (
11
+ DestinationRegistryEntry,
12
+ SourceRegistryEntry,
13
+ )
14
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
15
+ FsspecAccessConfig,
16
+ FsspecConnectionConfig,
17
+ FsspecDownloader,
18
+ FsspecDownloaderConfig,
19
+ FsspecIndexer,
20
+ FsspecIndexerConfig,
21
+ FsspecUploader,
22
+ FsspecUploaderConfig,
23
+ )
24
+ from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
25
+
26
+ CONNECTOR_TYPE = "azure"
27
+
28
+
29
+ def azure_json_serial(obj):
30
+ from azure.storage.blob._models import ContentSettings
31
+
32
+ if isinstance(obj, ContentSettings):
33
+ return dict(obj)
34
+ if isinstance(obj, bytearray):
35
+ return str(obj)
36
+ return json_serial(obj)
37
+
38
+
39
+ @dataclass
40
+ class AzureIndexerConfig(FsspecIndexerConfig):
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class AzureAccessConfig(FsspecAccessConfig):
46
+ account_name: Optional[str] = None
47
+ account_key: Optional[str] = None
48
+ connection_string: Optional[str] = None
49
+ sas_token: Optional[str] = None
50
+
51
+ def __post_init__(self):
52
+ if self.connection_string is None and self.account_name is None:
53
+ raise ValueError("either connection_string or account_name must be set")
54
+
55
+
56
+ @dataclass
57
+ class AzureConnectionConfig(FsspecConnectionConfig):
58
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"])
59
+ access_config: AzureAccessConfig = enhanced_field(
60
+ sensitive=True, default_factory=lambda: AzureAccessConfig()
61
+ )
62
+ connector_type: str = CONNECTOR_TYPE
63
+
64
+ def get_access_config(self) -> dict[str, Any]:
65
+ # Avoid injecting None by filtering out k,v pairs where the value is None
66
+ access_configs: dict[str, Any] = {
67
+ k: v for k, v in self.access_config.to_dict().items() if v
68
+ }
69
+ return access_configs
70
+
71
+
72
+ @dataclass
73
+ class AzureIndexer(FsspecIndexer):
74
+ connection_config: AzureConnectionConfig
75
+ index_config: AzureIndexerConfig
76
+ connector_type: str = CONNECTOR_TYPE
77
+
78
+ def sterilize_info(self, path) -> dict:
79
+ info = self.fs.info(path=path)
80
+ return sterilize_dict(data=info, default=azure_json_serial)
81
+
82
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
83
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
84
+ return super().run(**kwargs)
85
+
86
+
87
+ @dataclass
88
+ class AzureDownloaderConfig(FsspecDownloaderConfig):
89
+ pass
90
+
91
+
92
+ @dataclass
93
+ class AzureDownloader(FsspecDownloader):
94
+ protocol: str = "az"
95
+ connection_config: AzureConnectionConfig
96
+ connector_type: str = CONNECTOR_TYPE
97
+ download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
98
+
99
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
100
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
101
+ return super().run(file_data=file_data, **kwargs)
102
+
103
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
104
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
105
+ return await super().run_async(file_data=file_data, **kwargs)
106
+
107
+
108
+ @dataclass
109
+ class AzureUploaderConfig(FsspecUploaderConfig):
110
+ pass
111
+
112
+
113
+ @dataclass
114
+ class AzureUploader(FsspecUploader):
115
+ connector_type: str = CONNECTOR_TYPE
116
+ connection_config: AzureConnectionConfig
117
+ upload_config: AzureUploaderConfig = field(default=None)
118
+
119
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
120
+ def __post_init__(self):
121
+ super().__post_init__()
122
+
123
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
124
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
125
+ return super().run(contents=contents, **kwargs)
126
+
127
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
128
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
129
+ return await super().run_async(path=path, file_data=file_data, **kwargs)
130
+
131
+
132
+ azure_source_entry = SourceRegistryEntry(
133
+ indexer=AzureIndexer,
134
+ indexer_config=AzureIndexerConfig,
135
+ downloader=AzureDownloader,
136
+ downloader_config=AzureDownloaderConfig,
137
+ connection_config=AzureConnectionConfig,
138
+ )
139
+
140
+ azure_destination_entry = DestinationRegistryEntry(
141
+ uploader=AzureUploader,
142
+ uploader_config=AzureUploaderConfig,
143
+ connection_config=AzureConnectionConfig,
144
+ )