unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,237 @@
1
+ import copy
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ from unstructured_ingest import __name__ as integration_name
7
+ from unstructured_ingest.__version__ import __version__ as integration_version
8
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
10
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ BaseConnectorConfig,
14
+ BaseDestinationConnector,
15
+ BaseSingleIngestDoc,
16
+ BaseSourceConnector,
17
+ IngestDocCleanupMixin,
18
+ SourceConnectorCleanupMixin,
19
+ SourceMetadata,
20
+ WriteConfig,
21
+ )
22
+ from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
24
+ from unstructured_ingest.utils.dep_check import requires_dependencies
25
+
26
+ if t.TYPE_CHECKING:
27
+ from astrapy.db import AstraDB, AstraDBCollection
28
+
29
+ NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
30
+
31
+
32
+ @dataclass
33
+ class AstraAccessConfig(AccessConfig):
34
+ token: str = enhanced_field(sensitive=True)
35
+ api_endpoint: str = enhanced_field(sensitive=True)
36
+
37
+
38
+ @dataclass
39
+ class SimpleAstraConfig(BaseConnectorConfig):
40
+ access_config: AstraAccessConfig
41
+ collection_name: str
42
+ namespace: t.Optional[str] = None
43
+
44
+
45
+ @dataclass
46
+ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
47
+ connector_config: SimpleAstraConfig
48
+ metadata: t.Dict[str, str] = field(default_factory=dict)
49
+ registry_name: str = "astra"
50
+
51
+ @property
52
+ def filename(self):
53
+ return (
54
+ Path(self.read_config.download_dir)
55
+ / self.connector_config.collection_name
56
+ / f"{self.metadata['_id']}.txt"
57
+ ).resolve()
58
+
59
+ @property
60
+ def _output_filename(self):
61
+ return (
62
+ Path(self.processor_config.output_dir)
63
+ / self.connector_config.collection_name
64
+ / f"{self.metadata['_id']}.json"
65
+ ).resolve()
66
+
67
+ def update_source_metadata(self, **kwargs):
68
+ if not self.metadata:
69
+ self.source_metadata = SourceMetadata(
70
+ exists=False,
71
+ )
72
+ return
73
+ self.source_metadata = SourceMetadata(
74
+ exists=True,
75
+ )
76
+
77
+ @SourceConnectionError.wrap
78
+ @requires_dependencies(["astrapy"], extras="astra")
79
+ @BaseSingleIngestDoc.skip_if_file_exists
80
+ def get_file(self):
81
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
82
+
83
+ flattened_dict = flatten_dict(dictionary=self.metadata)
84
+ str_values = [str(value) for value in flattened_dict.values()]
85
+ concatenated_values = "\n".join(str_values)
86
+
87
+ with open(self.filename, "w") as f:
88
+ f.write(concatenated_values)
89
+
90
+
91
+ @dataclass
92
+ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
93
+ connector_config: SimpleAstraConfig
94
+ _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
95
+ _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
96
+
97
+ @property
98
+ @requires_dependencies(["astrapy"], extras="astra")
99
+ def astra_db_collection(self) -> "AstraDBCollection":
100
+ if self._astra_db_collection is None:
101
+ from astrapy.db import AstraDB
102
+
103
+ # Build the Astra DB object.
104
+ # caller_name/version for AstraDB tracking
105
+ self._astra_db = AstraDB(
106
+ api_endpoint=self.connector_config.access_config.api_endpoint,
107
+ token=self.connector_config.access_config.token,
108
+ namespace=self.connector_config.namespace,
109
+ caller_name=integration_name,
110
+ caller_version=integration_version,
111
+ )
112
+
113
+ # Create and connect to the collection
114
+ self._astra_db_collection = self._astra_db.collection(
115
+ collection_name=self.connector_config.collection_name,
116
+ )
117
+ return self._astra_db_collection # type: ignore
118
+
119
+ @requires_dependencies(["astrapy"], extras="astra")
120
+ @SourceConnectionError.wrap # type: ignore
121
+ def initialize(self):
122
+ _ = self.astra_db_collection
123
+
124
+ @requires_dependencies(["astrapy"], extras="astra")
125
+ def check_connection(self):
126
+ try:
127
+ _ = self.astra_db_collection
128
+ except Exception as e:
129
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
130
+ raise SourceConnectionError(f"failed to validate connection: {e}")
131
+
132
+ @requires_dependencies(["astrapy"], extras="astra")
133
+ def get_ingest_docs(self): # type: ignore
134
+ # Perform the find operation
135
+ astra_docs = list(self.astra_db_collection.paginated_find())
136
+
137
+ doc_list = []
138
+ for record in astra_docs:
139
+ doc = AstraIngestDoc(
140
+ connector_config=self.connector_config,
141
+ processor_config=self.processor_config,
142
+ read_config=self.read_config,
143
+ metadata=record,
144
+ )
145
+
146
+ doc.update_source_metadata()
147
+
148
+ doc_list.append(doc)
149
+
150
+ return doc_list
151
+
152
+
153
+ @dataclass
154
+ class AstraWriteConfig(WriteConfig):
155
+ embedding_dimension: int
156
+ requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
157
+ batch_size: int = 20
158
+
159
+
160
+ @dataclass
161
+ class AstraDestinationConnector(BaseDestinationConnector):
162
+ write_config: AstraWriteConfig
163
+ connector_config: SimpleAstraConfig
164
+ _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
165
+ _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
166
+
167
+ def to_dict(self, **kwargs):
168
+ """
169
+ The _astra_db_collection variable in this dataclass breaks deepcopy due to:
170
+ TypeError: cannot pickle '_thread.lock' object
171
+ When serializing, remove it, meaning client data will need to be reinitialized
172
+ when deserialized
173
+ """
174
+ self_cp = copy.copy(self)
175
+
176
+ if hasattr(self_cp, "_astra_db_collection"):
177
+ setattr(self_cp, "_astra_db_collection", None)
178
+
179
+ return _asdict(self_cp, **kwargs)
180
+
181
+ @property
182
+ @requires_dependencies(["astrapy"], extras="astra")
183
+ def astra_db_collection(self) -> "AstraDBCollection":
184
+ if self._astra_db_collection is None:
185
+ from astrapy.db import AstraDB
186
+
187
+ collection_name = self.connector_config.collection_name
188
+ embedding_dimension = self.write_config.embedding_dimension
189
+
190
+ # If the user has requested an indexing policy, pass it to the AstraDB
191
+ requested_indexing_policy = self.write_config.requested_indexing_policy
192
+ options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
193
+
194
+ # caller_name/version for AstraDB tracking
195
+ self._astra_db = AstraDB(
196
+ api_endpoint=self.connector_config.access_config.api_endpoint,
197
+ token=self.connector_config.access_config.token,
198
+ namespace=self.connector_config.namespace,
199
+ caller_name=integration_name,
200
+ caller_version=integration_version,
201
+ )
202
+
203
+ # Create and connect to the newly created collection
204
+ self._astra_db_collection = self._astra_db.create_collection(
205
+ collection_name=collection_name,
206
+ dimension=embedding_dimension,
207
+ options=options,
208
+ )
209
+ return self._astra_db_collection
210
+
211
+ @requires_dependencies(["astrapy"], extras="astra")
212
+ @DestinationConnectionError.wrap
213
+ def initialize(self):
214
+ _ = self.astra_db_collection
215
+
216
+ @requires_dependencies(["astrapy"], extras="astra")
217
+ def check_connection(self):
218
+ try:
219
+ _ = self.astra_db_collection
220
+ except Exception as e:
221
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
222
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
223
+
224
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
225
+ logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
226
+
227
+ astra_batch_size = self.write_config.batch_size
228
+
229
+ for batch in batch_generator(elements_dict, astra_batch_size):
230
+ self._astra_db_collection.insert_many(batch)
231
+
232
+ def normalize_dict(self, element_dict: dict) -> dict:
233
+ return {
234
+ "$vector": element_dict.pop("embeddings", None),
235
+ "content": element_dict.pop("text", None),
236
+ "metadata": element_dict,
237
+ }
@@ -0,0 +1,144 @@
1
+ import json
2
+ import typing as t
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+
6
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
8
+ from unstructured_ingest.interfaces import (
9
+ AccessConfig,
10
+ BaseConnectorConfig,
11
+ BaseDestinationConnector,
12
+ WriteConfig,
13
+ )
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+
17
+ if t.TYPE_CHECKING:
18
+ from azure.search.documents import SearchClient
19
+
20
+
21
+ @dataclass
22
+ class AzureCognitiveSearchAccessConfig(AccessConfig):
23
+ key: str = enhanced_field(sensitive=True)
24
+
25
+
26
+ @dataclass
27
+ class SimpleAzureCognitiveSearchStorageConfig(BaseConnectorConfig):
28
+ endpoint: str
29
+ access_config: AzureCognitiveSearchAccessConfig
30
+
31
+
32
+ @dataclass
33
+ class AzureCognitiveSearchWriteConfig(WriteConfig):
34
+ index: str
35
+
36
+
37
+ @dataclass
38
+ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
39
+ write_config: AzureCognitiveSearchWriteConfig
40
+ connector_config: SimpleAzureCognitiveSearchStorageConfig
41
+ _client: t.Optional["SearchClient"] = field(init=False, default=None)
42
+
43
+ @requires_dependencies(["azure.search"], extras="azure-cognitive-search")
44
+ def generate_client(self) -> "SearchClient":
45
+ from azure.core.credentials import AzureKeyCredential
46
+ from azure.search.documents import SearchClient
47
+
48
+ # Create a client
49
+ credential = AzureKeyCredential(self.connector_config.access_config.key)
50
+ return SearchClient(
51
+ endpoint=self.connector_config.endpoint,
52
+ index_name=self.write_config.index,
53
+ credential=credential,
54
+ )
55
+
56
+ @property
57
+ def client(self) -> "SearchClient":
58
+ if self._client is None:
59
+ self._client = self.generate_client()
60
+ return self._client
61
+
62
+ def check_connection(self):
63
+ try:
64
+ self.client.get_document_count()
65
+ except Exception as e:
66
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
67
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
68
+
69
+ def initialize(self):
70
+ _ = self.client
71
+
72
+ def conform_dict(self, data: dict) -> None:
73
+ """
74
+ updates the dictionary that is from each Element being converted into a dict/json
75
+ into a dictionary that conforms to the schema expected by the
76
+ Azure Cognitive Search index
77
+ """
78
+ from dateutil import parser # type: ignore
79
+
80
+ data["id"] = str(uuid.uuid4())
81
+
82
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
83
+ data["metadata"]["coordinates"]["points"] = json.dumps(points)
84
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
85
+ data["metadata"]["data_source"]["version"] = str(version)
86
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
87
+ data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
88
+ if permissions_data := (
89
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
90
+ ):
91
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
92
+ if links := data.get("metadata", {}).get("links"):
93
+ data["metadata"]["links"] = [json.dumps(link) for link in links]
94
+ if last_modified := data.get("metadata", {}).get("last_modified"):
95
+ data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
96
+ "%Y-%m-%dT%H:%M:%S.%fZ",
97
+ )
98
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
99
+ data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
100
+ "%Y-%m-%dT%H:%M:%S.%fZ",
101
+ )
102
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
103
+ data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
104
+ "%Y-%m-%dT%H:%M:%S.%fZ",
105
+ )
106
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
107
+ data["metadata"]["data_source"]["date_processed"] = parser.parse(
108
+ date_processed,
109
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
110
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
111
+ data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
112
+ if page_number := data.get("metadata", {}).get("page_number"):
113
+ data["metadata"]["page_number"] = str(page_number)
114
+
115
+ @requires_dependencies(["azure"], extras="azure-cognitive-search")
116
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
117
+ import azure.core.exceptions
118
+
119
+ logger.info(
120
+ f"writing {len(elements_dict)} documents to destination "
121
+ f"index at {self.write_config.index}",
122
+ )
123
+ try:
124
+ results = self.client.upload_documents(documents=elements_dict)
125
+
126
+ except azure.core.exceptions.HttpResponseError as http_error:
127
+ raise WriteError(f"http error: {http_error}") from http_error
128
+ errors = []
129
+ success = []
130
+ for result in results:
131
+ if result.succeeded:
132
+ success.append(result)
133
+ else:
134
+ errors.append(result)
135
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
136
+ if errors:
137
+ raise WriteError(
138
+ ", ".join(
139
+ [
140
+ f"{error.key}: [{error.status_code}] {error.error_message}"
141
+ for error in errors
142
+ ],
143
+ ),
144
+ )
@@ -0,0 +1,313 @@
1
+ import os
2
+ import typing as t
3
+ import urllib.request
4
+ from dataclasses import dataclass
5
+ from ftplib import FTP, error_perm
6
+ from pathlib import Path
7
+
8
+ import requests
9
+ from requests.adapters import HTTPAdapter
10
+
11
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.interfaces import (
13
+ BaseConnectorConfig,
14
+ BaseSingleIngestDoc,
15
+ BaseSourceConnector,
16
+ IngestDocCleanupMixin,
17
+ SourceConnectorCleanupMixin,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.data_prep import (
21
+ validate_date_args,
22
+ )
23
+
24
+ DOMAIN = "ftp.ncbi.nlm.nih.gov"
25
+ FTP_DOMAIN = f"ftp://{DOMAIN}"
26
+ PMC_DIR = "pub/pmc"
27
+ PDF_DIR = "oa_pdf"
28
+
29
+
30
+ @dataclass
31
+ class BiomedFileMeta:
32
+ ftp_path: str
33
+ download_filepath: str
34
+ output_filepath: str
35
+
36
+
37
+ @dataclass
38
+ class SimpleBiomedConfig(BaseConnectorConfig):
39
+ """Connector config where path is the FTP directory path and
40
+ id_, from_, until, format are API parameters."""
41
+
42
+ path: t.Optional[str] = None
43
+ # OA Web Service API Options
44
+ api_id: t.Optional[str] = None
45
+ api_from: t.Optional[str] = None
46
+ api_until: t.Optional[str] = None
47
+ max_request_time: int = 45
48
+
49
+ def validate_api_inputs(self):
50
+ valid = False
51
+
52
+ if self.api_from:
53
+ valid = validate_date_args(self.api_from)
54
+
55
+ if self.api_until:
56
+ valid = validate_date_args(self.api_until)
57
+
58
+ return valid
59
+
60
+ def __post_init__(self):
61
+ self.is_file = False
62
+ self.is_dir = False
63
+ self.is_api = False
64
+
65
+ if not self.path:
66
+ is_valid = self.validate_api_inputs()
67
+ if not is_valid:
68
+ raise ValueError(
69
+ "Path argument or at least one of the "
70
+ "OA Web Service arguments MUST be provided.",
71
+ )
72
+
73
+ self.is_api = True
74
+ else:
75
+ self.path = self.path.strip("/")
76
+ is_valid = self.path.lower().startswith(PDF_DIR)
77
+
78
+ if not is_valid:
79
+ raise ValueError(f"Path MUST start with {PDF_DIR}")
80
+
81
+ ftp = FTP(DOMAIN)
82
+ ftp.login()
83
+
84
+ path = Path(PMC_DIR) / self.path
85
+ response = ""
86
+ try:
87
+ if path.suffix == ".pdf":
88
+ response = ftp.cwd(str(path.parent))
89
+ self.is_file = True
90
+ else:
91
+ response = ftp.cwd(str(path))
92
+ except error_perm as exc:
93
+ if "no such file or directory" in exc.args[0].lower():
94
+ raise ValueError(f"The path: {path} is not valid.")
95
+ elif "not a directory" in exc.args[0].lower():
96
+ self.is_file = True
97
+ elif "command successful" in response:
98
+ self.is_dir = True
99
+ else:
100
+ raise ValueError(
101
+ "Something went wrong when validating the path: {path}.",
102
+ )
103
+
104
+
105
+ @dataclass
106
+ class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
107
+ connector_config: SimpleBiomedConfig
108
+ file_meta: BiomedFileMeta
109
+ registry_name: str = "biomed"
110
+
111
+ @property
112
+ def filename(self):
113
+ return Path(self.file_meta.download_filepath).resolve() # type: ignore
114
+
115
+ @property
116
+ def _output_filename(self):
117
+ return Path(f"{self.file_meta.output_filepath}.json").resolve()
118
+
119
+ def cleanup_file(self):
120
+ if (
121
+ not self.read_config.preserve_downloads
122
+ and self.filename.is_file()
123
+ and not self.read_config.download_only
124
+ ):
125
+ logger.debug(f"Cleaning up {self}")
126
+ Path.unlink(self.filename)
127
+
128
+ @SourceConnectionError.wrap
129
+ @BaseSingleIngestDoc.skip_if_file_exists
130
+ def get_file(self):
131
+ download_path = self.file_meta.download_filepath # type: ignore
132
+ dir_ = Path(os.path.dirname(download_path)) # type: ignore
133
+ if not dir_.is_dir():
134
+ logger.debug(f"Creating directory: {dir_}")
135
+
136
+ if dir_:
137
+ dir_.mkdir(parents=True, exist_ok=True)
138
+ self._retrieve()
139
+ logger.debug(f"File downloaded: {self.file_meta.download_filepath}")
140
+
141
+ @SourceConnectionNetworkError.wrap
142
+ def _retrieve(self):
143
+ urllib.request.urlretrieve(
144
+ self.file_meta.ftp_path, # type: ignore
145
+ self.file_meta.download_filepath,
146
+ )
147
+
148
+
149
+ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
150
+ """Objects of this class support fetching documents from Biomedical literature FTP directory"""
151
+
152
+ connector_config: SimpleBiomedConfig
153
+
154
+ def get_base_endpoints_url(self) -> str:
155
+ endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf"
156
+
157
+ if self.connector_config.api_id:
158
+ endpoint_url += f"&id={self.connector_config.api_id}"
159
+
160
+ if self.connector_config.api_from:
161
+ endpoint_url += f"&from={self.connector_config.api_from}"
162
+
163
+ if self.connector_config.api_until:
164
+ endpoint_url += f"&until={self.connector_config.api_until}"
165
+
166
+ return endpoint_url
167
+
168
+ def _list_objects_api(self) -> t.List[BiomedFileMeta]:
169
+ from bs4 import BeautifulSoup
170
+
171
+ def urls_to_metadata(urls):
172
+ files = []
173
+ for url in urls:
174
+ parts = url.split(PDF_DIR)
175
+ if len(parts) > 1:
176
+ local_path = parts[1].strip("/")
177
+ files.append(
178
+ BiomedFileMeta(
179
+ ftp_path=url,
180
+ download_filepath=(Path(self.read_config.download_dir) / local_path)
181
+ .resolve()
182
+ .as_posix(),
183
+ output_filepath=(Path(self.processor_config.output_dir) / local_path)
184
+ .resolve()
185
+ .as_posix(),
186
+ ),
187
+ )
188
+
189
+ return files
190
+
191
+ files: t.List[BiomedFileMeta] = []
192
+
193
+ endpoint_url = self.get_base_endpoints_url()
194
+
195
+ while endpoint_url:
196
+ session = requests.Session()
197
+ adapter = HTTPAdapter()
198
+ session.mount("http://", adapter)
199
+ session.mount("https://", adapter)
200
+ response = self._get_request(session=session, endpoint_url=endpoint_url)
201
+ soup = BeautifulSoup(response.content, features="lxml")
202
+ urls = [link["href"] for link in soup.find_all("link")]
203
+
204
+ if not urls:
205
+ return files
206
+
207
+ endpoint_url = urls[-1] if "resumptiontoken" in urls[-1].lower() else None
208
+ if endpoint_url:
209
+ urls = urls[:-1]
210
+
211
+ files.extend(urls_to_metadata(urls))
212
+
213
+ return files
214
+
215
+ @SourceConnectionNetworkError.wrap
216
+ def _get_request(self, session: requests.Session, endpoint_url: str) -> requests.Response:
217
+ return session.get(endpoint_url, timeout=self.connector_config.max_request_time)
218
+
219
+ def _list_objects(self) -> t.List[BiomedFileMeta]:
220
+ files = []
221
+
222
+ # Conform to mypy, null check performed elsewhere.
223
+ # Wouldn't be in this method unless self.config.path exists
224
+ path: str = self.connector_config.path if self.connector_config.path else ""
225
+
226
+ def traverse(path, download_dir, output_dir):
227
+ full_path = Path(PMC_DIR) / path
228
+ logger.debug(f"Traversing directory: {full_path}")
229
+
230
+ ftp = FTP(DOMAIN)
231
+ ftp.login()
232
+
233
+ try:
234
+ response = ftp.cwd(str(full_path))
235
+ except error_perm:
236
+ raise ValueError(f"{full_path} is not a valid directory.")
237
+
238
+ if "command successful" in response.lower():
239
+ sub_paths = [path / p for p in ftp.nlst()]
240
+
241
+ if not sub_paths:
242
+ return
243
+
244
+ ext = Path(sub_paths[0]).suffix
245
+ if ext:
246
+ for sub_path in sub_paths:
247
+ ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{sub_path}"
248
+ local_path = "/".join(str(sub_path).split("/")[1:])
249
+ files.append(
250
+ BiomedFileMeta(
251
+ ftp_path=ftp_path,
252
+ download_filepath=(Path(self.read_config.download_dir) / local_path)
253
+ .resolve()
254
+ .as_posix(),
255
+ output_filepath=(
256
+ Path(self.processor_config.output_dir) / local_path
257
+ )
258
+ .resolve()
259
+ .as_posix(),
260
+ ),
261
+ )
262
+
263
+ else:
264
+ for sub_path in sub_paths:
265
+ traverse(sub_path, download_dir, output_dir)
266
+
267
+ else:
268
+ raise ValueError(f"{full_path} is not a valid directory.")
269
+
270
+ ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{self.connector_config.path}"
271
+ if self.connector_config.is_file:
272
+ local_path = "/".join(path.split("/")[1:])
273
+ return [
274
+ BiomedFileMeta(
275
+ ftp_path=ftp_path,
276
+ download_filepath=(Path(self.read_config.download_dir) / local_path)
277
+ .resolve()
278
+ .as_posix(),
279
+ output_filepath=(Path(self.processor_config.output_dir) / local_path)
280
+ .resolve()
281
+ .as_posix(),
282
+ ),
283
+ ]
284
+ else:
285
+ traverse(
286
+ Path(path),
287
+ Path(self.read_config.download_dir),
288
+ Path(self.processor_config.output_dir),
289
+ )
290
+
291
+ return files
292
+
293
+ def initialize(self):
294
+ pass
295
+
296
+ def check_connection(self):
297
+ resp = requests.head(self.get_base_endpoints_url())
298
+ try:
299
+ resp.raise_for_status()
300
+ except requests.HTTPError as http_error:
301
+ raise SourceConnectionError(f"failed to validate connection: {http_error}")
302
+
303
+ def get_ingest_docs(self):
304
+ files = self._list_objects_api() if self.connector_config.is_api else self._list_objects()
305
+ return [
306
+ BiomedIngestDoc(
307
+ processor_config=self.processor_config,
308
+ connector_config=self.connector_config,
309
+ read_config=self.read_config,
310
+ file_meta=file,
311
+ )
312
+ for file in files
313
+ ]