unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,32 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Any, TypeVar
4
+
5
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+
7
+
8
+ @dataclass
9
+ class AccessConfig(EnhancedDataClassJsonMixin):
10
+ """Meant to designate holding any sensitive information associated with other configs
11
+ and also for access specific configs."""
12
+
13
+
14
+ AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
15
+
16
+
17
+ @dataclass
18
+ class ConnectionConfig(EnhancedDataClassJsonMixin):
19
+ access_config: AccessConfigT
20
+
21
+ def get_access_config(self) -> dict[str, Any]:
22
+ if not self.access_config:
23
+ return {}
24
+ return self.access_config.to_dict(apply_name_overload=False)
25
+
26
+
27
+ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
28
+
29
+
30
+ @dataclass
31
+ class BaseConnector(ABC):
32
+ connection_config: ConnectionConfigT
@@ -0,0 +1,79 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Optional, TypedDict, TypeVar, Union
6
+
7
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
8
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
+ from unstructured_ingest.v2.interfaces.file_data import FileData
10
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
11
+
12
+
13
+ @dataclass
14
+ class DownloaderConfig(EnhancedDataClassJsonMixin):
15
+ download_dir: Optional[Path] = None
16
+
17
+
18
+ DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
19
+
20
+
21
+ class DownloadResponse(TypedDict):
22
+ file_data: FileData
23
+ path: Path
24
+
25
+
26
+ download_responses = Union[list[DownloadResponse], DownloadResponse]
27
+
28
+
29
+ class Downloader(BaseProcess, BaseConnector, ABC):
30
+ connector_type: str
31
+ download_config: DownloaderConfigT
32
+
33
+ @staticmethod
34
+ def is_float(value: str):
35
+ try:
36
+ float(value)
37
+ return True
38
+ except ValueError:
39
+ return False
40
+
41
+ def generate_download_response(
42
+ self, file_data: FileData, download_path: Path
43
+ ) -> DownloadResponse:
44
+ if (
45
+ file_data.metadata.date_modified
46
+ and self.is_float(file_data.metadata.date_modified)
47
+ and file_data.metadata.date_created
48
+ and self.is_float(file_data.metadata.date_created)
49
+ ):
50
+ date_modified = float(file_data.metadata.date_modified)
51
+ date_created = float(file_data.metadata.date_created)
52
+ os.utime(download_path, times=(date_created, date_modified))
53
+ return DownloadResponse(file_data=file_data, path=download_path)
54
+
55
+ @property
56
+ def download_dir(self) -> Path:
57
+ if self.download_config.download_dir is None:
58
+ self.download_config.download_dir = (
59
+ Path.home()
60
+ / ".cache"
61
+ / "unstructured"
62
+ / "ingest"
63
+ / "download"
64
+ / self.connector_type
65
+ ).resolve()
66
+ return self.download_config.download_dir
67
+
68
+ def is_async(self) -> bool:
69
+ return True
70
+
71
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
72
+ return None
73
+
74
+ @abstractmethod
75
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
76
+ pass
77
+
78
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
79
+ return self.run(file_data=file_data, **kwargs)
@@ -0,0 +1,49 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import Any, Literal, Optional
5
+
6
+ from dataclasses_json import DataClassJsonMixin
7
+ from unstructured.documents.elements import DataSourceMetadata
8
+
9
+
10
+ @dataclass
11
+ class SourceIdentifiers:
12
+ filename: str
13
+ fullpath: str
14
+ rel_path: Optional[str] = None
15
+
16
+ @property
17
+ def filename_stem(self) -> str:
18
+ return Path(self.filename).stem
19
+
20
+ @property
21
+ def relative_path(self) -> str:
22
+ return self.rel_path or self.fullpath
23
+
24
+
25
+ @dataclass
26
+ class FileData(DataClassJsonMixin):
27
+ identifier: str
28
+ connector_type: str
29
+ source_identifiers: Optional[SourceIdentifiers] = None
30
+ doc_type: Literal["file", "batch"] = field(default="file")
31
+ metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
32
+ additional_metadata: dict[str, Any] = field(default_factory=dict)
33
+ reprocess: bool = False
34
+
35
+ @classmethod
36
+ def from_file(cls, path: str) -> "FileData":
37
+ path = Path(path).resolve()
38
+ if not path.exists() or not path.is_file():
39
+ raise ValueError(f"file path not valid: {path}")
40
+ with open(str(path.resolve()), "rb") as f:
41
+ file_data_dict = json.load(f)
42
+ file_data = FileData.from_dict(file_data_dict)
43
+ return file_data
44
+
45
+ def to_file(self, path: str) -> None:
46
+ path = Path(path).resolve()
47
+ path.parent.mkdir(parents=True, exist_ok=True)
48
+ with open(str(path.resolve()), "w") as f:
49
+ json.dump(self.to_dict(), f, indent=2)
@@ -0,0 +1,28 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Generator, Optional, TypeVar
4
+
5
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
+ from unstructured_ingest.v2.interfaces.file_data import FileData
8
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+
10
+
11
+ @dataclass
12
+ class IndexerConfig(EnhancedDataClassJsonMixin):
13
+ pass
14
+
15
+
16
+ IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig)
17
+
18
+
19
+ class Indexer(BaseProcess, BaseConnector, ABC):
20
+ connector_type: str
21
+ index_config: Optional[IndexerConfigT] = None
22
+
23
+ def is_async(self) -> bool:
24
+ return False
25
+
26
+ @abstractmethod
27
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
28
+ pass
@@ -0,0 +1,20 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any
4
+
5
+
6
+ @dataclass
7
+ class BaseProcess(ABC):
8
+ def is_async(self) -> bool:
9
+ return False
10
+
11
+ @abstractmethod
12
+ def run(self, **kwargs: Any) -> Any:
13
+ pass
14
+
15
+ async def run_async(self, **kwargs: Any) -> Any:
16
+ return self.run(**kwargs)
17
+
18
+ def check_connection(self):
19
+ # If the process requires external connections, run a quick check
20
+ pass
@@ -0,0 +1,48 @@
1
+ import os
2
+ from asyncio import Semaphore
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
8
+
9
+ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
10
+
11
+
12
+ @dataclass
13
+ class ProcessorConfig(EnhancedDataClassJsonMixin):
14
+ reprocess: bool = False
15
+ verbose: bool = False
16
+ tqdm: bool = False
17
+ work_dir: str = field(default_factory=lambda: DEFAULT_WORK_DIR)
18
+ num_processes: int = 2
19
+ max_connections: Optional[int] = None
20
+ raise_on_error: bool = False
21
+ disable_parallelism: bool = field(
22
+ default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
23
+ )
24
+ preserve_downloads: bool = False
25
+ download_only: bool = False
26
+ max_docs: Optional[int] = None
27
+ re_download: bool = False
28
+ uncompress: bool = False
29
+
30
+ # Used to keep track of state in pipeline
31
+ status: dict = field(default_factory=dict)
32
+ semaphore: Optional[Semaphore] = field(init=False, default=None)
33
+
34
+ def __post_init__(self):
35
+ if self.max_connections is not None:
36
+ self.semaphore = Semaphore(self.max_connections)
37
+
38
+ @property
39
+ def mp_supported(self) -> bool:
40
+ return not self.disable_parallelism and self.num_processes > 1
41
+
42
+ @property
43
+ def async_supported(self) -> bool:
44
+ if self.disable_parallelism:
45
+ return False
46
+ if self.max_connections is not None and isinstance(self.max_connections, int):
47
+ return self.max_connections > 1
48
+ return True
@@ -0,0 +1,48 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from unstructured_ingest.v2.interfaces.file_data import FileData
8
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
9
+
10
+
11
+ @dataclass
12
+ class UploadStagerConfig(EnhancedDataClassJsonMixin):
13
+ pass
14
+
15
+
16
+ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
17
+
18
+
19
+ @dataclass
20
+ class UploadStager(BaseProcess, ABC):
21
+ upload_stager_config: UploadStagerConfigT
22
+
23
+ @abstractmethod
24
+ def run(
25
+ self,
26
+ elements_filepath: Path,
27
+ file_data: FileData,
28
+ output_dir: Path,
29
+ output_filename: str,
30
+ **kwargs: Any
31
+ ) -> Path:
32
+ pass
33
+
34
+ async def run_async(
35
+ self,
36
+ elements_filepath: Path,
37
+ file_data: FileData,
38
+ output_dir: Path,
39
+ output_filename: str,
40
+ **kwargs: Any
41
+ ) -> Path:
42
+ return self.run(
43
+ elements_filepath=elements_filepath,
44
+ output_dir=output_dir,
45
+ output_filename=output_filename,
46
+ file_data=file_data,
47
+ **kwargs
48
+ )
@@ -0,0 +1,39 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from unstructured_ingest.v2.interfaces.connector import BaseConnector
8
+ from unstructured_ingest.v2.interfaces.file_data import FileData
9
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
10
+
11
+
12
+ @dataclass
13
+ class UploaderConfig(EnhancedDataClassJsonMixin):
14
+ pass
15
+
16
+
17
+ UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
18
+
19
+
20
+ @dataclass
21
+ class UploadContent:
22
+ path: Path
23
+ file_data: FileData
24
+
25
+
26
+ @dataclass
27
+ class Uploader(BaseProcess, BaseConnector, ABC):
28
+ upload_config: UploaderConfigT
29
+ connector_type: str
30
+
31
+ def is_async(self) -> bool:
32
+ return False
33
+
34
+ @abstractmethod
35
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
36
+ pass
37
+
38
+ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
39
+ return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs)
@@ -0,0 +1,126 @@
1
+ import ast
2
+ import json
3
+ import os
4
+ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
5
+ from typing import Any, Callable
6
+
7
+ log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
8
+ LOGGER_NAME = "unstructured.ingest.v2"
9
+
10
+
11
+ def default_is_data_sensitive(k: str, v: Any) -> bool:
12
+ sensitive_fields = [
13
+ "account_name",
14
+ "client_id",
15
+ ]
16
+ sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
17
+ return (
18
+ v
19
+ and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
20
+ or k.lower() in sensitive_fields
21
+ )
22
+
23
+
24
+ def hide_sensitive_fields(
25
+ data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive
26
+ ) -> dict:
27
+ """
28
+ Will recursively look through every k, v pair in this dict and any nested ones and run
29
+ is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
30
+ any string value can be parsed as valid json and process that dict as well and replace
31
+ the original string with the json.dumps() version of the redacted dict.
32
+ """
33
+ new_data = data.copy()
34
+ for k, v in new_data.items():
35
+ if is_sensitive_fn(k, v):
36
+ new_data[k] = "*******"
37
+ if isinstance(v, dict):
38
+ new_data[k] = hide_sensitive_fields(v)
39
+ if isinstance(v, str):
40
+ # Need to take into account strings generated via json.dumps() or simply printing a dict
41
+ try:
42
+ json_data = json.loads(v)
43
+ if isinstance(json_data, dict):
44
+ updated_data = hide_sensitive_fields(json_data)
45
+ new_data[k] = json.dumps(updated_data)
46
+ except json.JSONDecodeError:
47
+ pass
48
+
49
+ return new_data
50
+
51
+
52
+ def redact_jsons(s: str) -> str:
53
+ """
54
+ Takes in a generic string and pulls out all valid json content. Leverages
55
+ hide_sensitive_fields() to redact any sensitive information and replaces the
56
+ original json with the new redacted format. There can be any number of valid
57
+ jsons in a generic string and this will work. Having extra '{' without a
58
+ closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
59
+
60
+ """
61
+ chars = list(s)
62
+ if "{" not in chars:
63
+ return s
64
+ i = 0
65
+ jsons = []
66
+ i = 0
67
+ while i < len(chars):
68
+ char = chars[i]
69
+ if char == "{":
70
+ stack = [char]
71
+ current = [char]
72
+ while len(stack) != 0 and i < len(chars):
73
+ i += 1
74
+ char = chars[i]
75
+ current.append(char)
76
+ if char == "{":
77
+ stack.append(char)
78
+ if char == "}":
79
+ stack.pop(-1)
80
+ jsons.append("".join(current))
81
+ continue
82
+ i += 1
83
+ for j in jsons:
84
+ try:
85
+ formatted_j = json.dumps(json.loads(j))
86
+ except json.JSONDecodeError:
87
+ lit = ast.literal_eval(j)
88
+ formatted_j = json.dumps(lit)
89
+ hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
90
+ s = s.replace(j, hidden_j)
91
+ return s
92
+
93
+
94
+ class SensitiveFormatter(Formatter):
95
+ def format(self, record):
96
+ s = super().format(record=record)
97
+ try:
98
+ return redact_jsons(s)
99
+ except Exception:
100
+ return f"Failed to redact: {s}"
101
+
102
+
103
+ def remove_root_handlers(logger: Logger) -> None:
104
+ # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
105
+ # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
106
+ # Removing these when they exist prevents this behavior
107
+ if logger.root.hasHandlers():
108
+ for handler in logger.root.handlers:
109
+ logger.root.removeHandler(handler)
110
+
111
+
112
+ def make_default_logger(level: int) -> Logger:
113
+ """Return a custom logger."""
114
+ logger = getLogger(LOGGER_NAME)
115
+ handler = StreamHandler()
116
+ handler.name = "ingest_log_handler"
117
+ formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
118
+ handler.setFormatter(formatter)
119
+ if handler.name not in [h.name for h in logger.handlers]:
120
+ logger.addHandler(handler)
121
+ logger.setLevel(level)
122
+ remove_root_handlers(logger)
123
+ return logger
124
+
125
+
126
+ logger = make_default_logger(level=getLevelName(log_level.upper()))
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ from unstructured_ingest.v2.cli.cli import get_cmd
3
+
4
+
5
+ def main():
6
+ ingest_cmd = get_cmd()
7
+ ingest_cmd()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
File without changes
@@ -0,0 +1,167 @@
1
+ import asyncio
2
+ import logging
3
+ import multiprocessing as mp
4
+ from abc import ABC
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from dataclasses import dataclass
7
+ from functools import wraps
8
+ from pathlib import Path
9
+ from time import time
10
+ from typing import Any, Awaitable, Callable, Optional, TypeVar
11
+
12
+ from tqdm import tqdm
13
+ from tqdm.asyncio import tqdm as tqdm_asyncio
14
+
15
+ from unstructured_ingest.v2.interfaces import BaseProcess, ProcessorConfig
16
+ from unstructured_ingest.v2.logger import logger, make_default_logger
17
+
18
+ BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
19
+ iterable_input = list[dict[str, Any]]
20
+
21
+
22
+ def timed(func):
23
+ @wraps(func)
24
+ def time_it(self, *args, **kwargs):
25
+ start = time()
26
+ try:
27
+ return func(self, *args, **kwargs)
28
+ finally:
29
+ if func.__name__ == "__call__":
30
+ reported_name = f"{self.__class__.__name__} [cls]"
31
+ else:
32
+ reported_name = func.__name__
33
+ logger.info(f"{reported_name} took {time() - start} seconds")
34
+
35
+ return time_it
36
+
37
+
38
+ @dataclass
39
+ class PipelineStep(ABC):
40
+ process: BaseProcessT
41
+ context: ProcessorConfig
42
+ identifier: str
43
+
44
+ def __str__(self):
45
+ return self.identifier
46
+
47
+ def process_serially(self, iterable: iterable_input) -> Any:
48
+ logger.info("processing content serially")
49
+ if iterable:
50
+ if len(iterable) == 1:
51
+ return [self.run(**iterable[0])]
52
+ if self.context.tqdm:
53
+ return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
54
+ return [self.run(**it) for it in iterable]
55
+ return [self.run()]
56
+
57
+ async def _process_async(self, iterable: iterable_input) -> Any:
58
+ if iterable:
59
+ if len(iterable) == 1:
60
+ return [await self.run_async(**iterable[0])]
61
+ if self.context.tqdm:
62
+ return await tqdm_asyncio.gather(
63
+ *[self.run_async(**i) for i in iterable], desc=self.identifier
64
+ )
65
+ return await asyncio.gather(*[self.run_async(**i) for i in iterable])
66
+ return [await self.run_async()]
67
+
68
+ def process_async(self, iterable: iterable_input) -> Any:
69
+ logger.info("processing content async")
70
+ return self.asyncio_run(fn=self._process_async, iterable=iterable)
71
+
72
+ def asyncio_run(
73
+ self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
74
+ ) -> Any:
75
+ current_loop = asyncio._get_running_loop()
76
+ if current_loop is None:
77
+ return asyncio.run(fn(*args, **kwargs))
78
+ with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
79
+ logger.warning(
80
+ f"async code being run in dedicated thread pool "
81
+ f"to not conflict with existing event loop: {current_loop}"
82
+ )
83
+
84
+ def wrapped():
85
+ return asyncio.run(fn(*args, **kwargs))
86
+
87
+ future = thread_pool.submit(wrapped)
88
+ return future.result()
89
+
90
+ def process_multiprocess(self, iterable: iterable_input) -> Any:
91
+ logger.info("processing content across processes")
92
+
93
+ if iterable:
94
+ if len(iterable) == 1:
95
+ return [self.process_serially(iterable)]
96
+ if self.context.num_processes == 1:
97
+ return self.process_serially(iterable)
98
+ with mp.Pool(
99
+ processes=self.context.num_processes,
100
+ initializer=self._init_logger,
101
+ initargs=(logging.DEBUG if self.context.verbose else logging.INFO,),
102
+ ) as pool:
103
+ if self.context.tqdm:
104
+ return list(
105
+ tqdm(
106
+ pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
107
+ total=len(iterable),
108
+ desc=self.identifier,
109
+ )
110
+ )
111
+ return pool.map(self._wrap_mp, iterable)
112
+ return [self.run()]
113
+
114
+ def _wrap_mp(self, input_kwargs: dict) -> Any:
115
+ # Allow mapping of kwargs via multiprocessing map()
116
+ return self.run(**input_kwargs)
117
+
118
+ def _init_logger(self, log_level: int):
119
+ # Init logger for each spawned process when using multiprocessing pool
120
+ make_default_logger(level=log_level)
121
+
122
+ @timed
123
+ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
124
+ iterable = iterable or []
125
+ if iterable:
126
+ logger.info(
127
+ f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
128
+ )
129
+ if self.context.async_supported and self.process.is_async():
130
+ return self.process_async(iterable=iterable)
131
+ if self.context.mp_supported:
132
+ return self.process_multiprocess(iterable=iterable)
133
+ return self.process_serially(iterable=iterable)
134
+
135
+ def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
136
+ return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
137
+
138
+ async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
139
+ raise NotImplementedError
140
+
141
+ def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
142
+ try:
143
+ fn = _fn or self.process.run
144
+ return self._run(fn=fn, **kwargs)
145
+ except Exception as e:
146
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
147
+ if "file_data_path" in kwargs:
148
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
149
+ if self.context.raise_on_error:
150
+ raise e
151
+ return None
152
+
153
+ async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
154
+ try:
155
+ fn = _fn or self.process.run_async
156
+ return await self._run_async(fn=fn, **kwargs)
157
+ except Exception as e:
158
+ logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
159
+ if "file_data_path" in kwargs:
160
+ self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
161
+ if self.context.raise_on_error:
162
+ raise e
163
+ return None
164
+
165
+ @property
166
+ def cache_dir(self) -> Path:
167
+ return Path(self.context.work_dir) / self.identifier