unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,49 @@
1
+ from abc import ABC
2
+ from functools import wraps
3
+
4
+
5
+ class CustomError(Exception, ABC):
6
+ error_string: str
7
+
8
+ @classmethod
9
+ def wrap(cls, f):
10
+ """
11
+ Provides a wrapper for a function that catches any exception and
12
+ re-raises it as the customer error. If the exception itself is already an instance
13
+ of the custom error, re-raises original error.
14
+ """
15
+
16
+ @wraps(f)
17
+ def wrapper(*args, **kwargs):
18
+ try:
19
+ return f(*args, **kwargs)
20
+ except BaseException as error:
21
+ if not isinstance(error, cls) and not issubclass(type(error), cls):
22
+ raise cls(cls.error_string.format(str(error))) from error
23
+ raise
24
+
25
+ return wrapper
26
+
27
+
28
+ class SourceConnectionError(CustomError):
29
+ error_string = "Error in getting data from upstream data source: {}"
30
+
31
+
32
+ class SourceConnectionNetworkError(SourceConnectionError):
33
+ error_string = "Error in connecting to upstream data source: {}"
34
+
35
+
36
+ class DestinationConnectionError(CustomError):
37
+ error_string = "Error in connecting to downstream data source: {}"
38
+
39
+
40
+ class EmbeddingEncoderConnectionError(CustomError):
41
+ error_string = "Error in connecting to the embedding model provider: {}"
42
+
43
+
44
+ class WriteError(CustomError):
45
+ error_string = "Error in writing to downstream data source: {}"
46
+
47
+
48
+ class PartitionError(CustomError):
49
+ error_string = "Error in partitioning content: {}"
@@ -0,0 +1,338 @@
1
+ #! /usr/bin/env python3
2
+
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import click
6
+ from unstructured.metrics.evaluate import (
7
+ ElementTypeMetricsCalculator,
8
+ ObjectDetectionMetricsCalculator,
9
+ TableStructureMetricsCalculator,
10
+ TextExtractionMetricsCalculator,
11
+ filter_metrics,
12
+ get_mean_grouping,
13
+ )
14
+
15
+
16
+ @click.group()
17
+ def main():
18
+ pass
19
+
20
+
21
+ @main.command()
22
+ @click.option("--output_dir", type=str, help="Directory to structured output.")
23
+ @click.option("--source_dir", type=str, help="Directory to source.")
24
+ @click.option(
25
+ "--output_list",
26
+ type=str,
27
+ multiple=True,
28
+ help="Optional: list of selected structured output file names under the \
29
+ directory to be evaluate. If none, all files under directory will be use.",
30
+ )
31
+ @click.option(
32
+ "--source_list",
33
+ type=str,
34
+ multiple=True,
35
+ help="Optional: list of selected source file names under the directory \
36
+ to be evaluate. If none, all files under directory will be use.",
37
+ )
38
+ @click.option(
39
+ "--export_dir",
40
+ type=str,
41
+ default="metrics",
42
+ help="Directory to save the output evaluation metrics to. Default to \
43
+ your/working/dir/metrics/",
44
+ )
45
+ @click.option("--group_by", type=str, help="Input field for aggregration, or leave blank if none.")
46
+ @click.option(
47
+ "--weights",
48
+ type=(int, int, int),
49
+ default=(2, 1, 1),
50
+ show_default=True,
51
+ help="A list of weights to the Levenshtein distance calculation. Takes input as --weights 2 2 2\
52
+ See text_extraction.py/calculate_edit_distance for more details.",
53
+ )
54
+ @click.option(
55
+ "--visualize",
56
+ is_flag=True,
57
+ show_default=True,
58
+ default=False,
59
+ help="Add the flag to show progress bar.",
60
+ )
61
+ @click.option(
62
+ "--output_type",
63
+ type=str,
64
+ default="json",
65
+ show_default=True,
66
+ help="Takes in either `txt` or `json` as output_type.",
67
+ )
68
+ def measure_text_extraction_accuracy_command(
69
+ output_dir: str,
70
+ source_dir: str,
71
+ export_dir: str,
72
+ weights: Tuple[int, int, int],
73
+ visualize: bool,
74
+ output_type: str,
75
+ output_list: Optional[List[str]] = None,
76
+ source_list: Optional[List[str]] = None,
77
+ group_by: Optional[str] = None,
78
+ ):
79
+ return (
80
+ TextExtractionMetricsCalculator(
81
+ documents_dir=output_dir,
82
+ ground_truths_dir=source_dir,
83
+ group_by=group_by,
84
+ weights=weights,
85
+ document_type=output_type,
86
+ )
87
+ .on_files(document_paths=output_list, ground_truth_paths=source_list)
88
+ .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
89
+ )
90
+
91
+
92
+ @main.command()
93
+ @click.option("--output_dir", type=str, help="Directory to structured output.")
94
+ @click.option("--source_dir", type=str, help="Directory to structured source.")
95
+ @click.option(
96
+ "--output_list",
97
+ type=str,
98
+ multiple=True,
99
+ help="Optional: list of selected structured output file names under the \
100
+ directory to be evaluate. If none, all files under directory will be used.",
101
+ )
102
+ @click.option(
103
+ "--source_list",
104
+ type=str,
105
+ multiple=True,
106
+ help="Optional: list of selected source file names under the directory \
107
+ to be evaluate. If none, all files under directory will be used.",
108
+ )
109
+ @click.option(
110
+ "--export_dir",
111
+ type=str,
112
+ default="metrics",
113
+ help="Directory to save the output evaluation metrics to. Default to \
114
+ your/working/dir/metrics/",
115
+ )
116
+ @click.option(
117
+ "--visualize",
118
+ is_flag=True,
119
+ show_default=True,
120
+ default=False,
121
+ help="Add the flag to show progress bar.",
122
+ )
123
+ def measure_element_type_accuracy_command(
124
+ output_dir: str,
125
+ source_dir: str,
126
+ export_dir: str,
127
+ visualize: bool,
128
+ output_list: Optional[List[str]] = None,
129
+ source_list: Optional[List[str]] = None,
130
+ ):
131
+ return (
132
+ ElementTypeMetricsCalculator(
133
+ documents_dir=output_dir,
134
+ ground_truths_dir=source_dir,
135
+ )
136
+ .on_files(document_paths=output_list, ground_truth_paths=source_list)
137
+ .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
138
+ )
139
+
140
+
141
+ @main.command()
142
+ @click.option(
143
+ "--group_by",
144
+ type=str,
145
+ required=True,
146
+ help="The category to group by; valid values are 'doctype' and 'connector'.",
147
+ )
148
+ @click.option(
149
+ "--data_input",
150
+ type=str,
151
+ required=True,
152
+ help="A datafram or path to the CSV/TSV file containing the data",
153
+ )
154
+ @click.option(
155
+ "--export_dir",
156
+ type=str,
157
+ default="metrics",
158
+ help="Directory to save the output evaluation metrics to. Default to \
159
+ your/working/dir/metrics/",
160
+ )
161
+ @click.option(
162
+ "--eval_name",
163
+ type=str,
164
+ help="Evaluated metric. Expecting one of 'text_extraction' or 'element_type'",
165
+ )
166
+ @click.option(
167
+ "--agg_name",
168
+ type=str,
169
+ help="String to use with export filename. Default is `cct` for `text_extraction` \
170
+ and `element-type` for `element_type`",
171
+ )
172
+ @click.option(
173
+ "--export_filename", type=str, help="Optional. Define your file name for the output here."
174
+ )
175
+ def get_mean_grouping_command(
176
+ group_by: str,
177
+ data_input: str,
178
+ export_dir: str,
179
+ eval_name: str,
180
+ agg_name: Optional[str] = None,
181
+ export_filename: Optional[str] = None,
182
+ ):
183
+ return get_mean_grouping(
184
+ group_by=group_by,
185
+ data_input=data_input,
186
+ export_dir=export_dir,
187
+ eval_name=eval_name,
188
+ agg_name=agg_name,
189
+ export_filename=export_filename,
190
+ )
191
+
192
+
193
+ @main.command()
194
+ @click.option("--output_dir", type=str, help="Directory to structured output.")
195
+ @click.option("--source_dir", type=str, help="Directory to structured source.")
196
+ @click.option(
197
+ "--output_list",
198
+ type=str,
199
+ multiple=True,
200
+ help="Optional: list of selected structured output file names under the \
201
+ directory to be evaluate. If none, all files under directory will be used.",
202
+ )
203
+ @click.option(
204
+ "--source_list",
205
+ type=str,
206
+ multiple=True,
207
+ help="Optional: list of selected source file names under the directory \
208
+ to be evaluate. If none, all files under directory will be used.",
209
+ )
210
+ @click.option(
211
+ "--export_dir",
212
+ type=str,
213
+ default="metrics",
214
+ help="Directory to save the output evaluation metrics to. Default to \
215
+ your/working/dir/metrics/",
216
+ )
217
+ @click.option(
218
+ "--visualize",
219
+ is_flag=True,
220
+ show_default=True,
221
+ default=False,
222
+ help="Add the flag to show progress bar.",
223
+ )
224
+ @click.option(
225
+ "--cutoff",
226
+ type=float,
227
+ show_default=True,
228
+ default=0.8,
229
+ help="The cutoff value for the element level alignment. \
230
+ If not set, a default value is used",
231
+ )
232
+ def measure_table_structure_accuracy_command(
233
+ output_dir: str,
234
+ source_dir: str,
235
+ export_dir: str,
236
+ visualize: bool,
237
+ output_list: Optional[List[str]] = None,
238
+ source_list: Optional[List[str]] = None,
239
+ cutoff: Optional[float] = None,
240
+ ):
241
+ return (
242
+ TableStructureMetricsCalculator(
243
+ documents_dir=output_dir,
244
+ ground_truths_dir=source_dir,
245
+ cutoff=cutoff,
246
+ )
247
+ .on_files(document_paths=output_list, ground_truth_paths=source_list)
248
+ .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
249
+ )
250
+
251
+
252
+ @main.command()
253
+ @click.option("--output_dir", type=str, help="Directory to structured output.")
254
+ @click.option("--source_dir", type=str, help="Directory to structured source.")
255
+ @click.option(
256
+ "--output_list",
257
+ type=str,
258
+ multiple=True,
259
+ help=(
260
+ "Optional: list of selected structured output file names under the "
261
+ "directory to be evaluated. If none, all files under directory will be used."
262
+ ),
263
+ )
264
+ @click.option(
265
+ "--source_list",
266
+ type=str,
267
+ multiple=True,
268
+ help="Optional: list of selected source file names under the directory \
269
+ to be evaluate. If none, all files under directory will be used.",
270
+ )
271
+ @click.option(
272
+ "--export_dir",
273
+ type=str,
274
+ default="metrics",
275
+ help="Directory to save the output evaluation metrics to. Default to \
276
+ your/working/dir/metrics/",
277
+ )
278
+ @click.option(
279
+ "--visualize",
280
+ is_flag=True,
281
+ show_default=True,
282
+ default=False,
283
+ help="Add the flag to show progress bar.",
284
+ )
285
+ def measure_object_detection_metrics_command(
286
+ output_dir: str,
287
+ source_dir: str,
288
+ export_dir: str,
289
+ visualize: bool,
290
+ output_list: Optional[List[str]] = None,
291
+ source_list: Optional[List[str]] = None,
292
+ ):
293
+ return (
294
+ ObjectDetectionMetricsCalculator(
295
+ documents_dir=output_dir,
296
+ ground_truths_dir=source_dir,
297
+ )
298
+ .on_files(document_paths=output_list, ground_truth_paths=source_list)
299
+ .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True)
300
+ )
301
+
302
+
303
+ @main.command()
304
+ @click.option(
305
+ "--data_input", type=str, required=True, help="Takes in path to data file as .tsv .csv .txt"
306
+ )
307
+ @click.option(
308
+ "--filter_list",
309
+ type=str,
310
+ required=True,
311
+ help="Takes in list of string to filter the data_input.",
312
+ )
313
+ @click.option(
314
+ "--filter_by",
315
+ type=str,
316
+ required=True,
317
+ help="Field from data_input to match with filter_list. Default is `filename`.",
318
+ )
319
+ @click.option(
320
+ "--export_filename", type=str, help="Export filename. Required when return_type is `file`"
321
+ )
322
+ @click.option("--export_dir", type=str, help="Export directory.")
323
+ @click.option("--return_type", type=str, help="`dataframe` or `file`. Default is `file`.")
324
+ def filter_metrics_command(
325
+ data_input: str,
326
+ filter_list: Union[str, List[str]],
327
+ filter_by: str = "filename",
328
+ export_filename: Optional[str] = None,
329
+ export_dir: str = "metrics",
330
+ return_type: str = "file",
331
+ ):
332
+ return filter_metrics(
333
+ data_input, filter_list, filter_by, export_filename, export_dir, return_type
334
+ )
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()
@@ -0,0 +1,3 @@
1
+ from ._wrapper import RetryHandler
2
+
3
+ __all__ = ["RetryHandler"]
@@ -0,0 +1,102 @@
1
+ import logging
2
+ import sys
3
+ import traceback
4
+
5
+
6
+ # Default startup handler
7
+ def _log_start(details, logger, log_level):
8
+ max_tried = details.get("max_tries")
9
+ max_time = details.get("max_time")
10
+ if max_tried is not None and max_time is not None:
11
+ s = "%.1fs or %d tries"
12
+ s_args = [max_time, max_tried]
13
+ elif max_tried is not None:
14
+ s = "%d tries"
15
+ s_args = [max_tried]
16
+ else:
17
+ s = "%.1fs"
18
+ s_args = [max_time]
19
+ exception = details.get("exception")
20
+ if isinstance(exception, tuple):
21
+ exception = list(exception)
22
+ elif not isinstance(exception, list):
23
+ exception = [exception]
24
+ exception_s = ", ".join([e.__name__ for e in exception])
25
+ if log_level >= logging.INFO:
26
+ msg = f"Attempting %s(...), will retry for {s} given these issues: %s"
27
+ log_args = [details["target"].__name__] + s_args + [exception_s]
28
+ else:
29
+ msg = f"Attempting %s(%s), will retry for {s} given these issues: %s"
30
+ target_input_list = []
31
+ if args := details.get("args"):
32
+ target_input_list.extend([str(d) for d in args])
33
+ if kwargs := details.get("kwargs"):
34
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
35
+ target_input = ", ".join(target_input_list) if target_input_list else ""
36
+ log_args = (
37
+ [
38
+ details["target"].__name__,
39
+ target_input,
40
+ ]
41
+ + s_args
42
+ + [exception_s]
43
+ )
44
+ logger.log(log_level, msg, *log_args)
45
+
46
+
47
+ # Default backoff handler
48
+ def _log_backoff(details, logger, log_level):
49
+ if log_level >= logging.INFO:
50
+ msg = "Backing off %s(...) for %.1fs (%s)"
51
+ log_args = [details["target"].__name__, details["tries"]]
52
+ else:
53
+ msg = "Backing off %.1fs seconds after %d tries calling function %s(%s) -> %s"
54
+ target_input_list = []
55
+ if args := details.get("args"):
56
+ target_input_list.extend([str(d) for d in args])
57
+ if kwargs := details.get("kwargs"):
58
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
59
+ target_input = ", ".join(target_input_list) if target_input_list else ""
60
+ log_args = [
61
+ details["wait"],
62
+ details["tries"],
63
+ details["target"].__name__,
64
+ target_input,
65
+ ]
66
+ exc_typ, exc, _ = sys.exc_info()
67
+ if exc is not None:
68
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
69
+ log_args.append(exc_fmt.rstrip("\n"))
70
+ else:
71
+ log_args.append(str(details["value"]))
72
+ logger.log(log_level, msg, *log_args)
73
+
74
+
75
+ # Default giveup handler
76
+ def _log_giveup(details, logger, log_level):
77
+ if log_level >= logging.INFO:
78
+ msg = "Giving up %s(...) after %.1fs (%s)"
79
+ log_args = [details["target"].__name__, details["tries"]]
80
+ else:
81
+ msg = "Giving up after %d tries (%.1fs) calling function %s(%s) -> %s"
82
+ target_input_list = []
83
+ if args := details.get("args"):
84
+ target_input_list.extend([str(d) for d in args])
85
+ if kwargs := details.get("kwargs"):
86
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
87
+ target_input = ", ".join(target_input_list) if target_input_list else "..."
88
+ log_args = [
89
+ details["tries"],
90
+ details["wait"],
91
+ details["target"].__name__,
92
+ target_input,
93
+ ]
94
+
95
+ exc_typ, exc, _ = sys.exc_info()
96
+ if exc is not None:
97
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
98
+ log_args.append(exc_fmt.rstrip("\n"))
99
+ else:
100
+ log_args.append(details["value"])
101
+
102
+ logger.log(log_level, msg, *log_args)
@@ -0,0 +1,122 @@
1
+ # coding:utf-8
2
+ import logging
3
+ from collections.abc import Iterable as IterableType
4
+ from typing import Any, Iterable, Optional, Type, Union
5
+
6
+ from backoff import _sync
7
+ from backoff._common import _config_handlers, _prepare_logger
8
+ from backoff._jitter import full_jitter
9
+ from backoff._typing import (
10
+ _Handler,
11
+ _Jitterer,
12
+ _MaybeCallable,
13
+ _MaybeLogger,
14
+ _MaybeSequence,
15
+ _Predicate,
16
+ _WaitGenerator,
17
+ )
18
+
19
+ from unstructured_ingest.ingest_backoff._common import _log_backoff, _log_giveup, _log_start
20
+
21
+
22
+ class RetryHandler:
23
+ def __init__(
24
+ self,
25
+ wait_gen: _WaitGenerator,
26
+ exception: _MaybeSequence[Type[Exception]],
27
+ *,
28
+ max_tries: Optional[_MaybeCallable[int]] = None,
29
+ max_time: Optional[_MaybeCallable[float]] = None,
30
+ jitter: Union[_Jitterer, None] = full_jitter,
31
+ giveup: _Predicate[Exception] = lambda e: False,
32
+ on_start: Union[_Handler, Iterable[_Handler], None] = None,
33
+ on_success: Union[_Handler, Iterable[_Handler], None] = None,
34
+ on_backoff: Union[_Handler, Iterable[_Handler], None] = None,
35
+ on_giveup: Union[_Handler, Iterable[_Handler], None] = None,
36
+ raise_on_giveup: bool = True,
37
+ logger: _MaybeLogger = "backoff",
38
+ start_log_level: int = logging.INFO,
39
+ backoff_log_level: int = logging.INFO,
40
+ giveup_log_level: int = logging.ERROR,
41
+ **wait_gen_kwargs: Any,
42
+ ):
43
+ prepared_logger = _prepare_logger(logger)
44
+ on_success = _config_handlers(on_success)
45
+ on_start = _config_handlers(
46
+ on_start,
47
+ default_handler=_log_start,
48
+ logger=prepared_logger,
49
+ log_level=start_log_level,
50
+ )
51
+ on_backoff = _config_handlers(
52
+ on_backoff,
53
+ default_handler=_log_backoff,
54
+ logger=prepared_logger,
55
+ log_level=backoff_log_level,
56
+ )
57
+ on_giveup = _config_handlers(
58
+ on_giveup,
59
+ default_handler=_log_giveup,
60
+ logger=prepared_logger,
61
+ log_level=giveup_log_level,
62
+ )
63
+ prepared_logger.debug(
64
+ "Initiating retry handler with "
65
+ "max_tries={}, "
66
+ "max_time={}, "
67
+ "exception={}, "
68
+ "start_log_level={}, "
69
+ "backoff_log_level={}, "
70
+ "giveup_log_level={}".format(
71
+ max_tries,
72
+ max_time,
73
+ (
74
+ ", ".join([e.__name__ for e in exception])
75
+ if isinstance(exception, IterableType)
76
+ else exception.__name__
77
+ ),
78
+ logging.getLevelName(start_log_level),
79
+ logging.getLevelName(backoff_log_level),
80
+ logging.getLevelName(giveup_log_level),
81
+ ),
82
+ )
83
+ self.on_start = on_start
84
+ self.on_success = on_success
85
+ self.on_backoff = on_backoff
86
+ self.on_giveup = on_giveup
87
+ self.jitter = jitter
88
+ self.giveup = giveup
89
+ self.raise_on_giveup = raise_on_giveup
90
+ self.wait_gen_kwargs = wait_gen_kwargs
91
+ self.wait_gen = wait_gen
92
+ self.exception = exception
93
+ self.max_tries = max_tries
94
+ self.max_time = max_time
95
+
96
+ def __call__(self, target, *args, **kwargs):
97
+ _sync._call_handlers(
98
+ self.on_start,
99
+ target=target,
100
+ args=args,
101
+ kwargs=kwargs,
102
+ tries=None,
103
+ elapsed=None,
104
+ max_tries=self.max_tries,
105
+ max_time=self.max_time,
106
+ exception=self.exception,
107
+ )
108
+ wrapped_func = _sync.retry_exception(
109
+ target,
110
+ self.wait_gen,
111
+ self.exception,
112
+ max_tries=self.max_tries,
113
+ max_time=self.max_time,
114
+ jitter=self.jitter,
115
+ giveup=self.giveup,
116
+ on_success=self.on_success,
117
+ on_backoff=self.on_backoff,
118
+ on_giveup=self.on_giveup,
119
+ raise_on_giveup=self.raise_on_giveup,
120
+ wait_gen_kwargs=self.wait_gen_kwargs,
121
+ )
122
+ return wrapped_func(*args, **kwargs)