unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,284 @@
1
+ import logging
2
+ import multiprocessing as mp
3
+ from dataclasses import InitVar, dataclass, field
4
+ from time import time
5
+ from typing import Any, Optional, Union
6
+
7
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
8
+ from unstructured_ingest.v2.logger import logger, make_default_logger
9
+ from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
10
+ from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
11
+ from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
12
+ from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
13
+ from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
14
+ from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
15
+ from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
16
+ from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
17
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
18
+ from unstructured_ingest.v2.processes.chunker import ChunkerConfig
19
+ from unstructured_ingest.v2.processes.connector_registry import (
20
+ ConnectionConfig,
21
+ DownloaderConfigT,
22
+ IndexerConfigT,
23
+ UploaderConfigT,
24
+ UploadStagerConfigT,
25
+ destination_registry,
26
+ source_registry,
27
+ )
28
+ from unstructured_ingest.v2.processes.connectors.local import LocalUploader
29
+ from unstructured_ingest.v2.processes.embedder import EmbedderConfig
30
+ from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
31
+
32
+
33
+ class PipelineError(Exception):
34
+ pass
35
+
36
+
37
+ @dataclass
38
+ class Pipeline:
39
+ context: ProcessorConfig
40
+ indexer: InitVar[IndexerT]
41
+ indexer_step: IndexStep = field(init=False)
42
+ downloader: InitVar[DownloaderT]
43
+ downloader_step: DownloadStep = field(init=False)
44
+ partitioner: InitVar[Partitioner]
45
+ partitioner_step: PartitionStep = field(init=False)
46
+ chunker: InitVar[Optional[Chunker]] = None
47
+ chunker_step: ChunkStep = field(init=False, default=None)
48
+ embedder: InitVar[Optional[Embedder]] = None
49
+ embedder_step: EmbedStep = field(init=False, default=None)
50
+ stager: InitVar[Optional[UploadStager]] = None
51
+ stager_step: UploadStageStep = field(init=False, default=None)
52
+ uploader: InitVar[Uploader] = field(default=LocalUploader())
53
+ uploader_step: UploadStep = field(init=False, default=None)
54
+ uncompress_step: UncompressStep = field(init=False, default=None)
55
+
56
+ def __post_init__(
57
+ self,
58
+ indexer: IndexerT,
59
+ downloader: DownloaderT,
60
+ partitioner: Partitioner,
61
+ chunker: Chunker = None,
62
+ embedder: Embedder = None,
63
+ stager: UploadStager = None,
64
+ uploader: Uploader = None,
65
+ ):
66
+ make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
67
+ self.indexer_step = IndexStep(process=indexer, context=self.context)
68
+ self.downloader_step = DownloadStep(process=downloader, context=self.context)
69
+ self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
70
+ self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
71
+
72
+ self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None
73
+ # TODO: support initialize() call from each step process
74
+ # Potential long call to download embedder models, run before any fanout:
75
+ if embedder and embedder.config:
76
+ embedder.config.get_embedder().initialize()
77
+
78
+ self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None
79
+ self.uploader_step = UploadStep(process=uploader, context=self.context)
80
+ if self.context.uncompress:
81
+ process = Uncompressor()
82
+ self.uncompress_step = UncompressStep(process=process, context=self.context)
83
+
84
+ self.check_destination_connector()
85
+
86
+ def check_destination_connector(self):
87
+ # Make sure that if the set destination connector expects a stager, one is also set
88
+ if not self.uploader_step:
89
+ return
90
+ uploader_connector_type = self.uploader_step.process.connector_type
91
+ registry_entry = destination_registry[uploader_connector_type]
92
+ if registry_entry.upload_stager and self.stager_step is None:
93
+ raise ValueError(
94
+ f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
95
+ f"expects a stager of type {registry_entry.upload_stager.__name__} "
96
+ f"but one was not set"
97
+ )
98
+
99
+ def cleanup(self):
100
+ pass
101
+
102
+ def log_statuses(self):
103
+ if status := self.context.status:
104
+ logger.error(f"{len(status)} failed documents:")
105
+ for k, v in status.items():
106
+ for kk, vv in v.items():
107
+ logger.error(f"{k}: [{kk}] {vv}")
108
+
109
+ def run(self):
110
+ try:
111
+ start_time = time()
112
+ self._run()
113
+ logger.info(f"Finished ingest process in {time() - start_time}s")
114
+ finally:
115
+ self.log_statuses()
116
+ self.cleanup()
117
+ if self.context.status:
118
+ raise PipelineError("Pipeline did not run successfully")
119
+
120
+ def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
121
+ if not results:
122
+ return None
123
+ results = [r for r in results if r]
124
+ flat = []
125
+ for r in results:
126
+ if isinstance(r, list):
127
+ flat.extend(r)
128
+ else:
129
+ flat.append(r)
130
+ final = [f for f in flat if f]
131
+ return final or None
132
+
133
+ def _run(self):
134
+ logger.info(
135
+ f"Running local pipline: {self} with configs: "
136
+ f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
137
+ )
138
+ if self.context.mp_supported:
139
+ manager = mp.Manager()
140
+ self.context.status = manager.dict()
141
+ else:
142
+ self.context.status = {}
143
+
144
+ # Index into data source
145
+ indices = self.indexer_step.run()
146
+ indices_inputs = [{"file_data_path": i} for i in indices]
147
+ if not indices_inputs:
148
+ return
149
+
150
+ # Download associated content to local file system
151
+ downloaded_data = self.downloader_step(indices_inputs)
152
+ downloaded_data = self.clean_results(results=downloaded_data)
153
+ if not downloaded_data:
154
+ return
155
+
156
+ # Run uncompress if available
157
+ if self.uncompress_step:
158
+ downloaded_data = self.uncompress_step(downloaded_data)
159
+ # Flatten list of lists
160
+ downloaded_data = self.clean_results(results=downloaded_data)
161
+
162
+ if not downloaded_data:
163
+ return
164
+
165
+ # Partition content
166
+ elements = self.partitioner_step(downloaded_data)
167
+ elements = self.clean_results(results=elements)
168
+ if not elements:
169
+ return
170
+
171
+ # Run element specific modifiers
172
+ for step in [self.chunker_step, self.embedder_step, self.stager_step]:
173
+ elements = step(elements) if step else elements
174
+ elements = self.clean_results(results=elements)
175
+ if not elements:
176
+ return
177
+
178
+ # Upload the final result
179
+ self.uploader_step(iterable=elements)
180
+
181
+ def __str__(self):
182
+ s = [str(self.indexer_step), str(self.downloader_step)]
183
+ if uncompress_step := self.uncompress_step:
184
+ s.append(str(uncompress_step))
185
+ s.append(str(self.partitioner_step))
186
+ if chunker_step := self.chunker_step:
187
+ s.append(str(chunker_step))
188
+ if embedder_step := self.embedder_step:
189
+ s.append(str(embedder_step))
190
+ if stager_step := self.stager_step:
191
+ s.append(str(stager_step))
192
+ s.append(str(self.uploader_step))
193
+ return " -> ".join(s)
194
+
195
+ @classmethod
196
+ def from_configs(
197
+ cls,
198
+ context: ProcessorConfig,
199
+ indexer_config: IndexerConfigT,
200
+ downloader_config: DownloaderConfigT,
201
+ source_connection_config: ConnectionConfig,
202
+ partitioner_config: PartitionerConfig,
203
+ chunker_config: Optional[ChunkerConfig] = None,
204
+ embedder_config: Optional[EmbedderConfig] = None,
205
+ destination_connection_config: Optional[ConnectionConfig] = None,
206
+ stager_config: Optional[UploadStagerConfigT] = None,
207
+ uploader_config: Optional[UploaderConfigT] = None,
208
+ ) -> "Pipeline":
209
+ # Get registry key based on indexer config
210
+ source_entry = {
211
+ k: v
212
+ for k, v in source_registry.items()
213
+ if isinstance(indexer_config, v.indexer_config)
214
+ and isinstance(downloader_config, v.downloader_config)
215
+ and isinstance(source_connection_config, v.connection_config)
216
+ }
217
+ if len(source_entry) > 1:
218
+ raise ValueError(
219
+ f"multiple entries found matching provided indexer, "
220
+ f"downloader and connection configs: {source_entry}"
221
+ )
222
+ if len(source_entry) != 1:
223
+ raise ValueError(
224
+ "no entry found in source registry with matching indexer, "
225
+ "downloader and connection configs"
226
+ )
227
+ source = list(source_entry.values())[0]
228
+ pipeline_kwargs = {
229
+ "context": context,
230
+ "indexer": source.indexer(
231
+ index_config=indexer_config, connection_config=source_connection_config
232
+ ),
233
+ "downloader": source.downloader(
234
+ download_config=downloader_config, connection_config=source_connection_config
235
+ ),
236
+ "partitioner": Partitioner(config=partitioner_config),
237
+ }
238
+ if chunker_config:
239
+ pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
240
+ if embedder_config:
241
+ pipeline_kwargs["embedder"] = Embedder(config=embedder_config)
242
+ if not uploader_config:
243
+ return Pipeline(**pipeline_kwargs)
244
+
245
+ destination_entry = {
246
+ k: v
247
+ for k, v in destination_registry.items()
248
+ if isinstance(uploader_config, v.uploader_config)
249
+ }
250
+ if destination_connection_config:
251
+ destination_entry = {
252
+ k: v
253
+ for k, v in destination_entry.items()
254
+ if isinstance(destination_connection_config, v.connection_config)
255
+ }
256
+ if stager_config:
257
+ destination_entry = {
258
+ k: v
259
+ for k, v in destination_entry.items()
260
+ if isinstance(stager_config, v.upload_stager_config)
261
+ }
262
+
263
+ if len(destination_entry) > 1:
264
+ raise ValueError(
265
+ f"multiple entries found matching provided uploader, "
266
+ f"stager and connection configs: {destination_entry}"
267
+ )
268
+ if len(destination_entry) != 1:
269
+ raise ValueError(
270
+ "no entry found in source registry with matching uploader, "
271
+ "stager and connection configs"
272
+ )
273
+
274
+ destination = list(destination_entry.values())[0]
275
+ if stager_config:
276
+ pipeline_kwargs["stager"] = destination.upload_stager(
277
+ upload_stager_config=stager_config
278
+ )
279
+ if uploader_config:
280
+ uploader_kwargs = {"upload_config": uploader_config}
281
+ if destination_connection_config:
282
+ uploader_kwargs["connection_config"] = destination_connection_config
283
+ pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs)
284
+ return cls(**pipeline_kwargs)
File without changes
@@ -0,0 +1,85 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, TypedDict
7
+
8
+ from unstructured.staging.base import elements_to_dicts
9
+
10
+ from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
+ from unstructured_ingest.v2.processes.chunker import Chunker
15
+
16
+ STEP_ID = "chunk"
17
+
18
+
19
+ class ChunkStepResponse(TypedDict):
20
+ file_data_path: str
21
+ path: str
22
+
23
+
24
+ @dataclass
25
+ class ChunkStep(PipelineStep):
26
+ process: Chunker
27
+ identifier: str = STEP_ID
28
+
29
+ def __str__(self):
30
+ return f"{self.identifier} ({self.process.config.chunking_strategy})"
31
+
32
+ def __post_init__(self):
33
+ config = (
34
+ sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
+ if self.process.config
36
+ else None
37
+ )
38
+ logger.info(f"Created {self.identifier} with configs: {config}")
39
+
40
+ def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
41
+ if self.context.reprocess or file_data.reprocess:
42
+ return True
43
+ return not filepath.exists()
44
+
45
+ def get_output_filepath(self, filename: Path) -> Path:
46
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
47
+ filepath = (self.cache_dir / hashed_output_file).resolve()
48
+ filepath.parent.mkdir(parents=True, exist_ok=True)
49
+ return filepath
50
+
51
+ def _save_output(self, output_filepath: str, chunked_content: list[dict]):
52
+ with open(str(output_filepath), "w") as f:
53
+ logger.debug(f"Writing chunker output to: {output_filepath}")
54
+ json.dump(chunked_content, f, indent=2)
55
+
56
+ async def _run_async(
57
+ self, fn: Callable, path: str, file_data_path: str, **kwargs
58
+ ) -> ChunkStepResponse:
59
+ path = Path(path)
60
+ file_data = FileData.from_file(path=file_data_path)
61
+ output_filepath = self.get_output_filepath(filename=path)
62
+ if not self.should_chunk(filepath=output_filepath, file_data=file_data):
63
+ logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
64
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
65
+ fn_kwargs = {"elements_filepath": path}
66
+ if not asyncio.iscoroutinefunction(fn):
67
+ chunked_content_raw = fn(**fn_kwargs)
68
+ elif semaphore := self.context.semaphore:
69
+ async with semaphore:
70
+ chunked_content_raw = await fn(**fn_kwargs)
71
+ else:
72
+ chunked_content_raw = await fn(**fn_kwargs)
73
+ self._save_output(
74
+ output_filepath=str(output_filepath),
75
+ chunked_content=elements_to_dicts(chunked_content_raw),
76
+ )
77
+ return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
78
+
79
+ def get_hash(self, extras: Optional[list[str]]) -> str:
80
+ hashable_string = json.dumps(
81
+ self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
82
+ )
83
+ if extras:
84
+ hashable_string += "".join(extras)
85
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,124 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from typing import Callable, Optional, TypedDict, TypeVar
6
+
7
+ from unstructured_ingest.v2.interfaces import FileData, download_responses
8
+ from unstructured_ingest.v2.interfaces.downloader import Downloader
9
+ from unstructured_ingest.v2.logger import logger
10
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
+
13
+ DownloaderT = TypeVar("DownloaderT", bound=Downloader)
14
+
15
+ STEP_ID = "download"
16
+
17
+
18
+ class DownloadStepResponse(TypedDict):
19
+ file_data_path: str
20
+ path: str
21
+
22
+
23
+ @dataclass
24
+ class DownloadStep(PipelineStep):
25
+ process: DownloaderT
26
+ identifier: str = STEP_ID
27
+
28
+ def __str__(self):
29
+ return f"{self.identifier} ({self.process.__class__.__name__})"
30
+
31
+ def __post_init__(self):
32
+ config = (
33
+ sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
34
+ if self.process.download_config
35
+ else None
36
+ )
37
+ connection_config = (
38
+ sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
39
+ if self.process.connection_config
40
+ else None
41
+ )
42
+ logger.info(
43
+ f"Created {self.identifier} with configs: {config}, "
44
+ f"connection configs: {connection_config}"
45
+ )
46
+
47
+ @staticmethod
48
+ def is_float(value: str):
49
+ try:
50
+ float(value)
51
+ return True
52
+ except ValueError:
53
+ return False
54
+
55
+ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
56
+ if self.context.re_download:
57
+ return True
58
+ download_path = self.process.get_download_path(file_data=file_data)
59
+ if not download_path or not download_path.exists():
60
+ return True
61
+ if (
62
+ download_path.is_file()
63
+ and file_data.metadata.date_modified
64
+ and self.is_float(file_data.metadata.date_modified)
65
+ and download_path.stat().st_mtime > float(file_data.metadata.date_modified)
66
+ ):
67
+ # Also update file data to mark this to reprocess since this won't change the filename
68
+ file_data.reprocess = True
69
+ file_data.to_file(path=file_data_path)
70
+ return True
71
+ return False
72
+
73
+ async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
74
+ file_data = FileData.from_file(path=file_data_path)
75
+ download_path = self.process.get_download_path(file_data=file_data)
76
+ if not self.should_download(file_data=file_data, file_data_path=file_data_path):
77
+ logger.debug(f"Skipping download, file already exists locally: {download_path}")
78
+ return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
79
+ fn_kwargs = {"file_data": file_data}
80
+ if not asyncio.iscoroutinefunction(fn):
81
+ download_results = fn(**fn_kwargs)
82
+ elif semaphore := self.context.semaphore:
83
+ async with semaphore:
84
+ download_results = await fn(**fn_kwargs)
85
+ else:
86
+ download_results = await fn(**fn_kwargs)
87
+ return self.create_step_results(
88
+ current_file_data_path=file_data_path, download_results=download_results
89
+ )
90
+
91
+ def create_step_results(
92
+ self, current_file_data_path: str, download_results: download_responses
93
+ ) -> list[DownloadStepResponse]:
94
+ if not isinstance(download_results, list):
95
+ return [
96
+ DownloadStepResponse(
97
+ file_data_path=current_file_data_path, path=str(download_results["path"])
98
+ )
99
+ ]
100
+ # Supplemental results generated as part of the download process
101
+ download_step_results = []
102
+ for res in download_results:
103
+ file_data_path = self.persist_new_file_data(file_data=res["file_data"])
104
+ download_step_results.append(
105
+ DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
106
+ )
107
+ return download_step_results
108
+
109
+ def persist_new_file_data(self, file_data: FileData) -> str:
110
+ record_hash = self.get_hash(extras=[file_data.identifier])
111
+ filename = f"{record_hash}.json"
112
+ filepath = (self.cache_dir / filename).resolve()
113
+ filepath.parent.mkdir(parents=True, exist_ok=True)
114
+ with open(str(filepath), "w") as f:
115
+ json.dump(file_data.to_dict(), f, indent=2)
116
+ return str(filepath)
117
+
118
+ def get_hash(self, extras: Optional[list[str]]) -> str:
119
+ hashable_string = json.dumps(
120
+ sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
121
+ )
122
+ if extras:
123
+ hashable_string += "".join(extras)
124
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,84 @@
1
+ import asyncio
2
+ import hashlib
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, TypedDict
7
+
8
+ from unstructured.staging.base import elements_to_dicts
9
+
10
+ from unstructured_ingest.v2.interfaces import FileData
11
+ from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
+ from unstructured_ingest.v2.processes.embedder import Embedder
15
+
16
+ STEP_ID = "embed"
17
+
18
+
19
+ class EmbedStepResponse(TypedDict):
20
+ file_data_path: str
21
+ path: str
22
+
23
+
24
+ @dataclass
25
+ class EmbedStep(PipelineStep):
26
+ process: Embedder
27
+ identifier: str = STEP_ID
28
+
29
+ def __str__(self):
30
+ return f"{self.identifier} ({self.process.config.embedding_provider})"
31
+
32
+ def __post_init__(self):
33
+ config = (
34
+ sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
+ if self.process.config
36
+ else None
37
+ )
38
+ logger.info(f"Created {self.identifier} with configs: {config}")
39
+
40
+ def should_embed(self, filepath: Path, file_data: FileData) -> bool:
41
+ if self.context.reprocess or file_data.reprocess:
42
+ return True
43
+ return not filepath.exists()
44
+
45
+ def get_output_filepath(self, filename: Path) -> Path:
46
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
47
+ filepath = (self.cache_dir / hashed_output_file).resolve()
48
+ filepath.parent.mkdir(parents=True, exist_ok=True)
49
+ return filepath
50
+
51
+ def _save_output(self, output_filepath: str, embedded_content: list[dict]):
52
+ with open(str(output_filepath), "w") as f:
53
+ logger.debug(f"Writing embedded output to: {output_filepath}")
54
+ json.dump(embedded_content, f, indent=2)
55
+
56
+ async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
57
+ path = Path(path)
58
+ file_data = FileData.from_file(path=file_data_path)
59
+ output_filepath = self.get_output_filepath(filename=path)
60
+ if not self.should_embed(filepath=output_filepath, file_data=file_data):
61
+ logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
62
+ return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
63
+ fn_kwargs = {"elements_filepath": path}
64
+ if not asyncio.iscoroutinefunction(fn):
65
+ embed_content_raw = fn(**fn_kwargs)
66
+ elif semaphore := self.context.semaphore:
67
+ async with semaphore:
68
+ embed_content_raw = await fn(**fn_kwargs)
69
+ else:
70
+ embed_content_raw = await fn(**fn_kwargs)
71
+
72
+ self._save_output(
73
+ output_filepath=str(output_filepath),
74
+ embedded_content=elements_to_dicts(embed_content_raw),
75
+ )
76
+ return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
77
+
78
+ def get_hash(self, extras: Optional[list[str]]) -> str:
79
+ hashable_string = json.dumps(
80
+ self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
81
+ )
82
+ if extras:
83
+ hashable_string += "".join(extras)
84
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -0,0 +1,61 @@
1
+ import hashlib
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Generator, Optional, TypeVar
5
+
6
+ from unstructured_ingest.v2.interfaces.indexer import Indexer
7
+ from unstructured_ingest.v2.logger import logger
8
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
10
+
11
+ IndexerT = TypeVar("IndexerT", bound=Indexer)
12
+
13
+ STEP_ID = "index"
14
+
15
+
16
+ @dataclass
17
+ class IndexStep(PipelineStep):
18
+ process: IndexerT
19
+ identifier: str = STEP_ID
20
+
21
+ def __str__(self):
22
+ return f"{self.identifier} ({self.process.__class__.__name__})"
23
+
24
+ def __post_init__(self):
25
+ config = (
26
+ sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
27
+ if self.process.index_config
28
+ else None
29
+ )
30
+ connection_config = (
31
+ sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
32
+ if self.process.connection_config
33
+ else None
34
+ )
35
+ logger.info(
36
+ f"Created {self.identifier} with configs: {config}, "
37
+ f"connection configs: {connection_config}"
38
+ )
39
+
40
+ def run(self) -> Generator[str, None, None]:
41
+ for file_data in self.process.run():
42
+ logger.debug(f"Generated file data: {file_data.to_dict()}")
43
+ try:
44
+ record_hash = self.get_hash(extras=[file_data.identifier])
45
+ filename = f"{record_hash}.json"
46
+ filepath = (self.cache_dir / filename).resolve()
47
+ filepath.parent.mkdir(parents=True, exist_ok=True)
48
+ with open(str(filepath), "w") as f:
49
+ json.dump(file_data.to_dict(), f, indent=2)
50
+ yield str(filepath)
51
+ except Exception as e:
52
+ logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
53
+ if self.context.raise_on_error:
54
+ raise e
55
+ continue
56
+
57
+ def get_hash(self, extras: Optional[list[str]]) -> str:
58
+ hashable_string = json.dumps(self.process.index_config.to_dict())
59
+ if extras:
60
+ hashable_string += "".join(extras)
61
+ return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]