unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,838 @@
1
+ """Defines Abstract Base Classes (ABC's) core to batch processing documents
2
+ through Unstructured."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import functools
7
+ import json
8
+ import os
9
+ import re
10
+ from abc import ABC, abstractmethod
11
+ from dataclasses import InitVar, dataclass, field
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Any, Optional, Type, TypeVar
15
+
16
+ from dataclasses_json import DataClassJsonMixin
17
+ from dataclasses_json.core import Json, _decode_dataclass
18
+ from unstructured.documents.elements import DataSourceMetadata
19
+ from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
20
+ from unstructured.partition.api import partition_via_api
21
+ from unstructured.staging.base import elements_to_dicts, flatten_dict
22
+
23
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
24
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
25
+ from unstructured_ingest.error import PartitionError, SourceConnectionError
26
+ from unstructured_ingest.logger import logger
27
+
28
+ A = TypeVar("A", bound="DataClassJsonMixin")
29
+
30
+ # -- Needed to resolve TypeError raised by using InitVar and __future__.annotations
31
+ # -- See more here: https://stackoverflow.com/questions/70400639/
32
+ InitVar.__call__ = lambda *args: None # type: ignore
33
+
34
+ SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [
35
+ "s3",
36
+ "s3a",
37
+ "abfs",
38
+ "az",
39
+ "gs",
40
+ "gcs",
41
+ "box",
42
+ "dropbox",
43
+ "sftp",
44
+ ]
45
+
46
+
47
+ @dataclass
48
+ class BaseSessionHandle(ABC):
49
+ """Abstract Base Class for sharing resources that are local to an individual process.
50
+ e.g., a connection for making a request for fetching documents."""
51
+
52
+
53
+ @dataclass
54
+ class BaseConfig(EnhancedDataClassJsonMixin, ABC):
55
+ pass
56
+
57
+
58
+ @dataclass
59
+ class AccessConfig(BaseConfig):
60
+ """Meant to designate holding any sensitive information associated with other configs
61
+ and also for access specific configs."""
62
+
63
+
64
+ @dataclass
65
+ class RetryStrategyConfig(BaseConfig):
66
+ """
67
+ Contains all info needed for decorator to pull from `self` for backoff
68
+ and retry triggered by exception.
69
+
70
+ Args:
71
+ max_retries: The maximum number of attempts to make before giving
72
+ up. Once exhausted, the exception will be allowed to escape.
73
+ The default value of None means there is no limit to the
74
+ number of tries. If a callable is passed, it will be
75
+ evaluated at runtime and its return value used.
76
+ max_retry_time: The maximum total amount of time to try for before
77
+ giving up. Once expired, the exception will be allowed to
78
+ escape. If a callable is passed, it will be
79
+ evaluated at runtime and its return value used.
80
+ """
81
+
82
+ max_retries: Optional[int] = None
83
+ max_retry_time: Optional[float] = None
84
+
85
+
86
+ @dataclass
87
+ class PartitionConfig(BaseConfig):
88
+ # where to write structured data outputs
89
+ pdf_infer_table_structure: bool = False
90
+ strategy: str = "auto"
91
+ ocr_languages: Optional[list[str]] = None
92
+ encoding: Optional[str] = None
93
+ additional_partition_args: dict[str, Any] = field(default_factory=dict)
94
+ skip_infer_table_types: Optional[list[str]] = None
95
+ fields_include: list[str] = field(
96
+ default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
97
+ )
98
+ flatten_metadata: bool = False
99
+ metadata_exclude: list[str] = field(default_factory=list)
100
+ metadata_include: list[str] = field(default_factory=list)
101
+ partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
102
+ partition_by_api: bool = False
103
+ api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
104
+ hi_res_model_name: Optional[str] = None
105
+
106
+
107
+ @dataclass
108
+ class ProcessorConfig(BaseConfig):
109
+ reprocess: bool = False
110
+ verbose: bool = False
111
+ work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
112
+ output_dir: str = "structured-output"
113
+ num_processes: int = 2
114
+ raise_on_error: bool = False
115
+
116
+
117
+ @dataclass
118
+ class FileStorageConfig(BaseConfig):
119
+ remote_url: str
120
+ uncompress: bool = False
121
+ recursive: bool = False
122
+ file_glob: Optional[list[str]] = None
123
+
124
+
125
+ @dataclass
126
+ class FsspecConfig(FileStorageConfig):
127
+ access_config: Optional[AccessConfig] = None
128
+ protocol: str = field(init=False)
129
+ path_without_protocol: str = field(init=False)
130
+ dir_path: str = field(init=False)
131
+ file_path: str = field(init=False)
132
+
133
+ def get_access_config(self) -> dict[str, Any]:
134
+ if self.access_config:
135
+ return self.access_config.to_dict(apply_name_overload=False)
136
+ else:
137
+ return {}
138
+
139
+ def __post_init__(self):
140
+ self.protocol, self.path_without_protocol = self.remote_url.split("://")
141
+ if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS:
142
+ raise ValueError(
143
+ f"Protocol {self.protocol} not supported yet, only "
144
+ f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.",
145
+ )
146
+
147
+ # dropbox root is an empty string
148
+ match = re.match(rf"{self.protocol}://([\s])/", self.remote_url)
149
+ if match and self.protocol == "dropbox":
150
+ self.dir_path = " "
151
+ self.file_path = ""
152
+ return
153
+
154
+ # dropbox paths can start with slash
155
+ match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url)
156
+ if match and self.protocol == "dropbox":
157
+ self.dir_path = match.group(1)
158
+ self.file_path = match.group(2) or ""
159
+ return
160
+
161
+ # just a path with no trailing prefix
162
+ match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url)
163
+ if match:
164
+ self.dir_path = match.group(1)
165
+ self.file_path = ""
166
+ return
167
+
168
+ # valid path with a dir and/or file
169
+ match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.remote_url)
170
+ if not match:
171
+ raise ValueError(
172
+ f"Invalid path {self.remote_url}. "
173
+ f"Expected <protocol>://<dir-path>/<file-or-dir-path>.",
174
+ )
175
+ self.dir_path = match.group(1)
176
+ self.file_path = match.group(2) or ""
177
+
178
+
179
+ @dataclass
180
+ class ReadConfig(BaseConfig):
181
+ # where raw documents are stored for processing, and then removed if not preserve_downloads
182
+ download_dir: Optional[str] = ""
183
+ re_download: bool = False
184
+ preserve_downloads: bool = False
185
+ download_only: bool = False
186
+ max_docs: Optional[int] = None
187
+
188
+
189
+ @dataclass
190
+ class EmbeddingConfig(BaseConfig):
191
+ provider: str
192
+ api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
193
+ model_name: Optional[str] = None
194
+ aws_access_key_id: Optional[str] = None
195
+ aws_secret_access_key: Optional[str] = None
196
+ aws_region: Optional[str] = None
197
+
198
+ def get_embedder(self) -> BaseEmbeddingEncoder:
199
+ kwargs: dict[str, Any] = {}
200
+ if self.api_key:
201
+ kwargs["api_key"] = self.api_key
202
+ if self.model_name:
203
+ kwargs["model_name"] = self.model_name
204
+ # TODO make this more dynamic to map to encoder configs
205
+ if self.provider == "langchain-openai":
206
+ from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
207
+
208
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
209
+ elif self.provider == "langchain-huggingface":
210
+ from unstructured.embed.huggingface import (
211
+ HuggingFaceEmbeddingConfig,
212
+ HuggingFaceEmbeddingEncoder,
213
+ )
214
+
215
+ return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
216
+ elif self.provider == "octoai":
217
+ from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
218
+
219
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
220
+ elif self.provider == "langchain-aws-bedrock":
221
+ from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
222
+
223
+ return BedrockEmbeddingEncoder(
224
+ config=BedrockEmbeddingConfig(
225
+ aws_access_key_id=self.aws_access_key_id,
226
+ aws_secret_access_key=self.aws_secret_access_key,
227
+ region_name=self.aws_region,
228
+ )
229
+ )
230
+ elif self.provider == "langchain-vertexai":
231
+ from unstructured.embed.vertexai import (
232
+ VertexAIEmbeddingConfig,
233
+ VertexAIEmbeddingEncoder,
234
+ )
235
+
236
+ return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
237
+ elif self.provider == "langchain-voyageai":
238
+ from unstructured.embed.voyageai import (
239
+ VoyageAIEmbeddingConfig,
240
+ VoyageAIEmbeddingEncoder,
241
+ )
242
+
243
+ return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
244
+ else:
245
+ raise ValueError(f"{self.provider} not a recognized encoder")
246
+
247
+
248
+ @dataclass
249
+ class ChunkingConfig(BaseConfig):
250
+ chunk_elements: InitVar[bool] = False
251
+ chunking_strategy: Optional[str] = None
252
+ combine_text_under_n_chars: Optional[int] = None
253
+ include_orig_elements: Optional[bool] = None
254
+ max_characters: Optional[int] = None
255
+ multipage_sections: Optional[bool] = None
256
+ new_after_n_chars: Optional[int] = None
257
+ overlap: Optional[int] = None
258
+ overlap_all: Optional[bool] = None
259
+
260
+ def __post_init__(self, chunk_elements: bool) -> None:
261
+ """Resolve chunking_strategy if chunk_elements is True.
262
+
263
+ If chunk_elements is True and chunking_strategy is None, default to 'by_title'. Otherwise,
264
+ do nothing and keep the defined value of chunking_strategy."
265
+ """
266
+ if chunk_elements and self.chunking_strategy is None:
267
+ self.chunking_strategy = "by_title"
268
+
269
+
270
+ @dataclass
271
+ class PermissionsConfig(BaseConfig):
272
+ application_id: Optional[str] = enhanced_field(overload_name="permissions_application_id")
273
+ tenant: Optional[str] = enhanced_field(overload_name="permissions_tenant")
274
+ client_cred: Optional[str] = enhanced_field(
275
+ default=None, sensitive=True, overload_name="permissions_client_cred"
276
+ )
277
+
278
+
279
+ # module-level variable to store session handle
280
+ global_write_session_handle: Optional[BaseSessionHandle] = None
281
+
282
+
283
+ @dataclass
284
+ class WriteConfig(BaseConfig):
285
+ pass
286
+
287
+
288
+ @dataclass
289
+ class BaseConnectorConfig(BaseConfig, ABC):
290
+ """Abstract definition on which to define connector-specific attributes."""
291
+
292
+
293
+ @dataclass
294
+ class SourceMetadata(EnhancedDataClassJsonMixin, ABC):
295
+ date_created: Optional[str] = None
296
+ date_modified: Optional[str] = None
297
+ version: Optional[str] = None
298
+ source_url: Optional[str] = None
299
+ exists: Optional[bool] = None
300
+ permissions_data: Optional[list[dict[str, Any]]] = None
301
+
302
+
303
+ class IngestDocJsonMixin(EnhancedDataClassJsonMixin):
304
+ """
305
+ Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
306
+ created from the dataclass. This explicitly sets properties to look for on the IngestDoc
307
+ class when creating the json/dict for serialization purposes.
308
+ """
309
+
310
+ metadata_properties = [
311
+ "date_created",
312
+ "date_modified",
313
+ "date_processed",
314
+ "exists",
315
+ "permissions_data",
316
+ "version",
317
+ "source_url",
318
+ ]
319
+ properties_to_serialize = [
320
+ "base_filename",
321
+ "filename",
322
+ "_output_filename",
323
+ "record_locator",
324
+ "_source_metadata",
325
+ "unique_id",
326
+ ]
327
+
328
+ def add_props(self, as_dict: dict[str, Any], props: list[str]):
329
+ for prop in props:
330
+ val = getattr(self, prop)
331
+ if isinstance(val, Path):
332
+ val = str(val)
333
+ if isinstance(val, DataClassJsonMixin):
334
+ val = val.to_dict(encode_json=False)
335
+ as_dict[prop] = val
336
+
337
+ def to_dict(self, **kwargs) -> dict[str, Json]:
338
+ as_dict = _asdict(self, **kwargs)
339
+ if "_session_handle" in as_dict:
340
+ as_dict.pop("_session_handle", None)
341
+ self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
342
+ if getattr(self, "_source_metadata") is not None:
343
+ self.add_props(as_dict=as_dict, props=self.metadata_properties)
344
+ return as_dict
345
+
346
+ @classmethod
347
+ def from_dict(
348
+ cls: Type[A], kvs: Json, *, infer_missing=False, apply_name_overload: bool = True
349
+ ) -> A:
350
+ doc = super().from_dict(
351
+ kvs=kvs, infer_missing=infer_missing, apply_name_overload=apply_name_overload
352
+ )
353
+ if meta := kvs.get("_source_metadata"):
354
+ setattr(doc, "_source_metadata", SourceMetadata.from_dict(meta))
355
+ if date_processed := kvs.get("_date_processed"):
356
+ setattr(doc, "_date_processed", date_processed)
357
+ return doc
358
+
359
+
360
+ class BatchIngestDocJsonMixin(EnhancedDataClassJsonMixin):
361
+ """
362
+ Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
363
+ created from the dataclass. This explicitly sets properties to look for on the IngestDoc
364
+ class when creating the json/dict for serialization purposes.
365
+ """
366
+
367
+ properties_to_serialize = ["unique_id"]
368
+
369
+ def add_props(self, as_dict: dict[str, Any], props: list[str]):
370
+ for prop in props:
371
+ val = getattr(self, prop)
372
+ if isinstance(val, Path):
373
+ val = str(val)
374
+ if isinstance(val, DataClassJsonMixin):
375
+ val = val.to_dict(encode_json=False)
376
+ as_dict[prop] = val
377
+
378
+ def to_dict(self, encode_json=False) -> dict[str, Json]:
379
+ as_dict = _asdict(self, encode_json=encode_json)
380
+ self.add_props(as_dict=as_dict, props=self.properties_to_serialize)
381
+ return as_dict
382
+
383
+ @classmethod
384
+ def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A:
385
+ doc = _decode_dataclass(cls, kvs, infer_missing)
386
+ return doc
387
+
388
+
389
+ @dataclass
390
+ class BaseIngestDoc(ABC):
391
+ processor_config: ProcessorConfig
392
+ read_config: ReadConfig
393
+ connector_config: BaseConnectorConfig
394
+
395
+ @property
396
+ @abstractmethod
397
+ def unique_id(self) -> str:
398
+ pass
399
+
400
+
401
+ @dataclass
402
+ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
403
+ """An "ingest document" is specific to a connector, and provides
404
+ methods to fetch a single raw document, store it locally for processing, any cleanup
405
+ needed after successful processing of the doc, and the ability to write the doc's
406
+ structured outputs once processed.
407
+
408
+ Crucially, it is not responsible for the actual processing of the raw document.
409
+ """
410
+
411
+ _source_metadata: Optional[SourceMetadata] = field(init=False, default=None)
412
+ _date_processed: Optional[str] = field(init=False, default=None)
413
+
414
+ @property
415
+ def source_metadata(self) -> SourceMetadata:
416
+ if self._source_metadata is None:
417
+ self.update_source_metadata()
418
+ # Provide guarantee that the field was set by update_source_metadata()
419
+ if self._source_metadata is None:
420
+ raise ValueError("failed to set source metadata")
421
+ return self._source_metadata
422
+
423
+ @source_metadata.setter
424
+ def source_metadata(self, value: SourceMetadata):
425
+ self._source_metadata = value
426
+
427
+ @property
428
+ def date_created(self) -> Optional[str]:
429
+ """The date the document was created on the source system."""
430
+ return self.source_metadata.date_created
431
+
432
+ @property
433
+ def date_modified(self) -> Optional[str]:
434
+ """The date the document was last modified on the source system."""
435
+ return self.source_metadata.date_modified
436
+
437
+ @property
438
+ def date_processed(self) -> Optional[str]:
439
+ """The date the document was last processed by Unstructured.
440
+ self._date_processed is assigned internally in self.partition_file()"""
441
+ return self._date_processed
442
+
443
+ @property
444
+ def exists(self) -> Optional[bool]:
445
+ """Whether the document exists on the remote source."""
446
+ return self.source_metadata.exists
447
+
448
+ @property
449
+ @abstractmethod
450
+ def filename(self):
451
+ """The local filename of the document after fetching from remote source."""
452
+
453
+ @property
454
+ def base_filename(self) -> Optional[str]:
455
+ if self.read_config.download_dir and self.filename:
456
+ download_path = str(Path(self.read_config.download_dir).resolve())
457
+ full_path = str(self.filename)
458
+ base_path = full_path.replace(download_path, "")
459
+ return base_path
460
+ return None
461
+
462
+ @property
463
+ def base_output_filename(self) -> Optional[str]:
464
+ if self.processor_config.output_dir and self._output_filename:
465
+ output_path = str(Path(self.processor_config.output_dir).resolve())
466
+ full_path = str(self._output_filename)
467
+ base_path = full_path.replace(output_path, "")
468
+ return base_path
469
+ return None
470
+
471
+ @property
472
+ @abstractmethod
473
+ def _output_filename(self):
474
+ """Filename of the structured output for this doc."""
475
+
476
+ @property
477
+ def record_locator(self) -> Optional[dict[str, Any]]: # Values must be JSON-serializable
478
+ """A dictionary with any data necessary to uniquely identify the document on
479
+ the source system."""
480
+ return None
481
+
482
+ @property
483
+ def unique_id(self) -> str:
484
+ return self.filename
485
+
486
+ @property
487
+ def source_url(self) -> Optional[str]:
488
+ """The url of the source document."""
489
+ return self.source_metadata.source_url # type: ignore
490
+
491
+ @property
492
+ def version(self) -> Optional[str]:
493
+ """The version of the source document, this could be the last modified date, an
494
+ explicit version number, or anything else that can be used to uniquely identify
495
+ the version of the document."""
496
+ return self.source_metadata.version # type: ignore
497
+
498
+ @property
499
+ def permissions_data(self) -> Optional[list[dict[str, Any]]]:
500
+ """Access control data, aka permissions or sharing, from the source system."""
501
+ if self.source_metadata is None:
502
+ self.update_source_metadata()
503
+ return self.source_metadata.permissions_data # type: ignore
504
+
505
+ @abstractmethod
506
+ def cleanup_file(self):
507
+ """Removes the local copy the file (or anything else) after successful processing."""
508
+
509
+ @staticmethod
510
+ def skip_if_file_exists(func):
511
+ """Decorator that checks if a file exists, is not empty, and should not re-download,
512
+ if so log a message indicating as much and skip the decorated function."""
513
+
514
+ @functools.wraps(func)
515
+ def wrapper(self, *args, **kwargs):
516
+ if (
517
+ not self.read_config.re_download
518
+ and self.filename.is_file()
519
+ and self.filename.stat().st_size
520
+ ):
521
+ logger.debug(f"File exists: {self.filename}, skipping {func.__name__}")
522
+ return None
523
+ return func(self, *args, **kwargs)
524
+
525
+ return wrapper
526
+
527
+ # TODO: set as @abstractmethod and pass or raise NotImplementedError
528
+ def update_source_metadata(self, **kwargs) -> None:
529
+ """Sets the SourceMetadata and the properties for the doc"""
530
+ self._source_metadata = SourceMetadata()
531
+
532
+ def update_permissions_data(self):
533
+ """Sets the _permissions_data property for the doc.
534
+ This property is later used to fill the corresponding SourceMetadata.permissions_data field,
535
+ and after that carries on to the permissions_data property."""
536
+ self._permissions_data: Optional[list[dict[str, Any]]] = None
537
+
538
+ # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods
539
+ # in addition to or instead of get_file()
540
+ @abstractmethod
541
+ @SourceConnectionError.wrap
542
+ def get_file(self):
543
+ """Fetches the "remote" doc and stores it locally on the filesystem."""
544
+
545
+ def has_output(self) -> bool:
546
+ """Determine if structured output for this doc already exists."""
547
+ return self._output_filename.is_file() and self._output_filename.stat().st_size
548
+
549
+ @PartitionError.wrap
550
+ def partition_file(
551
+ self,
552
+ partition_config: PartitionConfig,
553
+ **partition_kwargs,
554
+ ) -> list[Element]:
555
+ from unstructured.partition.auto import partition
556
+
557
+ if not partition_config.partition_by_api:
558
+ logger.debug("Using local partition")
559
+ elements = partition(
560
+ filename=str(self.filename),
561
+ data_source_metadata=DataSourceMetadata(
562
+ url=self.source_url,
563
+ version=self.version,
564
+ record_locator=self.record_locator,
565
+ date_created=self.date_created,
566
+ date_modified=self.date_modified,
567
+ date_processed=self.date_processed,
568
+ permissions_data=self.permissions_data,
569
+ ),
570
+ **partition_kwargs,
571
+ )
572
+ else:
573
+ endpoint = partition_config.partition_endpoint
574
+
575
+ logger.debug(f"Using remote partition ({endpoint})")
576
+
577
+ elements = partition_via_api(
578
+ filename=str(self.filename),
579
+ api_key=partition_config.api_key,
580
+ api_url=endpoint,
581
+ **partition_kwargs,
582
+ )
583
+ # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then
584
+ # pass the stringified json here
585
+ return elements
586
+
587
+ def process_file(
588
+ self,
589
+ partition_config: PartitionConfig,
590
+ **partition_kwargs,
591
+ ) -> Optional[list[dict[str, Any]]]:
592
+ self._date_processed = datetime.utcnow().isoformat()
593
+ if self.read_config.download_only:
594
+ return None
595
+ logger.info(f"Processing {self.filename}")
596
+
597
+ elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
598
+ element_dicts = elements_to_dicts(elements)
599
+
600
+ self.isd_elems_no_filename: list[dict[str, Any]] = []
601
+ for elem in element_dicts:
602
+ if partition_config.metadata_exclude and partition_config.metadata_include:
603
+ raise ValueError(
604
+ "Arguments `--metadata-include` and `--metadata-exclude` are "
605
+ "mutually exclusive with each other.",
606
+ )
607
+ elif partition_config.metadata_exclude:
608
+ ex_list = partition_config.metadata_exclude
609
+ for ex in ex_list:
610
+ if "." in ex: # handle nested fields
611
+ nested_fields = ex.split(".")
612
+ current_elem = elem
613
+ for f in nested_fields[:-1]:
614
+ if f in current_elem:
615
+ current_elem = current_elem[f]
616
+ field_to_exclude = nested_fields[-1]
617
+ if field_to_exclude in current_elem:
618
+ current_elem.pop(field_to_exclude, None)
619
+ else: # handle top-level fields
620
+ elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
621
+ elif partition_config.metadata_include:
622
+ in_list = partition_config.metadata_include
623
+ for k in list(elem["metadata"].keys()): # type: ignore[attr-defined]
624
+ if k not in in_list:
625
+ elem["metadata"].pop(k, None) # type: ignore[attr-defined]
626
+ in_list = partition_config.fields_include
627
+ elem = {k: v for k, v in elem.items() if k in in_list}
628
+
629
+ if partition_config.flatten_metadata and "metadata" in elem:
630
+ metadata = elem.pop("metadata")
631
+ elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
632
+
633
+ self.isd_elems_no_filename.append(elem)
634
+
635
+ return self.isd_elems_no_filename
636
+
637
+
638
+ @dataclass
639
+ class BaseIngestDocBatch(BaseIngestDoc, BatchIngestDocJsonMixin, ABC):
640
+ ingest_docs: list[BaseSingleIngestDoc] = field(default_factory=list)
641
+
642
+ @abstractmethod
643
+ @SourceConnectionError.wrap
644
+ def get_files(self):
645
+ """Fetches the "remote" docs and stores it locally on the filesystem."""
646
+
647
+
648
+ @dataclass
649
+ class BaseConnector(EnhancedDataClassJsonMixin, ABC):
650
+ @abstractmethod
651
+ def check_connection(self):
652
+ pass
653
+
654
+
655
+ @dataclass
656
+ class BaseSourceConnector(BaseConnector, ABC):
657
+ """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
658
+
659
+ processor_config: ProcessorConfig
660
+ read_config: ReadConfig
661
+ connector_config: BaseConnectorConfig
662
+
663
+ @abstractmethod
664
+ def cleanup(self, cur_dir=None):
665
+ """Any additional cleanup up need after processing is complete. E.g., removing
666
+ temporary download dirs that are empty.
667
+
668
+ By convention, documents that failed to process are typically not cleaned up."""
669
+
670
+ @abstractmethod
671
+ def initialize(self):
672
+ """Initializes the connector. Should also validate the connector is properly
673
+ configured: e.g., list a single a document from the source."""
674
+
675
+ @abstractmethod
676
+ def get_ingest_docs(self):
677
+ """Returns all ingest docs (derived from BaseIngestDoc).
678
+ This does not imply downloading all the raw documents themselves,
679
+ rather each IngestDoc is capable of fetching its content (in another process)
680
+ with IngestDoc.get_file()."""
681
+
682
+
683
+ @dataclass
684
+ class BaseDestinationConnector(BaseConnector, ABC):
685
+ write_config: WriteConfig
686
+ connector_config: BaseConnectorConfig
687
+
688
+ def __init__(self, write_config: WriteConfig, connector_config: BaseConnectorConfig):
689
+ self.write_config = write_config
690
+ self.connector_config = connector_config
691
+
692
+ def conform_dict(self, data: dict[str, Any]) -> None:
693
+ """
694
+ When the original dictionary needs to be modified in place
695
+ """
696
+ return
697
+
698
+ def normalize_dict(self, element_dict: dict[str, Any]) -> dict[str, Any]:
699
+ """
700
+ When the original dictionary needs to be mapped to a new one
701
+ """
702
+ return element_dict
703
+
704
+ @abstractmethod
705
+ def initialize(self):
706
+ """Initializes the connector. Should also validate the connector is properly
707
+ configured."""
708
+
709
+ def write(self, docs: list[BaseSingleIngestDoc]) -> None:
710
+ elements_dict = self.get_elements_dict(docs=docs)
711
+ self.modify_and_write_dict(elements_dict=elements_dict)
712
+
713
+ def get_elements_dict(self, docs: list[BaseSingleIngestDoc]) -> list[dict[str, Any]]:
714
+ dict_list: list[dict[str, Any]] = []
715
+ for doc in docs:
716
+ local_path = doc._output_filename
717
+ with open(local_path) as json_file:
718
+ dict_content = json.load(json_file)
719
+ logger.info(
720
+ f"Extending {len(dict_content)} json elements from content in {local_path}",
721
+ )
722
+ dict_list.extend(dict_content)
723
+ return dict_list
724
+
725
+ @abstractmethod
726
+ def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
727
+ pass
728
+
729
+ def modify_and_write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
730
+ """
731
+ Modify in this instance means this method wraps calls to conform_dict() and
732
+ normalize() before actually processing the content via write_dict()
733
+ """
734
+ for d in elements_dict:
735
+ self.conform_dict(data=d)
736
+ elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict]
737
+ return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs)
738
+
739
+ def write_elements(self, elements: list[Element], *args, **kwargs) -> None:
740
+ elements_dict = [e.to_dict() for e in elements]
741
+ self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs)
742
+
743
+
744
+ class SourceConnectorCleanupMixin:
745
+ read_config: ReadConfig
746
+
747
+ def cleanup(self, cur_dir=None):
748
+ """Recursively clean up downloaded files and directories."""
749
+ if self.read_config.preserve_downloads or self.read_config.download_only:
750
+ return
751
+ if cur_dir is None:
752
+ cur_dir = self.read_config.download_dir
753
+ if cur_dir is None or not Path(cur_dir).is_dir():
754
+ return
755
+ sub_dirs = os.listdir(cur_dir)
756
+ os.chdir(cur_dir)
757
+ for sub_dir in sub_dirs:
758
+ # don't traverse symlinks, not that there every should be any
759
+ if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
760
+ self.cleanup(sub_dir)
761
+ os.chdir("..")
762
+ if len(os.listdir(cur_dir)) == 0:
763
+ os.rmdir(cur_dir)
764
+
765
+
766
+ class PermissionsCleanupMixin:
767
+ processor_config: ProcessorConfig
768
+
769
+ def cleanup_permissions(self, cur_dir=None):
770
+ def has_no_folders(folder_path):
771
+ folders = [
772
+ item
773
+ for item in os.listdir(folder_path)
774
+ if os.path.isdir(os.path.join(folder_path, item))
775
+ ]
776
+ return len(folders) == 0
777
+
778
+ """Recursively clean up downloaded files and directories."""
779
+ if cur_dir is None:
780
+ cur_dir = Path(self.processor_config.output_dir, "permissions_data")
781
+ if not Path(cur_dir).exists():
782
+ return
783
+ if Path(cur_dir).is_file():
784
+ cur_file = cur_dir
785
+ os.remove(cur_file)
786
+ return
787
+ sub_dirs = os.listdir(cur_dir)
788
+ os.chdir(cur_dir)
789
+ for sub_dir in sub_dirs:
790
+ # don't traverse symlinks, not that there every should be any
791
+ if not os.path.islink(sub_dir):
792
+ self.cleanup_permissions(sub_dir)
793
+ os.chdir("..")
794
+ if has_no_folders(cur_dir):
795
+ os.rmdir(cur_dir)
796
+
797
+
798
+ class IngestDocCleanupMixin:
799
+ read_config: ReadConfig
800
+
801
+ @property
802
+ @abstractmethod
803
+ def filename(self):
804
+ """The local filename of the document after fetching from remote source."""
805
+
806
+ def cleanup_file(self):
807
+ """Removes the local copy of the file after successful processing."""
808
+ if (
809
+ not self.read_config.preserve_downloads
810
+ and self.filename.is_file()
811
+ and not self.read_config.download_only
812
+ ):
813
+ logger.debug(f"Cleaning up {self}")
814
+ os.unlink(self.filename)
815
+
816
+
817
+ class ConfigSessionHandleMixin:
818
+ @abstractmethod
819
+ def create_session_handle(self) -> BaseSessionHandle:
820
+ """Creates a session handle that will be assigned on each IngestDoc to share
821
+ session related resources across all document handling for a given subprocess."""
822
+
823
+
824
+ @dataclass
825
+ class IngestDocSessionHandleMixin:
826
+ connector_config: ConfigSessionHandleMixin
827
+ _session_handle: Optional[BaseSessionHandle] = field(default=None, init=False)
828
+
829
+ @property
830
+ def session_handle(self):
831
+ """If a session handle is not assigned, creates a new one and assigns it."""
832
+ if self._session_handle is None:
833
+ self._session_handle = self.connector_config.create_session_handle()
834
+ return self._session_handle
835
+
836
+ @session_handle.setter
837
+ def session_handle(self, session_handle: BaseSessionHandle):
838
+ self._session_handle = session_handle