unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,293 @@
1
+ """
2
+ Salesforce Connector
3
+ Able to download Account, Case, Campaign, EmailMessage, Lead
4
+ Salesforce returns everything as a list of json.
5
+ This saves each entry as a separate file to be partitioned.
6
+ Using JWT authorization
7
+ https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
8
+ https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
9
+ """
10
+
11
+ import json
12
+ from collections import OrderedDict
13
+ from dataclasses import dataclass, field
14
+ from email.utils import formatdate
15
+ from pathlib import Path
16
+ from string import Template
17
+ from textwrap import dedent
18
+ from typing import TYPE_CHECKING, Any, Generator, Type
19
+
20
+ from dateutil import parser
21
+ from unstructured.documents.elements import DataSourceMetadata
22
+
23
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
24
+ from unstructured_ingest.error import SourceConnectionNetworkError
25
+ from unstructured_ingest.utils.dep_check import requires_dependencies
26
+ from unstructured_ingest.v2.interfaces import (
27
+ AccessConfig,
28
+ ConnectionConfig,
29
+ Downloader,
30
+ DownloaderConfig,
31
+ DownloadResponse,
32
+ FileData,
33
+ Indexer,
34
+ IndexerConfig,
35
+ SourceIdentifiers,
36
+ )
37
+ from unstructured_ingest.v2.logger import logger
38
+ from unstructured_ingest.v2.processes.connector_registry import (
39
+ SourceRegistryEntry,
40
+ )
41
+
42
+
43
+ class MissingCategoryError(Exception):
44
+ """There are no categories with that name."""
45
+
46
+
47
+ CONNECTOR_TYPE = "salesforce"
48
+
49
+ if TYPE_CHECKING:
50
+ from simple_salesforce import Salesforce
51
+
52
+ SALESFORCE_API_VERSION = "57.0"
53
+
54
+ # TODO: Add more categories as needed
55
+ ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
56
+
57
+ # Generic minimal email template used only
58
+ # to process EmailMessage records as .eml files
59
+ EMAIL_TEMPLATE = Template(
60
+ """MIME-Version: 1.0
61
+ Date: $date
62
+ Message-ID: $message_identifier
63
+ Subject: $subject
64
+ From: $from_email
65
+ To: $to_email
66
+ Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
67
+ --00000000000095c9b205eff92630
68
+ Content-Type: text/plain; charset="UTF-8"
69
+ $textbody
70
+ --00000000000095c9b205eff92630
71
+ Content-Type: text/html; charset="UTF-8"
72
+ $htmlbody
73
+ --00000000000095c9b205eff92630--
74
+ """,
75
+ )
76
+
77
+
78
+ @dataclass
79
+ class SalesforceAccessConfig(AccessConfig):
80
+ consumer_key: str
81
+ private_key: str
82
+
83
+ @requires_dependencies(["cryptography"])
84
+ def get_private_key_value_and_type(self) -> tuple[str, Type]:
85
+ from cryptography.hazmat.primitives import serialization
86
+
87
+ try:
88
+ serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
89
+ except ValueError:
90
+ pass
91
+ else:
92
+ return self.private_key, str
93
+
94
+ if Path(self.private_key).is_file():
95
+ return self.private_key, Path
96
+
97
+ raise ValueError("private_key does not contain PEM private key or path")
98
+
99
+
100
+ @dataclass
101
+ class SalesforceConnectionConfig(ConnectionConfig):
102
+ username: str
103
+ access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
104
+
105
+ @requires_dependencies(["simple_salesforce"], extras="salesforce")
106
+ def get_client(self) -> "Salesforce":
107
+ from simple_salesforce import Salesforce
108
+
109
+ pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
110
+
111
+ return Salesforce(
112
+ username=self.username,
113
+ consumer_key=self.access_config.consumer_key,
114
+ privatekey_file=pkey_value if pkey_type is Path else None,
115
+ privatekey=pkey_value if pkey_type is str else None,
116
+ version=SALESFORCE_API_VERSION,
117
+ )
118
+
119
+
120
+ @dataclass
121
+ class SalesforceIndexerConfig(IndexerConfig):
122
+ categories: list[str]
123
+
124
+
125
+ @dataclass
126
+ class SalesforceIndexer(Indexer):
127
+ connection_config: SalesforceConnectionConfig
128
+ index_config: SalesforceIndexerConfig
129
+
130
+ def __post_init__(self):
131
+ for record_type in self.index_config.categories:
132
+ if record_type not in ACCEPTED_CATEGORIES:
133
+ raise ValueError(f"{record_type} not currently an accepted Salesforce category")
134
+
135
+ def get_file_extension(self, record_type) -> str:
136
+ if record_type == "EmailMessage":
137
+ extension = ".eml"
138
+ elif record_type in ["Account", "Lead", "Case", "Campaign"]:
139
+ extension = ".xml"
140
+ else:
141
+ raise MissingCategoryError(
142
+ f"There are no categories with the name: {record_type}",
143
+ )
144
+ return extension
145
+
146
+ @requires_dependencies(["simple_salesforce"], extras="salesforce")
147
+ def list_files(self) -> list[FileData]:
148
+ """Get Salesforce Ids for the records.
149
+ Send them to next phase where each doc gets downloaded into the
150
+ appropriate format for partitioning.
151
+ """
152
+ from simple_salesforce.exceptions import SalesforceMalformedRequest
153
+
154
+ client = self.connection_config.get_client()
155
+
156
+ files_list = []
157
+ for record_type in self.index_config.categories:
158
+ try:
159
+ # Get ids from Salesforce
160
+ records = client.query_all_iter(
161
+ f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
162
+ )
163
+ for record in records:
164
+ record_with_extension = record["Id"] + self.get_file_extension(
165
+ record["attributes"]["type"]
166
+ )
167
+ files_list.append(
168
+ FileData(
169
+ connector_type=CONNECTOR_TYPE,
170
+ identifier=record["Id"],
171
+ source_identifiers=SourceIdentifiers(
172
+ filename=record_with_extension,
173
+ fullpath=f"{record['attributes']['type']}/{record_with_extension}",
174
+ ),
175
+ metadata=DataSourceMetadata(
176
+ url=record["attributes"]["url"],
177
+ version=str(parser.parse(record["SystemModstamp"]).timestamp()),
178
+ date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
179
+ date_modified=str(
180
+ parser.parse(record["LastModifiedDate"]).timestamp()
181
+ ),
182
+ record_locator={"id": record["Id"]},
183
+ ),
184
+ additional_metadata={"record_type": record["attributes"]["type"]},
185
+ )
186
+ )
187
+ except SalesforceMalformedRequest as e:
188
+ raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
189
+
190
+ return files_list
191
+
192
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
193
+ for f in self.list_files():
194
+ yield f
195
+
196
+
197
+ @dataclass
198
+ class SalesforceDownloaderConfig(DownloaderConfig):
199
+ pass
200
+
201
+
202
+ @dataclass
203
+ class SalesforceDownloader(Downloader):
204
+ connection_config: SalesforceConnectionConfig
205
+ download_config: SalesforceDownloaderConfig = field(
206
+ default_factory=lambda: SalesforceDownloaderConfig()
207
+ )
208
+ connector_type: str = CONNECTOR_TYPE
209
+
210
+ def get_download_path(self, file_data: FileData) -> Path:
211
+ rel_path = file_data.source_identifiers.relative_path
212
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
213
+ return self.download_dir / Path(rel_path)
214
+
215
+ def _xml_for_record(self, record: OrderedDict) -> str:
216
+ """Creates partitionable xml file from a record"""
217
+ import xml.etree.ElementTree as ET
218
+
219
+ def create_xml_doc(data, parent, prefix=""):
220
+ for key, value in data.items():
221
+ if isinstance(value, OrderedDict):
222
+ create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
223
+ else:
224
+ item = ET.Element("item")
225
+ item.text = f"{prefix}{key}: {value}"
226
+ parent.append(item)
227
+
228
+ root = ET.Element("root")
229
+ create_xml_doc(record, root)
230
+
231
+ xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
232
+ return xml_string
233
+
234
+ def _eml_for_record(self, email_json: dict[str, Any]) -> str:
235
+ """Recreates standard expected .eml format using template."""
236
+ eml = EMAIL_TEMPLATE.substitute(
237
+ date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
238
+ message_identifier=email_json.get("MessageIdentifier"),
239
+ subject=email_json.get("Subject"),
240
+ from_email=email_json.get("FromAddress"),
241
+ to_email=email_json.get("ToAddress"),
242
+ textbody=email_json.get("TextBody"),
243
+ htmlbody=email_json.get("HtmlBody"),
244
+ )
245
+ return dedent(eml)
246
+
247
+ @SourceConnectionNetworkError.wrap
248
+ def _get_response(self, file_data: FileData) -> OrderedDict:
249
+ client = self.connection_config.get_client()
250
+ return client.query(
251
+ f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
252
+ )
253
+
254
+ def get_record(self, file_data: FileData) -> OrderedDict:
255
+ # Get record from Salesforce based on id
256
+ response = self._get_response(file_data)
257
+ logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
258
+ records = response["records"]
259
+ if not records:
260
+ raise ValueError(
261
+ f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
262
+ )
263
+ record_json = records[0]
264
+ return record_json
265
+
266
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
267
+ record = self.get_record(file_data)
268
+
269
+ try:
270
+ if file_data.additional_metadata["record_type"] == "EmailMessage":
271
+ document = self._eml_for_record(record)
272
+ else:
273
+ document = self._xml_for_record(record)
274
+ download_path = self.get_download_path(file_data=file_data)
275
+ download_path.parent.mkdir(parents=True, exist_ok=True)
276
+
277
+ with open(download_path, "w") as page_file:
278
+ page_file.write(document)
279
+
280
+ except Exception as e:
281
+ logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
282
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
283
+
284
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
285
+
286
+
287
+ salesforce_source_entry = SourceRegistryEntry(
288
+ connection_config=SalesforceConnectionConfig,
289
+ indexer_config=SalesforceIndexerConfig,
290
+ indexer=SalesforceIndexer,
291
+ downloader_config=SalesforceDownloaderConfig,
292
+ downloader=SalesforceDownloader,
293
+ )