unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,81 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Type
5
+ from urllib.parse import urlparse
6
+
7
+ from unstructured_ingest.connector.fsspec.fsspec import (
8
+ FsspecIngestDoc,
9
+ FsspecSourceConnector,
10
+ SimpleFsspecConfig,
11
+ )
12
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
13
+ from unstructured_ingest.error import SourceConnectionError
14
+ from unstructured_ingest.interfaces import AccessConfig
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+
19
+ @dataclass
20
+ class SftpAccessConfig(AccessConfig):
21
+ username: str
22
+ password: str = enhanced_field(sensitive=True)
23
+ host: str = ""
24
+ port: int = 22
25
+ look_for_keys: bool = False
26
+ allow_agent: bool = False
27
+
28
+
29
+ @dataclass
30
+ class SimpleSftpConfig(SimpleFsspecConfig):
31
+ access_config: SftpAccessConfig = None
32
+
33
+ def __post_init__(self):
34
+ super().__post_init__()
35
+
36
+ _, ext = os.path.splitext(self.remote_url)
37
+ parsed_url = urlparse(self.remote_url)
38
+ if ext:
39
+ # We only want the file_path if it has an extension
40
+ self.file_path = Path(self.remote_url).name
41
+ self.dir_path = Path(parsed_url.path).parent.as_posix().lstrip("/")
42
+ self.path_without_protocol = self.dir_path
43
+ else:
44
+ self.file_path = ""
45
+ self.dir_path = parsed_url.path.lstrip("/")
46
+ self.path_without_protocol = self.dir_path
47
+ self.access_config.host = parsed_url.hostname or self.access_config.host
48
+ self.access_config.port = parsed_url.port or self.access_config.port
49
+
50
+
51
+ @dataclass
52
+ class SftpIngestDoc(FsspecIngestDoc):
53
+ connector_config: SimpleSftpConfig
54
+ registry_name: str = "sftp"
55
+
56
+ @SourceConnectionError.wrap
57
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
58
+ def get_file(self):
59
+ super().get_file()
60
+
61
+
62
+ @dataclass
63
+ class SftpSourceConnector(FsspecSourceConnector):
64
+ connector_config: SimpleSftpConfig
65
+
66
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
67
+ def initialize(self):
68
+ super().initialize()
69
+
70
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
71
+ def check_connection(self):
72
+ from fsspec.implementations.sftp import SFTPFileSystem
73
+
74
+ try:
75
+ SFTPFileSystem(**self.connector_config.get_access_config())
76
+ except Exception as e:
77
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
78
+ raise SourceConnectionError(f"failed to validate connection: {e}")
79
+
80
+ def __post_init__(self):
81
+ self.ingest_doc_cls: Type[SftpIngestDoc] = SftpIngestDoc
@@ -0,0 +1,124 @@
1
+ import fnmatch
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from unstructured_ingest.error import SourceConnectionError
8
+ from unstructured_ingest.interfaces import (
9
+ AccessConfig,
10
+ BaseConnectorConfig,
11
+ BaseSingleIngestDoc,
12
+ BaseSourceConnector,
13
+ IngestDocCleanupMixin,
14
+ SourceConnectorCleanupMixin,
15
+ )
16
+ from unstructured_ingest.logger import logger
17
+
18
+
19
+ @dataclass
20
+ class GitAccessConfig(AccessConfig):
21
+ access_token: t.Optional[str] = enhanced_field(
22
+ default=None, sensitive=True, overload_name="git_access_token"
23
+ )
24
+
25
+
26
+ @dataclass
27
+ class SimpleGitConfig(BaseConnectorConfig):
28
+ url: str
29
+ access_config: GitAccessConfig
30
+ branch: t.Optional[str] = enhanced_field(default=None, overload_name="git_branch")
31
+ file_glob: t.Optional[t.List[str]] = enhanced_field(default=None, overload_name="git_file_glob")
32
+ repo_path: str = field(init=False, repr=False)
33
+
34
+
35
+ @dataclass
36
+ class GitIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
37
+ connector_config: SimpleGitConfig = field(repr=False)
38
+ path: str
39
+
40
+ @property
41
+ def filename(self):
42
+ return (Path(self.read_config.download_dir) / self.path).resolve()
43
+
44
+ @property
45
+ def _output_filename(self):
46
+ return Path(self.processor_config.output_dir) / f"{self.path}.json"
47
+
48
+ @property
49
+ def record_locator(self) -> t.Dict[str, t.Any]:
50
+ record_locator = {
51
+ "repo_path": self.connector_config.repo_path,
52
+ "file_path": self.path,
53
+ }
54
+ if self.connector_config.branch is not None:
55
+ record_locator["branch"] = self.connector_config.branch
56
+ return record_locator
57
+
58
+ def _create_full_tmp_dir_path(self):
59
+ """includes directories in in the gitlab repository"""
60
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
61
+
62
+ def update_source_metadata(self, **kwargs):
63
+ raise NotImplementedError()
64
+
65
+ @SourceConnectionError.wrap
66
+ @BaseSingleIngestDoc.skip_if_file_exists
67
+ def get_file(self):
68
+ """Fetches the "remote" doc and stores it locally on the filesystem."""
69
+ self._create_full_tmp_dir_path()
70
+ self._fetch_and_write()
71
+
72
+ def _fetch_content(self) -> None:
73
+ raise NotImplementedError()
74
+
75
+ def _fetch_and_write(self) -> None:
76
+ raise NotImplementedError()
77
+
78
+
79
+ @dataclass
80
+ class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
81
+ connector_config: SimpleGitConfig
82
+
83
+ def initialize(self):
84
+ pass
85
+
86
+ def check_connection(self):
87
+ pass
88
+
89
+ @staticmethod
90
+ def is_file_type_supported(path: str) -> bool:
91
+ # Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
92
+ # TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
93
+ supported = path.endswith(
94
+ (
95
+ ".md",
96
+ ".txt",
97
+ ".pdf",
98
+ ".doc",
99
+ ".docx",
100
+ ".eml",
101
+ ".heic",
102
+ ".html",
103
+ ".png",
104
+ ".jpg",
105
+ ".ppt",
106
+ ".pptx",
107
+ ".xml",
108
+ ),
109
+ )
110
+ if not supported:
111
+ logger.debug(
112
+ f"The file {path!r} is discarded as it does not contain a supported filetype.",
113
+ )
114
+ return supported
115
+
116
+ def does_path_match_glob(self, path: str) -> bool:
117
+ if not self.connector_config.file_glob:
118
+ return True
119
+ patterns = self.connector_config.file_glob
120
+ for pattern in patterns:
121
+ if fnmatch.filter([path], pattern):
122
+ return True
123
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
124
+ return False
@@ -0,0 +1,173 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from urllib.parse import urlparse
5
+
6
+ import requests
7
+
8
+ from unstructured_ingest.connector.git import (
9
+ GitIngestDoc,
10
+ GitSourceConnector,
11
+ SimpleGitConfig,
12
+ )
13
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
14
+ from unstructured_ingest.interfaces import SourceMetadata
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+ if t.TYPE_CHECKING:
19
+ from github.Repository import Repository
20
+
21
+
22
+ @dataclass
23
+ class SimpleGitHubConfig(SimpleGitConfig):
24
+ def __post_init__(self):
25
+ parsed_gh_url = urlparse(self.url)
26
+ path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
27
+
28
+ # If a scheme and netloc are provided, ensure they are correct
29
+ # Additionally, ensure that the path contains two fragments
30
+ if (
31
+ (parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
32
+ or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
33
+ or len(path_fragments) != 2
34
+ ):
35
+ raise ValueError(
36
+ 'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
37
+ ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
38
+ )
39
+
40
+ # If there's no issues, store the core repository info
41
+ self.repo_path = parsed_gh_url.path
42
+
43
+ @SourceConnectionError.wrap
44
+ @requires_dependencies(["github"], extras="github")
45
+ def get_repo(self) -> "Repository":
46
+ from github import Github
47
+
48
+ github = Github(self.access_config.access_token)
49
+ return github.get_repo(self.repo_path)
50
+
51
+
52
+ @dataclass
53
+ class GitHubIngestDoc(GitIngestDoc):
54
+ connector_config: SimpleGitHubConfig
55
+ registry_name: str = "github"
56
+
57
+ @property
58
+ def date_created(self) -> t.Optional[str]:
59
+ return None
60
+
61
+ @requires_dependencies(["github"], extras="github")
62
+ def _fetch_file(self):
63
+ from github.GithubException import UnknownObjectException
64
+
65
+ try:
66
+ content_file = self.connector_config.get_repo().get_contents(self.path)
67
+ except UnknownObjectException:
68
+ logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
69
+ return None
70
+
71
+ return content_file
72
+
73
+ @SourceConnectionNetworkError.wrap
74
+ def _fetch_content(self, content_file):
75
+ contents = b""
76
+ if (
77
+ not content_file.content # type: ignore
78
+ and content_file.encoding == "none" # type: ignore
79
+ and content_file.size # type: ignore
80
+ ):
81
+ logger.info("File too large for the GitHub API, using direct download link instead.")
82
+ # NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors?
83
+ response = requests.get(content_file.download_url) # type: ignore
84
+ if response.status_code != 200:
85
+ logger.info("Direct download link has failed... Skipping this file.")
86
+ return None
87
+ else:
88
+ contents = response.content
89
+ else:
90
+ contents = content_file.decoded_content # type: ignore
91
+ return contents
92
+
93
+ def update_source_metadata(self, **kwargs):
94
+ content_file = kwargs.get("content_file", self._fetch_file())
95
+ if content_file is None:
96
+ self.source_metadata = SourceMetadata(
97
+ exists=False,
98
+ )
99
+ return
100
+
101
+ date_modified = datetime.strptime(
102
+ content_file.last_modified,
103
+ "%a, %d %b %Y %H:%M:%S %Z",
104
+ ).isoformat()
105
+ self.source_metadata = SourceMetadata(
106
+ date_modified=date_modified,
107
+ version=content_file.etag,
108
+ source_url=content_file.download_url,
109
+ exists=True,
110
+ )
111
+
112
+ def _fetch_and_write(self) -> None:
113
+ content_file = self._fetch_file()
114
+ self.update_source_metadata(content_file=content_file)
115
+ contents = self._fetch_content(content_file)
116
+ if contents is None:
117
+ raise ValueError(
118
+ f"Failed to retrieve file from repo "
119
+ f"{self.connector_config.url}/{self.path}. Check logs",
120
+ )
121
+ with open(self.filename, "wb") as f:
122
+ f.write(contents)
123
+
124
+
125
+ @dataclass
126
+ class GitHubSourceConnector(GitSourceConnector):
127
+ connector_config: SimpleGitHubConfig
128
+
129
+ @requires_dependencies(["github"], extras="github")
130
+ def check_connection(self):
131
+ from github import Consts
132
+ from github.GithubRetry import GithubRetry
133
+ from github.Requester import Requester
134
+
135
+ try:
136
+ requester = Requester(
137
+ auth=self.connector_config.access_config.access_token,
138
+ base_url=Consts.DEFAULT_BASE_URL,
139
+ timeout=Consts.DEFAULT_TIMEOUT,
140
+ user_agent=Consts.DEFAULT_USER_AGENT,
141
+ per_page=Consts.DEFAULT_PER_PAGE,
142
+ verify=True,
143
+ retry=GithubRetry(),
144
+ pool_size=None,
145
+ )
146
+ url_base = (
147
+ "/repositories/" if isinstance(self.connector_config.repo_path, int) else "/repos/"
148
+ )
149
+ url = f"{url_base}{self.connector_config.repo_path}"
150
+ headers, _ = requester.requestJsonAndCheck("HEAD", url)
151
+ logger.debug(f"headers from HEAD request: {headers}")
152
+ except Exception as e:
153
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
154
+ raise SourceConnectionError(f"failed to validate connection: {e}")
155
+
156
+ def get_ingest_docs(self):
157
+ repo = self.connector_config.get_repo()
158
+ # Load the Git tree with all files, and then create Ingest docs
159
+ # for all blobs, i.e. all files, ignoring directories
160
+ sha = self.connector_config.branch or repo.default_branch
161
+ git_tree = repo.get_git_tree(sha, recursive=True)
162
+ return [
163
+ GitHubIngestDoc(
164
+ connector_config=self.connector_config,
165
+ processor_config=self.processor_config,
166
+ read_config=self.read_config,
167
+ path=element.path,
168
+ )
169
+ for element in git_tree.tree
170
+ if element.type == "blob"
171
+ and self.is_file_type_supported(element.path)
172
+ and (not self.connector_config.file_glob or self.does_path_match_glob(element.path))
173
+ ]
@@ -0,0 +1,142 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+ from urllib.parse import urlparse
4
+
5
+ from unstructured_ingest.connector.git import (
6
+ GitIngestDoc,
7
+ GitSourceConnector,
8
+ SimpleGitConfig,
9
+ )
10
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.interfaces import SourceMetadata
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+
15
+ if t.TYPE_CHECKING:
16
+ from gitlab.v4.objects.projects import Project
17
+
18
+
19
+ @dataclass
20
+ class SimpleGitlabConfig(SimpleGitConfig):
21
+ base_url: str = "https://gitlab.com"
22
+
23
+ def __post_init__(self):
24
+ parsed_gh_url = urlparse(self.url)
25
+ # If a scheme or netloc are provided, use the parsed base url
26
+ if parsed_gh_url.scheme or parsed_gh_url.netloc:
27
+ self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}"
28
+ self.repo_path = parsed_gh_url.path
29
+ while self.repo_path.startswith("/"):
30
+ self.repo_path = self.repo_path[1:]
31
+
32
+ @SourceConnectionError.wrap
33
+ @requires_dependencies(["gitlab"], extras="gitlab")
34
+ def get_project(self) -> "Project":
35
+ from gitlab import Gitlab
36
+
37
+ gitlab = Gitlab(self.base_url, private_token=self.access_config.access_token)
38
+ return gitlab.projects.get(self.repo_path)
39
+
40
+
41
+ @dataclass
42
+ class GitLabIngestDoc(GitIngestDoc):
43
+ connector_config: SimpleGitlabConfig
44
+ registry_name: str = "gitlab"
45
+
46
+ @property
47
+ def date_created(self) -> t.Optional[str]:
48
+ return None
49
+
50
+ @property
51
+ def date_modified(self) -> t.Optional[str]:
52
+ return None
53
+
54
+ @property
55
+ def source_url(self) -> t.Optional[str]:
56
+ return None
57
+
58
+ @SourceConnectionNetworkError.wrap
59
+ @requires_dependencies(["gitlab"], extras="gitlab")
60
+ def _fetch_content(self):
61
+ from gitlab.exceptions import GitlabHttpError
62
+
63
+ try:
64
+ project = self.connector_config.get_project()
65
+ content_file = project.files.get(
66
+ self.path,
67
+ ref=self.connector_config.branch or project.default_branch,
68
+ )
69
+ except GitlabHttpError as e:
70
+ if e.response_code == 404:
71
+ logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}")
72
+ return None
73
+ raise
74
+ return content_file
75
+
76
+ def update_source_metadata(self, **kwargs):
77
+ content_file = kwargs.get("content_file", self._fetch_content())
78
+ if content_file is None:
79
+ self.source_metadata = SourceMetadata(
80
+ exists=None,
81
+ )
82
+ return
83
+ self.source_metadata = SourceMetadata(
84
+ version=content_file.attributes.get("last_commit_id", ""),
85
+ exists=True,
86
+ )
87
+
88
+ def _fetch_and_write(self) -> None:
89
+ content_file = self._fetch_content()
90
+ self.update_source_metadata(content_file=content_file)
91
+ if content_file is None:
92
+ raise ValueError(
93
+ f"Failed to retrieve file from repo "
94
+ f"{self.connector_config.url}/{self.path}. Check logs.",
95
+ )
96
+ contents = content_file.decode()
97
+ with open(self.filename, "wb") as f:
98
+ f.write(contents)
99
+
100
+
101
+ @dataclass
102
+ class GitLabSourceConnector(GitSourceConnector):
103
+ connector_config: SimpleGitlabConfig
104
+
105
+ @requires_dependencies(["gitlab"], extras="gitlab")
106
+ def check_connection(self):
107
+ from gitlab import Gitlab
108
+ from gitlab.exceptions import GitlabError
109
+
110
+ try:
111
+ gitlab = Gitlab(
112
+ self.connector_config.base_url,
113
+ private_token=self.connector_config.access_config.access_token,
114
+ )
115
+ gitlab.auth()
116
+ except GitlabError as gitlab_error:
117
+ logger.error(f"failed to validate connection: {gitlab_error}", exc_info=True)
118
+ raise SourceConnectionError(f"failed to validate connection: {gitlab_error}")
119
+
120
+ def get_ingest_docs(self):
121
+ # Load the Git tree with all files, and then create Ingest docs
122
+ # for all blobs, i.e. all files, ignoring directories
123
+ project = self.connector_config.get_project()
124
+ ref = self.connector_config.branch or project.default_branch
125
+ git_tree = project.repository_tree(
126
+ ref=ref,
127
+ recursive=True,
128
+ iterator=True,
129
+ all=True,
130
+ )
131
+ return [
132
+ GitLabIngestDoc(
133
+ connector_config=self.connector_config,
134
+ processor_config=self.processor_config,
135
+ read_config=self.read_config,
136
+ path=element["path"],
137
+ )
138
+ for element in git_tree
139
+ if element["type"] == "blob"
140
+ and self.is_file_type_supported(element["path"])
141
+ and (not self.connector_config.file_glob or self.does_path_match_glob(element["path"]))
142
+ ]